From: Matt Arsenault Date: Tue, 10 Nov 2020 16:06:59 +0000 (-0500) Subject: AMDGPU: Select global saddr mode from SGPR pointer X-Git-Tag: llvmorg-13-init~5978 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d2e52eec513a475fb69af7718e00a6aaac7738e8;p=platform%2Fupstream%2Fllvm.git AMDGPU: Select global saddr mode from SGPR pointer Use the 64-bit SGPR base with a 0 offset, since it's 1 fewer instruction to materialize the 0 vs. the 64-bit copy. --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 339d6fb..2ad7fab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1821,10 +1821,9 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // Match the immediate offset first, which canonically is moved as low as // possible. - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue LHS = Addr.getOperand(0); - SDValue RHS = Addr.getOperand(1); + SDValue LHS, RHS; + if (isBaseWithConstantOffset64(Addr, LHS, RHS)) { int64_t COffsetVal = cast(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); @@ -1852,11 +1851,24 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, } // Match the variable offset. - if (Addr.getOpcode() != ISD::ADD) - return false; + if (Addr.getOpcode() != ISD::ADD) { + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || + isa(Addr)) + return false; + + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + SAddr = Addr; + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, + CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; + } - SDValue LHS = Addr.getOperand(0); - SDValue RHS = Addr.getOperand(1); + LHS = Addr.getOperand(0); + RHS = Addr.getOperand(1); if (!LHS->isDivergent()) { // add (i64 sgpr), (zero_extend (i32 vgpr)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index feebe25..37a79ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3536,20 +3536,40 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { return None; // Match the variable offset. - if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) - return None; + if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) { + // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and + // drop this. + if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || + AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT) + return None; + + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + const Register SAddr = AddrDef->Reg; + if (!isSGPR(SAddr)) + return None; + + MachineInstr *MI = Root.getParent(); + MachineBasicBlock *MBB = MI->getParent(); + Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), + VOffset) + .addImm(0); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr + [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + }}; + } // Look through the SGPR->VGPR copy. - Register PtrBaseSrc = + Register SAddr = getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); - if (!PtrBaseSrc) - return None; - - const RegisterBank *BaseRB = RBI.getRegBank(PtrBaseSrc, *MRI, TRI); - if (BaseRB->getID() != AMDGPU::SGPRRegBankID) + if (!SAddr || !isSGPR(SAddr)) return None; - Register SAddr = PtrBaseSrc; Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); // It's possible voffset is an SGPR here, but the copy to VGPR will be diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 8d301ae..02d9d3c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1679,7 +1679,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 9 -; GPRIDX-NEXT: workitem_vgpr_count = 4 +; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 ; GPRIDX-NEXT: reserved_sgpr_first = 0 @@ -1710,10 +1710,9 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32 ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 4 ; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s7 -; GPRIDX-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 +; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_extract_v5f64_s_s: @@ -2194,7 +2193,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 6 -; GPRIDX-NEXT: workitem_vgpr_count = 3 +; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 ; GPRIDX-NEXT: reserved_sgpr_first = 0 @@ -2211,17 +2210,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32 ; GPRIDX-NEXT: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s2, s[4:5], 0x8 +; GPRIDX-NEXT: v_mov_b32_e32 v1, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: s_cmp_eq_u32 s2, 1 ; GPRIDX-NEXT: s_cselect_b32 s3, 2.0, 1.0 ; GPRIDX-NEXT: s_cmp_eq_u32 s2, 2 ; GPRIDX-NEXT: s_cselect_b32 s3, 0x40400000, s3 ; GPRIDX-NEXT: s_cmp_eq_u32 s2, 3 ; GPRIDX-NEXT: s_cselect_b32 s2, 4.0, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 -; GPRIDX-NEXT: global_store_dword v[0:1], v2, off +; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 +; GPRIDX-NEXT: global_store_dword v1, v0, s[0:1] ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_extract_v4f32_s_s_s: @@ -2370,7 +2368,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 7 -; GPRIDX-NEXT: workitem_vgpr_count = 4 +; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 ; GPRIDX-NEXT: reserved_sgpr_first = 0 @@ -2389,8 +2387,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3 ; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8 ; GPRIDX-NEXT: s_mov_b32 s0, 0 ; GPRIDX-NEXT: s_mov_b32 s1, 0x40080000 +; GPRIDX-NEXT: v_mov_b32_e32 v2, 0 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) -; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 ; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2 @@ -2399,8 +2397,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3 ; GPRIDX-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 -; GPRIDX-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_extract_v4f64_s_s_s: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 7901f22..88af446 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,36 +7,35 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.ptr, <64 x i32> addrspace(1)* %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: v_mov_b32_e32 v16, 0x100 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x0 ; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x80 +; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x80 +; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16 -; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: v_mov_b32_e32 v4, s16 -; GCN-NEXT: v_mov_b32_e32 v5, s17 -; GCN-NEXT: v_mov_b32_e32 v6, s18 -; GCN-NEXT: v_mov_b32_e32 v7, s19 -; GCN-NEXT: v_mov_b32_e32 v8, s20 -; GCN-NEXT: v_mov_b32_e32 v9, s21 -; GCN-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NEXT: v_mov_b32_e32 v13, s25 -; GCN-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0xc0 +; GCN-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NEXT: v_mov_b32_e32 v6, s42 +; GCN-NEXT: v_mov_b32_e32 v7, s43 +; GCN-NEXT: v_mov_b32_e32 v8, s44 +; GCN-NEXT: v_mov_b32_e32 v9, s45 +; GCN-NEXT: v_mov_b32_e32 v10, s46 +; GCN-NEXT: v_mov_b32_e32 v11, s47 +; GCN-NEXT: v_mov_b32_e32 v12, s48 +; GCN-NEXT: v_mov_b32_e32 v13, s49 +; GCN-NEXT: v_mov_b32_e32 v14, s50 +; GCN-NEXT: v_mov_b32_e32 v15, s51 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: v_add_u32_e32 v0, 4, v16 ; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -44,13 +43,13 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v1, s53 ; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s4, 0x50 ; GCN-NEXT: v_add_u32_e32 v34, 0x4c, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s55 ; GCN-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v35, s4, v16 +; GCN-NEXT: v_add_u32_e32 v35, 0x50, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v36, 0x54, v16 @@ -59,11 +58,10 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: v_add_u32_e32 v37, 0x58, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s5, 0x60 ; GCN-NEXT: v_add_u32_e32 v38, 0x5c, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s59 ; GCN-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v39, s5, v16 +; GCN-NEXT: v_add_u32_e32 v39, 0x60, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NEXT: buffer_store_dword v1, v39, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v40, 0x64, v16 @@ -72,11 +70,10 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: v_add_u32_e32 v41, 0x68, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s10, 0x70 ; GCN-NEXT: v_add_u32_e32 v42, 0x6c, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s63 ; GCN-NEXT: buffer_store_dword v1, v42, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v43, s10, v16 +; GCN-NEXT: v_add_u32_e32 v43, 0x70, v16 ; GCN-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v44, 0x74, v16 @@ -89,110 +86,104 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: v_mov_b32_e32 v1, s67 ; GCN-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v47, 0x80, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s36 +; GCN-NEXT: v_mov_b32_e32 v1, s12 ; GCN-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v48, 0x84, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v49, 0x88, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NEXT: buffer_store_dword v1, v49, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s11, 0x90 ; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NEXT: buffer_store_dword v1, v50, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v51, s11, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NEXT: v_add_u32_e32 v51, 0x90, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s16 ; GCN-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v52, 0x94, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: buffer_store_dword v1, v52, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v53, 0x98, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NEXT: v_mov_b32_e32 v1, s18 ; GCN-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s28, 0xa0 ; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NEXT: buffer_store_dword v1, v54, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v55, s28, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NEXT: v_add_u32_e32 v55, 0xa0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s20 ; GCN-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NEXT: v_mov_b32_e32 v1, s21 ; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NEXT: v_mov_b32_e32 v1, s22 ; GCN-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s29, 0xb0 ; GCN-NEXT: v_add_u32_e32 v58, 0xac, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NEXT: buffer_store_dword v1, v58, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v59, s29, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NEXT: v_add_u32_e32 v59, 0xb0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s24 ; GCN-NEXT: buffer_store_dword v1, v59, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NEXT: buffer_store_dword v1, v60, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NEXT: v_mov_b32_e32 v1, s26 ; GCN-NEXT: buffer_store_dword v1, v61, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NEXT: buffer_store_dword v1, v62, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s12 ; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v16 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NEXT: buffer_store_dword v1, v63, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s38 ; GCN-NEXT: buffer_store_dword v1, v65, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s12, 0xd0 ; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NEXT: buffer_store_dword v1, v66, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v67, s12, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_add_u32_e32 v67, 0xd0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NEXT: buffer_store_dword v1, v67, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NEXT: buffer_store_dword v1, v68, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NEXT: buffer_store_dword v1, v69, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s13, 0xe0 ; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: v_mov_b32_e32 v1, s43 ; GCN-NEXT: buffer_store_dword v1, v70, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v71, s13, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NEXT: v_add_u32_e32 v71, 0xe0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NEXT: buffer_store_dword v1, v71, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: v_mov_b32_e32 v1, s45 ; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s22 +; GCN-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NEXT: buffer_store_dword v1, v73, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s14, 0xf0 ; GCN-NEXT: v_add_u32_e32 v74, 0xec, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NEXT: v_mov_b32_e32 v1, s47 ; GCN-NEXT: buffer_store_dword v1, v74, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v75, s14, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NEXT: v_add_u32_e32 v75, 0xf0, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NEXT: s_and_b32 s7, s7, 63 +; GCN-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NEXT: s_and_b32 s5, s5, 63 ; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NEXT: v_add_u32_e32 v17, 8, v16 ; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NEXT: s_lshl_b32 s5, s5, 2 ; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v18, 12, v16 ; GCN-NEXT: v_add_u32_e32 v19, 16, v16 @@ -208,8 +199,8 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: v_add_u32_e32 v29, 56, v16 ; GCN-NEXT: v_add_u32_e32 v30, 60, v16 ; GCN-NEXT: buffer_store_dword v1, v78, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_add_u32_e32 v1, s7, v16 +; GCN-NEXT: v_add_u32_e32 v1, s5, v16 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen @@ -288,86 +279,24 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out. ; GCN-NEXT: buffer_load_dword v62, v77, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v63, v78, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256 -; GCN-NEXT: v_mov_b32_e32 v65, s9 -; GCN-NEXT: s_add_u32 s6, s8, 16 -; GCN-NEXT: v_mov_b32_e32 v64, s8 -; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off -; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, 32 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[4:7], off -; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, 48 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off -; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, 64 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[12:15], off -; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, s4 -; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_add_u32 s4, s8, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[20:23], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s10 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, 0x80 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[28:31], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s11 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s28 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s29 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, 0xc0 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[44:47], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s12 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s13 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s14 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[60:63], off +; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[8:9] +; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[8:9] offset:32 +; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[8:9] offset:48 +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[8:9] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[8:9] offset:80 +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[8:9] offset:96 +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[8:9] offset:112 +; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[8:9] offset:128 +; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[8:9] offset:144 +; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[8:9] offset:160 +; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[8:9] offset:176 +; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[8:9] offset:192 +; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[8:9] offset:208 +; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[8:9] offset:224 +; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[8:9] offset:240 ; GCN-NEXT: s_endpgm %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index b3cbf7f..a29f31f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -1999,12 +1999,12 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_cselect_b32 s7, s16, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_s: @@ -2163,19 +2163,19 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_s: @@ -2335,10 +2335,9 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_s: @@ -2510,9 +2509,9 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] @@ -2521,10 +2520,9 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_v: @@ -2699,9 +2697,9 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v6, s18 ; GFX9-NEXT: v_mov_b32_e32 v7, s19 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] @@ -2710,10 +2708,9 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_v: @@ -2874,20 +2871,20 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] ; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX9-NEXT: v_mov_b32_e32 v10, 16 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_v: @@ -3022,20 +3019,20 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] ; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX9-NEXT: v_mov_b32_e32 v10, 16 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_s: @@ -3177,13 +3174,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] -; GFX9-NEXT: v_mov_b32_e32 v10, 16 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_v: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index 7d99993..fb54cf0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -6,19 +6,18 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i8_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off @@ -139,9 +138,8 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8 define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i8_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: global_load_ushort v1, v[1:2], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 @@ -205,19 +203,18 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v2i8_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: global_load_ushort v1, v[1:2], off -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ushort v2, v2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_short v[0:1], v2, off @@ -274,9 +271,8 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v2i8_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ushort v2, v2, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir index 18ec87e..96e109e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir @@ -690,9 +690,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr ; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3 ; GFX10: $vcc_hi = IMPLICIT_DEF @@ -700,9 +700,9 @@ body: | ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -791,9 +791,9 @@ body: | ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) - ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095 ; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3 ; GFX10: $vcc_hi = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir index f8eb541..7906ff6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir @@ -21,9 +21,9 @@ body: | ; WAVE32-LABEL: name: copy ; WAVE32: $vcc_hi = IMPLICIT_DEF ; WAVE32: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 - ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]] ; WAVE32: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; WAVE32: GLOBAL_STORE_DWORD [[COPY1]], [[DEF]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; WAVE32: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[DEF]], [[COPY]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:sgpr(p1) = COPY $sgpr2_sgpr3 %1:vgpr(p1) = COPY %0 %2:vgpr(s32) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir index b450aa8..e04f8d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -26,8 +26,8 @@ body: | ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK: %12:vreg_64 = nofpexcept V_ADD_F64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; CHECK: %15:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %12, 0, 0, implicit $mode, implicit $exec - ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY1]] - ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY5]], %15, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %15, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; CHECK: S_ENDPGM 0 %2:sgpr(p4) = COPY $sgpr0_sgpr1 %7:sgpr(s64) = G_CONSTANT i64 36 @@ -76,8 +76,8 @@ body: | ; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK: %13:vreg_64 = nofpexcept V_ADD_F64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; CHECK: %16:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %13, 0, 0, implicit $mode, implicit $exec - ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY1]] - ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY5]], %16, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %16, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; CHECK: S_ENDPGM 0 %2:sgpr(p4) = COPY $sgpr0_sgpr1 %7:sgpr(s64) = G_CONSTANT i64 36 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir index 23ba321..4623a7f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir @@ -15,16 +15,16 @@ body: | ; GFX9-LABEL: name: load_global_s32_from_sgpr ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) - ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: $vcc_hi = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) - ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(p1) = COPY %0 %2:vgpr(s32) = G_LOAD %1 :: (load 4, align 4, addrspace 1) @@ -400,19 +400,9 @@ body: | ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_2049 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2049 - ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc - ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc - ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) - ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_2049 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: $vcc_hi = IMPLICIT_DEF @@ -442,19 +432,9 @@ body: | ; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049 ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247 - ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 - ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 - ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 - ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 - ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc - ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc - ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 - ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) - ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: $vcc_hi = IMPLICIT_DEF @@ -717,3 +697,52 @@ body: | $vgpr0 = COPY %4 ... + +--- +name: load_global_s32_from_copy_undef_sgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GFX9-LABEL: name: load_global_s32_from_copy_undef_sgpr + ; GFX9: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_copy_undef_sgpr + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = G_IMPLICIT_DEF + %1:vgpr(p1) = COPY %0 + %2:vgpr(s32) = G_LOAD %1 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %2 + +... + +--- +name: load_global_s32_from_undef_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + ; GFX9-LABEL: name: load_global_s32_from_undef_vgpr + ; GFX9: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_undef_vgpr + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:vgpr(p1) = G_IMPLICIT_DEF + %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll index e2fba85..5fc598b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -55,11 +55,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 ad ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0 store i32 %result, i32 addrspace(1)* %out @@ -106,11 +105,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) @@ -217,15 +215,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 ; GFX9-LABEL: global_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_inc v2, v[0:1], v2, off glc -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -268,17 +263,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o ; GFX9-LABEL: global_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s2, 16 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_inc v2, v[0:1], v2, off glc -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) @@ -310,11 +300,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) n ; GFX9-LABEL: global_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc +; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[0:1] glc ; GFX9-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void @@ -348,13 +337,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* ; GFX9-LABEL: global_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 16 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc +; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc ; GFX9-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) @@ -508,18 +494,15 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_u32_e32 v1, 2, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v2 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: global_store_dword v2, v1, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 @@ -572,10 +555,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 ad ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -624,10 +606,9 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) @@ -742,16 +723,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 ; GFX9-LABEL: global_atomic_inc_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -796,18 +774,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o ; GFX9-LABEL: global_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s2, 32 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) @@ -841,12 +814,11 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) n ; GFX9-LABEL: global_atomic_inc_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void @@ -882,14 +854,11 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* ; GFX9-LABEL: global_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 32 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) @@ -1247,19 +1216,16 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_add_u32_e32 v4, 2, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dword v[2:3], v4, off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dword v3, v2, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 @@ -1523,17 +1489,14 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_inc_rtn_u32 v4, v1, v0 -; GFX9-NEXT: ds_inc_rtn_u32 v5, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[2:3], v5, off +; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 8847cb0..3cb7545 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -267,10 +267,9 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32] ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s4 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s5, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32: @@ -286,10 +285,9 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32] ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s4 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s5, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -340,15 +338,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: @@ -358,14 +355,13 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -416,15 +412,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %o ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: @@ -434,14 +429,13 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %o ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -492,15 +486,14 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o ; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c ; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: @@ -510,14 +503,13 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o ; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c ; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) store float %result, float addrspace(1)* %out, align 4 @@ -579,9 +571,8 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double % ; GFX10_W32-NEXT: v_mov_b32_e32 v3, s7 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s1 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s0 -; GFX10_W32-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f64: @@ -597,9 +588,8 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double % ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX10_W64-NEXT: v_mov_b32_e32 v3, s7 ; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s1 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s0 -; GFX10_W64-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX10_W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10_W64-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) store double %result, double addrspace(1)* %out, align 8 @@ -658,10 +648,9 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou ; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 ; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc: @@ -676,10 +665,9 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) @@ -734,10 +722,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: @@ -751,10 +738,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) store float %result, float addrspace(1)* %out, align 4 @@ -808,10 +794,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: @@ -825,10 +810,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 -; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) store float %result, float addrspace(1)* %out, align 4 @@ -899,7 +883,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x54 +; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) @@ -907,43 +891,37 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace ; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 ; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 -; GFX10_W32-NEXT: s_add_u32 s0, s4, 8 -; GFX10_W32-NEXT: s_addc_u32 s1, s5, 0 -; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 -; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s2 +; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_div_fmas_f32 v2, v2, v3, v1 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W32-NEXT: global_store_dword v1, v0, s[4:5] offset:8 ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x54 +; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x54 ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_clause 0x2 ; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] ; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 ; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 -; GFX10_W64-NEXT: s_add_u32 s0, s4, 8 -; GFX10_W64-NEXT: s_addc_u32 s1, s5, 0 -; GFX10_W64-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 -; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[2:3] +; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s0 +; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[0:1] ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_div_fmas_f32 v2, v2, v3, v1 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8 ; GFX10_W64-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -1058,14 +1036,11 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_add_u32 s0, s2, 8 -; GFX10_W32-NEXT: s_addc_u32 s1, s3, 0 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_div_fmas_f32 v2, v1, v2, v3 -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] offset:8 ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: @@ -1091,14 +1066,11 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out ; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_add_u32 s0, s2, 8 -; GFX10_W64-NEXT: s_addc_u32 s1, s3, 0 ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_div_fmas_f32 v2, v1, v2, v3 -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 -; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off +; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] offset:8 ; GFX10_W64-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 69d5236..9c0cb28 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -53,10 +53,9 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -120,10 +119,9 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -193,9 +191,8 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl ; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -265,9 +262,8 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl ; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -324,14 +320,13 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* % ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -386,14 +381,13 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* % ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -448,14 +442,13 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* % ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, s0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -510,14 +503,13 @@ define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* % ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, s0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -572,14 +564,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -634,14 +625,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -696,14 +686,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -758,14 +747,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -811,12 +799,11 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* % ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, s3, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, s3, s3, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -857,12 +844,11 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* % ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, s2, s3, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, s2, s3, s2 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) %result0 = extractvalue { float, i1 } %result, 0 @@ -905,12 +891,11 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) %result0 = extractvalue { double, i1 } %result, 0 @@ -953,12 +938,11 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) %result0 = extractvalue { double, i1 } %result, 0 @@ -1005,14 +989,13 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, 1.0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -1063,14 +1046,13 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, 2.0, 2.0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -1137,10 +1119,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -1209,10 +1190,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -1255,12 +1235,11 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* ; GFX10-LABEL: test_div_scale_f32_val_undef_val: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_div_scale_f32 v2, s2, s0, s0, 0x41000000 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -1295,12 +1274,11 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -1332,12 +1310,11 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1) ; GFX10-LABEL: test_div_scale_f32_undef_undef_val: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_div_scale_f32 v2, s2, s0, s0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -1372,15 +1349,14 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* ; ; GFX10-LABEL: test_div_scale_f64_val_undef_val: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40200000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) %result0 = extractvalue { double, i1 } %result, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll index 867cf8d..956432f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -66,12 +66,11 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) @@ -112,12 +111,11 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll index 22c1255..257615e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -62,10 +62,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 ; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe da -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -84,12 +83,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -116,10 +114,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 ; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe da -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -138,12 +135,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe lwe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll index 33c95e3..dd7111b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -66,12 +66,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -112,12 +111,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll index 0212975..011d767 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -62,10 +62,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 ; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -84,12 +83,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -116,10 +114,9 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 ; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: global_store_dword v5, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -138,12 +135,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll index e23ba4f..bfd3470 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -66,12 +66,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -112,12 +111,11 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v5, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe -; GFX10-NEXT: v_mov_b32_e32 v6, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: global_store_dword v5, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll index 6c0fe47..235f606 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -23,13 +23,12 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { ; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf] ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11] -; GFX10-NEXT: global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00] +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0 store i32 %tmp0, i32 addrspace(1)* %out @@ -52,15 +51,14 @@ define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) { ; GFX10-LABEL: mov_dpp64_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; encoding: [0x01,0x02,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11] ; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11] -; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x7d,0x00] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0 store i64 %tmp0, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 812ad97..4badf1d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -24,12 +24,11 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) store i32 %tmp0, i32 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index 8046caf..291f40e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -387,9 +387,8 @@ define <3 x i32> @v_load_constant_v3i32_align16(<3 x i32> addrspace(4)* %ptr) { define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1] ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 @@ -398,99 +397,55 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)* ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v17, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v18, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v19, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[12:13], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[14:15], off -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[6:7], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v0, s[0:1] offset:1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v0, s[0:1] offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v0, s[0:1] offset:3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v0, s[0:1] offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v0, s[0:1] offset:5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v0, s[0:1] offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v0, s[0:1] offset:11 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, s1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v18 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, s0, v19 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v16, s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v7 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v1, s0, v2 +; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v11, v5 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v8, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v7 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v3, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v12, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v12 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v5, s0, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v9, v12, v10 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v3 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v6 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog ; @@ -577,9 +532,8 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)* define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)* inreg %ptr) { ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1] ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 @@ -588,49 +542,27 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)* ; ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v0, s[0:1] offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v0, s[0:1] offset:10 ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v6, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v5, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v6, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v1, s0, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v3, s0, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, s0, v6 +; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 5f4d409..3a4e590 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -91,24 +91,20 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_cbranch_scc0 BB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb1 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, gv2@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, gv2@gotpcrel32@hi+12 ; GFX9-NEXT: s_getpc_b64 s[2:3] -; GFX9-NEXT: s_add_u32 s2, s2, gv2@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s3, s3, gv2@gotpcrel32@hi+12 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, gv3@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, gv3@gotpcrel32@hi+12 +; GFX9-NEXT: s_add_u32 s2, s2, gv3@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, gv3@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v0, v0, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: BB1_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, -1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 @@ -123,15 +119,11 @@ define amdgpu_kernel void @localize_globals(i1 %cond) { ; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: global_store_dword v0, v0, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: BB1_4: ; %bb2 ; GFX9-NEXT: s_endpgm entry: @@ -171,17 +163,13 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv2@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv2@rel32@hi+12 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dword v0, v0, s[6:7] ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv3@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv3@rel32@hi+12 -; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: BB2_2: ; %Flow ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -190,17 +178,13 @@ define void @localize_internal_globals(i1 %cond) { ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv0@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+12 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dword v0, v0, s[6:7] ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, static.gv1@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, static.gv1@rel32@hi+12 -; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: BB2_4: ; %bb2 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll index 3fa2613..16c2b624 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -38,23 +38,21 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x10 ; GCN-NEXT: s_add_u32 s5, s32, 0x1000 +; GCN-NEXT: s_add_u32 s8, s5, 4 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: s_add_u32 s8, s5, 4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NEXT: s_add_u32 s4, s5, s4 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v1, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: global_store_dword v[0:1], v2, off +; GCN-NEXT: v_add_u32_e32 v0, v2, v0 +; GCN-NEXT: global_store_dword v1, v0, s[6:7] ; GCN-NEXT: BB0_3: ; %bb.2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off @@ -119,17 +117,15 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NEXT: s_add_u32 s4, s5, s4 -; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v2, v1, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: global_store_dword v[0:1], v2, off +; GCN-NEXT: v_add_u32_e32 v0, v2, v0 +; GCN-NEXT: global_store_dword v1, v0, s[6:7] ; GCN-NEXT: BB1_2: ; %bb.1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 8fe55b6..abb602a 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -133,9 +133,12 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* ; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast: ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} -; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] -; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] -; HSA: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} +; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] +; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] +; CI: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} + +; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, [[ZERO:v[0-9]+]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}} define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { %stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* %ld = load volatile i32, i32 addrspace(1)* %stof @@ -186,10 +189,13 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 { ; HSA: enable_sgpr_queue_ptr = 0 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 -; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] -; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 -; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] +; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] +; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] +; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 +; CI: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] + +; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GFX9: global_store_dword [[ZERO]], [[ZERO]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]$}} define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 { %ftos = addrspacecast i32* %ptr to i32 addrspace(1)* store volatile i32 0, i32 addrspace(1)* %ftos diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll index 4222ec5..4025513 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll @@ -22,9 +22,9 @@ ; ELF: } ; GFX10-W32: NumSGPRsForWavesPerEU: 4 -; GFX10-W32: NumVGPRsForWavesPerEU: 3 +; GFX10-W32: NumVGPRsForWavesPerEU: 1 ; GFX10-W64: NumSGPRsForWavesPerEU: 2 -; GFX10-W64: NumVGPRsForWavesPerEU: 3 +; GFX10-W64: NumVGPRsForWavesPerEU: 1 define amdgpu_kernel void @simple(i32 addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 297f944..8082305 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -32,12 +32,10 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_add_u32 s0, s0, s9 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: global_store_dword v[0:1], v2, off +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v0, v0, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 @@ -66,9 +64,7 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NEXT: global_store_dword v[0:1], v40, off +; GCN-NEXT: global_store_dword v40, v40, s[34:35] ; GCN-NEXT: s_endpgm call void @func(i32 0) store i32 0, i32 addrspace(1)* %ptr @@ -88,10 +84,9 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* ; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12 ; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NEXT: global_store_dword v[1:2], v0, off +; GCN-NEXT: global_store_dword v40, v0, s[34:35] ; GCN-NEXT: s_endpgm %rv = call i32 @func.return(i32 0) store i32 %rv, i32 addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index e0e7636..4538436 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -161,7 +161,8 @@ define hidden void @func_indirect_use_workgroup_id_z() #1 { } ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x: -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off ; GCN: ; use s12 define hidden void @other_arg_use_workgroup_id_x(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.x() @@ -171,7 +172,8 @@ define hidden void @other_arg_use_workgroup_id_x(i32 %arg0) #1 { } ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y: -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off ; GCN: ; use s13 define hidden void @other_arg_use_workgroup_id_y(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.y() @@ -181,7 +183,8 @@ define hidden void @other_arg_use_workgroup_id_y(i32 %arg0) #1 { } ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z: -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off ; GCN: ; use s14 define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.z() diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll index 1d79f88..450bd02 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -33,11 +33,10 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 add ; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GCN-NEXT: s_cbranch_execz BB0_2 ; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, 2 -; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off offset:28 glc +; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:28 glc ; GCN-NEXT: BB0_2: ; %endif ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0800 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index 22dde27..be1e841 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -32,11 +32,10 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz BB0_2 ; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, 2.0 -; GCN-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:28 +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28 ; GCN-NEXT: global_load_dword v0, v[0:1], off ; GCN-NEXT: BB0_2: ; %endif ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index c803b26..8f2ce73 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -74,7 +74,8 @@ done: ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: ; GCN: s_and_saveexec_b64 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} -; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}} +; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}} ; GCN: {{^}}BB2_2: ; GCN: s_or_b64 exec define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { @@ -697,7 +698,8 @@ done: ; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset: -; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}} +; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}} define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 1289032..271f6c7 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -226,32 +226,29 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s9 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v1, s5 -; GFX900-NEXT: global_load_ushort v2, v[0:1], off +; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 -; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6 -; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:4 -; GFX900-NEXT: v_mov_b32_e32 v0, s6 -; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 +; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 -; GFX900-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 -; GFX900-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v3, v4 -; GFX900-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8 -; GFX900-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 +; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX900-NEXT: s_endpgm ; ; FLATSCR-LABEL: vload2_private: @@ -259,36 +256,33 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 -; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off +; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:4 -; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 +; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:4 +; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:6 -; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:6 +; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: v_mov_b32_e32 v0, s2 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s3 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:8 +; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:8 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_load_ushort v2, off, vcc_hi offset:4 +; FLATSCR-NEXT: scratch_load_ushort v0, off, vcc_hi offset:4 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 -; FLATSCR-NEXT: scratch_load_ushort v4, off, vcc_hi offset:6 +; FLATSCR-NEXT: scratch_load_ushort v3, off, vcc_hi offset:6 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 ; FLATSCR-NEXT: s_waitcnt vmcnt(1) -; FLATSCR-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; FLATSCR-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v3, v4 -; FLATSCR-NEXT: scratch_load_short_d16_hi v3, off, vcc_hi offset:8 -; FLATSCR-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; FLATSCR-NEXT: v_mov_b32_e32 v1, v3 +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, vcc_hi offset:8 +; FLATSCR-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR-NEXT: s_endpgm entry: %loc = alloca [3 x i16], align 2, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll index e923363..d10d0dd 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll @@ -4,7 +4,7 @@ ; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, ; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] ; GCN-NOT: v_and_b32 -; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]], +; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) { %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -17,7 +17,7 @@ define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) { ; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, ; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] ; GCN-NOT: v_and_b32 -; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]], +; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) { %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -30,7 +30,7 @@ define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) { ; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, ; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] ; GCN-NOT: v_and_b32 -; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]], +; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_and3(i32 addrspace(1)* %p, i32 %x, i32 %y) { %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 @@ -58,7 +58,7 @@ define amdgpu_kernel void @select_and_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 ; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, ; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] ; GCN-NOT: v_or_b32 -; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]], +; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) { %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -71,7 +71,7 @@ define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) { ; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, ; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] ; GCN-NOT: v_or_b32 -; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]], +; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) { %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -84,7 +84,7 @@ define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) { ; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, ; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]] ; GCN-NOT: v_or_b32 -; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]], +; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @select_or3(i32 addrspace(1)* %p, i32 %x, i32 %y) { %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 @@ -118,7 +118,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(i32 addrspac } ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16: -; GCN: v_cndmask_b32_e64 v2, 2, 9, +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9, define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(i16 addrspace(1)* %p, i1 %cond) { %sel = select i1 %cond, i16 -4, i16 3 %bo = sub i16 5, %sel diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 9f72a34..f9a3d19 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -958,14 +958,12 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out ; ; GFX9-LABEL: load_constant_adjacent_offsets: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -990,14 +988,12 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out ; ; GFX9-LABEL: load_constant_disjoint_offsets: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -1026,29 +1022,25 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* ; ; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets: ; GFX9-ALIGNED: ; %bb.0: -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 -; GFX9-ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-ALIGNED-NEXT: ds_read2_b32 v[0:1], v4 offset1:1 +; GFX9-ALIGNED-NEXT: ds_read2_b32 v[2:3], v4 offset0:2 offset1:3 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-ALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-ALIGNED-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-ALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets: ; GFX9-UNALIGNED: ; %bb.0: -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-UNALIGNED-NEXT: s_endpgm %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -1083,12 +1075,11 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspac ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 @@ -1154,10 +1145,9 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* % ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add_f32_e32 v2, v0, v9 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v9 +; GFX9-NEXT: global_store_dword v10, v0, s[0:1] ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 @@ -1221,13 +1211,12 @@ define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 @@ -1253,13 +1242,12 @@ define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addr ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %load = load i64, i64 addrspace(3)* %in, align 4 store i64 %load, i64 addrspace(1)* %out, align 8 @@ -1304,6 +1292,7 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, s0, v1 ; GFX9-NEXT: v_add_u32_e32 v3, s1, v0 @@ -1322,10 +1311,8 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving( ; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 -; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:40 +; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v8, v0, s[4:5] offset:40 ; GFX9-NEXT: s_endpgm float addrspace(1)* nocapture %arg, [4 x [4 x float]] addrspace(3)* %arg1, @@ -1402,27 +1389,26 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa ; GFX9-NEXT: s_mov_b32 s36, s0 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_add_u32 s36, s36, s3 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 -; GFX9-NEXT: v_lshl_add_u32 v40, v0, 2, s2 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: ds_read_b32 v41, v40 +; GFX9-NEXT: ds_read_b32 v42, v41 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: ds_read_b32 v0, v40 offset:4 +; GFX9-NEXT: ds_read_b32 v0, v41 offset:4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v41, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-NEXT: v_mov_b32_e32 v1, s35 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, v42, v0 +; GFX9-NEXT: global_store_dword v40, v0, s[34:35] ; GFX9-NEXT: s_endpgm %x = call i32 @llvm.amdgcn.workitem.id.x() %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x @@ -1508,41 +1494,38 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* ; ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-ALIGNED: ; %bb.0: ; %entry -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v0 offset:65 -; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v0 offset:66 -; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v0 offset:67 -; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v0 offset:68 -; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v0 offset:69 -; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v0 offset:70 -; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v0 offset:71 -; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v0 offset:72 +; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 +; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 +; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 +; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 +; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 +; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:70 +; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:71 +; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:72 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v0, v1 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-ALIGNED-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry -; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 +; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-UNALIGNED-NEXT: s_endpgm entry: %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index d1d3169..f3208c2 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -10,7 +10,7 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff ; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]] -; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V_RESULT]] define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc= bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff ; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]] -; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V_RESULT]] define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) store half %fabs, half addrspace(1)* %out @@ -65,7 +65,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half ; GFX89-NOT: and ; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]] ; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]] -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { %fabs = call half @llvm.fabs.f16(half %in0) %fmul = fmul half %fabs, %in1 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index b32dce3..1363c1c 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -84,13 +84,12 @@ define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 a ; GFX9-LABEL: global_store_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: v_mov_b32_e32 v2, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_short v[0:1], v2, off -; GFX9-NEXT: global_store_short v[0:1], v3, off offset:2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2 ; GFX9-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 2 @@ -193,11 +192,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(i16 addrspace(1)* %p, i16 a ; GFX9-LABEL: global_store_2xi16_align1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x20001 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 1 @@ -283,11 +281,10 @@ define amdgpu_kernel void @global_store_2xi16_align4(i16 addrspace(1)* %p, i16 a ; GFX9-LABEL: global_store_2xi16_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x20001 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 74c8b54..99251ef 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -12,7 +12,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half undef) store half %canonicalized, half addrspace(1)* %out @@ -34,7 +34,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16: ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 { %val = bitcast i16 %val.arg to half %canonicalized = call half @llvm.canonicalize.f16(half %val) @@ -59,7 +59,7 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16: ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}| -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %val.fabs = call half @llvm.fabs.f16(half %val) @@ -70,7 +70,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* % ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16: ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}| -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] ; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace( ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16: ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] ; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -{{v[0-9]+}} ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} @@ -100,7 +100,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* % ; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_var_f16: ; VI: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}} ; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 { %val = load half, half addrspace(1)* %out %val.fneg = fneg half %val @@ -113,7 +113,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half ad ; VI: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}| ; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] ; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -|{{v[0-9]+}}| ; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} @@ -128,7 +128,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ha ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0.0) store half %canonicalized, half addrspace(1)* %out @@ -137,7 +137,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half -0.0) store half %canonicalized, half addrspace(1)* %out @@ -146,7 +146,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 1.0) store half %canonicalized, half addrspace(1)* %out @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half -1.0) store half %canonicalized, half addrspace(1)* %out @@ -164,7 +164,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 16.0) store half %canonicalized, half addrspace(1)* %out @@ -173,7 +173,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, half addrspace(1)* %out @@ -182,7 +182,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, half addrspace(1)* %out @@ -191,7 +191,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half a ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, half addrspace(1)* %out @@ -200,7 +200,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, half addrspace(1)* %out @@ -209,7 +209,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half a ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) store half %canonicalized, half addrspace(1)* %out @@ -218,7 +218,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) store half %canonicalized, half addrspace(1)* %out @@ -227,7 +227,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrs ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) store half %canonicalized, half addrspace(1)* %out @@ -236,7 +236,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrs ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) store half %canonicalized, half addrspace(1)* %out @@ -245,7 +245,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) store half %canonicalized, half addrspace(1)* %out @@ -254,7 +254,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) store half %canonicalized, half addrspace(1)* %out @@ -263,7 +263,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) store half %canonicalized, half addrspace(1)* %out @@ -276,7 +276,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace ; VI-NOT: v_and_b32 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+$}} -; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX9: global_store_dword v{{.+}}, [[REG]], s define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid @@ -335,7 +335,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad ; VI-NOT: 0xffff ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}} -; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX9: global_store_dword v{{[0-9]+}}, [[REG]], s define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid @@ -352,7 +352,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa ; VI-NOT: v_and_b32 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}} -; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX9: global_store_dword v{{[0-9]+}}, [[REG]], s define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 { %val = bitcast i32 %val.arg to <2 x half> %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) @@ -362,7 +362,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -371,7 +371,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace( ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -380,7 +380,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace( ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -389,7 +389,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace( ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -398,7 +398,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace( ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -407,7 +407,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrs ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -416,7 +416,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(< ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -425,7 +425,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -434,7 +434,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(< ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -443,7 +443,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -452,7 +452,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspac ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -461,7 +461,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x hal ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -470,7 +470,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x hal ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -479,7 +479,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> a ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -488,7 +488,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> a ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -497,7 +497,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> a ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16: ; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -542,7 +542,7 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { ; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00 -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index d95194a..357de0e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -20,7 +20,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32: ; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} ; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %canonicalized = call float @llvm.canonicalize.f32(float %val) @@ -31,7 +31,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32: ; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} ; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 { %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, float addrspace(1)* %out @@ -41,7 +41,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32: ; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| ; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}| -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fabs = call float @llvm.fabs.f32(float %val) @@ -53,7 +53,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], -1.0, |{{v[0-9]+}}| ; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}| -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fabs = call float @llvm.fabs.f32(float %val) @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32: ; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], -1.0, {{v[0-9]+}} ; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fneg = fneg float %val @@ -77,7 +77,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float undef) store float %canonicalized, float addrspace(1)* %out @@ -86,7 +86,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 0.0) store float %canonicalized, float addrspace(1)* %out @@ -95,7 +95,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %ou ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32: ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float -0.0) store float %canonicalized, float addrspace(1)* %out @@ -104,7 +104,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %ou ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 1.0) store float %canonicalized, float addrspace(1)* %out @@ -113,7 +113,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %ou ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float -1.0) store float %canonicalized, float addrspace(1)* %out @@ -122,7 +122,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %ou ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 16.0) store float %canonicalized, float addrspace(1)* %out @@ -131,7 +131,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1) ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, float addrspace(1)* %out @@ -140,7 +140,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(flo ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, float addrspace(1)* %out @@ -149,7 +149,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, float addrspace(1)* %out @@ -158,7 +158,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(flo ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, float addrspace(1)* %out @@ -167,7 +167,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000) store float %canonicalized, float addrspace(1)* %out @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* % ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float)) store float %canonicalized, float addrspace(1)* %out @@ -185,7 +185,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addr ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float)) store float %canonicalized, float addrspace(1)* %out @@ -194,7 +194,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addr ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float)) store float %canonicalized, float addrspace(1)* %out @@ -203,7 +203,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspac ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float)) store float %canonicalized, float addrspace(1)* %out @@ -212,7 +212,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspac ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float)) store float %canonicalized, float addrspace(1)* %out @@ -221,7 +221,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspac ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float)) store float %canonicalized, float addrspace(1)* %out @@ -230,7 +230,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspac ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %canonicalized = call double @llvm.canonicalize.f64(double %val) @@ -240,7 +240,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 { %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, double addrspace(1)* %out @@ -249,7 +249,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}| -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fabs = call double @llvm.fabs.f64(double %val) @@ -260,7 +260,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}| -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fabs = call double @llvm.fabs.f64(double %val) @@ -272,7 +272,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspac ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fneg = fneg double %val @@ -284,7 +284,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64: ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 0.0) store double %canonicalized, double addrspace(1)* %out @@ -294,7 +294,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double -0.0) store double %canonicalized, double addrspace(1)* %out @@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 1.0) store double %canonicalized, double addrspace(1)* %out @@ -314,7 +314,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double -1.0) store double %canonicalized, double addrspace(1)* %out @@ -324,7 +324,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 16.0) store double %canonicalized, double addrspace(1)* %out @@ -334,7 +334,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64: ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, double addrspace(1)* %out @@ -344,7 +344,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(dou ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, double addrspace(1)* %out @@ -354,7 +354,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64: ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, double addrspace(1)* %out @@ -364,7 +364,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(dou ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, double addrspace(1)* %out @@ -374,7 +374,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000) store double %canonicalized, double addrspace(1)* %out @@ -384,7 +384,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double)) store double %canonicalized, double addrspace(1)* %out @@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double add ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double)) store double %canonicalized, double addrspace(1)* %out @@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double add ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double)) store double %canonicalized, double addrspace(1)* %out @@ -414,7 +414,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspa ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double)) store double %canonicalized, double addrspace(1)* %out @@ -424,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspa ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double)) store double %canonicalized, double addrspace(1)* %out @@ -434,7 +434,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspa ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double)) store double %canonicalized, double addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index d62155c..1a00573 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH %s ; GCN-LABEL: {{^}}div_1_by_x_25ulp: ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 @@ -13,7 +13,7 @@ ; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off +; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %div = fdiv float 1.000000e+00, %load, !fpmath !0 @@ -33,7 +33,7 @@ define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) { ; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off +; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %div = fdiv float -1.000000e+00, %load, !fpmath !0 @@ -53,7 +53,7 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) { ; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off +; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %neg = fsub float -0.000000e+00, %load @@ -74,7 +74,7 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) { ; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off +; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %neg = fsub float -0.000000e+00, %load @@ -112,7 +112,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg ; GCN-FLUSH: v_rcp_f32_e32 ; GCN-FLUSH: v_rcp_f32_e32 ; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]] -; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off +; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) { %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 %div = fdiv <4 x float> , %load, !fpmath !0 @@ -121,6 +121,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) { } ; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp: +; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] @@ -156,6 +157,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* % } ; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp: +; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] @@ -183,7 +185,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* % ; GCN-FLUSH: v_rcp_f32_e64 ; GCN-FLUSH: v_rcp_f32_e64 ; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]] -; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off +; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 %neg = fsub <4 x float> , %load @@ -221,7 +223,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* % ; GCN-FLUSH: v_rcp_f32_e32 ; GCN-FLUSH: v_rcp_f32_e32 ; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]] -; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off +; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 %neg = fsub <4 x float> , %load @@ -334,7 +336,7 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* % ; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] ; GCN-FLUSH: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off +; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) { %load = load float, float addrspace(1)* %arg, align 4 %div = fdiv float %num, %load, !fpmath !0 @@ -345,7 +347,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num ; GCN-LABEL: {{^}}div_1_by_x_fast: ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]:[0-9]+\]}} define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %div = fdiv fast float 1.000000e+00, %load, !fpmath !0 @@ -356,7 +358,7 @@ define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) { ; GCN-LABEL: {{^}}div_minus_1_by_x_fast: ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %div = fdiv fast float -1.000000e+00, %load, !fpmath !0 @@ -367,7 +369,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) { ; GCN-LABEL: {{^}}div_1_by_minus_x_fast: ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %neg = fsub float -0.000000e+00, %load, !fpmath !0 @@ -379,7 +381,7 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) { ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast: ; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] -; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) { %load = load float, float addrspace(1)* %arg, align 4 %neg = fsub float -0.000000e+00, %load, !fpmath !0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index c4ee8ad..42229a7 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -26,7 +26,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, ; GFX89-NOT: _and ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{s[0-9]+}}, -|{{v[0-9]+}}| ; GFX89-NOT: [[MUL]] -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +; GFX89: {{flat|global}}_store_short v{{.+}}, [[MUL]] define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs @@ -134,8 +134,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %o ; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]] ; GFX9: s_xor_b32 [[NEG:s[0-9]+]], [[ABS]], 0x80008000 ; GFX9-DAG: v_mov_b32_e32 [[V_NEG:v[0-9]+]], [[NEG]] -; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_ABS]] -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_NEG]] +; GFX9-DAG: global_store_dword v{{[0-9]+}}, [[V_ABS]], s{{\[[0-9]+:[0-9]+\]}} +; GFX9: global_store_dword v{{[0-9]+}}, [[V_NEG]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fneg = fsub <2 x half> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 5afcafc..cca5398 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -32,7 +32,7 @@ define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1) ; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]], ; GCN: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} ; GCN: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]] -; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V_XOR]] define amdgpu_kernel void @s_fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 123c40a..f2d3c59 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -48,16 +48,15 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 % ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v2, s1, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32: @@ -108,12 +107,11 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 25 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: @@ -188,22 +186,21 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s7, s5, 1 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s5, s5, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 ; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -265,14 +262,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 25 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -373,13 +369,14 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s11, s7, 1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s7, s7, 1 +; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: s_not_b32 s2, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -396,11 +393,9 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 ; GFX9-NEXT: s_lshr_b32 s1, s4, 1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32: @@ -478,9 +473,8 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 @@ -489,7 +483,7 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v4i32_imm: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 0874b09..ef7eff2 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -51,13 +51,12 @@ define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 % ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32: @@ -105,12 +104,11 @@ define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 7 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: @@ -173,16 +171,15 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32: @@ -240,14 +237,13 @@ define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 7 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -324,6 +320,7 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 @@ -335,11 +332,9 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32: @@ -409,9 +404,8 @@ define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 @@ -420,7 +414,7 @@ define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> ; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v4i32_imm: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 3991b27..122b10d 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2508,15 +2508,14 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[4:5], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v4, s[4:5] offset:16 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 @@ -2541,6 +2540,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2548,14 +2548,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v4, s[4:5] offset:16 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[4:5], off -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -2655,17 +2654,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v13, s5 -; GFX9-NEXT: v_mov_b32_e32 v12, s4 -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[12:13], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:48 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v12, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v12, s[4:5] offset:48 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 @@ -2690,6 +2688,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2697,16 +2696,15 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v13, s5 -; GFX10-NEXT: v_mov_b32_e32 v12, s4 +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v12, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v12, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v12, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v12, s[4:5] offset:48 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 -; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[12:13], off -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:48 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -2732,21 +2730,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v29, s5 -; GFX9-NEXT: v_mov_b32_e32 v28, s4 -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 @@ -2771,6 +2768,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v28, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -2778,20 +2776,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v29, s5 -; GFX10-NEXT: v_mov_b32_e32 v28, s4 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[28:29], off -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -2817,21 +2814,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v29, s5 -; GFX9-NEXT: v_mov_b32_e32 v28, s4 -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 @@ -2860,28 +2856,28 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v28, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dword v32, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v29, s5 -; GFX10-NEXT: v_mov_b32_e32 v28, s4 +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-NEXT: global_load_dword v32, v[0:1], off -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[28:29], off -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2987,15 +2983,14 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 -; GFX9-NEXT: global_load_ubyte v0, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:4 +; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 @@ -3020,6 +3015,7 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -3027,14 +3023,13 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ubyte v0, v1, s[4:5] +; GFX10-NEXT: global_load_dword v1, v1, s[4:5] offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ubyte v0, v[1:2], off -; GFX10-NEXT: global_load_dword v1, v[1:2], off offset:4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -3230,14 +3225,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 @@ -3281,6 +3275,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 @@ -3288,12 +3283,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index f93b6d3..0076685 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -33,7 +33,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* % ; GFX900-NEXT: s_cbranch_execnz [[LOOP]] ; GFX908-NOT: v_add_f32 -; GFX908: global_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off +; GFX908: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s ; GFX908-NOT: s_cbranch_execnz define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 { %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index b9760a6..f28b9fd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -8,13 +8,26 @@ ; No vgpr offset, constants ; -------------------------------------------------------------------------------- +; SGPR base only +define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) { +; GCN-LABEL: global_load_saddr_i8_offset_0: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %load = load i8, i8 addrspace(1)* %sbase + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + ; SGPR base with maximum gfx9 immediate offset define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_4095: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -65,9 +78,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg4096: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -141,9 +153,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inr define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_2048: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -164,9 +175,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_2049: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2049 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2049 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -187,9 +197,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_2050: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2050 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2050 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -210,9 +219,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) { ; GCN-LABEL: global_load_saddr_i8_offset_neg2048: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048 @@ -226,9 +234,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inr define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg2049: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -250,9 +257,8 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inr define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg2050: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2050 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2050 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -2316,5 +2322,44 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128 ret <2 x half> %cast } +; -------------------------------------------------------------------------------- +; or-with-constant as add +; -------------------------------------------------------------------------------- + +; Check add-as-or with split 64-bit or. +define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) { +; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: +; GCN: ; %bb.0: +; GCN-NEXT: v_or_b32_e32 v0, 16, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_load_ubyte v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %zext.idx = zext i32 %idx to i64 + %or = or i64 %zext.idx, 16 + %addr = inttoptr i64 %or to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %addr + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) { +; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GCN: ; %bb.0: +; GCN-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_load_ubyte v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %zext.idx = zext i32 %idx to i64 + %or = or i64 %zext.idx, 4160 + %addr = inttoptr i64 %or to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %addr + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + !0 = !{i32 0, i32 1073741824} ; (1 << 30) !1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 7bd2fd2..93dcde0 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}atomic_add_i32_offset: ; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -13,7 +13,7 @@ entry: } ; GCN-LABEL: {{^}}atomic_add_i32_max_neg_offset: -; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:-4096{{$}} +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}} define amdgpu_kernel void @atomic_add_i32_max_neg_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 -1024 @@ -57,7 +57,7 @@ entry: ; SIVI: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -84,7 +84,7 @@ entry: ; SIVI: buffer_store_dword [[RET]] ; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -96,7 +96,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i32: ; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst @@ -107,8 +107,8 @@ entry: ; SIVI: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX9: global_atomic_add [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst @@ -144,7 +144,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32_offset: ; SIVI: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -156,7 +156,7 @@ entry: ; SIVI: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_and [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -196,7 +196,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i32: ; SIVI: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_and v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst @@ -207,7 +207,7 @@ entry: ; SIVI: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst @@ -244,7 +244,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32_offset: ; SIVI: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -256,7 +256,7 @@ entry: ; SIVI: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_sub v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -296,7 +296,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i32: ; SIVI: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_sub v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst @@ -307,7 +307,7 @@ entry: ; SIVI: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_sub [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst @@ -344,7 +344,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32_offset: ; SIVI: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -356,7 +356,7 @@ entry: ; SIVI: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -396,7 +396,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i32: ; SIVI: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst @@ -407,7 +407,7 @@ entry: ; SIVI: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst @@ -444,7 +444,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32_offset: ; SIVI: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -456,7 +456,7 @@ entry: ; SIVI: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -495,7 +495,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i32: ; SIVI: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst @@ -506,7 +506,7 @@ entry: ; SIVI: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst @@ -542,7 +542,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32_offset: ; SIVI: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -554,7 +554,7 @@ entry: ; SIVI: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -593,7 +593,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i32: ; SIVI: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst @@ -604,7 +604,7 @@ entry: ; SIVI: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst @@ -640,7 +640,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32_offset: ; SIVI: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -652,7 +652,7 @@ entry: ; SIVI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -690,7 +690,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i32: ; SIVI: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst @@ -701,7 +701,7 @@ entry: ; SIVI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst @@ -737,7 +737,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32_offset: ; SIVI: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -749,7 +749,7 @@ entry: ; SIVI: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -788,7 +788,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i32: ; SIVI: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst @@ -799,7 +799,7 @@ entry: ; SIVI: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst @@ -835,7 +835,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32_offset: ; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -846,7 +846,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_f32_offset: ; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, float %in) { entry: %gep = getelementptr float, float addrspace(1)* %out, i64 4 @@ -858,7 +858,7 @@ entry: ; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -896,7 +896,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i32: ; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst @@ -907,7 +907,7 @@ entry: ; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst @@ -943,7 +943,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset: ; SIVI: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:16{{$}} +; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -955,7 +955,7 @@ entry: ; SIVI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword v[[RET]] -; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:16 glc{{$}} +; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -997,7 +997,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i32: ; SIVI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst @@ -1008,7 +1008,7 @@ entry: ; SIVI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword v[[RET]] -; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], off glc{{$}} +; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { entry: %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst @@ -1046,7 +1046,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32_offset: ; SIVI: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -; GFX9: global_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}} +; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -1058,7 +1058,7 @@ entry: ; SIVI: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_xor v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}} +; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -1096,7 +1096,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i32: ; SIVI: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst @@ -1107,7 +1107,7 @@ entry: ; SIVI: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}} +; GFX9: global_atomic_xor [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { entry: %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst @@ -1145,7 +1145,7 @@ entry: ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:16 glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4 @@ -1161,7 +1161,7 @@ entry: ; VI-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, -1 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:-512 glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-512 glc{{$}} define amdgpu_kernel void @atomic_load_i32_negoffset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %in, i64 -128 @@ -1175,7 +1175,7 @@ entry: ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:16 glc{{$}} +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} define amdgpu_kernel void @atomic_load_f32_offset(float addrspace(1)* %in, float addrspace(1)* %out) { entry: %gep = getelementptr float, float addrspace(1)* %in, i64 4 @@ -1189,7 +1189,7 @@ entry: ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; SIVI: buffer_store_dword [[RET]] -; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off glc +; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4 @@ -1244,7 +1244,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i32_offset: ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off offset:16{{$}} +; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -1255,7 +1255,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i32: ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}} +; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 @@ -1265,7 +1265,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_f32: ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}} +; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_f32(float %in, float addrspace(1)* %out) { entry: store atomic float %in, float addrspace(1)* %out seq_cst, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index 88bec6c..456080d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}atomic_add_i64_offset: ; CIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}} +; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -17,7 +17,7 @@ entry: ; CIVI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off offset:32 glc{{$}} +; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -55,7 +55,7 @@ entry: ; GCN-LABEL: {{^}}atomic_add_i64: ; SIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst @@ -66,7 +66,7 @@ entry: ; CIVI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst @@ -101,7 +101,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64_offset: ; CIVI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -113,7 +113,7 @@ entry: ; CIVI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -151,7 +151,7 @@ entry: ; GCN-LABEL: {{^}}atomic_and_i64: ; CIVI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst @@ -162,7 +162,7 @@ entry: ; CIVI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst @@ -197,7 +197,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64_offset: ; CIVI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -209,7 +209,7 @@ entry: ; CIVI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -247,7 +247,7 @@ entry: ; GCN-LABEL: {{^}}atomic_sub_i64: ; CIVI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst @@ -258,7 +258,7 @@ entry: ; CIVI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst @@ -293,7 +293,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64_offset: ; CIVI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -305,7 +305,7 @@ entry: ; CIVI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -343,7 +343,7 @@ entry: ; GCN-LABEL: {{^}}atomic_max_i64: ; CIVI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst @@ -354,7 +354,7 @@ entry: ; CIVI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst @@ -389,7 +389,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64_offset: ; CIVI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -401,7 +401,7 @@ entry: ; CIVI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -439,7 +439,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umax_i64: ; CIVI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst @@ -450,7 +450,7 @@ entry: ; CIVI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst @@ -485,7 +485,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64_offset: ; CIVI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -497,7 +497,7 @@ entry: ; CIVI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -535,7 +535,7 @@ entry: ; GCN-LABEL: {{^}}atomic_min_i64: ; CIVI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst @@ -546,7 +546,7 @@ entry: ; CIVI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst @@ -582,7 +582,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64_offset: ; CIVI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -594,7 +594,7 @@ entry: ; CIVI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -632,7 +632,7 @@ entry: ; GCN-LABEL: {{^}}atomic_umin_i64: ; CIVI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst @@ -643,7 +643,7 @@ entry: ; CIVI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst @@ -678,7 +678,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64_offset: ; CIVI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -690,7 +690,7 @@ entry: ; CIVI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -728,7 +728,7 @@ entry: ; GCN-LABEL: {{^}}atomic_or_i64: ; CIVI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst @@ -739,7 +739,7 @@ entry: ; CIVI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst @@ -775,7 +775,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64_offset: ; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -786,7 +786,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_f64_offset: ; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, double %in) { entry: %gep = getelementptr double, double addrspace(1)* %out, i64 4 @@ -798,7 +798,7 @@ entry: ; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -836,7 +836,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xchg_i64: ; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst @@ -847,7 +847,7 @@ entry: ; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst @@ -882,7 +882,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64_offset: ; CIVI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -894,7 +894,7 @@ entry: ; CIVI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -932,7 +932,7 @@ entry: ; GCN-LABEL: {{^}}atomic_xor_i64: ; CIVI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst @@ -943,7 +943,7 @@ entry: ; CIVI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}} +; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { entry: %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst @@ -979,7 +979,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset: ; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} -; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -1004,7 +1004,7 @@ entry: ; CIVI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} ; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]: -; GFX9: global_atomic_cmpswap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_cmpswap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -1044,7 +1044,7 @@ entry: ; GCN-LABEL: {{^}}atomic_cmpxchg_i64: ; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst @@ -1055,7 +1055,7 @@ entry: ; CIVI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc ; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]: -; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off glc{{$}} +; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) { entry: %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst @@ -1095,7 +1095,7 @@ entry: ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:32 glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { entry: %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4 @@ -1115,7 +1115,7 @@ entry: ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:-32 glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-32 glc{{$}} define amdgpu_kernel void @atomic_load_i64_neg_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { entry: %gep = getelementptr i64, i64 addrspace(1)* %in, i64 -4 @@ -1129,7 +1129,7 @@ entry: ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc ; CIVI: buffer_store_dwordx2 [[RET]] -; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off glc{{$}} +; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { entry: %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8 @@ -1184,7 +1184,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i64_offset: ; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} ; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX9: global_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}} +; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -1195,7 +1195,7 @@ entry: ; GCN-LABEL: {{^}}atomic_store_i64: ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off{{$}} +; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]\]}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) { entry: store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll index 5df0896..069658c 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -18,7 +18,7 @@ ; WAVE64: .sgpr_count: 8 ; WAVE32: .sgpr_count: 10 ; CHECK: .symbol: test.kd -; CHECK: .vgpr_count: 6 +; CHECK: .vgpr_count: {{3|6}} ; WAVE64: .wavefront_size: 64 ; WAVE32: .wavefront_size: 32 define amdgpu_kernel void @test( @@ -50,8 +50,8 @@ entry: ; CHECK: .name: num_spilled_sgprs ; GFX700: .sgpr_spill_count: 38 ; GFX803: .sgpr_spill_count: 22 -; GFX900: .sgpr_spill_count: 22 -; GFX1010: .sgpr_spill_count: 22 +; GFX900: .sgpr_spill_count: 48 +; GFX1010: .sgpr_spill_count: 48 ; CHECK: .symbol: num_spilled_sgprs.kd define amdgpu_kernel void @num_spilled_sgprs( i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], @@ -88,7 +88,7 @@ entry: ; CHECK: .name: num_spilled_vgprs ; CHECK: .symbol: num_spilled_vgprs.kd -; CHECK: .vgpr_spill_count: 14 +; CHECK: .vgpr_spill_count: {{13|14}} define amdgpu_kernel void @num_spilled_vgprs() #1 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 76de5b9..98e7983 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -17,7 +17,7 @@ ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 ; CHECK: NumSGPRs: 8 -; CHECK: NumVGPRs: 6 +; CHECK: NumVGPRs: {{3|6}} ; CHECK: MaxFlatWorkGroupSize: 1024 define amdgpu_kernel void @test( half addrspace(1)* %r, @@ -40,7 +40,7 @@ entry: ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 ; CHECK: NumSGPRs: 8 -; CHECK: NumVGPRs: 6 +; CHECK: NumVGPRs: {{3|6}} ; CHECK: MaxFlatWorkGroupSize: 256 define amdgpu_kernel void @test_max_flat_workgroup_size( half addrspace(1)* %r, @@ -59,7 +59,7 @@ entry: ; CHECK: CodeProps: ; GFX700: NumSpilledSGPRs: 38 ; GFX803: NumSpilledSGPRs: 22 -; GFX900: NumSpilledSGPRs: 22 +; GFX900: NumSpilledSGPRs: {{22|48}} define amdgpu_kernel void @num_spilled_sgprs( i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32], @@ -96,7 +96,7 @@ entry: ; CHECK-LABEL: - Name: num_spilled_vgprs ; CHECK: SymbolName: 'num_spilled_vgprs@kd' ; CHECK: CodeProps: -; CHECK: NumSpilledVGPRs: 14 +; CHECK: NumSpilledVGPRs: {{13|14}} define amdgpu_kernel void @num_spilled_vgprs() #1 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index 192ddd4..6493b29 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -66,7 +66,7 @@ ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000 ; Make sure we generate flat store for HSA ; PRE-GFX10: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; GFX10: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} ; HSA: .Lfunc_end0: ; HSA: .size simple, .Lfunc_end0-simple diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index ffa17c9..8b9931a 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -16,31 +16,30 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: BB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v3, s5, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 -; GFX9-NEXT: v_not_b32_e32 v6, v3 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v5, s4, v6 +; GFX9-NEXT: v_mul_lo_u32 v2, s5, v0 +; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v4, s4, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: s_add_u32 s4, s4, 1 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 -; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB0_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -76,29 +75,28 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: BB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v3, s5, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 -; GFX9-NEXT: v_not_b32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3 -; GFX9-NEXT: v_add_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, s5, v0 +; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v2 ; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: s_add_u32 s4, s4, 1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 -; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB1_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -137,28 +135,27 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: BB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v2, s3 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 ; GFX9-NEXT: s_add_i32 s4, s4, 1 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 -; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -197,24 +194,23 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: BB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v3, s3, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 -; GFX9-NEXT: v_sub_u32_e32 v3, s3, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: s_add_i32 s3, s3, 1 -; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB3_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 2a7b47b..a7b137b 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -64,6 +64,7 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -73,31 +74,28 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: @@ -105,6 +103,7 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -112,10 +111,8 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -200,6 +197,7 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -207,16 +205,14 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, s5, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v1, s2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, s5, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MulMul: @@ -224,6 +220,7 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -231,16 +228,14 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_add_u32_e32 v2, s5, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, s2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, s5, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MulMul: @@ -248,6 +243,7 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 @@ -261,10 +257,8 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s5, s6 ; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -341,6 +335,7 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -350,31 +345,28 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: @@ -382,6 +374,7 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -389,10 +382,8 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot2_i32_i16 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -470,6 +461,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -479,20 +471,19 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedTypedMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -502,14 +493,12 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedTypedMul: @@ -517,6 +506,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -529,10 +519,8 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -614,6 +602,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -623,31 +612,28 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_alt_AddOperands: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: @@ -655,6 +641,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -662,10 +649,8 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -743,6 +728,7 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -752,20 +738,19 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MixedExt: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -775,14 +760,12 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MixedExt: @@ -790,6 +773,7 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -802,10 +786,8 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -875,38 +857,36 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, s2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_SameVec: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, s2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_SameVec: @@ -914,6 +894,7 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 @@ -923,10 +904,8 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s3 ; GFX10-DL-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1008,6 +987,7 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1017,31 +997,28 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: @@ -1049,6 +1026,7 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1056,10 +1034,8 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1141,6 +1117,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4 @@ -1150,31 +1127,28 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_v4i16_Hi: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: @@ -1182,6 +1156,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1189,10 +1164,8 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1274,6 +1247,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 @@ -1282,15 +1256,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 ; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8 ; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 ; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Even: @@ -1298,6 +1270,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 @@ -1306,15 +1279,13 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 ; GFX9-DL-NEXT: s_and_b32 s2, s2, s8 ; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 ; GFX9-DL-NEXT: s_and_b32 s4, s4, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Even: @@ -1323,6 +1294,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1335,10 +1307,8 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_and_b32 s0, s0, s7 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 ; GFX10-DL-NEXT: s_and_b32 s1, s2, s7 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1420,6 +1390,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 @@ -1428,15 +1399,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_v4i16_Middle: @@ -1444,6 +1413,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 @@ -1452,15 +1422,13 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_and_b32 s3, s3, s8 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-DL-NEXT: s_and_b32 s5, s5, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_v4i16_Middle: @@ -1469,6 +1437,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1481,10 +1450,8 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0 ; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1566,6 +1533,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1574,15 +1542,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 ; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notudot2_DiffIndex: @@ -1590,6 +1556,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1598,15 +1565,13 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 ; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notudot2_DiffIndex: @@ -1614,6 +1579,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1627,10 +1593,8 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_and_b32 s0, s0, s2 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s3, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1714,6 +1678,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1723,15 +1688,13 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v2, v1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_add1: @@ -1739,6 +1702,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1748,15 +1712,13 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_add1: @@ -1764,6 +1726,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1778,10 +1741,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1863,6 +1824,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1872,21 +1834,20 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_add1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1896,15 +1857,13 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_add1: @@ -1912,6 +1871,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1925,10 +1885,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2014,6 +1972,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -2022,16 +1981,14 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul1: @@ -2039,6 +1996,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -2047,16 +2005,14 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul1: @@ -2064,6 +2020,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -2078,10 +2035,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2164,6 +2119,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -2172,22 +2128,21 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v3, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -2196,16 +2151,14 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v3, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul1: @@ -2213,6 +2166,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -2226,10 +2180,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2316,6 +2268,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -2325,15 +2278,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_MultipleUses_mul2: @@ -2341,6 +2292,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -2350,15 +2302,13 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_MultipleUses_mul2: @@ -2366,6 +2316,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -2380,10 +2331,8 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 ; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2466,6 +2415,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -2475,21 +2425,20 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot2_MultipleUses_mul2: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -2499,15 +2448,13 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2_MultipleUses_mul2: @@ -2515,6 +2462,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -2528,10 +2476,8 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2614,58 +2560,54 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_dot2_u32_u16 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -2751,86 +2693,71 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 -; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v1, v0, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notsdot2_sext8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 -; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notsdot2_sext8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-DL-NEXT: global_load_ushort v1, v[2:3], off +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] +; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, v0 -; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v1 -; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, v2 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, s2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v1, v0, v2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s2 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, v2, v1, v3 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm <2 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index 629538a..c941080 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -73,6 +73,7 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -81,40 +82,37 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 ; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 ; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: @@ -122,6 +120,7 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -129,10 +128,8 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot4_i32_i8 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -251,65 +248,61 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s1, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010 -; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v3, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_dot4_i32_i8 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -419,66 +412,62 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -580,6 +569,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -588,30 +578,29 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 ; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 ; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 ; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_multiuse_mul1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -620,24 +609,22 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 ; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 ; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 ; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v2, v1 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_multiuse_mul1: @@ -645,6 +632,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -664,10 +652,8 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -779,64 +765,62 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24 ; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s2, v3, v4 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, v3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc32_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 ; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24 ; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 ; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24 ; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010 ; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, s2, v3, v4 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32_vecMul: @@ -863,11 +847,10 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 ; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -962,122 +945,118 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s5 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v4, 8, s5 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s4 -; GFX9-NODL-NEXT: v_and_b32_e32 v5, s5, v4 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 +; GFX9-NODL-NEXT: v_and_b32_e32 v6, s5, v5 ; GFX9-NODL-NEXT: s_bfe_i32 s4, s4, 0x80000 -; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v5 -; GFX9-NODL-NEXT: v_and_b32_e32 v5, s4, v4 -; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s3 +; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 +; GFX9-NODL-NEXT: v_and_b32_e32 v6, s4, v5 +; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 ; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s2 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s3, v4 +; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s2 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, s3, v5 ; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s2, v4 -; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, s2, v5 +; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u32_e32 v4, v3, v4 -; GFX9-NODL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2 +; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s5 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s5 ; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s4 -; GFX9-DL-NEXT: v_and_b32_e32 v5, s5, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 +; GFX9-DL-NEXT: v_and_b32_e32 v6, s5, v5 ; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80000 -; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v5, s4, v4 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s3 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v6, s4, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 ; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s3, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s3, v5 ; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v4 -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v4, v3, v4 -; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s0 ; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v3 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v2 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s1 +; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v2 ; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s2 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s2 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 ; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x80000 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80000 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s0 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s1, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v3, v8, 16, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s0 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index aa4dc4e..b491db0 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -76,6 +76,7 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -84,40 +85,37 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: @@ -125,6 +123,7 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -132,10 +131,8 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -246,66 +243,62 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 +; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -416,66 +409,62 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -563,60 +552,56 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s2, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2 +; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s4, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s3, s3, 0x80008 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s5, s4, s2 +; GFX9-DL-NEXT: s_and_b32 s2, s3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x80008 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_movk_i32 s1, 0xff @@ -625,10 +610,10 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 ; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -720,66 +705,62 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s3, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s1, s0, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s1, s0, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -883,94 +864,90 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* % ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-DL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s4, 0xff -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_movk_i32 s6, 0xff +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: s_and_b32 s2, s0, s4 -; GFX10-DL-NEXT: s_and_b32 s3, s1, s4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 +; GFX10-DL-NEXT: s_and_b32 s2, s0, s6 +; GFX10-DL-NEXT: s_and_b32 s3, s1, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -1076,6 +1053,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1084,24 +1062,22 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v3, v2 ; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_mul1: @@ -1109,6 +1085,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1117,24 +1094,22 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v3, v2 ; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_mul1: @@ -1142,6 +1117,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1162,10 +1138,8 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1282,6 +1256,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1291,24 +1266,22 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v1, v2 ; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_add_u32_e32 v1, s10, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, s10, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v3, v1 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v3, v1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_multiuse_add1: @@ -1316,6 +1289,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1325,24 +1299,22 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v1, v2 ; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_add_u32_e32 v1, s10, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_add_u32_e32 v2, s10, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v3, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v3, v1 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v3, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_multiuse_add1: @@ -1351,6 +1323,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_movk_i32 s7, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1371,10 +1344,8 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s6, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1487,91 +1458,87 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v3, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 -; GFX9-DL-NEXT: s_sext_i32_i8 s2, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v4, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0 ; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -1685,6 +1652,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1692,24 +1660,22 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 ; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 ; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s4 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, s3, v3, v4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, v3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc32_vecMul: @@ -1717,6 +1683,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 @@ -1724,24 +1691,22 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 ; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24 ; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 ; GFX9-DL-NEXT: s_and_b32 s3, s3, s2 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 ; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s4 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, s3, v3, v4 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32_vecMul: @@ -1770,11 +1735,10 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1870,7 +1834,8 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1878,35 +1843,34 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 -; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 -; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s6, 16, v4 +; GFX9-NODL-NEXT: v_lshl_or_b32 v5, s4, 16, v5 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 ; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 +; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u32_e32 v4, v2, v4 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2 +; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -1914,66 +1878,62 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16 ; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 -; GFX9-DL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshl_or_b32 v3, s6, 16, v3 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, s4, 16, v4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, s6, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, s4, 16, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 ; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 -; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v0, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off +; GFX9-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v4, v2, v4 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_lshr_b32 s2, s1, 16 ; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, s1, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -2091,110 +2051,106 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: global_load_ubyte v4, v0, s[0:1] ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s2, v1 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 -; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 +; GFX9-NODL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v4, v0, s[0:1] ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s2, v1 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 -; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s1 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s1 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v4 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index f6252c1..2e3411f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -129,43 +129,42 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 ; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 ; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s6, v2, v1 ; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s8, v2, v1 ; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 ; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1 ; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 ; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s12, v2, v1 ; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 ; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_i32_i24 v1, s14, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 ; GFX9-NEXT: s_ashr_i32 s3, s3, 28 -; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s16, v2, v1 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32: @@ -181,14 +180,13 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32: @@ -201,6 +199,7 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -209,10 +208,8 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -416,169 +413,165 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s18, -1 -; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s16, s16, s3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-NEXT: s_lshr_b32 s2, s0, 12 -; GFX9-NEXT: s_lshr_b32 s3, s1, 12 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s3 -; GFX9-NEXT: v_mul_i32_i24_e32 v3, s8, v3 -; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000 +; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: s_lshr_b32 s4, s2, 12 +; GFX9-NEXT: s_lshr_b32 s5, s3, 12 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s5 +; GFX9-NEXT: v_mul_i32_i24_e32 v2, s10, v2 +; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 -; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018 -; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s1, s1, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s15 -; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v7, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_mad_i32_i24 v1, s6, v5, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s8, v6, v1 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s12, v7, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s14, v8, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s16, v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s18, -1 -; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 -; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12 -; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s3 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s8, v3 -; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12 +; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s10, v2 +; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s15 -; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v5, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v6, v1 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v7, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v8, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v9, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3 +; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -784,175 +777,171 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s3, s1, 12 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX9-NEXT: s_lshr_b32 s4, s2, 12 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s3 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: s_lshr_b32 s5, s3, 12 +; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40000 +; GFX9-NEXT: s_lshr_b32 s6, s4, 12 +; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40004 +; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40008 +; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s6 +; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s16 -; GFX9-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-NEXT: v_mul_i32_i24_e32 v2, s11, v2 +; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s18 +; GFX9-NEXT: s_ashr_i32 s3, s3, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: v_mad_i32_i24 v1, s7, v5, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s9, v6, v1 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s13, v7, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s15, v8, v1 +; GFX9-NEXT: v_mad_i32_i24 v1, s17, v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mad_i32_i24 v1, s3, v2, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 12 +; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s6 +; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 -; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s11, v2 +; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 +; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s18 +; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s7, v5, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s9, v6, v1 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v3, v4, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s13, v7, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s15, v8, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s17, v9, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3 +; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -1149,45 +1138,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 ; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v2 ; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 ; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 ; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mad_i32_i24 v0, s6, v2, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mad_i32_i24 v1, s6, v3, v1 ; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s8, v3, v1 ; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s10, v3, v1 ; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 ; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 ; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s12, v3, v1 ; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s14, v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_i32_i24 v1, s14, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: s_ashr_i32 s3, s3, 28 -; GFX9-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX9-NEXT: v_mad_i32_i24 v1, s16, v3, v1 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mad_i32_i24 v1, s2, v3, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_multiuses_mul1: @@ -1204,45 +1192,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 ; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2 ; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 ; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 ; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v3, v1 ; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v3, v1 ; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s10, v3, v1 ; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 ; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 ; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v3, v1 ; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v3, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v3, v1 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v3, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_multiuses_mul1: @@ -1256,6 +1243,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1288,10 +1276,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1485,6 +1471,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s2, 28 ; GFX9-NEXT: s_ashr_i32 s11, s3, 28 @@ -1502,26 +1489,24 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 ; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40004 ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_i32_i24 v0, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mad_i32_i24 v0, s7, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_mad_i32_i24 v0, s5, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_i32_i24 v1, s2, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mad_i32_i24 v1, s9, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mad_i32_i24 v1, s8, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mad_i32_i24 v1, s7, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: v_mad_i32_i24 v1, s5, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mad_i32_i24 v1, s4, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc32_vecMul: @@ -1537,14 +1522,13 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32_vecMul: @@ -1557,6 +1541,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1565,10 +1550,8 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -1748,11 +1731,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_and_b32 s11, s2, 15 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s12, s6, 28 @@ -1762,40 +1745,39 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 ; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v5 +; GFX9-NEXT: global_load_ushort v5, v0, s[0:1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 ; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v1, v5 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 ; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 -; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12 -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v6, v4, v6 -; GFX9-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v5, v1, v5 +; GFX9-NEXT: v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16_vecMul: @@ -1821,11 +1803,11 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 ; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 @@ -1835,115 +1817,112 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 +; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v1, v5 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v0, v4 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 -; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v6, v4, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_add_u32_e32 v5, v1, v5 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX10-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s10, s0, 15 ; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s9, s1, 15 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s11, s1, 15 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 +; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40004 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s10 +; GFX10-DL-NEXT: s_bfe_u32 s11, s1, 0x4000c ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s8 -; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x4000c -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s11 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -2147,288 +2126,284 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s7, s1, 4 -; GFX9-NEXT: s_lshr_b32 s14, s2, 4 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX9-NEXT: s_lshr_b32 s8, s1, 12 -; GFX9-NEXT: s_lshr_b32 s9, s1, 8 -; GFX9-NEXT: s_lshr_b32 s15, s2, 12 -; GFX9-NEXT: s_lshr_b32 s16, s2, 8 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s9 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8 -; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s16 -; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s15 +; GFX9-NEXT: s_lshr_b32 s9, s3, 4 +; GFX9-NEXT: s_lshr_b32 s16, s4, 4 +; GFX9-NEXT: v_lshlrev_b16_e64 v2, 12, s3 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX9-NEXT: s_lshr_b32 s10, s3, 12 +; GFX9-NEXT: s_lshr_b32 s11, s3, 8 +; GFX9-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-NEXT: s_lshr_b32 s18, s4, 8 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s18 +; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s17 +; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s3, s1, 20 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s10, s2, 20 -; GFX9-NEXT: s_lshr_b32 s11, s2, 16 +; GFX9-NEXT: v_mul_lo_u16_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3 -; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11 -; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10 -; GFX9-NEXT: s_lshr_b32 s5, s1, 28 -; GFX9-NEXT: s_lshr_b32 s6, s1, 24 -; GFX9-NEXT: s_lshr_b32 s12, s2, 28 -; GFX9-NEXT: s_lshr_b32 s13, s2, 24 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX9-NEXT: v_or_b32_e32 v5, v3, v5 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s5, s3, 20 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_lshr_b32 s12, s4, 20 +; GFX9-NEXT: s_lshr_b32 s13, s4, 16 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v4, v4, v11 +; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12 +; GFX9-NEXT: s_lshr_b32 s7, s3, 28 +; GFX9-NEXT: s_lshr_b32 s8, s3, 24 +; GFX9-NEXT: s_lshr_b32 s14, s4, 28 +; GFX9-NEXT: s_lshr_b32 s15, s4, 24 +; GFX9-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v18 +; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-NEXT: v_or_b32_e32 v6, v4, v8 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_or_b32_e32 v5, v3, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v6 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v5 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 4 -; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 12 -; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 8 -; GFX9-DL-NEXT: s_lshr_b32 s15, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s16, s2, 8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s15 +; GFX9-DL-NEXT: s_lshr_b32 s9, s3, 4 +; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s3 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX9-DL-NEXT: s_lshr_b32 s10, s3, 12 +; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 8 +; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12 +; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s17 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 20 -; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 20 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 16 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10 -; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 28 -; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24 -; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28 -; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 24 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v5 +; GFX9-DL-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 20 +; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 20 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 16 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, v4, v11 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12 +; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 28 +; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 +; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 24 +; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX9-DL-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v18 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v16 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v4 +; GFX9-DL-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 +; GFX9-DL-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v6 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v5 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s22, -1 ; GFX10-DL-NEXT: s_mov_b32 s23, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s20, s20, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 4 -; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s14 -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 -; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 8 -; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4 +; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 8 +; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v12 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v13 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v6 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v19, v13 ; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s10, s1, 20 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s10 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 16 -; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 28 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s11 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8 +; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 20 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s12 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v11 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 16 +; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 28 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s13 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 -; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 24 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v10 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v9, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v12 +; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 24 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v15 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v8, v6 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v14 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v6, v12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v8 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v4 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v5, v11 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v7 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v8 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 8e4ddbf1..b37d1d1 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -144,26 +144,25 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 ; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32: @@ -179,14 +178,13 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: @@ -199,6 +197,7 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -207,10 +206,8 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -394,156 +391,152 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s18, -1 -; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s16, s16, s3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-NEXT: s_and_b32 s1, s1, 15 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004 +; GFX9-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-NEXT: s_and_b32 s3, s3, 15 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s18, -1 -; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 -; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-DL-NEXT: s_and_b32 s3, s3, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s6, -1 -; GFX10-DL-NEXT: s_mov_b32 s7, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s4, s4, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 -; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -727,156 +720,152 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s18, -1 -; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s16, s16, s3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-NEXT: s_and_b32 s1, s1, 15 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004 +; GFX9-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-NEXT: s_and_b32 s3, s3, 15 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s2, s2, 15 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s18, -1 -; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 -; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-DL-NEXT: s_and_b32 s3, s3, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s6, -1 -; GFX10-DL-NEXT: s_mov_b32 s7, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s4, s4, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s10, -1 +; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 -; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -1064,165 +1053,161 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s18, -1 -; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s16, s16, s3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s0, 15 -; GFX9-NEXT: s_and_b32 s15, s1, 15 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: s_and_b32 s10, s2, 15 +; GFX9-NEXT: s_and_b32 s17, s3, 15 +; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s18, -1 -; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 -; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s5 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i4 addrspace(1)* nocapture %dst) { @@ -1394,165 +1379,161 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s18, -1 -; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s16, s16, s3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s0, 15 -; GFX9-NEXT: s_and_b32 s15, s1, 15 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: s_and_b32 s10, s2, 15 +; GFX9-NEXT: s_and_b32 s17, s3, 15 +; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s18, -1 -; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 -; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s2, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s2, s3 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i4 addrspace(1)* nocapture %dst) { @@ -1748,28 +1729,27 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 ; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s11, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_multiuses_mul1: @@ -1803,28 +1783,27 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_multiuses_mul1: @@ -1838,6 +1817,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -1870,10 +1850,8 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2083,26 +2061,25 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 ; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc32_vecMul: @@ -2118,14 +2095,13 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: @@ -2138,6 +2114,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -2146,10 +2123,8 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i32 addrspace(1)* nocapture %dst) { @@ -2308,11 +2283,13 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: global_load_ushort v5, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 @@ -2326,54 +2303,53 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, s3, v1 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-NEXT: s_and_b32 s17, s6, 15 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 ; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s11, s2, 15 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s4, v0 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s2, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v2 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 -; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, v4, v5 ; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 @@ -2387,102 +2363,97 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s3, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, s3, v1 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c ; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 ; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v0 ; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 ; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s4, v0 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s2, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v2 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_e32 v5, v4, v5 ; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v[0:1], v2, off +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v1 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 ; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s3 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s7 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s3 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s2, s4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s6 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s6 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s0 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: global_store_short v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i16 addrspace(1)* nocapture %dst) { @@ -2678,214 +2649,210 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s22, -1 -; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s2, 28 -; GFX9-NEXT: s_and_b32 s14, s2, 15 -; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: s_lshr_b32 s6, s1, 28 -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: s_and_b32 s7, s1, 15 -; GFX9-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-NEXT: s_bfe_u32 s8, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v7, s7, v7 -; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9 -; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX9-NEXT: s_bfe_u32 s5, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s15, s4, 28 +; GFX9-NEXT: s_and_b32 s16, s4, 15 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s3, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-NEXT: s_lshr_b32 s8, s3, 28 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_and_b32 s9, s3, 15 +; GFX9-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-NEXT: s_bfe_u32 s10, s3, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v7, s17 +; GFX9-NEXT: s_bfe_u32 s11, s3, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: v_mul_lo_u16_e32 v2, s5, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v4, s7, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v6, s9, v6 +; GFX9-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX9-NEXT: v_mul_lo_u16_e32 v8, s11, v8 +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v5 +; GFX9-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v6 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s22, -1 -; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28 -; GFX9-DL-NEXT: s_and_b32 s14, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s16, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-DL-NEXT: s_and_b32 s7, s1, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s7, v7 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 +; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 28 +; GFX9-DL-NEXT: s_and_b32 s16, s4, 15 +; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_and_b32 s9, s3, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-DL-NEXT: s_bfe_u32 s10, s3, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s17 +; GFX9-DL-NEXT: s_bfe_u32 s11, s3, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s18 +; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, s5, v2 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, s7, v4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, s9, v6 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, s11, v8 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_add_u32_e32 v1, v4, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-DL-NEXT: s_mov_b32 s10, -1 -; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 -; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-DL-NEXT: s_mov_b32 s14, -1 +; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 +; GFX10-DL-NEXT: s_add_u32 s12, s12, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s3, s5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s3, s7 ; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s8, s7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s2, s1, 0x40008 ; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s2 -; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s2 +; GFX10-DL-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s6 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x40010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX10-DL-NEXT: s_lshr_b32 s6, s1, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s6 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, s5, s0 -; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s0 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 -; GFX10-DL-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v7 +; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i8 addrspace(1)* nocapture %dst) { @@ -3037,165 +3004,161 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_mov_b32 s18, -1 -; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s22, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NEXT: s_add_u32 s16, s16, s3 -; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-NEXT: s_add_u32 s20, s20, s3 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s0, 15 -; GFX9-NEXT: s_and_b32 s15, s1, 15 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: s_and_b32 s10, s2, 15 +; GFX9-NEXT: s_and_b32 s17, s3, 15 +; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008 +; GFX9-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_mov_b32 s18, -1 -; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GFX9-DL-NEXT: s_mov_b32 s22, -1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_add_u32 s16, s16, s3 -; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000 +; GFX9-DL-NEXT: s_add_u32 s20, s20, s3 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 -; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: s_and_b32 s10, s2, 15 +; GFX9-DL-NEXT: s_and_b32 s17, s3, 15 +; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28 +; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s5 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i4 addrspace(1)* nocapture %dst) { @@ -3326,6 +3289,7 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 @@ -3340,9 +3304,9 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014 ; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v1, v2 ; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004 ; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008 ; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c @@ -3350,40 +3314,37 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014 ; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018 ; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_u32_u24 v0, s3, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: v_mad_u32_u24 v0, s13, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_mad_u32_u24 v0, s15, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_mad_u32_u24 v2, s17, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: v_mad_u32_u24 v1, s13, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mad_u32_u24 v1, s15, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mad_u32_u24 v1, s17, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_variant1: ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: @@ -3391,6 +3352,7 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 @@ -3398,10 +3360,8 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off +; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s1, s0, v0 +; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, i32 addrspace(1)* %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 9f4b3b8..e26d577 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -7,14 +7,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, ; GFX9-LABEL: s_insertelement_v2i16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; CIVI-LABEL: s_insertelement_v2i16_0: @@ -42,14 +41,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reg: @@ -94,17 +92,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ; use s2 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -161,14 +158,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_0_reghi: @@ -215,17 +211,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s3, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ; use s3 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -281,21 +276,20 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s3, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s1 +; GFX9-NEXT: ; use s3 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ; use s2 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -361,14 +355,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, ; GFX9-LABEL: s_insertelement_v2i16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x3e7 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; CIVI-LABEL: s_insertelement_v2i16_1: @@ -395,14 +388,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_1_reg: @@ -446,15 +438,14 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out ; GFX9-LABEL: s_insertelement_v2f16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x4500, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; CIVI-LABEL: s_insertelement_v2f16_0: @@ -480,14 +471,13 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out ; GFX9-LABEL: s_insertelement_v2f16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x4500 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; CIVI-LABEL: s_insertelement_v2f16_1: @@ -1050,19 +1040,18 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: s_andn2_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s0, 0x3e703e7 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_lshl_b32 s3, s4, 4 +; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_andn2_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s3, s3, 0x3e703e7 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_insertelement_v2i16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 3b02e85..a3dda8d 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -855,10 +855,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 -; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 -; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 +; HSA-GFX9: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 @@ -904,9 +905,10 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; FIXME: Why not all scalar loads? ; GCN-LABEL: {{^}}array_3xi16: -; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:2 -; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4 -; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:6 +; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:2 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:4 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:6 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, i8 addrspace(1)* undef store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef @@ -914,7 +916,8 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { } ; GCN-LABEL: {{^}}small_array_round_down_offset: -; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:1 +; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:1 define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { %val = extractvalue [1 x i8] %arg, 0 store volatile i8 %val, i8 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll index 8e90431..ef3ad70 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -75,8 +75,9 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-VI: kernarg_segment_byte_size = 28 -; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 -; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 +; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { @@ -156,9 +157,8 @@ entry: ; Byref pointers should only be treated as offsets from kernarg ; GCN-LABEL: {{^}}byref_constant_i8_arg: ; GCN: kernarg_segment_byte_size = 12 -; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4 -; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5 -; GCN: global_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8 +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) { %in = load i8, i8 addrspace(4)* %in.byref %ext = zext i8 %in to i32 @@ -168,9 +168,8 @@ define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %ou ; GCN-LABEL: {{^}}byref_constant_i16_arg: ; GCN: kernarg_segment_byte_size = 12 -; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4 -; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5 -; GCN: global_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8 +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) { %in = load i16, i16 addrspace(4)* %in.byref %ext = zext i16 %in to i32 @@ -207,8 +206,8 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* noca ; GCN-DAG: s_load_dword [[AFTER_OFFSET:s[0-9]+]], s[4:5], 0x104{{$}} ; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], [[IN]] ; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], [[AFTER_OFFSET]] -; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_IN]] -; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_AFTER_OFFSET]] +; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s +; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { %in = load i32, i32 addrspace(4)* %in.byref store volatile i32 %in, i32 addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll index 4ab8c34..5e9b711 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll @@ -21,7 +21,7 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_csub: -; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off glc +; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc define amdgpu_kernel void @global_atomic_csub(i32 addrspace(1)* %ptr, i32 %data) { main_body: %ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %ptr, i32 %data) @@ -29,7 +29,7 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_csub_off4: -; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 glc +; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc define amdgpu_kernel void @global_atomic_csub_off4(i32 addrspace(1)* %ptr, i32 %data) { main_body: %p = getelementptr i32, i32 addrspace(1)* %ptr, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index 4d49d87..130ce9e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -63,9 +63,10 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %pt } ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -; GFX9: global_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off glc{{$}} +; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -73,9 +74,11 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 } ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} -; GFX9: global_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16 glc{{$}} + +; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) @@ -84,18 +87,22 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %o } ; GCN-LABEL: {{^}}global_atomic_dec_noret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GFX9: global_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]], off{{$}} + +; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } ; GCN-LABEL: {{^}}global_atomic_dec_noret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} -; GFX9: global_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16{{$}} + +; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) @@ -338,9 +345,11 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %pt ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off glc{{$}} + +; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -349,9 +358,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} -; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) @@ -361,9 +371,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %o ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off{{$}} +; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void @@ -371,9 +382,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) n ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} -; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32{{$}} +; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll index aee4479..19322c1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -38,7 +38,7 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_add_f32: -; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off +; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) { main_body: %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) @@ -46,7 +46,7 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_add_f32_off4: -; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 +; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) { main_body: %p = getelementptr float, float addrspace(1)* %ptr, i64 1 @@ -55,7 +55,7 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4: -; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:-4 +; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4 define amdgpu_kernel void @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) { main_body: %p = getelementptr float, float addrspace(1)* %ptr, i64 -1 @@ -64,7 +64,7 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16: -; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off +; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) @@ -72,7 +72,7 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: -; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 +; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1 @@ -81,7 +81,7 @@ main_body: } ; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4: -; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:-4 +; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}} define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1 @@ -92,7 +92,7 @@ main_body: ; Make sure this artificially selects with an incorrect subtarget, but ; the feature set. ; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget: -; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off +; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(float addrspace(1)* %ptr, float %data) #0 { %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index ff9e1cc..6b66070 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -70,7 +70,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %pt ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -; GFX9: global_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off glc{{$}} +; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -80,7 +80,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} -; GFX9: global_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16 glc{{$}} +; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) @@ -91,7 +91,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o ; GCN-LABEL: {{^}}global_atomic_inc_noret_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GFX9: global_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]], off{{$}} +; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) n ; GCN-LABEL: {{^}}global_atomic_inc_noret_i32_offset: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} -; GFX9: global_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16{{$}} +; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) @@ -193,9 +193,10 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} -; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off glc{{$}} +; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -204,9 +205,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} -; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32 glc{{$}} +; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) @@ -216,10 +218,11 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off{{$}} +; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void @@ -227,9 +230,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) n ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} -; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32{{$}} +; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 5df4734..e64555c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -32,12 +32,11 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) store <2 x half> %result, <2 x half> addrspace(1)* %out @@ -71,11 +70,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) store <2 x half> %result, <2 x half> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index ca5a818..562e0f5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -76,17 +76,20 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1 ; ; GFX6789-LABEL: load_1d_tfe: ; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: v_mov_b32_e32 v6, 0 ; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v7, v6 +; GFX6789-NEXT: v_mov_b32_e32 v8, v6 +; GFX6789-NEXT: v_mov_b32_e32 v9, v6 +; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v0, v6 +; GFX6789-NEXT: v_mov_b32_e32 v1, v7 +; GFX6789-NEXT: v_mov_b32_e32 v2, v8 +; GFX6789-NEXT: v_mov_b32_e32 v3, v9 +; GFX6789-NEXT: v_mov_b32_e32 v4, v10 ; GFX6789-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm tfe -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v6, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -94,27 +97,29 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1 ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf unorm tfe -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v6, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -160,17 +165,20 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1 ; ; GFX6789-LABEL: load_1d_lwe: ; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: v_mov_b32_e32 v6, 0 ; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v7, v6 +; GFX6789-NEXT: v_mov_b32_e32 v8, v6 +; GFX6789-NEXT: v_mov_b32_e32 v9, v6 +; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v0, v6 +; GFX6789-NEXT: v_mov_b32_e32 v1, v7 +; GFX6789-NEXT: v_mov_b32_e32 v2, v8 +; GFX6789-NEXT: v_mov_b32_e32 v3, v9 +; GFX6789-NEXT: v_mov_b32_e32 v4, v10 ; GFX6789-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm lwe -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v6, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -178,27 +186,29 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1 ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf unorm lwe -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_lwe: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v6, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -282,18 +292,21 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1 ; ; GFX6789-LABEL: load_2d_tfe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v7, 0 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v8, v7 +; GFX6789-NEXT: v_mov_b32_e32 v9, v7 +; GFX6789-NEXT: v_mov_b32_e32 v10, v7 +; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v0, v7 +; GFX6789-NEXT: v_mov_b32_e32 v1, v8 +; GFX6789-NEXT: v_mov_b32_e32 v2, v9 +; GFX6789-NEXT: v_mov_b32_e32 v3, v10 +; GFX6789-NEXT: v_mov_b32_e32 v4, v11 ; GFX6789-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v7, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -301,28 +314,30 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1 ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -408,19 +423,22 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa ; ; GFX6789-LABEL: load_3d_tfe_lwe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v8, 0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v2 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v9, v8 +; GFX6789-NEXT: v_mov_b32_e32 v10, v8 +; GFX6789-NEXT: v_mov_b32_e32 v11, v8 +; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v0, v8 +; GFX6789-NEXT: v_mov_b32_e32 v1, v9 +; GFX6789-NEXT: v_mov_b32_e32 v2, v10 +; GFX6789-NEXT: v_mov_b32_e32 v3, v11 +; GFX6789-NEXT: v_mov_b32_e32 v4, v12 ; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v8, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -428,29 +446,31 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_tfe_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -536,19 +556,22 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace ; ; GFX6789-LABEL: load_cube_lwe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v8, 0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v2 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v9, v8 +; GFX6789-NEXT: v_mov_b32_e32 v10, v8 +; GFX6789-NEXT: v_mov_b32_e32 v11, v8 +; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v0, v8 +; GFX6789-NEXT: v_mov_b32_e32 v1, v9 +; GFX6789-NEXT: v_mov_b32_e32 v2, v10 +; GFX6789-NEXT: v_mov_b32_e32 v3, v11 +; GFX6789-NEXT: v_mov_b32_e32 v4, v12 ; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v8, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -556,29 +579,31 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_cube_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -662,18 +687,21 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrsp ; ; GFX6789-LABEL: load_1darray_tfe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v7, 0 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v8, v7 +; GFX6789-NEXT: v_mov_b32_e32 v9, v7 +; GFX6789-NEXT: v_mov_b32_e32 v10, v7 +; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v0, v7 +; GFX6789-NEXT: v_mov_b32_e32 v1, v8 +; GFX6789-NEXT: v_mov_b32_e32 v2, v9 +; GFX6789-NEXT: v_mov_b32_e32 v3, v10 +; GFX6789-NEXT: v_mov_b32_e32 v4, v11 ; GFX6789-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe da -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v7, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -681,28 +709,30 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrsp ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe da -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1darray_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -788,19 +818,22 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp ; ; GFX6789-LABEL: load_2darray_lwe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v8, 0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v2 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v9, v8 +; GFX6789-NEXT: v_mov_b32_e32 v10, v8 +; GFX6789-NEXT: v_mov_b32_e32 v11, v8 +; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v0, v8 +; GFX6789-NEXT: v_mov_b32_e32 v1, v9 +; GFX6789-NEXT: v_mov_b32_e32 v2, v10 +; GFX6789-NEXT: v_mov_b32_e32 v3, v11 +; GFX6789-NEXT: v_mov_b32_e32 v4, v12 ; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v8, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -808,29 +841,31 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darray_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -916,19 +951,22 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp ; ; GFX6789-LABEL: load_2dmsaa_both: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v8, 0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v2 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v9, v8 +; GFX6789-NEXT: v_mov_b32_e32 v10, v8 +; GFX6789-NEXT: v_mov_b32_e32 v11, v8 +; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v0, v8 +; GFX6789-NEXT: v_mov_b32_e32 v1, v9 +; GFX6789-NEXT: v_mov_b32_e32 v2, v10 +; GFX6789-NEXT: v_mov_b32_e32 v3, v11 +; GFX6789-NEXT: v_mov_b32_e32 v4, v12 ; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v8, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -936,29 +974,31 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2dmsaa_both: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1046,20 +1086,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad ; ; GFX6789-LABEL: load_2darraymsaa_tfe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v9, 0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v3 ; GFX6789-NEXT: v_mov_b32_e32 v7, v2 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v10, v9 +; GFX6789-NEXT: v_mov_b32_e32 v11, v9 +; GFX6789-NEXT: v_mov_b32_e32 v12, v9 +; GFX6789-NEXT: v_mov_b32_e32 v13, v9 +; GFX6789-NEXT: v_mov_b32_e32 v0, v9 +; GFX6789-NEXT: v_mov_b32_e32 v1, v10 +; GFX6789-NEXT: v_mov_b32_e32 v2, v11 +; GFX6789-NEXT: v_mov_b32_e32 v3, v12 +; GFX6789-NEXT: v_mov_b32_e32 v4, v13 ; GFX6789-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf unorm tfe da -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v9, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -1067,30 +1110,32 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe da -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v10 ; encoding: [0x0a,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v12 ; encoding: [0x0c,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v9, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x09,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1174,18 +1219,21 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspa ; ; GFX6789-LABEL: load_mip_1d_lwe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v7, 0 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v8, v7 +; GFX6789-NEXT: v_mov_b32_e32 v9, v7 +; GFX6789-NEXT: v_mov_b32_e32 v10, v7 +; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v0, v7 +; GFX6789-NEXT: v_mov_b32_e32 v1, v8 +; GFX6789-NEXT: v_mov_b32_e32 v2, v9 +; GFX6789-NEXT: v_mov_b32_e32 v3, v10 +; GFX6789-NEXT: v_mov_b32_e32 v4, v11 ; GFX6789-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf unorm lwe -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v7, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -1193,28 +1241,30 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspa ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load_mip v[0:4], v[0:1], s[0:7] dmask:0xf unorm lwe -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1d_lwe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e] ; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1300,19 +1350,22 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa ; ; GFX6789-LABEL: load_mip_2d_tfe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 +; GFX6789-NEXT: v_mov_b32_e32 v8, 0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v2 ; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 +; GFX6789-NEXT: v_mov_b32_e32 v9, v8 +; GFX6789-NEXT: v_mov_b32_e32 v10, v8 +; GFX6789-NEXT: v_mov_b32_e32 v11, v8 +; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v0, v8 +; GFX6789-NEXT: v_mov_b32_e32 v1, v9 +; GFX6789-NEXT: v_mov_b32_e32 v2, v10 +; GFX6789-NEXT: v_mov_b32_e32 v3, v11 +; GFX6789-NEXT: v_mov_b32_e32 v4, v12 ; GFX6789-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe -; GFX6789-NEXT: v_mov_b32_e32 v5, s8 -; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v8, v4, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -1320,29 +1373,31 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: image_load_mip v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe -; NOPRT-NEXT: v_mov_b32_e32 v5, s8 -; NOPRT-NEXT: v_mov_b32_e32 v6, s9 +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[5:6], v4, off +; NOPRT-NEXT: global_store_dword v5, v4, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2d_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e] ; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1698,16 +1753,18 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 a ; ; GFX6789-LABEL: load_1d_tfe_V4_dmask3: ; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: v_mov_b32_e32 v5, 0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 +; GFX6789-NEXT: v_mov_b32_e32 v6, v5 +; GFX6789-NEXT: v_mov_b32_e32 v7, v5 +; GFX6789-NEXT: v_mov_b32_e32 v8, v5 +; GFX6789-NEXT: v_mov_b32_e32 v0, v5 +; GFX6789-NEXT: v_mov_b32_e32 v1, v6 +; GFX6789-NEXT: v_mov_b32_e32 v2, v7 +; GFX6789-NEXT: v_mov_b32_e32 v3, v8 ; GFX6789-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 unorm tfe -; GFX6789-NEXT: v_mov_b32_e32 v4, s8 -; GFX6789-NEXT: v_mov_b32_e32 v5, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[4:5], v3, off +; GFX6789-NEXT: global_store_dword v5, v3, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -1715,26 +1772,27 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 a ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v3, 0 ; NOPRT-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe -; NOPRT-NEXT: v_mov_b32_e32 v4, s8 -; NOPRT-NEXT: v_mov_b32_e32 v5, s9 +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[4:5], v3, off +; NOPRT-NEXT: global_store_dword v4, v3, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe_V4_dmask3: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, s9 ; encoding: [0x09,0x02,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; encoding: [0x05,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; encoding: [0x05,0x03,0x0e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; encoding: [0x05,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; encoding: [0x05,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; encoding: [0x06,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; encoding: [0x07,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v8 ; encoding: [0x08,0x03,0x06,0x7e] ; GFX10-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x17,0x01,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; encoding: [0x08,0x02,0x08,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[4:5], v3, off ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x03,0x7d,0x00] +; GFX10-NEXT: global_store_dword v5, v3, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x03,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1776,15 +1834,16 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 a ; ; GFX6789-LABEL: load_1d_tfe_V4_dmask2: ; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: v_mov_b32_e32 v4, 0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 +; GFX6789-NEXT: v_mov_b32_e32 v5, v4 +; GFX6789-NEXT: v_mov_b32_e32 v6, v4 +; GFX6789-NEXT: v_mov_b32_e32 v0, v4 +; GFX6789-NEXT: v_mov_b32_e32 v1, v5 +; GFX6789-NEXT: v_mov_b32_e32 v2, v6 ; GFX6789-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 unorm tfe -; GFX6789-NEXT: v_mov_b32_e32 v3, s8 -; GFX6789-NEXT: v_mov_b32_e32 v4, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[3:4], v2, off +; GFX6789-NEXT: global_store_dword v4, v2, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -1792,25 +1851,25 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 a ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v2, 0 ; NOPRT-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x6 unorm tfe -; NOPRT-NEXT: v_mov_b32_e32 v3, s8 -; NOPRT-NEXT: v_mov_b32_e32 v4, s9 +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[3:4], v2, off +; NOPRT-NEXT: global_store_dword v3, v2, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe_V4_dmask2: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; encoding: [0x80,0x02,0x08,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, s9 ; encoding: [0x09,0x02,0x08,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; encoding: [0x04,0x03,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; encoding: [0x04,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; encoding: [0x05,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v6 ; encoding: [0x06,0x03,0x04,0x7e] ; GFX10-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x16,0x01,0xf0,0x03,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; encoding: [0x08,0x02,0x06,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[3:4], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x02,0x7d,0x00] +; GFX10-NEXT: global_store_dword v4, v2, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x02,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1850,14 +1909,14 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 a ; ; GFX6789-LABEL: load_1d_tfe_V4_dmask1: ; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: v_mov_b32_e32 v3, 0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 +; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: v_mov_b32_e32 v0, v3 +; GFX6789-NEXT: v_mov_b32_e32 v1, v4 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe -; GFX6789-NEXT: v_mov_b32_e32 v2, s8 -; GFX6789-NEXT: v_mov_b32_e32 v3, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[2:3], v1, off +; GFX6789-NEXT: global_store_dword v3, v1, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -1865,24 +1924,23 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 a ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v1, 0 ; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe -; NOPRT-NEXT: v_mov_b32_e32 v2, s8 -; NOPRT-NEXT: v_mov_b32_e32 v3, s9 +; NOPRT-NEXT: v_mov_b32_e32 v2, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[2:3], v1, off +; NOPRT-NEXT: global_store_dword v2, v1, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe_V4_dmask1: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; encoding: [0x04,0x03,0x02,0x7e] ; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] +; GFX10-NEXT: global_store_dword v3, v1, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1922,14 +1980,14 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 a ; ; GFX6789-LABEL: load_1d_tfe_V2_dmask1: ; GFX6789: ; %bb.0: ; %main_body +; GFX6789-NEXT: v_mov_b32_e32 v3, 0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 +; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: v_mov_b32_e32 v0, v3 +; GFX6789-NEXT: v_mov_b32_e32 v1, v4 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe -; GFX6789-NEXT: v_mov_b32_e32 v2, s8 -; GFX6789-NEXT: v_mov_b32_e32 v3, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[2:3], v1, off +; GFX6789-NEXT: global_store_dword v3, v1, s[8:9] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -1937,24 +1995,23 @@ define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 a ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: v_mov_b32_e32 v1, 0 ; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe -; NOPRT-NEXT: v_mov_b32_e32 v2, s8 -; NOPRT-NEXT: v_mov_b32_e32 v3, s9 +; NOPRT-NEXT: v_mov_b32_e32 v2, 0 ; NOPRT-NEXT: s_waitcnt vmcnt(0) -; NOPRT-NEXT: global_store_dword v[2:3], v1, off +; NOPRT-NEXT: global_store_dword v2, v1, s[8:9] ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe_V2_dmask1: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; encoding: [0x04,0x03,0x02,0x7e] ; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] +; GFX10-NEXT: global_store_dword v3, v1, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x08,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll index fa744b8..d09ecc8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -83,15 +83,15 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: global_store_dword v4, v3, s[12:13] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -99,16 +99,16 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s28, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_mov_b32_e32 v2, v4 +; GFX10-NEXT: v_mov_b32_e32 v3, v5 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 -; GFX10-NEXT: v_mov_b32_e32 v0, s12 -; GFX10-NEXT: v_mov_b32_e32 v1, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[0:1], v3, off ; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: global_store_dword v4, v3, s[12:13] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index 02f57ff..56a2165 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -60,18 +60,21 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: s_mov_b64 s[14:15], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec +; GFX6789-NEXT: v_mov_b32_e32 v6, 0 ; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v7, v6 +; GFX6789-NEXT: v_mov_b32_e32 v8, v6 +; GFX6789-NEXT: v_mov_b32_e32 v9, v6 +; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v0, v6 +; GFX6789-NEXT: v_mov_b32_e32 v1, v7 +; GFX6789-NEXT: v_mov_b32_e32 v2, v8 +; GFX6789-NEXT: v_mov_b32_e32 v3, v9 +; GFX6789-NEXT: v_mov_b32_e32 v4, v10 ; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe -; GFX6789-NEXT: v_mov_b32_e32 v5, s12 -; GFX6789-NEXT: v_mov_b32_e32 v6, s13 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v6, v4, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -79,19 +82,22 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe] ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -482,18 +488,21 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: s_mov_b64 s[14:15], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec +; GFX6789-NEXT: v_mov_b32_e32 v6, 0 ; GFX6789-NEXT: v_mov_b32_e32 v5, v0 -; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v1, v0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 +; GFX6789-NEXT: v_mov_b32_e32 v7, v6 +; GFX6789-NEXT: v_mov_b32_e32 v8, v6 +; GFX6789-NEXT: v_mov_b32_e32 v9, v6 +; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v0, v6 +; GFX6789-NEXT: v_mov_b32_e32 v1, v7 +; GFX6789-NEXT: v_mov_b32_e32 v2, v8 +; GFX6789-NEXT: v_mov_b32_e32 v3, v9 +; GFX6789-NEXT: v_mov_b32_e32 v4, v10 ; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe -; GFX6789-NEXT: v_mov_b32_e32 v5, s12 -; GFX6789-NEXT: v_mov_b32_e32 v6, s13 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[5:6], v4, off +; GFX6789-NEXT: global_store_dword v6, v4, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; @@ -501,19 +510,22 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe] ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1767,29 +1779,29 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x ; ; GFX6789-LABEL: sample_c_d_o_2darray_V1_tfe: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: v_mov_b32_e32 v9, 0 -; GFX6789-NEXT: v_mov_b32_e32 v10, v9 +; GFX6789-NEXT: v_mov_b32_e32 v11, 0 +; GFX6789-NEXT: v_mov_b32_e32 v12, v11 +; GFX6789-NEXT: v_mov_b32_e32 v9, v11 +; GFX6789-NEXT: v_mov_b32_e32 v10, v12 ; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da -; GFX6789-NEXT: v_mov_b32_e32 v0, s12 -; GFX6789-NEXT: v_mov_b32_e32 v1, s13 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[0:1], v10, off ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 +; GFX6789-NEXT: global_store_dword v11, v10, s[12:13] ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_o_2darray_V1_tfe: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e] ; GFX10-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08] -; GFX10-NEXT: v_mov_b32_e32 v2, s12 ; encoding: [0x0c,0x02,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] +; GFX10-NEXT: global_store_dword v11, v1, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x01,0x0c,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 2574266..ddb3f3f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1287,8 +1287,41 @@ bb: ; GCN: v_accvgpr_read_b32 ; GCN: v_accvgpr_read_b32 ; GCN: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg) { +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg, i64 %idx) { bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) + ;store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep + ret void +} + +; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved +; in the middle of the expanded agpr reg_sequence. The broadcast of +; the individual AGPR->AGPR components should avoid the intermediate AGPR case. +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code: +; GCN: v_mov_b32_e32 [[VTMP0:v[0-9]+]], 0x42f60000 +; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[VTMP0]] +; GCN: s_nop 0 +; GCN: v_accvgpr_read_b32 [[VTMP1:v[0-9]+]], [[AGPR]] +; GCN: v_accvgpr_read_b32 [[VTMP2:v[0-9]+]], [[AGPR]] +; GCN: v_accvgpr_read_b32 [[VTMP3:v[0-9]+]], [[AGPR]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP1]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP2]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP3]] +; GCN: s_nop 0 +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(<4 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll index 7b1fb01..111fd35 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.get.waveid.in.workgroup.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.s.get.waveid.in.workgroup() #0 ; GFX10: s_get_waveid_in_workgroup [[DEST:s[0-9]+]] ; GFX10: s_waitcnt lgkmcnt(0) ; GFX10: v_mov_b32_e32 [[VDEST:v[0-9]+]], [[DEST]] -; GFX10: global_store_dword v[{{[0-9:]+}}], [[VDEST]], off +; GFX10: global_store_dword v{{[0-9]+}}, [[VDEST]], s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @test_s_get_waveid_in_workgroup(i32 addrspace(1)* %out) { ; Make sure %out is loaded and assiciated wait count already inserted store i32 0, i32 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll index c97c43a..0ae7d454 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -15,7 +15,7 @@ ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64 -; GCN: store_dword v[{{[0-9:]+}}], [[V]] +; GCN: store_dword v{{.+}}, [[V]] ; OPT-W32: store i32 32, i32 addrspace(1)* %arg, align 4 ; OPT-W64: store i32 64, i32 addrspace(1)* %arg, align 4 @@ -36,7 +36,7 @@ bb: ; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}} ; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}} ; GCN-NOT: cndmask -; GCN: store_dword v[{{[0-9:]+}}], [[V]] +; GCN: store_dword v{{.+}}, [[V]] ; OPT-W32: store i32 1, i32 addrspace(1)* %arg, align 4 ; OPT-W64: store i32 2, i32 addrspace(1)* %arg, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index e2519851..45bacd4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -45,16 +45,13 @@ define amdgpu_kernel void @cos_f16(half addrspace(1)* %r, half addrspace(1)* %a) ; GFX9-LABEL: cos_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 -; GFX9-NEXT: v_cos_f16_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX9-NEXT: v_cos_f16_e32 v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %a.val = load half, half addrspace(1)* %a %r.val = call half @llvm.cos.f16(half %a.val) @@ -118,21 +115,18 @@ define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add ; GFX9-LABEL: cos_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0 -; GFX9-NEXT: v_cos_f16_e32 v2, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cos_f16_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cos_f16_e32 v3, v3 +; GFX9-NEXT: v_cos_f16_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index e4de730..6cd0c21 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -45,16 +45,13 @@ define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a) ; GFX9-LABEL: sin_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 -; GFX9-NEXT: v_sin_f16_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX9-NEXT: v_sin_f16_e32 v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %a.val = load half, half addrspace(1)* %a %r.val = call half @llvm.sin.f16(half %a.val) @@ -118,21 +115,18 @@ define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add ; GFX9-LABEL: sin_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118 +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0 -; GFX9-NEXT: v_sin_f16_e32 v2, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_sin_f16_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 +; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_sin_f16_e32 v3, v3 +; GFX9-NEXT: v_sin_f16_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 0a60413..eaae56f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -97,7 +97,7 @@ entry: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]] +; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]] ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { @@ -112,7 +112,7 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i3 ; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]] ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} ; EG: MEM_RAT @@ -144,7 +144,7 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1) ; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]] ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in %ext = sext <1 x i32> %ld to <1 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 94ea587..2b08a9d 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -46,12 +46,11 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 ; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: s_waitcnt vmcnt(1) ; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3 -; MUBUF-NEXT: s_waitcnt lgkmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v2, s4 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; MUBUF-NEXT: v_mov_b32_e32 v3, s5 -; MUBUF-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; MUBUF-NEXT: s_endpgm ; ; FLATSCR-LABEL: local_stack_offset_uses_sp: @@ -79,10 +78,9 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v3, s1 -; FLATSCR-NEXT: v_mov_b32_e32 v2, s0 -; FLATSCR-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; FLATSCR-NEXT: s_endpgm entry: %pin.low = alloca i32, align 8192, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index d0d8788..71ec879 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -11,12 +11,11 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index e93ee2c..7d165b0 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -11,7 +11,7 @@ ; GFX8: v_mad_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]] ; GFX9: v_mad_legacy_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]] ; GFX10: v_mad_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]] -; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[R]] +; GCN: {{flat|global}}_store_short v{{.+}}, v[[R]] ; GCN: s_endpgm define amdgpu_kernel void @mad_u16( i16 addrspace(1)* %r, diff --git a/llvm/test/CodeGen/AMDGPU/mai-inline.ll b/llvm/test/CodeGen/AMDGPU/mai-inline.ll index a9d8c5c..8b7f542 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-inline.ll +++ b/llvm/test/CodeGen/AMDGPU/mai-inline.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}accvgpr_write_read: ; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1 ; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]] -; GFX908: global_store_dword {{[^,]+}}, [[VREG]], off +; GFX908: global_store_dword v{{[0-9]+}}, [[VREG]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) { bb: %in.1 = load float, float addrspace(1)* %arg diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll index f6ee09f..5e2e2df 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -347,8 +347,8 @@ entry: ; GCN-LABEL: {{^}}nontemporal_global_0: ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} -; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} glc slc{{$}} +; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_global_0 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index ab38736..2c5931e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -51,17 +51,16 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 -; GCN-NEXT: v_mov_b32_e32 v12, s18 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v8, s8 -; GCN-NEXT: v_mov_b32_e32 v13, s19 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -71,14 +70,14 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocaptu ; GCN-NEXT: v_mov_b32_e32 v9, s9 ; GCN-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[18:19] +; GCN-NEXT: global_store_dwordx4 v12, v[4:7], s[18:19] offset:16 +; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[18:19] offset:32 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48 +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[18:19] offset:48 ; GCN-NEXT: s_endpgm bb: %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16 @@ -179,12 +178,11 @@ define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias noca ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 -; GCN-NEXT: v_mov_b32_e32 v9, s5 -; GCN-NEXT: v_mov_b32_e32 v8, s4 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll index 0457030..b5f2ee4 100644 --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -30,23 +30,21 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; MUBUF-NEXT: s_cbranch_scc1 BB0_3 ; MUBUF-NEXT: ; %bb.2: ; %bb.1 ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0 -; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 ; MUBUF-NEXT: s_mov_b32 s32, s6 -; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; MUBUF-NEXT: v_mov_b32_e32 v1, 1 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v3, 1 ; MUBUF-NEXT: s_add_i32 s6, s6, s7 -; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_mov_b32_e32 v1, s6 -; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_u32_e32 v2, v1, v0 +; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: v_mov_b32_e32 v1, s5 -; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] ; MUBUF-NEXT: BB0_3: ; %bb.2 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off @@ -76,14 +74,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2 ; FLATSCR-NEXT: s_mov_b32 s32, s4 ; FLATSCR-NEXT: s_add_i32 s4, s4, s2 -; FLATSCR-NEXT: scratch_load_dword v1, off, s4 +; FLATSCR-NEXT: scratch_load_dword v2, off, s4 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0 +; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 -; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] ; FLATSCR-NEXT: BB0_3: ; %bb.2 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off @@ -137,23 +133,21 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; MUBUF-NEXT: ; %bb.1: ; %bb.0 ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0 -; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 ; MUBUF-NEXT: s_mov_b32 s32, s6 -; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; MUBUF-NEXT: v_mov_b32_e32 v1, 1 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v3, 1 ; MUBUF-NEXT: s_add_i32 s6, s6, s7 -; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; MUBUF-NEXT: v_mov_b32_e32 v1, s6 -; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_u32_e32 v2, v1, v0 +; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: v_mov_b32_e32 v1, s5 -; MUBUF-NEXT: global_store_dword v[0:1], v2, off +; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] ; MUBUF-NEXT: BB1_2: ; %bb.1 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off @@ -178,14 +172,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 ; FLATSCR-NEXT: s_add_i32 s2, s2, s3 -; FLATSCR-NEXT: scratch_load_dword v1, off, s2 +; FLATSCR-NEXT: scratch_load_dword v2, off, s2 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0 +; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; FLATSCR-NEXT: v_mov_b32_e32 v0, s0 -; FLATSCR-NEXT: v_mov_b32_e32 v1, s1 -; FLATSCR-NEXT: global_store_dword v[0:1], v2, off +; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] ; FLATSCR-NEXT: BB1_2: ; %bb.1 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll index 3b0795d2..27d01ef 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -632,10 +632,9 @@ define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) { ; GFX9-LABEL: global_inst_salu_offset_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -643,11 +642,10 @@ define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) { ; GFX10-LABEL: global_inst_salu_offset_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -661,10 +659,9 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p ; GFX9-LABEL: global_inst_salu_offset_11bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -672,11 +669,10 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p ; GFX10-LABEL: global_inst_salu_offset_11bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -690,10 +686,9 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p ; GFX9-LABEL: global_inst_salu_offset_12bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -745,10 +740,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1) ; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -756,11 +750,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1) ; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm @@ -774,10 +767,9 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1) ; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -834,10 +826,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* ; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm @@ -916,10 +907,9 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace ; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll index da52bce..ca41899 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -9,7 +9,7 @@ ; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0, -; GCN: store_dword v[{{[0-9:]+}}], [[RES]] +; GCN: store_dword v{{.+}}, [[RES]] ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 ; OPT: store <4 x float> , <4 x float> addrspace(5)* %alloca, align 4 @@ -44,7 +44,7 @@ entry: ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] -; GCN: store_dwordx4 v[{{[0-9:]+}}], +; GCN: store_dwordx4 v{{.+}}, ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll index 1b709dc..a9182aa 100644 --- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -17,10 +17,10 @@ declare i64 @llvm.readcyclecounter() #0 ; GETREG-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 ; GETREG-DAG: s_getreg_b32 [[CNT1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20) ; GETREG-DAG: v_mov_b32_e32 v[[VCNT1:[0-9]+]], [[CNT1]] -; GETREG: global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT1]]:[[ZERO]]], off +; GETREG: global_store_dwordx2 v{{.+}}, v{{\[}}[[VCNT1]]:[[ZERO]]] ; GETREG: s_getreg_b32 [[CNT2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20) ; GETREG: v_mov_b32_e32 v[[VCNT2:[0-9]+]], [[CNT2]] -; GETREG: global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT2]]:[[ZERO]]], off +; GETREG: global_store_dwordx2 v{{.+}}, v{{\[}}[[VCNT2]]:[[ZERO]]] define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.readcyclecounter() diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 50ac228..d01dbbf 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -59,21 +59,20 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s3, s7, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 @@ -134,19 +133,16 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_add_i32 v4, s0, v4 clamp -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v4 -; GFX9-NEXT: global_store_dword v[0:1], v5, off -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_byte v[2:3], v0, off +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_add_i32 s1, s0, s1 +; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: global_store_dword v0, v2, s[4:5] +; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %sadd, 0 @@ -214,24 +210,17 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* ; GFX9-LABEL: v_saddo_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v5, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_i32 v6, v4, v5 clamp -; GFX9-NEXT: v_add_u32_e32 v4, v4, v5 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v6 -; GFX9-NEXT: global_store_dword v[0:1], v4, off -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_byte v[2:3], v0, off +; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 @@ -296,23 +285,20 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* ; GFX9-LABEL: s_saddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_add_u32 s0, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_addc_u32 s1, s5, s7 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX9-NEXT: s_add_u32 s8, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_addc_u32 s9, s5, s7 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v[2:3], v0, off +; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 @@ -381,27 +367,20 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* ; ; GFX9-LABEL: v_saddo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v[6:7], v0, off +; GFX9-NEXT: global_store_byte v6, v0, s[6:7] ; GFX9-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 @@ -481,28 +460,21 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ; GFX9-LABEL: v_saddo_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_i32 v8, v0, v2 clamp +; GFX9-NEXT: v_add_i32 v5, v0, v2 clamp ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_i32 v2, v1, v3 clamp ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: global_store_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index e089ac0a..c91a54c 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -165,8 +165,8 @@ define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace ; CI: buffer_load_dword ; CI: buffer_store_dword -; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4 -; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12 +; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 +; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 ; GFX9: ds_write_b32 define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 @@ -216,11 +216,11 @@ define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %ou ; CI: buffer_store_dword ; CI: s_endpgm -; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400 -; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408 -; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12 -; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400 -; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:400 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408 +; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 +; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:400 +; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408 ; GFX9: global_store_dword ; GFX9: s_endpgm define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll index c376886..2e0633d 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: ; %bb.0: ; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0 ; GCN: s_waitcnt lgkmcnt(0) -; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off +; GCN: global_store_dword v define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) { bb: diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll index a884336..6085e1a 100644 --- a/llvm/test/CodeGen/AMDGPU/store-global.ll +++ b/llvm/test/CodeGen/AMDGPU/store-global.ll @@ -65,7 +65,7 @@ entry: ; SIVI-DAG: buffer_store_byte ; SIVI-DAG: buffer_store_short -; GFX9-DAG: global_store_byte_d16_hi v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off offset:2 +; GFX9-DAG: global_store_byte_d16_hi v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:2 ; GFX9-DAG: global_store_short ; EG: MEM_RAT MSKOR @@ -80,7 +80,7 @@ entry: ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} ; GCN: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] ; SIVI: buffer_store_dword [[VAND]] -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAND]] +; GFX9: global_store_dword v{{[0-9]+}}, [[VAND]], s ; EG: MEM_RAT_CACHELESS STORE_RAW ; EG-NOT: MEM_RAT diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 90336ca..f66cb6b 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -75,24 +75,22 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6 +; GFX9-NEXT: ds_write_b32 v1, v3 ; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll index 627ba9e..85029a5 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}trunc_store_v4i64_v4i8: -; GCN: global_store_dword v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off +; GCN: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @trunc_store_v4i64_v4i8(< 4 x i8> addrspace(1)* %out, <4 x i64> %in) { entry: %trunc = trunc <4 x i64> %in to < 4 x i8> @@ -10,7 +10,7 @@ entry: } ; GCN-LABEL: {{^}}trunc_store_v8i64_v8i8: -; GCN: global_store_dwordx2 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off +; GCN: global_store_dwordx2 v{{[0-9]+}}, v{{\[[0-9]:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @trunc_store_v8i64_v8i8(< 8 x i8> addrspace(1)* %out, <8 x i64> %in) { entry: %trunc = trunc <8 x i64> %in to < 8 x i8> @@ -19,7 +19,7 @@ entry: } ; GCN-LABEL: {{^}}trunc_store_v8i64_v8i16: -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off +; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @trunc_store_v8i64_v8i16(< 8 x i16> addrspace(1)* %out, <8 x i64> %in) { entry: %trunc = trunc <8 x i64> %in to < 8 x i16> @@ -28,8 +28,8 @@ entry: } ; GCN-LABEL: {{^}}trunc_store_v8i64_v8i32: -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16 -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off +; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 +; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @trunc_store_v8i64_v8i32(< 8 x i32> addrspace(1)* %out, <8 x i64> %in) { entry: %trunc = trunc <8 x i64> %in to <8 x i32> @@ -38,10 +38,10 @@ entry: } ; GCN-LABEL: {{^}}trunc_store_v16i64_v16i32: -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:48 -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:32 -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16 -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off +; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:48 +; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 +; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 +; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) { entry: %trunc = trunc <16 x i64> %in to <16 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index bc8ccf9..b3e158f 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -798,16 +798,15 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4) ; GFX9-LABEL: shuffle_scalar_load_v8i32_0123: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 58737e6..b5b42d8 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -233,11 +233,11 @@ bb13: ; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo ; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc +; GCN: global_store_dword ; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo ; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec ; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo ; GFX1064: s_and_b64 [[MASK0]], [[MASK0]], exec -; GCN: global_store_dword ; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], [[MASK0]] ; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], [[MASK0]] ; GCN: BB{{.*}}: ; %Flow @@ -476,9 +476,12 @@ exit: } ; GCN-LABEL: {{^}}fdiv_f32: +; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} + ; GCN-NOT: vcc ; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { @@ -774,7 +777,7 @@ main_body: ; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}| ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]] -; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]], +; GCN: store_dwordx2 v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]], s define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) { %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) @@ -789,7 +792,7 @@ define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src ; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}} ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]] -; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]], +; GCN: store_dwordx2 v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]], s define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) { %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) store i64 %result, i64 addrspace(1)* %out @@ -801,7 +804,7 @@ define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] ; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}| ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] -; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]], +; GCN: store_dword v{{[0-9]+}}, v[[V_LO]], s define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) { %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) @@ -814,7 +817,7 @@ define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}} ; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}} ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}} -; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]], +; GCN: store_dword v{{[0-9]+}}, v[[V_LO]], s define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) { %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) store i32 %result, i32 addrspace(1)* %out