FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
-FunctionPass *createSIFixupVectorISelPass();
FunctionPass *createSIAddIMGInitPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
void initializeSIFixVGPRCopiesPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
-void initializeSIFixupVectorISelPass(PassRegistry &);
-extern char &SIFixupVectorISelID;
-
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
- initializeSIFixupVectorISelPass(*PR);
initializeSIFoldOperandsPass(*PR);
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesID);
addPass(createSILowerI1CopiesPass());
- // TODO: We have to add FinalizeISel
- // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel
- // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded.
- // Will be removed as soon as SIFixupVectorISel is changed
- // to work with V_ADD/SUB_U64_PSEUDO instead.
- addPass(&FinalizeISelID);
- addPass(createSIFixupVectorISelPass());
addPass(createSIAddIMGInitPass());
return false;
}
SIAddIMGInit.cpp
SIAnnotateControlFlow.cpp
SIFixSGPRCopies.cpp
- SIFixupVectorISel.cpp
SIFixVGPRCopies.cpp
SIPreAllocateWWMRegs.cpp
SIFoldOperands.cpp
+++ /dev/null
-//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-/// \file
-/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
-/// Currently this will convert GLOBAL_{LOAD|STORE}_*
-/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
-/// feeding the sreg into the saddr field of the new instruction.
-/// We currently handle a REG_SEQUENCE feeding the vaddr
-/// and decompose it into a base and index.
-///
-/// Transform:
-/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_CO_U32_e64 %21:sgpr_32, %22:vgpr_32
-/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
-/// %24:vgpr_32, %19:sreg_64_xexec
-/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
-/// %11:vreg_64 = COPY %16:vreg_64
-/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
-/// Into:
-/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
-/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
-/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
-///
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-#define DEBUG_TYPE "si-fixup-vector-isel"
-
-using namespace llvm;
-
-static cl::opt<bool> EnableGlobalSGPRAddr(
- "amdgpu-enable-global-sgpr-addr",
- cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"),
- cl::init(false));
-
-STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities");
-STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted");
-
-namespace {
-
-class SIFixupVectorISel : public MachineFunctionPass {
-public:
- static char ID;
-
-public:
- SIFixupVectorISel() : MachineFunctionPass(ID) {
- initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
- "SI Fixup Vector ISel", false, false)
-
-char SIFixupVectorISel::ID = 0;
-
-char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
-
-FunctionPass *llvm::createSIFixupVectorISelPass() {
- return new SIFixupVectorISel();
-}
-
-static bool findSRegBaseAndIndex(MachineOperand *Op,
- unsigned &BaseReg,
- unsigned &IndexReg,
- MachineRegisterInfo &MRI,
- const SIRegisterInfo *TRI) {
- SmallVector<MachineOperand *, 8> Worklist;
- Worklist.push_back(Op);
- while (!Worklist.empty()) {
- MachineOperand *WOp = Worklist.pop_back_val();
- if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg()))
- continue;
- MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
- switch (DefInst->getOpcode()) {
- default:
- continue;
- case AMDGPU::COPY:
- Worklist.push_back(&DefInst->getOperand(1));
- break;
- case AMDGPU::REG_SEQUENCE:
- if (DefInst->getNumOperands() != 5)
- continue;
- Worklist.push_back(&DefInst->getOperand(1));
- Worklist.push_back(&DefInst->getOperand(3));
- break;
- case AMDGPU::V_ADD_CO_U32_e64:
- // The V_ADD_* and its analogous V_ADDCV_* are generated by
- // a previous pass which lowered from an ADD_64_PSEUDO,
- // which generates subregs to break up the 64 bit args.
- if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister)
- continue;
- BaseReg = DefInst->getOperand(2).getReg();
- if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister)
- continue;
- IndexReg = DefInst->getOperand(3).getReg();
- // Chase the IndexReg.
- MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg);
- if (!MI || !MI->isCopy())
- continue;
- // Make sure the reg class is 64 bit for Index.
- // If the Index register is a subreg, we want it to reference
- // a 64 bit register which we will use as the Index reg.
- const TargetRegisterClass *IdxRC, *BaseRC;
- IdxRC = MRI.getRegClass(MI->getOperand(1).getReg());
- if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64)
- continue;
- IndexReg = MI->getOperand(1).getReg();
- // Chase the BaseReg.
- MI = MRI.getUniqueVRegDef(BaseReg);
- if (!MI || !MI->isCopy())
- continue;
- // Make sure the register class is 64 bit for Base.
- BaseReg = MI->getOperand(1).getReg();
- BaseRC = MRI.getRegClass(BaseReg);
- if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64)
- continue;
- // Make sure Base is SReg and Index is VReg.
- if (!TRI->isSGPRReg(MRI, BaseReg))
- return false;
- if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
- return false;
- // clear any killed flags on Index and Base regs, used later.
- MRI.clearKillFlags(IndexReg);
- MRI.clearKillFlags(BaseReg);
- return true;
- }
- }
- return false;
-}
-
-// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
-static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
- MachineFunction &MF,
- MachineRegisterInfo &MRI,
- const GCNSubtarget &ST,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI) {
- if (!EnableGlobalSGPRAddr)
- return false;
- bool FuncModified = false;
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
- int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
- if (NewOpcd < 0)
- continue;
- // Update our statistics on opportunities seen.
- ++NumSGPRGlobalOccurs;
- LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
- // Need a Base and Index or we cant transform to _SADDR.
- unsigned BaseReg = 0;
- unsigned IndexReg = 0;
- MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
- if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
- continue;
- ++NumSGPRGlobalSaddrs;
- FuncModified = true;
- // Create the new _SADDR Memory instruction.
- bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
- MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
- MachineInstr *NewGlob = nullptr;
- NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
- if (HasVdst)
- NewGlob->addOperand(MF, MI.getOperand(0));
- NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
- if (VData)
- NewGlob->addOperand(MF, *VData);
- NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
- NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
-
- MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
- // Atomics dont have a GLC, so omit the field if not there.
- if (Glc)
- NewGlob->addOperand(MF, *Glc);
-
- MachineOperand *DLC = TII->getNamedOperand(MI, AMDGPU::OpName::dlc);
- if (DLC)
- NewGlob->addOperand(MF, *DLC);
-
- NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
- // _D16 have an vdst_in operand, copy it in.
- MachineOperand *VDstInOp = TII->getNamedOperand(MI,
- AMDGPU::OpName::vdst_in);
- if (VDstInOp)
- NewGlob->addOperand(MF, *VDstInOp);
- NewGlob->copyImplicitOps(MF, MI);
- NewGlob->cloneMemRefs(MF, MI);
- // Remove the old Global Memop instruction.
- MI.eraseFromParent();
- LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
- }
- return FuncModified;
-}
-
-bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
- // Only need to run this in SelectionDAG path.
- if (MF.getProperties().hasProperty(
- MachineFunctionProperties::Property::Selected))
- return false;
-
- if (skipFunction(MF.getFunction()))
- return false;
-
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
- bool FuncModified = false;
- for (MachineBasicBlock &MBB : MF) {
- // Cleanup missed Saddr opportunites from ISel.
- FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI);
- }
- return FuncModified;
-}
-; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr -stop-after=si-form-memory-clauses < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -stop-after=si-form-memory-clauses < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}name:{{[ ]*}}vector_clause
; GCN: BUNDLE
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
@lds = addrspace(3) global [512 x float] undef, align 4
@lds.f64 = addrspace(3) global [512 x double] undef, align 8
; GFX9-NOT: m0
; GCN-DAG: {{buffer|flat|global}}_load_dword [[VAL:v[0-9]+]]
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]]
; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]]
; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+
+; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}}
+; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]]
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
-
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 3, v{{[0-9]+}}
+; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VBASE]]
; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
; GCN: s_endpgm
define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
@lds = addrspace(3) global [512 x float] undef, align 4
; GCN-LABEL: @simple_write2st64_one_val_f32_0_1
; CI-DAG: s_mov_b32 m0
-; GFX9-NOT: m0n
+; GFX9-NOT: m0
; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]]
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4{{$}}
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
+++ /dev/null
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-fixup-vector-isel -amdgpu-enable-global-sgpr-addr %s -o - | FileCheck -check-prefix=GCN %s
-
-# Coverage tests for GLOBAL_* to their _SADDR equivalent.
-
-# GCN-LABEL: name: global_load_store_atomics
-# GCN: GLOBAL_LOAD_DWORD_SADDR
-# GCN: GLOBAL_STORE_DWORD_SADDR
-# GCN: GLOBAL_LOAD_DWORDX2_SADDR
-# GCN: GLOBAL_STORE_DWORDX2_SADDR
-# GCN: GLOBAL_LOAD_DWORDX3_SADDR
-# GCN: GLOBAL_STORE_DWORDX3_SADDR
-# GCN: GLOBAL_LOAD_DWORDX4_SADDR
-# GCN: GLOBAL_STORE_DWORDX4_SADDR
-# GCN: GLOBAL_LOAD_SSHORT_SADDR
-# GCN: GLOBAL_STORE_SHORT_SADDR
-# GCN: GLOBAL_LOAD_USHORT_SADDR
-# GCN: GLOBAL_STORE_SHORT_SADDR
-# GCN: GLOBAL_LOAD_UBYTE_SADDR
-# GCN: GLOBAL_STORE_BYTE_SADDR
-# GCN: GLOBAL_LOAD_SBYTE_SADDR
-# GCN: GLOBAL_STORE_BYTE_SADDR
-# GCN: GLOBAL_LOAD_SBYTE_D16_SADDR
-# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR
-# GCN: GLOBAL_LOAD_UBYTE_D16_SADDR
-# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR
-# GCN: GLOBAL_LOAD_SBYTE_D16_HI_SADDR
-# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR
-# GCN: GLOBAL_LOAD_UBYTE_D16_HI_SADDR
-# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR
-# GCN: GLOBAL_LOAD_SHORT_D16_HI_SADDR
-# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR
-# GCN: GLOBAL_LOAD_SHORT_D16_SADDR
-# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR
-
-# GCN: GLOBAL_ATOMIC_XOR_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_XOR_SADDR %
-# GCN: GLOBAL_ATOMIC_SMIN_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_SMIN_SADDR %
-# GCN: GLOBAL_ATOMIC_AND_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_AND_SADDR %
-# GCN: GLOBAL_ATOMIC_SWAP_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_SWAP_SADDR %
-# GCN: GLOBAL_ATOMIC_SMAX_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_SMAX_SADDR %
-# GCN: GLOBAL_ATOMIC_UMIN_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_UMIN_SADDR %
-# GCN: GLOBAL_ATOMIC_UMAX_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_UMAX_SADDR %
-# GCN: GLOBAL_ATOMIC_OR_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_OR_SADDR %
-# GCN: GLOBAL_ATOMIC_ADD_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_ADD_SADDR %
-# GCN: GLOBAL_ATOMIC_SUB_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_SUB_SADDR %
-# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR %
-# GCN: GLOBAL_ATOMIC_INC_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_INC_SADDR %
-# GCN: GLOBAL_ATOMIC_DEC_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_DEC_SADDR %
-
-# GCN: GLOBAL_ATOMIC_OR_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_OR_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_AND_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_AND_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_INC_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_INC_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR %
-# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR_RTN
-# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR %
-
-name: global_load_store_atomics
-body: |
- bb.0:
- liveins: $vgpr0, $sgpr0_sgpr1
-
- %1:sgpr_64 = COPY $sgpr0_sgpr1
- %0:vgpr_32 = COPY $vgpr0
- %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 )
- %5:sreg_32_xm0 = S_MOV_B32 2
- %6:vgpr_32 = V_LSHLREV_B32_e64 killed %5, %0, implicit $exec
- %7:sreg_32_xm0 = S_MOV_B32 0
- %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- %14:vreg_64 = REG_SEQUENCE killed %6, %subreg.sub0, killed %15, %subreg.sub1
- %21:sgpr_32 = COPY %4.sub0
- %22:vgpr_32 = COPY %14.sub0
- %23:sgpr_32 = COPY %4.sub1
- %24:vgpr_32 = COPY %14.sub1
- %17:vgpr_32, %19:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %22, 0, implicit $exec
- %25:vgpr_32 = COPY %23
- %18:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %25, %24, killed %19, 0, implicit $exec
- %16:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %18, %subreg.sub1
- %11:vreg_64 = COPY %16
-
- %10:vgpr_32 = GLOBAL_LOAD_DWORD %11, 16, 0, 0, 0, implicit $exec :: (load 4)
- GLOBAL_STORE_DWORD %11, %10, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %40:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 16, 0, 0, 0, implicit $exec :: (load 4)
- GLOBAL_STORE_DWORDX2 %11, %40, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %41:vreg_96 = GLOBAL_LOAD_DWORDX3 %11, 16, 0, 0, 0, implicit $exec :: (load 4)
- GLOBAL_STORE_DWORDX3 %11, %41, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %42:vreg_128 = GLOBAL_LOAD_DWORDX4 %11, 16, 0, 0, 0, implicit $exec :: (load 4)
- GLOBAL_STORE_DWORDX4 %11, %42, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %43:vgpr_32 = GLOBAL_LOAD_SSHORT %11, 16, 0, 0, 0, implicit $exec :: (load 4)
- GLOBAL_STORE_SHORT %11, %43, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %44:vgpr_32 = GLOBAL_LOAD_USHORT %11, 16, 0, 0, 0, implicit $exec :: (load 4)
- GLOBAL_STORE_SHORT %11, %44, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %45:vgpr_32 = GLOBAL_LOAD_UBYTE %11, 16, 0, 0, 0, implicit $exec :: (load 4)
- GLOBAL_STORE_BYTE %11, %45, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %46:vgpr_32 = GLOBAL_LOAD_SBYTE %11, 16, 0, 0, 0, implicit $exec :: (load 4)
- GLOBAL_STORE_BYTE %11, %46, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %47:vgpr_32 = GLOBAL_LOAD_SBYTE_D16 %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4)
- GLOBAL_STORE_BYTE_D16_HI %11, %47, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %48:vgpr_32 = GLOBAL_LOAD_UBYTE_D16 %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4)
- GLOBAL_STORE_BYTE_D16_HI %11, %48, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %49:vgpr_32 = GLOBAL_LOAD_SBYTE_D16_HI %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4)
- GLOBAL_STORE_BYTE_D16_HI %11, %49, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %50:vgpr_32 = GLOBAL_LOAD_UBYTE_D16_HI %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4)
- GLOBAL_STORE_BYTE_D16_HI %11, %50, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %51:vgpr_32 = GLOBAL_LOAD_SHORT_D16_HI %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4)
- GLOBAL_STORE_SHORT_D16_HI %11, %51, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- %52:vgpr_32 = GLOBAL_LOAD_SHORT_D16 %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4)
- GLOBAL_STORE_SHORT_D16_HI %11, %52, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
-
- %53:vgpr_32 = GLOBAL_ATOMIC_XOR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %53, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_XOR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %54:vgpr_32 = GLOBAL_ATOMIC_SMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %54, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_SMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %55:vgpr_32 = GLOBAL_ATOMIC_AND_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %55, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_AND %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %56:vgpr_32 = GLOBAL_ATOMIC_SWAP_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %56, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_SWAP %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %57:vgpr_32 = GLOBAL_ATOMIC_SMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %57, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_SMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %58:vgpr_32 = GLOBAL_ATOMIC_UMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %58, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_UMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %59:vgpr_32 = GLOBAL_ATOMIC_UMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %59, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_UMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %60:vgpr_32 = GLOBAL_ATOMIC_OR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %60, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_OR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %61:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %61, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_ADD %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %62:vgpr_32 = GLOBAL_ATOMIC_SUB_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %62, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_SUB %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %63:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %63, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_CMPSWAP %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %64:vgpr_32 = GLOBAL_ATOMIC_INC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %64, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_INC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %65:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORD %11, %65, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_DEC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %66:vreg_64 = GLOBAL_ATOMIC_OR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %66, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_OR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %67:vreg_64 = GLOBAL_ATOMIC_XOR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %67, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_XOR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %68:vreg_64 = GLOBAL_ATOMIC_AND_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %68, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_AND_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %69:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %69, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_ADD_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %70:vreg_64 = GLOBAL_ATOMIC_SUB_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %70, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_SUB_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %71:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %71, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_DEC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %72:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %72, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_INC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %73:vreg_64 = GLOBAL_ATOMIC_SMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %73, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_SMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %74:vreg_64 = GLOBAL_ATOMIC_SWAP_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %74, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_SWAP_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %75:vreg_64 = GLOBAL_ATOMIC_SMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %75, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_SMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %76:vreg_64 = GLOBAL_ATOMIC_UMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %76, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_UMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %77:vreg_64 = GLOBAL_ATOMIC_UMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %77, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_UMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- %79:sgpr_128 = REG_SEQUENCE %4, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3
- %80:vreg_128 = COPY %79
-
- %78:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
- GLOBAL_STORE_DWORDX2 %11, %78, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
- GLOBAL_ATOMIC_CMPSWAP_X2 %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1)
-
- S_ENDPGM 0
-...
+++ /dev/null
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GFX9 %s
-
-; Test for a conv2d like sequence of loads.
-
-; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
-; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
-; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}}
-; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}}
-; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
-
-define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) {
-entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %idx = zext i32 %id to i64
- %gep = getelementptr i64, i64 addrspace(1)* %src_image, i64 %idx
- %ptr0 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1
- %load0 = load i64, i64 addrspace(1)* %ptr0
- %ptr1 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 2
- %load1 = load i64, i64 addrspace(1)* %ptr1
- %ptr2 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 3
- %load2 = load i64, i64 addrspace(1)* %ptr2
- %ptr3 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 4
- %load3 = load i64, i64 addrspace(1)* %ptr3
- %ptr4 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -4
- %load4 = load i64, i64 addrspace(1)* %ptr4
- %ptr5 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -3
- %load5 = load i64, i64 addrspace(1)* %ptr5
- %ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2
- %load6 = load i64, i64 addrspace(1)* %ptr6
- %ptr7 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1
- %load7 = load i64, i64 addrspace(1)* %ptr7
- %ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 0
- %load8 = load i64, i64 addrspace(1)* %ptr8
- %add0 = add i64 %load1, %load0
- %add1 = add i64 %load3, %load2
- %add2 = add i64 %load5, %load4
- %add3 = add i64 %load7, %load6
- %add4 = add i64 %add0, %load8
- %add5 = add i64 %add2, %add1
- %add6 = add i64 %add4, %add3
- %add7 = add i64 %add6, %add5
- %gep9 = getelementptr i64, i64 addrspace(1)* %dst_image, i64 %idx
- %ptr9 = getelementptr inbounds i64, i64 addrspace(1)* %gep9, i64 1
- store volatile i64 %add7, i64 addrspace(1)* %ptr9
-
-; Test various offset boundaries.
-; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}}
-; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
-; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}}
- %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511
- %load11 = load i64, i64 addrspace(1)* %gep11
- %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023
- %load12 = load i64, i64 addrspace(1)* %gep12
- %gep13 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
- %load13 = load i64, i64 addrspace(1)* %gep13
- %add11 = add i64 %load11, %load12
- %add12 = add i64 %add11, %load13
- store volatile i64 %add12, i64 addrspace(1)* undef
-
-; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}}
- %gep21 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1024
- %load21 = load i64, i64 addrspace(1)* %gep21
- %gep22 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2048
- %load22 = load i64, i64 addrspace(1)* %gep22
- %gep23 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -512
- %load23 = load i64, i64 addrspace(1)* %gep23
- %add21 = add i64 %load22, %load21
- %add22 = add i64 %add21, %load23
- store volatile i64 %add22, i64 addrspace(1)* undef
-
-; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
- %gep31 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 257
- %load31 = load i64, i64 addrspace(1)* %gep31
- %gep32 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 256
- %load32 = load i64, i64 addrspace(1)* %gep32
- %gep33 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255
- %load33 = load i64, i64 addrspace(1)* %gep33
- %add34 = add i64 %load32, %load31
- %add35 = add i64 %add34, %load33
- store volatile i64 %add35, i64 addrspace(1)* undef
- ret void
-}
-
-; GFX9-LABEL: {{^}}_amdgpu_cs_main:
-; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}}
-; GFX9-NEXT: s_waitcnt
-; GFX9-NOT: global_load_dword
-
-define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) {
-bb:
- %tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)*
- %tmp2 = load volatile <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16
- store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef
- ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-attributes #0 = { convergent nounwind }
-attributes #1 = { nounwind readnone speculatable }
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign -amdgpu-enable-global-sgpr-addr -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s
declare i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: {{^}}nontemporal_global_1:
; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
-; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
-; GFX10: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
+; GFX10: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_global_1
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s
declare i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: {{^}}nontemporal_global_1:
; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
-; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
+; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
+; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_global_1
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
; GCN-LABEL: vector_clause:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GCN-NEXT: v_mov_b32_e32 v17, 0
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v[16:17], s[2:3]
-; GCN-NEXT: global_load_dwordx4 v[4:7], v[16:17], s[2:3] offset:16
-; GCN-NEXT: global_load_dwordx4 v[8:11], v[16:17], s[2:3] offset:32
-; GCN-NEXT: global_load_dwordx4 v[12:15], v[16:17], s[2:3] offset:48
-; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_add_co_u32_e32 v16, vcc, s2, v18
+; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v0, vcc
+; GCN-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
+; GCN-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GCN-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GCN-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GCN-NEXT: v_mov_b32_e32 v17, s5
+; GCN-NEXT: v_add_co_u32_e32 v16, vcc, s4, v18
+; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], s[4:5]
+; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], s[4:5] offset:16
+; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], s[4:5] offset:32
+; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], s[4:5] offset:48
+; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
; GCN-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx2 v[8:9], v[0:1], s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT: global_load_dwordx2 v[8:9], v[0:1], off
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
@stored_constant_ptr = addrspace(3) global i32 addrspace(4)* undef, align 8
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}}
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52
define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
%id = call i32 @llvm.amdgcn.workitem.id.x()