const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
- LiveIntervals *LIS;
+ AliasAnalysis *AA;
static bool offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned EltSize);
- MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
- unsigned EltSize);
+ MachineBasicBlock::iterator findMatchingDSInst(
+ MachineBasicBlock::iterator I,
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove);
MachineBasicBlock::iterator mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize);
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
MachineBasicBlock::iterator mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize);
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
public:
static char ID;
SILoadStoreOptimizer()
: MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
- LIS(nullptr) {}
+ AA(nullptr) {}
SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
return "SI Load / Store Optimizer";
}
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoPHIs);
- }
-
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addPreserved<SlotIndexes>();
- AU.addPreserved<LiveIntervals>();
- AU.addPreserved<LiveVariables>();
- AU.addRequired<LiveIntervals>();
+ AU.addRequired<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
return new SILoadStoreOptimizer(TM);
}
+static void moveInstsAfter(MachineBasicBlock::iterator I,
+ ArrayRef<MachineInstr*> InstsToMove) {
+ MachineBasicBlock *MBB = I->getParent();
+ ++I;
+ for (MachineInstr *MI : InstsToMove) {
+ MI->removeFromParent();
+ MBB->insert(I, MI);
+ }
+}
+
+static void addDefsToList(const MachineInstr &MI,
+ SmallVectorImpl<const MachineOperand *> &Defs) {
+ for (const MachineOperand &Def : MI.defs()) {
+ Defs.push_back(&Def);
+ }
+}
+
+static bool
+canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+ ArrayRef<MachineInstr*> InstsToMove,
+ const SIInstrInfo *TII,
+ AliasAnalysis *AA) {
+
+ assert(MemOp.mayLoadOrStore());
+
+ for (MachineInstr *InstToMove : InstsToMove) {
+ if (!InstToMove->mayLoadOrStore())
+ continue;
+ if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
+ return false;
+ }
+ return true;
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned Size) {
MachineBasicBlock::iterator
SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
- unsigned EltSize){
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove) {
MachineBasicBlock::iterator E = I->getParent()->end();
- MachineBasicBlock &MBB = *I->getParent();
MachineBasicBlock::iterator MBBI = I;
++MBBI;
- if (MBBI == MBB.end() || MBBI->getOpcode() != I->getOpcode())
- return E;
-
- // Don't merge volatiles.
- if (MBBI->hasOrderedMemoryRef())
- return E;
-
- int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
- const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
- const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
-
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
- if (AddrReg0.getReg() == AddrReg1.getReg() &&
- AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
- int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
- AMDGPU::OpName::offset);
- unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
- unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
-
- // Check both offsets fit in the reduced range.
- if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
- return MBBI;
- }
+ SmallVector<const MachineOperand *, 8> DefsToMove;
+ addDefsToList(*I, DefsToMove);
+
+ for ( ; MBBI != E; ++MBBI) {
+
+ if (MBBI->getOpcode() != I->getOpcode()) {
+
+ // This is not a matching DS instruction, but we can keep looking as
+ // long as one of these conditions are met:
+ // 1. It is safe to move I down past MBBI.
+ // 2. It is safe to move MBBI down past the instruction that I will
+ // be merged into.
+
+ if (MBBI->hasUnmodeledSideEffects())
+ // We can't re-order this instruction with respect to other memory
+ // opeations, so we fail both conditions mentioned above.
+ return E;
+ if (MBBI->mayLoadOrStore() &&
+ !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
+ // We fail condition #1, but we may still be able to satisfy condition
+ // #2. Add this instruction to the move list and then we will check
+ // if condition #2 holds once we have selected the matching instruction.
+ InstsToMove.push_back(&*MBBI);
+ addDefsToList(*MBBI, DefsToMove);
+ continue;
+ }
+
+ // When we match I with another DS instruction we will be moving I down
+ // to the location of the matched instruction any uses of I will need to
+ // be moved down as well.
+ for (const MachineOperand *Def : DefsToMove) {
+ bool ReadDef = MBBI->readsVirtualRegister(Def->getReg());
+ // If ReadDef is true, then there is a use of Def between I
+ // and the instruction that I will potentially be merged with. We
+ // will need to move this instruction after the merged instructions.
+ if (ReadDef) {
+ InstsToMove.push_back(&*MBBI);
+ addDefsToList(*MBBI, DefsToMove);
+ break;
+ }
+ }
+ continue;
+ }
+
+ // Don't merge volatiles.
+ if (MBBI->hasOrderedMemoryRef())
+ return E;
+
+ int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+ const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+ const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg0.getReg() == AddrReg1.getReg() &&
+ AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+ AMDGPU::OpName::offset);
+ unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+ unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+ // Check both offsets fit in the reduced range.
+ // We also need to go through the list of instructions that we plan to
+ // move and make sure they are all safe to move down past the merged
+ // instruction.
+ if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
+ canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
+ return MBBI;
+ }
+
+ // We've found a load/store that we couldn't merge for some reason.
+ // We could potentially keep looking, but we'd need to make sure that
+ // it was safe to move I and also all the instruction in InstsToMove
+ // down past this instruction.
+ // FIXME: This is too conservative.
+ break;
+ }
return E;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize) {
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
DebugLoc DL = I->getDebugLoc();
MachineInstrBuilder Read2
- = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
+ = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
.addOperand(*AddrReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
// Copy to the old destination registers.
- MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
+ BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- LIS->InsertMachineInstrInMaps(*Read2);
-
- // repairLiveintervalsInRange() doesn't handle physical register, so we have
- // to update the M0 range manually.
- SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
- LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
- LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
- bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
- // The new write to the original destination register is now the copy. Steal
- // the old SlotIndex.
- LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
- LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
+ moveInstsAfter(Copy1, InstsToMove);
+ MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
- LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
- LIS->shrinkToUses(&AddrRegLI);
-
- LIS->createAndComputeVirtRegInterval(DestReg);
-
- if (UpdateM0Range) {
- SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
- M0Segment->end = Read2Index.getRegSlot();
- }
-
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
- return Read2.getInstr();
+ return Next;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize) {
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = I->getDebugLoc();
- // repairLiveintervalsInRange() doesn't handle physical register, so we have
- // to update the M0 range manually.
- SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
- LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
- LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
- bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
MachineInstrBuilder Write2
- = BuildMI(*MBB, I, DL, Write2Desc)
+ = BuildMI(*MBB, Paired, DL, Write2Desc)
.addOperand(*Addr) // addr
.addOperand(*Data0) // data0
.addOperand(*Data1) // data1
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
- // XXX - How do we express subregisters here?
- unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+ moveInstsAfter(Write2, InstsToMove);
- LIS->RemoveMachineInstrFromMaps(*I);
- LIS->RemoveMachineInstrFromMaps(*Paired);
+ MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
- // This doesn't handle physical registers like M0
- LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
-
- if (UpdateM0Range) {
- SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
- M0Segment->end = Write2Index.getRegSlot();
- }
-
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
- return Write2.getInstr();
+ return Next;
}
// Scan through looking for adjacent LDS operations with constant offsets from
continue;
}
+ SmallVector<MachineInstr*, 8> InstsToMove;
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
- MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
if (Match != E) {
Modified = true;
- I = mergeRead2Pair(I, Match, Size);
+ I = mergeRead2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
continue;
} else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
- MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
if (Match != E) {
Modified = true;
- I = mergeWrite2Pair(I, Match, Size);
+ I = mergeWrite2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
-
- LIS = &getAnalysis<LiveIntervals>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
}
; FUNC-LABEL: {{^}}local_load_v16i16:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
; EG: LDS_READ_RET
ret void
}
-; FIXME: Should have 2 ds_read_b64
; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; GCN: ds_write2_b64
; GCN: ds_write2_b64
}
; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}}
+
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; EG: LDS_READ_RET
; EG: LDS_READ_RET
}
; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:5{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
; EG: LDS_READ_RET
; EG: LDS_READ_RET
ret void
}
-; FIXME: Missed read2
; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
; EG: LDS_READ_RET
; EG: LDS_READ_RET