//
// Future improvements:
//
-// - This currently relies on the scheduler to place loads and stores next to
-// each other, and then only merges adjacent pairs of instructions. It would
-// be good to be more flexible with interleaved instructions, and possibly run
-// before scheduling. It currently missing stores of constants because loading
+// - This is currently missing stores of constants because loading
// the constant into the data register is placed between the stores, although
// this is arguably a scheduling problem.
//
DS_READ,
DS_WRITE,
S_BUFFER_LOAD_IMM,
- BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
- BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
- BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
- BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
- BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
- BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
- BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
- BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
+ BUFFER_LOAD,
+ BUFFER_STORE,
};
enum RegisterEnum {
}
}
+/// Maps instruction opcode to enum InstClassEnum.
static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
- if (TII.isMUBUF(Opc)) {
- const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
-
- // If we couldn't identify the opcode, bail out.
- if (baseOpcode == -1) {
- return UNKNOWN;
- }
-
- switch (baseOpcode) {
- case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
- return BUFFER_LOAD_OFFEN;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
- return BUFFER_LOAD_OFFSET;
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
- return BUFFER_STORE_OFFEN;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
- return BUFFER_STORE_OFFSET;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
- return BUFFER_LOAD_OFFEN_exact;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
- return BUFFER_LOAD_OFFSET_exact;
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
- return BUFFER_STORE_OFFEN_exact;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
- return BUFFER_STORE_OFFSET_exact;
- default:
- return UNKNOWN;
- }
- }
-
switch (Opc) {
+ default:
+ if (TII.isMUBUF(Opc)) {
+ switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
+ default:
+ return UNKNOWN;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+ return BUFFER_LOAD;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ return BUFFER_STORE;
+ }
+ }
+ return UNKNOWN;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
return S_BUFFER_LOAD_IMM;
case AMDGPU::DS_READ_B32:
- case AMDGPU::DS_READ_B64:
case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64:
case AMDGPU::DS_READ_B64_gfx9:
return DS_READ;
case AMDGPU::DS_WRITE_B32:
- case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64:
case AMDGPU::DS_WRITE_B64_gfx9:
return DS_WRITE;
+ }
+}
+
+/// Determines instruction subclass from opcode. Only instructions
+/// of the same subclass can be merged together.
+static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
+ switch (Opc) {
default:
- return UNKNOWN;
+ if (TII.isMUBUF(Opc))
+ return AMDGPU::getMUBUFBaseOpcode(Opc);
+ return -1;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B64_gfx9:
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return Opc;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
}
}
if (InstClass == UNKNOWN) {
return false;
}
+ const unsigned InstSubclass = getInstSubclass(Opc, *TII);
// Do not merge VMEM buffer instructions with "swizzled" bit set.
int Swizzled =
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
for (; MBBI != E; ++MBBI) {
- const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
- (IsDS && (MBBI->getOpcode() != Opc))) {
- // This is not a matching DS instruction, but we can keep looking as
+ (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
+ // This is not a matching instruction, but we can keep looking as
// long as one of these conditions are met:
// 1. It is safe to move I down past MBBI.
// 2. It is safe to move MBBI down past the instruction that I will
switch (CI.InstClass) {
default:
+ assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
// FIXME: Handle d16 correctly
- return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
+ return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
+ Width);
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
case S_BUFFER_LOAD_IMM:
std::pair<unsigned, unsigned>
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
- if (CI.Offset0 > CI.Offset1) {
- switch (CI.Width0) {
- default:
- return std::make_pair(0, 0);
- case 1:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
- case 2:
- return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
- case 3:
- return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
- }
- case 2:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
- case 2:
- return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
- }
- case 3:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
- }
- }
+
+ if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
+ return std::make_pair(0, 0);
+
+ bool ReverseOrder = CI.Offset0 > CI.Offset1;
+
+ static const unsigned Idxs[4][4] = {
+ {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+ {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
+ {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
+ {AMDGPU::sub3, 0, 0, 0},
+ };
+ unsigned Idx0;
+ unsigned Idx1;
+
+ assert(CI.Width0 >= 1 && CI.Width0 <= 3);
+ assert(CI.Width1 >= 1 && CI.Width1 <= 3);
+
+ if (ReverseOrder) {
+ Idx1 = Idxs[0][CI.Width1 - 1];
+ Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
} else {
- switch (CI.Width0) {
- default:
- return std::make_pair(0, 0);
- case 1:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
- case 2:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
- case 3:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
- }
- case 2:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
- case 2:
- return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
- }
- case 3:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
- }
- }
+ Idx0 = Idxs[0][CI.Width0 - 1];
+ Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
}
+
+ return std::make_pair(Idx0, Idx1);
}
const TargetRegisterClass *
OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
}
break;
- case BUFFER_LOAD_OFFEN:
- case BUFFER_LOAD_OFFSET:
- case BUFFER_LOAD_OFFEN_exact:
- case BUFFER_LOAD_OFFSET_exact:
+ case BUFFER_LOAD:
if (findMatchingInst(CI)) {
Modified = true;
removeCombinedInst(MergeList, *CI.Paired);
OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
}
break;
- case BUFFER_STORE_OFFEN:
- case BUFFER_STORE_OFFSET:
- case BUFFER_STORE_OFFEN_exact:
- case BUFFER_STORE_OFFSET_exact:
+ case BUFFER_STORE:
if (findMatchingInst(CI)) {
Modified = true;
removeCombinedInst(MergeList, *CI.Paired);