From daf6e66ac5d2f5305f493e90923d11b91a27e7b3 Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Wed, 17 Apr 2019 21:30:07 +0000 Subject: [PATCH] [GlobalISel] Add legalization support for non-power-2 loads and stores Legalize things like i24 load/store by splitting them into smaller power of 2 operations. This matches how SelectionDAG handles these operations. Differential Revision: https://reviews.llvm.org/D59971 llvm-svn: 358613 --- .../llvm/CodeGen/GlobalISel/LegalizerInfo.h | 4 + llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 97 +++++++++++++++++++++- llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp | 13 ++- .../CodeGen/AArch64/GlobalISel/arm64-fallback.ll | 20 ----- .../GlobalISel/legalize-non-pow2-load-store.mir | 49 +++++++++++ 5 files changed, 152 insertions(+), 31 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index 6e4c967..d1eea06 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -639,6 +639,10 @@ public: return actionIf(LegalizeAction::Unsupported, LegalityPredicates::memSizeInBytesNotPow2(0)); } + LegalizeRuleSet &lowerIfMemSizeNotPow2() { + return actionIf(LegalizeAction::Lower, + LegalityPredicates::memSizeInBytesNotPow2(0)); + } LegalizeRuleSet &customIf(LegalityPredicate Predicate) { // We have no choice but conservatively assume that a custom action with a diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 08759f2..62bc010 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -1484,10 +1484,56 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { auto &MMO = **MI.memoperands_begin(); if (DstTy.getSizeInBits() == MMO.getSize() /* in bytes */ * 8) { - // In the case of G_LOAD, this was a non-extending load already and we're - // about to lower to the same instruction. - if (MI.getOpcode() == TargetOpcode::G_LOAD) + if (MI.getOpcode() == TargetOpcode::G_LOAD) { + // This load needs splitting into power of 2 sized loads. + if (DstTy.isVector()) return UnableToLegalize; + if (isPowerOf2_32(DstTy.getSizeInBits())) + return UnableToLegalize; // Don't know what we're being asked to do. + + // Our strategy here is to generate anyextending loads for the smaller + // types up to next power-2 result type, and then combine the two larger + // result values together, before truncating back down to the non-pow-2 + // type. + // E.g. v1 = i24 load => + // v2 = i32 load (2 byte) + // v3 = i32 load (1 byte) + // v4 = i32 shl v2, 16 + // v5 = i32 or v4, v3 + // v1 = i24 trunc v5 + // By doing this we generate the correct truncate which should get + // combined away as an artifact with a matching extend. + uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits()); + uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize; + + MachineFunction &MF = MIRBuilder.getMF(); + MachineMemOperand *LargeMMO = + MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); + MachineMemOperand *SmallMMO = MF.getMachineMemOperand( + &MMO, LargeSplitSize / 8, SmallSplitSize / 8); + + LLT PtrTy = MRI.getType(PtrReg); + unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits()); + LLT AnyExtTy = LLT::scalar(AnyExtSize); + unsigned LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy); + unsigned SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy); + auto LargeLoad = + MIRBuilder.buildLoad(LargeLdReg, PtrReg, *LargeMMO); + + auto OffsetCst = + MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8); + unsigned GEPReg = MRI.createGenericVirtualRegister(PtrTy); + auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0)); + auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0), + *SmallMMO); + + auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize); + auto Shift = MIRBuilder.buildShl(AnyExtTy, LargeLoad, ShiftAmt); + auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, SmallLoad); + MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)}); + MI.eraseFromParent(); + return Legalized; + } MIRBuilder.buildLoad(DstReg, PtrReg, MMO); MI.eraseFromParent(); return Legalized; @@ -1516,6 +1562,51 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return UnableToLegalize; } + case TargetOpcode::G_STORE: { + // Lower a non-power of 2 store into multiple pow-2 stores. + // E.g. split an i24 store into an i16 store + i8 store. + // We do this by first extending the stored value to the next largest power + // of 2 type, and then using truncating stores to store the components. + // By doing this, likewise with G_LOAD, generate an extend that can be + // artifact-combined away instead of leaving behind extracts. + unsigned SrcReg = MI.getOperand(0).getReg(); + unsigned PtrReg = MI.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + MachineMemOperand &MMO = **MI.memoperands_begin(); + if (SrcTy.getSizeInBits() != MMO.getSize() /* in bytes */ * 8) + return UnableToLegalize; + if (SrcTy.isVector()) + return UnableToLegalize; + if (isPowerOf2_32(SrcTy.getSizeInBits())) + return UnableToLegalize; // Don't know what we're being asked to do. + + // Extend to the next pow-2. + const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits())); + auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg); + + // Obtain the smaller value by shifting away the larger value. + uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits()); + uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize; + auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize); + auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt); + + // Generate the GEP and truncating stores. + LLT PtrTy = MRI.getType(PtrReg); + auto OffsetCst = + MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8); + unsigned GEPReg = MRI.createGenericVirtualRegister(PtrTy); + auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0)); + + MachineFunction &MF = MIRBuilder.getMF(); + MachineMemOperand *LargeMMO = + MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); + MachineMemOperand *SmallMMO = + MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); + MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO); + MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_CTLZ_ZERO_UNDEF: case TargetOpcode::G_CTTZ_ZERO_UNDEF: case TargetOpcode::G_CTLZ: diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 85110b2..8f7a521 100644 --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -234,14 +234,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}}) .clampScalar(0, s8, s64) - .widenScalarToNextPow2(0) - // TODO: We could support sum-of-pow2's but the lowering code doesn't know - // how to do that yet. - .unsupportedIfMemSizeNotPow2() + .lowerIfMemSizeNotPow2() // Lower any any-extending loads left into G_ANYEXT and G_LOAD .lowerIf([=](const LegalityQuery &Query) { return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; }) + .widenScalarToNextPow2(0) .clampMaxNumElements(0, s32, 2) .clampMaxNumElements(0, s64, 1) .customIf(IsPtrVecPred); @@ -249,6 +247,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder(G_STORE) .legalForTypesWithMemDesc({{s8, p0, 8, 8}, {s16, p0, 16, 8}, + {s32, p0, 8, 8}, + {s32, p0, 16, 8}, {s32, p0, 32, 8}, {s64, p0, 64, 8}, {p0, p0, 64, 8}, @@ -259,10 +259,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { {v4s32, p0, 128, 8}, {v2s64, p0, 128, 8}}) .clampScalar(0, s8, s64) - .widenScalarToNextPow2(0) - // TODO: We could support sum-of-pow2's but the lowering code doesn't know - // how to do that yet. - .unsupportedIfMemSizeNotPow2() + .lowerIfMemSizeNotPow2() .lowerIf([=](const LegalityQuery &Query) { return Query.Types[0].isScalar() && Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index a0c3af5..a21c251 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -54,26 +54,6 @@ false: } -; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %3:_(s32) = G_LOAD %1:_(p0) :: (load 3 from `i24* undef`, align 1) (in function: odd_type_load) -; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type_load -; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type_load -define i32 @odd_type_load() { -entry: - %ld = load i24, i24* undef, align 1 - %cst = zext i24 %ld to i32 - ret i32 %cst -} - - ; General legalizer inability to handle types whose size wasn't a power of 2. -; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %1:_(s42), %0:_(p0) :: (store 6 into %ir.addr, align 8) (in function: odd_type) -; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type -; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type: -define void @odd_type(i42* %addr) { - %val42 = load i42, i42* %addr - store i42 %val42, i42* %addr - ret void -} - ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %1:_(<7 x s32>), %0:_(p0) :: (store 28 into %ir.addr, align 32) (in function: odd_vector) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_vector ; FALLBACK-WITH-REPORT-OUT-LABEL: odd_vector: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir new file mode 100644 index 0000000..d968ba0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir @@ -0,0 +1,49 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + define i32 @load_store_test(i24* %ptr, i24* %ptr2) { + %val = load i24, i24* %ptr + store i24 %val, i24* %ptr2 + ret i32 0 + } + +... +--- +name: load_store_test +alignment: 2 +tracksRegLiveness: true +body: | + bb.1 (%ir-block.0): + liveins: $x0, $x1 + + ; CHECK-LABEL: name: load_store_test + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2 from %ir.ptr, align 4) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 1 from %ir.ptr + 2, align 4) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C2]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LOAD1]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32) + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64) + ; CHECK: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store 2 into %ir.ptr2, align 4) + ; CHECK: G_STORE [[LSHR]](s32), [[GEP1]](p0) :: (store 1 into %ir.ptr2 + 2, align 4) + ; CHECK: $w0 = COPY [[C]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %3:_(s32) = G_CONSTANT i32 0 + %2:_(s24) = G_LOAD %0(p0) :: (load 3 from %ir.ptr, align 4) + G_STORE %2(s24), %1(p0) :: (store 3 into %ir.ptr2, align 4) + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + +... -- 2.7.4