From c5e3f688271f9cd733b5ccba1830c290cbdeac64 Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Thu, 8 May 2014 10:18:22 +0800 Subject: [PATCH] GBE: Merge successive load/store together for better performance. Gen support at most 4 DWORD read/write in one single instruction. So we merge successive read/write for less instruction and better performance. This improves about 10% for LuxMark medium scene. Signed-off-by: Ruiling Song Reviewed-by: Zhigang Gong --- backend/src/CMakeLists.txt | 1 + backend/src/llvm/llvm_gen_backend.hpp | 3 + backend/src/llvm/llvm_loadstore_optimization.cpp | 269 +++++++++++++++++++++++ backend/src/llvm/llvm_to_gen.cpp | 3 +- 4 files changed, 275 insertions(+), 1 deletion(-) create mode 100644 backend/src/llvm/llvm_loadstore_optimization.cpp diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt index 2d59644..3bb31e5 100644 --- a/backend/src/CMakeLists.txt +++ b/backend/src/CMakeLists.txt @@ -146,6 +146,7 @@ else (GBE_USE_BLOB) llvm/llvm_intrinsic_lowering.cpp llvm/llvm_barrier_nodup.cpp llvm/llvm_to_gen.cpp + llvm/llvm_loadstore_optimization.cpp llvm/llvm_gen_backend.hpp llvm/llvm_gen_ocl_function.hxx llvm/llvm_to_gen.hpp diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp index 56dd27f..26323a3 100644 --- a/backend/src/llvm/llvm_gen_backend.hpp +++ b/backend/src/llvm/llvm_gen_backend.hpp @@ -84,6 +84,9 @@ namespace gbe /*! Remove the GEP instructions */ llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit); + /*! Merge load/store if possible */ + llvm::BasicBlockPass *createLoadStoreOptimizationPass(); + /*! Scalarize all vector op instructions */ llvm::FunctionPass* createScalarizePass(); /*! Remove/add NoDuplicate function attribute for barrier functions. */ diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp new file mode 100644 index 0000000..a597927 --- /dev/null +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp @@ -0,0 +1,269 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + * + * Author: Ruiling, Song + * + * The Idea is that: As GEN support at most 4 successive DWORD load/store, + * then merge successive load/store that are compatible is beneficial. + * The method of checking whether two load/store is compatible are borrowed + * from Vectorize passes in llvm. + */ + +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/PassManager.h" + +#include "llvm/Config/config.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2 +#include "llvm/Function.h" +#include "llvm/InstrTypes.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#else +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#endif /* LLVM_VERSION_MINOR <= 2 */ +#include "llvm/Pass.h" +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1 +#include "llvm/Support/IRBuilder.h" +#elif LLVM_VERSION_MINOR == 2 +#include "llvm/IRBuilder.h" +#else +#include "llvm/IR/IRBuilder.h" +#endif /* LLVM_VERSION_MINOR <= 1 */ +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" + +using namespace llvm; +namespace gbe { + class GenLoadStoreOptimization : public BasicBlockPass { + + public: + static char ID; + ScalarEvolution *SE; + DataLayout *TD; + GenLoadStoreOptimization() : BasicBlockPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesCFG(); + } + + virtual bool runOnBasicBlock(BasicBlock &BB) { + SE = &getAnalysis(); + TD = getAnalysisIfAvailable(); + return optimizeLoadStore(BB); + } + Type *getValueType(Value *insn); + Value *getPointerOperand(Value *I); + unsigned getAddressSpace(Value *I); + bool isSimpleLoadStore(Value *I); + bool optimizeLoadStore(BasicBlock &BB); + + bool isLoadStoreCompatible(Value *A, Value *B); + void mergeLoad(BasicBlock &BB, SmallVector &merged); + void mergeStore(BasicBlock &BB, SmallVector &merged); + BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB, + SmallVector &merged, + BasicBlock::iterator &start, + unsigned maxLimit, + bool isLoad); + + virtual const char *getPassName() const { + return "Merge compatible Load/stores for Gen"; + } + }; + + char GenLoadStoreOptimization::ID = 0; + + Value *GenLoadStoreOptimization::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast(I)) return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) return SI->getPointerOperand(); + return NULL; + } + unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) { + if (LoadInst *L=dyn_cast(I)) return L->getPointerAddressSpace(); + if (StoreInst *S=dyn_cast(I)) return S->getPointerAddressSpace(); + return -1; + } + bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) { + if (LoadInst *L=dyn_cast(I)) return L->isSimple(); + if (StoreInst *S=dyn_cast(I)) return S->isSimple(); + return false; + } + Type *GenLoadStoreOptimization::getValueType(Value *insn) { + if(LoadInst *ld = dyn_cast(insn)) return ld->getType(); + if(StoreInst *st = dyn_cast(insn)) return st->getValueOperand()->getType(); + + return NULL; + } + + bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) { + Value *ptrA = getPointerOperand(A); + Value *ptrB = getPointerOperand(B); + unsigned ASA = getAddressSpace(A); + unsigned ASB = getAddressSpace(B); + + // Check that the address spaces match and that the pointers are valid. + if (!ptrA || !ptrB || (ASA != ASB)) return false; + + if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false; + // Check that A and B are of the same type. + if (ptrA->getType() != ptrB->getType()) return false; + + // Calculate the distance. + const SCEV *ptrSCEVA = SE->getSCEV(ptrA); + const SCEV *ptrSCEVB = SE->getSCEV(ptrB); + const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB); + const SCEVConstant *constOffSCEV = dyn_cast(offsetSCEV); + + // Non constant distance. + if (!constOffSCEV) return false; + + int64_t offset = constOffSCEV->getValue()->getSExtValue(); + Type *Ty = cast(ptrA->getType())->getElementType(); + // The Instructions are connsecutive if the size of the first load/store is + // the same as the offset. + int64_t sz = TD->getTypeStoreSize(Ty); + return ((-offset) == sz); + } + + void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector &merged) { + IRBuilder<> Builder(&BB); + + unsigned size = merged.size(); + SmallVector values; + for(unsigned i = 0; i < size; i++) { + values.push_back(merged[i]); + } + LoadInst *ld = cast(merged[0]); + unsigned align = ld->getAlignment(); + unsigned addrSpace = ld->getPointerAddressSpace(); + // insert before first load + Builder.SetInsertPoint(ld); + VectorType *vecTy = VectorType::get(ld->getType(), size); + Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(), + PointerType::get(vecTy, addrSpace)); + LoadInst *vecValue = Builder.CreateLoad(vecPtr); + vecValue->setAlignment(align); + + for (unsigned i = 0; i < size; ++i) { + Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i)); + values[i]->replaceAllUsesWith(S); + } + } + + BasicBlock::iterator + GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB, + SmallVector &merged, + BasicBlock::iterator &start, + unsigned maxLimit, + bool isLoad) { + + BasicBlock::iterator stepForward = start; + if(!isSimpleLoadStore(start)) return stepForward; + + merged.push_back(start); + + BasicBlock::iterator E = BB.end(); + BasicBlock::iterator J = ++start; + + for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) { + if((isLoad && isa(*J)) || (!isLoad && isa(*J))) { + if(isLoadStoreCompatible(merged[merged.size()-1], J)) { + merged.push_back(J); + stepForward = ++J; + } + } else if((isLoad && isa(*J)) || (!isLoad && isa(*J))) { + // simple stop to keep read/write order + break; + } + + if(merged.size() >= 4) break; + } + return stepForward; + } + + void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector &merged) { + IRBuilder<> Builder(&BB); + + unsigned size = merged.size(); + SmallVector values; + for(unsigned i = 0; i < size; i++) { + values.push_back(cast(merged[i])->getValueOperand()); + } + StoreInst *st = cast(merged[0]); + unsigned addrSpace = st->getPointerAddressSpace(); + + unsigned align = st->getAlignment(); + // insert before the last store + Builder.SetInsertPoint(merged[size-1]); + + Type *dataTy = st->getValueOperand()->getType(); + VectorType *vecTy = VectorType::get(dataTy, size); + Value * parent = UndefValue::get(vecTy); + for(unsigned i = 0; i < size; i++) { + parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i)); + } + + Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), PointerType::get(vecTy, addrSpace)); + StoreInst *newST = Builder.CreateStore(parent, newPtr); + newST->setAlignment(align); + } + + bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) { + bool changed = false; + SmallVector merged; + for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) { + if(isa(*BBI) || isa(*BBI)) { + bool isLoad = isa(*BBI) ? true: false; + Type *ty = getValueType(BBI); + if(ty->isVectorTy()) continue; + // we only support DWORD data type merge + if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue; + BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad); + if(merged.size() > 1) { + if(isLoad) + mergeLoad(BB, merged); + else + mergeStore(BB, merged); + // remove merged insn + int size = merged.size(); + for(int i = 0; i < size; i++) + merged[i]->eraseFromParent(); + changed = true; + } + merged.clear(); + } + } + return changed; + } + + BasicBlockPass *createLoadStoreOptimizationPass() { + return new GenLoadStoreOptimization(); + } +}; + diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp index 37a5b2b..9282b3f 100644 --- a/backend/src/llvm/llvm_to_gen.cpp +++ b/backend/src/llvm/llvm_to_gen.cpp @@ -187,7 +187,7 @@ namespace gbe runModulePass(mod, libraryInfo, optLevel); llvm::PassManager passes; - + passes.add(new DataLayout(&mod)); // Print the code before further optimizations if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS) #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5 @@ -198,6 +198,7 @@ namespace gbe passes.add(createIntrinsicLoweringPass()); passes.add(createFunctionInliningPass(200000)); passes.add(createScalarReplAggregatesPass()); // Break up allocas + passes.add(createLoadStoreOptimizationPass()); passes.add(createRemoveGEPPass(unit)); passes.add(createConstantPropagationPass()); passes.add(createLowerSwitchPass()); -- 2.7.4