From: Yang Rong Date: Fri, 17 May 2013 07:11:51 +0000 (+0800) Subject: Add a scalarize llvm pass. X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3849534353cc7d84125b0b83937314bab52f561c;p=contrib%2Fbeignet.git Add a scalarize llvm pass. In previous implementation expand vector ops in GenWrite, it is hard to optimize. Now, I add new llvm pass to scalarize. This pass will expand all normal vector ops to scalar ops, except store/load, image read/write and function's argument. Add fake ExtractElement/InsertElement instructions to avoid dead instruction elimination, and unit valueMap hold the relationship between these fake instructions and real load/store instructions. Signed-off-by: Yang Rong Reviewed-by: Zhigang Gong --- diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt index 1829964..183517a 100644 --- a/backend/src/CMakeLists.txt +++ b/backend/src/CMakeLists.txt @@ -83,6 +83,7 @@ else (GBE_USE_BLOB) backend/program.h llvm/llvm_gen_backend.cpp llvm/llvm_passes.cpp + llvm/llvm_scalarize.cpp llvm/llvm_to_gen.cpp llvm/llvm_gen_backend.hpp llvm/llvm_gen_ocl_function.hxx diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp index ae78638..3b293f5 100644 --- a/backend/src/ir/unit.hpp +++ b/backend/src/ir/unit.hpp @@ -1,4 +1,4 @@ -/* +/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or @@ -24,9 +24,12 @@ #ifndef __GBE_IR_UNIT_HPP__ #define __GBE_IR_UNIT_HPP__ +#include "llvm/Value.h" + #include "ir/constant.hpp" #include "ir/register.hpp" #include "sys/hash_map.hpp" +#include "sys/map.hpp" namespace gbe { namespace ir { @@ -41,6 +44,7 @@ namespace ir { { public: typedef hash_map FunctionSet; + typedef std::pair ValueIndex; /*! Create an empty unit */ Unit(PointerSize pointerSize = POINTER_32_BITS); /*! Release everything (*including* the function pointers) */ @@ -71,11 +75,27 @@ namespace ir { ConstantSet& getConstantSet(void) { return constantSet; } /*! Return the constant set */ const ConstantSet& getConstantSet(void) const { return constantSet; } + + /*! Some values will not be allocated. For example a vector extract and + * a vector insertion when scalarize the vector load/store + */ + void newValueProxy(llvm::Value *real, + llvm::Value *fake, + uint32_t realIndex = 0u, + uint32_t fakeIndex = 0u) { + const ValueIndex key(fake, fakeIndex); + const ValueIndex value(real, realIndex); + GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice + valueMap[key] = value; + } + /*! Return the value map */ + const map& getValueMap(void) const { return valueMap; } private: friend class ContextInterface; //!< Can free modify the unit hash_map functions; //!< All the defined functions ConstantSet constantSet; //!< All the constants defined in the unit PointerSize pointerSize; //!< Size shared by all pointers + map valueMap; //!< fake to real value map for vector load/store GBE_CLASS(Unit); }; diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 8dcf15c..3855011 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -1,4 +1,4 @@ -/* +/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or @@ -60,7 +60,7 @@ * dependencies on endianness or ABIs. Fortunately, the ptx (and nvptx for LLVM * 3.2) profile is pretty well adapted to our needs since NV and Gen GPU are * kind of similar, or at least they are similar enough to share the same front - * end. + * end. * * Problems * ======== @@ -126,10 +126,8 @@ #include "ir/context.hpp" #include "ir/unit.hpp" #include "ir/liveness.hpp" -#include "sys/map.hpp" #include "sys/set.hpp" #include "sys/cvar.hpp" -#include /* Not defined for LLVM 3.0 */ #if !defined(LLVM_VERSION_MAJOR) @@ -207,7 +205,7 @@ namespace gbe /*! Type to register family translation */ static ir::RegisterFamily getFamily(const ir::Context &ctx, const Type *type) { - GBE_ASSERT(isScalarType(type) == true); + GBE_ASSERT(isScalarType(type) == true); if (type == Type::getInt1Ty(type->getContext())) return ir::FAMILY_BOOL; if (type == Type::getInt8Ty(type->getContext())) @@ -269,6 +267,8 @@ namespace gbe class RegisterTranslator { public: + /*! Indices will be zero for scalar values */ + typedef std::pair ValueIndex; RegisterTranslator(ir::Context &ctx) : ctx(ctx) {} /*! Empty the maps */ @@ -289,6 +289,11 @@ namespace gbe GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice valueMap[key] = value; } + /*! After scalarize pass, there are some valueMap in unit, + * use this function to copy from unit valueMap */ + void initValueMap(const map& vMap) { + valueMap.insert(vMap.begin(), vMap.end()); + } /*! Mostly used for the preallocated registers (lids, gids) */ void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) { const ValueIndex key(value, index); @@ -325,10 +330,9 @@ namespace gbe }; return ir::Register(); } - /*! Get the register from the given value at given index possibly iterating - * in the value map to get the final real register - */ - ir::Register getScalar(Value *value, uint32_t index = 0u) { + + /*! iterating in the value map to get the final real register */ + void getRealValue(Value* &value, uint32_t& index) { auto end = valueMap.end(); for (;;) { auto it = valueMap.find(std::make_pair(value, index)); @@ -339,6 +343,14 @@ namespace gbe index = it->second.second; } } + } + + /*! Get the register from the given value at given index possibly iterating + * in the value map to get the final real register + */ + ir::Register getScalar(Value *value, uint32_t index = 0u) { + getRealValue(value, index); + const auto key = std::make_pair(value, index); GBE_ASSERT(scalarMap.find(key) != scalarMap.end()); return scalarMap[key]; @@ -351,16 +363,8 @@ namespace gbe } /*! Says if the value exists. Otherwise, it is undefined */ bool valueExists(Value *value, uint32_t index) { - auto end = valueMap.end(); - for (;;) { - auto it = valueMap.find(std::make_pair(value, index)); - if (it == end) - break; - else { - value = it->second.first; - index = it->second.second; - } - } + getRealValue(value, index); + const auto key = std::make_pair(value, index); return scalarMap.find(key) != scalarMap.end(); } @@ -375,8 +379,6 @@ namespace gbe this->insertRegister(reg, key, index); return reg; } - /*! Indices will be zero for scalar values */ - typedef std::pair ValueIndex; /*! Map value to ir::Register */ map scalarMap; /*! Map values to values when this is only a translation (eq bitcast) */ @@ -384,28 +386,6 @@ namespace gbe /*! Actually allocates the registers */ ir::Context &ctx; }; - /*! All intrinsic Gen functions */ - enum OCLInstrinsic { -#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID, -#include "llvm_gen_ocl_function.hxx" -#undef DECL_LLVM_GEN_FUNCTION - }; - - /*! Build the hash map for OCL functions on Gen */ - struct OCLIntrinsicMap { - /*! Build the intrinsic hash map */ - OCLIntrinsicMap(void) { -#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \ - map.insert(std::make_pair(#NAME, GEN_OCL_##ID)); -#include "llvm_gen_ocl_function.hxx" -#undef DECL_LLVM_GEN_FUNCTION - } - /*! Sort intrinsics with their names */ - hash_map map; - }; - - /*! Sort the OCL Gen instrinsic functions (built on pre-main) */ - static const OCLIntrinsicMap instrinsicMap; /*! Translate LLVM IR code to Gen IR code */ class GenWriter : public FunctionPass, public InstVisitor @@ -423,7 +403,7 @@ namespace gbe */ set conditionSet; /*! We visit each function twice. Once to allocate the registers and once to - * emit the Gen IR instructions + * emit the Gen IR instructions */ enum Pass { PASS_EMIT_REGISTERS = 0, @@ -663,7 +643,7 @@ namespace gbe if (dyn_cast(CPV)) { return doIt(uint32_t(0)); // XXX Handle type } else { - if (dyn_cast(CPV)) + if (dyn_cast(CPV)) CPV = extractConstantElem(CPV, index); GBE_ASSERTM(dyn_cast(CPV) == NULL, "Unsupported constant expression"); @@ -756,6 +736,9 @@ namespace gbe } ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) { + //the real value may be constant, so get real value before constant check + regTranslator.getRealValue(value, elemID); + if (dyn_cast(value)) { ConstantExpr *ce = dyn_cast(value); if(ce->isCast()) { @@ -867,6 +850,7 @@ namespace gbe "Returned value for kernel functions is forbidden"); // Loop over the arguments and output registers for them if (!F.arg_empty()) { + uint32_t argID = 0; Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); // Insert a new register for each function argument @@ -875,10 +859,33 @@ namespace gbe uint32_t argID = 1; // Start at one actually for (; I != E; ++I, ++argID) { #else - for (; I != E; ++I) { + for (; I != E; ++I, ++argID) { #endif /* LLVM_VERSION_MINOR <= 1 */ const std::string &argName = I->getName().str(); Type *type = I->getType(); + + //add support for vector argument + if(type->isVectorTy()) { + VectorType *vectorType = cast(type); + + this->newRegister(I); + ir::Register reg = getRegister(I, 0); + + Type *elemType = vectorType->getElementType(); + const uint32_t elemSize = getTypeByteSize(unit, elemType); + const uint32_t elemNum = vectorType->getNumElements(); + //vector's elemType always scalar type + ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize); + + ir::Function& fn = ctx.getFunction(); + for(uint32_t i=1; i < elemNum; i++) { + ir::PushLocation argLocation(fn, argID, elemSize*i); + reg = getRegister(I, i); + ctx.appendPushedConstant(reg, argLocation); //add to push map for reg alloc + } + continue; + } + GBE_ASSERTM(isScalarType(type) == true, "vector type in the function argument is not supported yet"); const ir::Register reg = regTranslator.newScalar(I); @@ -916,7 +923,6 @@ namespace gbe ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize); ctx.getFunction().getImageSet()->append(reg, &ctx); break; - break; default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE); } } @@ -1141,6 +1147,7 @@ namespace gbe ctx.startFunction(F.getName()); this->regTranslator.clear(); + this->regTranslator.initValueMap(unit.getValueMap()); this->labelMap.clear(); this->emitFunctionPrototype(F); @@ -1495,141 +1502,15 @@ namespace gbe ir::Context &ctx; }; - void GenWriter::regAllocateInsertElement(InsertElementInst &I) { - Value *modified = I.getOperand(0); - Value *toInsert = I.getOperand(1); - Value *index = I.getOperand(2); - - // Get the index for the insertion - Constant *CPV = dyn_cast(index); - GBE_ASSERTM(CPV != NULL, "only constant indices when inserting values"); - auto x = processConstant(CPV, InsertExtractFunctor(ctx)); - GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, - "Invalid index type for InsertElement"); - - // Crash on overrun - VectorType *vectorType = cast(modified->getType()); - const uint32_t elemNum = vectorType->getNumElements(); - const uint32_t modifiedID = x.data.u32; - GBE_ASSERTM(modifiedID < elemNum, "Out-of-bound index for InsertElement"); - - // The source vector is not constant - if (!isa(modified) || isa(modified)) { - // Non modified values are just proxies - for (uint32_t elemID = 0; elemID < elemNum; ++elemID) - if (elemID != modifiedID) - regTranslator.newValueProxy(modified, &I, elemID, elemID); - } - // The source vector is constant - else { - // Non modified values will use LOADI - for (uint32_t elemID = 0; elemID < elemNum; ++elemID) - if (elemID != modifiedID) { - const ir::Type type = getType(ctx, toInsert->getType()); - const ir::Register reg = ctx.reg(getFamily(type)); - regTranslator.insertRegister(reg, &I, elemID); - } - } - - // If the element to insert is an immediate we will generate a LOADI. - // Otherwise, the value is just a proxy of the inserted value - if (dyn_cast(toInsert) != NULL) { - const ir::Type type = getType(ctx, toInsert->getType()); - const ir::Register reg = ctx.reg(getFamily(type)); - regTranslator.insertRegister(reg, &I, modifiedID); - } else - regTranslator.newValueProxy(toInsert, &I, 0, modifiedID); - } - - void GenWriter::emitInsertElement(InsertElementInst &I) { - // Note that we check everything in regAllocateInsertElement - Value *modified = I.getOperand(0); - Value *toInsert = I.getOperand(1); - Value *index = I.getOperand(2); - - // Get the index of the value to insert - Constant *indexCPV = dyn_cast(index); - auto x = processConstant(indexCPV, InsertExtractFunctor(ctx)); - const uint32_t modifiedID = x.data.u32; - - // The source vector is constant. We need to insert LOADI for the unmodified - // values - if (isa(modified) && !isa(modified)) { - VectorType *vectorType = cast(modified->getType()); - const uint32_t elemNum = vectorType->getNumElements(); - for (uint32_t elemID = 0; elemID < elemNum; ++elemID) - if (elemID != modifiedID) { - Constant *sourceCPV = dyn_cast(modified); - if (isa(extractConstantElem(sourceCPV, elemID)) == false) { - const ir::ImmediateIndex immIndex = this->newImmediate(sourceCPV, elemID); - const ir::Immediate imm = ctx.getImmediate(immIndex); - const ir::Register reg = regTranslator.getScalar(&I, elemID); - ctx.LOADI(imm.type, reg, immIndex); - } - } - } - - // If the inserted value is not a constant, we just use a proxy - if (dyn_cast(toInsert) == NULL) - return; - - // We need a LOADI if we insert an immediate - Constant *toInsertCPV = dyn_cast(toInsert); - const ir::ImmediateIndex immIndex = this->newImmediate(toInsertCPV); - const ir::Immediate imm = ctx.getImmediate(immIndex); - const ir::Register reg = regTranslator.getScalar(&I, modifiedID); - ctx.LOADI(imm.type, reg, immIndex); - } - - void GenWriter::regAllocateExtractElement(ExtractElementInst &I) { - Value *extracted = I.getOperand(0); - Value *index = I.getOperand(1); - GBE_ASSERTM(isa(extracted) == false, - "TODO support constant vector for extract"); - Constant *CPV = dyn_cast(index); - GBE_ASSERTM(CPV != NULL, "only constant indices when inserting values"); - auto x = processConstant(CPV, InsertExtractFunctor(ctx)); - GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, - "Invalid index type for InsertElement"); - - // Crash on overrun - const uint32_t extractedID = x.data.u32; -#if GBE_DEBUG - VectorType *vectorType = cast(extracted->getType()); - const uint32_t elemNum = vectorType->getNumElements(); - GBE_ASSERTM(extractedID < elemNum, "Out-of-bound index for InsertElement"); -#endif /* GBE_DEBUG */ - - // Easy when the vector is not immediate - regTranslator.newValueProxy(extracted, &I, extractedID, 0); - } - - void GenWriter::emitExtractElement(ExtractElementInst &I) { - // TODO -> insert LOADI when the extracted vector is constant - } + /*! Because there are still fake insert/extract instruction for + * load/store, so keep empty function here */ + void GenWriter::regAllocateInsertElement(InsertElementInst &I) {} + void GenWriter::emitInsertElement(InsertElementInst &I) {} - void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) { - Value *first = I.getOperand(0); - Value *second = I.getOperand(1); - GBE_ASSERTM(!isa(first) || isa(first), - "TODO support constant vector for shuffle"); - GBE_ASSERTM(!isa(second) || isa(second), - "TODO support constant vector for shuffle"); - VectorType *dstType = cast(I.getType()); - VectorType *srcType = cast(first->getType()); - const uint32_t dstElemNum = dstType->getNumElements(); - const uint32_t srcElemNum = srcType->getNumElements(); - for (uint32_t elemID = 0; elemID < dstElemNum; ++elemID) { - uint32_t srcID = I.getMaskValue(elemID); - Value *src = first; - if (srcID >= srcElemNum) { - srcID -= srcElemNum; - src = second; - } - regTranslator.newValueProxy(src, &I, srcID, elemID); - } - } + void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {} + void GenWriter::emitExtractElement(ExtractElementInst &I) {} + void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {} void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {} void GenWriter::regAllocateSelectInst(SelectInst &I) { diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp index c270924..2ad879e 100644 --- a/backend/src/llvm/llvm_gen_backend.hpp +++ b/backend/src/llvm/llvm_gen_backend.hpp @@ -1,4 +1,4 @@ -/* +/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or @@ -28,6 +28,9 @@ #include "llvm/Pass.h" #include "sys/platform.hpp" +#include "sys/map.hpp" +#include "sys/hash_map.hpp" +#include // LLVM Type namespace llvm { class Type; } @@ -37,6 +40,29 @@ namespace gbe // Final target of the Gen backend namespace ir { class Unit; } + /*! All intrinsic Gen functions */ + enum OCLInstrinsic { +#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID, +#include "llvm_gen_ocl_function.hxx" +#undef DECL_LLVM_GEN_FUNCTION + }; + + /*! Build the hash map for OCL functions on Gen */ + struct OCLIntrinsicMap { + /*! Build the intrinsic hash map */ + OCLIntrinsicMap(void) { +#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \ + map.insert(std::make_pair(#NAME, GEN_OCL_##ID)); +#include "llvm_gen_ocl_function.hxx" +#undef DECL_LLVM_GEN_FUNCTION + } + /*! Sort intrinsics with their names */ + hash_map map; + }; + + /*! Sort the OCL Gen instrinsic functions (built on pre-main) */ + static const OCLIntrinsicMap instrinsicMap; + /*! Pad the offset */ uint32_t getPadding(uint32_t offset, uint32_t align); @@ -55,6 +81,8 @@ namespace gbe /*! Remove the GEP instructions */ llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit); + llvm::FunctionPass* createScalarizePass(ir::Unit &unit); + } /* namespace gbe */ #endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */ diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp new file mode 100644 index 0000000..4de1fce --- /dev/null +++ b/backend/src/llvm/llvm_scalarize.cpp @@ -0,0 +1,834 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + */ + +/** + * \file llvm_scalarize.cpp + * \author Yang Rong + * + * This file is derived from: + * https://code.google.com/p/lunarglass/source/browse/trunk/Core/Passes/Transforms/Scalarize.cpp?r=605 + */ + +//===- Scalarize.cpp - Scalarize LunarGLASS IR ----------------------------===// +// +// LunarGLASS: An Open Modular Shader Compiler Architecture +// Copyright (C) 2010-2011 LunarG, Inc. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; version 2 of the +// License. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +// 02110-1301, USA. +// +//===----------------------------------------------------------------------===// +// +// Author: Michael Ilseman, LunarG +// +//===----------------------------------------------------------------------===// +// +// Scalarize the IR. +// * Loads of uniforms become multiple loadComponent calls +// +// * Reads/writes become read/writeComponent calls +// +// * Component-wise operations become multiple ops over each component +// +// * Texture call become recomponsed texture calls +// +// * Vector ops disappear, with their users referring to the scalarized +// * components +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Function.h" +#include "llvm/InstrTypes.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/IRBuilder.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/llvm_gen_backend.hpp" +#include "ir/unit.hpp" +#include "sys/map.hpp" + + +using namespace llvm; + +namespace gbe { + + struct VectorValues { + VectorValues() : vals() + { } + + void setComponent(int c, llvm::Value* val) + { + assert(c >= 0 && c < 16 && "Out of bounds component"); + vals[c] = val; + } + llvm::Value* getComponent(int c) + { + assert(c >= 0 && c < 16 && "Out of bounds component"); + assert(vals[c] && "Requesting non-existing component"); + return vals[c]; + } + + // {Value* x, Value* y, Value* z, Value* w} + llvm::Value* vals[16]; + }; + + class Scalarize : public FunctionPass { + + public: + // Standard pass stuff + static char ID; + + Scalarize(ir::Unit& unit) : FunctionPass(ID), unit(unit) + { + initializeLoopInfoPass(*PassRegistry::getPassRegistry()); + initializeDominatorTreePass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function&); + void print(raw_ostream&, const Module* = 0) const; + virtual void getAnalysisUsage(AnalysisUsage&) const; + + protected: + // An instruction is valid post-scalarization iff it is fully scalar or it + // is a gla_loadn + bool isValid(const Instruction*); + + // Take an instruction that produces a vector, and scalarize it + bool scalarize(Instruction*); + bool scalarizePerComponent(Instruction*); + bool scalarizeFuncCall(CallInst *); + bool scalarizeLoad(LoadInst*); + bool scalarizeStore(StoreInst*); + //bool scalarizeIntrinsic(IntrinsicInst*); + bool scalarizeExtract(ExtractElementInst*); + bool scalarizeInsert(InsertElementInst*); + bool scalarizeShuffleVector(ShuffleVectorInst*); + bool scalarizePHI(PHINode*); + void scalarizeArgs(Function& F); + // ... + + // Helpers to make the actual multiple scalar calls, one per + // component. Updates the given VectorValues's components with the new + // Values. + void makeScalarizedCalls(Function*, ArrayRef, int numComponents, VectorValues&); + + void makePerComponentScalarizedCalls(Instruction*, ArrayRef); + + // Makes a scalar form of the given instruction: replaces the operands + // and chooses a correct return type + Instruction* createScalarInstruction(Instruction* inst, ArrayRef); + + // Gather the specified components in the given values. Returns the + // component if the given value is a vector, or the scalar itself. + void gatherComponents(int component, ArrayRef args, SmallVectorImpl& componentArgs); + + // Get the assigned component for that value. If the value is a scalar, + // returns the scalar. If it's a constant, returns that component. If + // it's an instruction, returns the vectorValues of that instruction for + // that component + Value* getComponent(int component, Value*); + + // Used for assertion purposes. Whether we can get the component out with + // a getComponent call + bool canGetComponent(Value*); + + // Used for assertion purposes. Whether for every operand we can get + // components with a getComponent call + bool canGetComponentArgs(User*); + + // Delete the instruction in the deadList + void dce(); + + + int GetConstantInt(const Value* value); + bool IsPerComponentOp(const Instruction* inst); + bool IsPerComponentOp(const Value* value); + + //these function used to add extract and insert instructions when load/store etc. + void extractFromeVector(Value* insn); + Value* InsertToVector(Value* insn, Value* vecValue); + + Type* GetBasicType(Value* value) { + return GetBasicType(value->getType()); + } + + Type* GetBasicType(Type* type) { + switch(type->getTypeID()) { + case Type::VectorTyID: + case Type::ArrayTyID: + return GetBasicType(type->getContainedType(0)); + default: + break; + } + return type; + } + + int GetComponentCount(const Type* type) { + if (type->getTypeID() == Type::VectorTyID) + return llvm::dyn_cast(type)->getNumElements(); + else + return 1; + } + + int GetComponentCount(const Value* value) { + return GetComponentCount(value->getType()); + } + + DenseMap vectorVals; + Module* module; + IRBuilder<>* builder; + + Type* intTy; + Type* floatTy; + ir::Unit &unit; + + std::vector deadList; + + // List of vector phis that were not completely scalarized because some + // of their operands hadn't before been visited (i.e. loop variant + // variables) + SmallVector incompletePhis; + }; + + Value* Scalarize::getComponent(int component, Value* v) + { + assert(canGetComponent(v) && "getComponent called on unhandled vector"); + + if (v->getType()->isVectorTy()) { + if (ConstantDataVector* c = dyn_cast(v)) { + return c->getElementAsConstant(component); + } else if (ConstantVector* c = dyn_cast(v)) { + return c->getOperand(component); + } else if (isa(v)) { + return Constant::getNullValue(GetBasicType(v)); + } else if (isa(v)) { + return UndefValue::get(GetBasicType(v)); + } else { + return vectorVals[v].getComponent(component); + } + } else { + return v; + } + } + + bool IsPerComponentOp(const llvm::Value* value) + { + const llvm::Instruction* inst = llvm::dyn_cast(value); + return inst && IsPerComponentOp(inst); + } + + bool Scalarize::IsPerComponentOp(const Instruction* inst) + { + //if (const IntrinsicInst* intr = dyn_cast(inst)) + // return IsPerComponentOp(intr); + + if (inst->isTerminator()) + return false; + + switch (inst->getOpcode()) { + + // Cast ops are only per-component if they cast back to the same vector + // width + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + return GetComponentCount(inst->getOperand(0)) == GetComponentCount(inst); + + // Vector ops + case Instruction::InsertElement: + case Instruction::ExtractElement: + case Instruction::ShuffleVector: + + // Ways of accessing/loading/storing vectors + case Instruction::ExtractValue: + case Instruction::InsertValue: + + // Memory ops + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Store: + case Instruction::GetElementPtr: + // Phis are a little special. We consider them not to be per-component + // because the mechanism of choice is a single value (what path we took to + // get here), and doesn't choose per-component (as select would). The caller + // should know to handle phis specially + case Instruction::PHI: + // Call insts, conservatively are no per-component + case Instruction::Call: + // Misc + case Instruction::LandingPad: //--- 3.0 + case Instruction::VAArg: + return false; + } // end of switch (inst->getOpcode()) + + return true; + } + int Scalarize::GetConstantInt(const Value* value) + { + const ConstantInt *constantInt = dyn_cast(value); + + // this might still be a constant expression, rather than a numeric constant, + // e.g., expression with undef's in it, so it was not folded + if (! constantInt) + NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("non-simple constant"); + + return constantInt->getValue().getSExtValue(); + } + bool Scalarize::canGetComponent(Value* v) + { + if (v->getType()->isVectorTy()) { + if (isa(v) || isa(v) || isa(v) || isa(v)) { + return true; + } else { + assert((isa(v) || isa(v)) && "Non-constant non-instuction?"); + return vectorVals.count(v); + } + } else { + return true; + } + } + + bool Scalarize::canGetComponentArgs(User* u) + { + if (PHINode* phi = dyn_cast(u)) { + for (unsigned int i = 0; i < phi->getNumIncomingValues(); ++i) + if (!canGetComponent(phi->getIncomingValue(i))) + return false; + } else { + for (User::op_iterator i = u->op_begin(), e = u->op_end(); i != e; ++i) + if (!canGetComponent(*i)) + return false; + } + return true; + } + + void Scalarize::gatherComponents(int component, ArrayRef args, SmallVectorImpl& componentArgs) + { + componentArgs.clear(); + for (ArrayRef::iterator i = args.begin(), e = args.end(); i != e; ++i) + componentArgs.push_back(getComponent(component, *i)); + } + + Instruction* Scalarize::createScalarInstruction(Instruction* inst, ArrayRef args) + { + // TODO: Refine the below into one large switch + + unsigned op = inst->getOpcode(); + if (inst->isCast()) { + assert(args.size() == 1 && "incorrect number of arguments for cast op"); + return CastInst::Create((Instruction::CastOps)op, args[0], GetBasicType(inst)); + } + + if (inst->isBinaryOp()) { + assert(args.size() == 2 && "incorrect number of arguments for binary op"); + return BinaryOperator::Create((Instruction::BinaryOps)op, args[0], args[1]); + } + + if (PHINode* phi = dyn_cast(inst)) { + PHINode* res = PHINode::Create(GetBasicType(inst), phi->getNumIncomingValues()); + assert(args.size() % 2 == 0 && "Odd number of arguments for a PHI"); + + // Loop over pairs of operands: [Value*, BasicBlock*] + for (unsigned int i = 0; i < args.size(); i++) { + BasicBlock* bb = phi->getIncomingBlock(i); //dyn_cast(args[i+1]); + //assert(bb && "Non-basic block incoming block?"); + res->addIncoming(args[i], bb); + } + + return res; + } + + if (CmpInst* cmpInst = dyn_cast(inst)) { + assert(args.size() == 2 && "incorrect number of arguments for comparison"); + return CmpInst::Create(cmpInst->getOpcode(), cmpInst->getPredicate(), args[0], args[1]); + } + + if (isa(inst)) { + assert(args.size() == 3 && "incorrect number of arguments for select"); + return SelectInst::Create(args[0], args[1], args[2]); + } + + if (IntrinsicInst* intr = dyn_cast(inst)) { + if (! IsPerComponentOp(inst)) + NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarize instruction on a non-per-component intrinsic"); + + // TODO: Assumption is that all per-component intrinsics have all their + // arguments be overloadable. Need to find some way to assert on this + // assumption. This is due to how getDeclaration operates; it only takes + // a list of types that fit overloadable slots. + SmallVector tys(1, GetBasicType(inst->getType())); + // Call instructions have the decl as a last argument, so skip it + for (ArrayRef::iterator i = args.begin(), e = args.end() - 1; i != e; ++i) { + tys.push_back(GetBasicType((*i)->getType())); + } + + Function* f = Intrinsic::getDeclaration(module, intr->getIntrinsicID(), tys); + return CallInst::Create(f, args); + } + + NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unsupported instruction: ", inst->getOpcode(), + // inst->getOpcodeName()); + return 0; + + } + + + void Scalarize::makeScalarizedCalls(Function* f, ArrayRef args, int count, VectorValues& vVals) + { + assert(count > 0 && count <= 16 && "invalid number of vector components"); + for (int i = 0; i < count; ++i) { + Value* res; + SmallVector callArgs(args.begin(), args.end()); + callArgs.push_back(ConstantInt::get(intTy, i)); + + res = builder->CreateCall(f, callArgs); + vVals.setComponent(i, res); + } + } + + void Scalarize::makePerComponentScalarizedCalls(Instruction* inst, ArrayRef args) + { + int count = GetComponentCount(inst); + assert(count > 0 && count <= 16 && "invalid number of vector components"); + assert((inst->getNumOperands() == args.size() || isa(inst)) + && "not enough arguments passed for instruction"); + + VectorValues& vVals = vectorVals[inst]; + + for (int i = 0; i < count; ++i) { + // Set this component of each arg + SmallVector callArgs(args.size(), 0); + gatherComponents(i, args, callArgs); + + Instruction* res = createScalarInstruction(inst, callArgs); + + vVals.setComponent(i, res); + builder->Insert(res); + } + } + + bool Scalarize::isValid(const Instruction* inst) + { + // The result + if (inst->getType()->isVectorTy()) + return false; + + // The arguments + for (Instruction::const_op_iterator i = inst->op_begin(), e = inst->op_end(); i != e; ++i) { + const Value* v = (*i); + assert(v); + if (v->getType()->isVectorTy()) + return false; + } + + return true; + } + + bool Scalarize::scalarize(Instruction* inst) + { + if (isValid(inst)) + return false; + + assert(! vectorVals.count(inst) && "We've already scalarized this somehow?"); + assert((canGetComponentArgs(inst) || isa(inst)) && + "Scalarizing an op whose arguments haven't been scalarized "); + builder->SetInsertPoint(inst); + + if (IsPerComponentOp(inst)) + return scalarizePerComponent(inst); + + if (LoadInst* ld = dyn_cast(inst)) + return scalarizeLoad(ld); + + if (CallInst* call = dyn_cast(inst)) + return scalarizeFuncCall(call); + + if (ExtractElementInst* extr = dyn_cast(inst)) + return scalarizeExtract(extr); + + if (InsertElementInst* ins = dyn_cast(inst)) + return scalarizeInsert(ins); + + if (ShuffleVectorInst* sv = dyn_cast(inst)) + return scalarizeShuffleVector(sv); + + if (PHINode* phi = dyn_cast(inst)) + return scalarizePHI(phi); + + if (isa(inst) || isa(inst)) + // TODO: need to come up with a struct/array model for scalarization + NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarizing struct/array ops"); + + if (StoreInst* st = dyn_cast(inst)) + return scalarizeStore(st); + + NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unhandled instruction ", inst->getOpcode(), inst->getOpcodeName()); + return false; + } + + bool Scalarize::scalarizeShuffleVector(ShuffleVectorInst* sv) + { + // %res = shuffleVector %foo, bar, <...> + // ==> nothing (just make a new VectorValues with the new components) + VectorValues& vVals = vectorVals[sv]; + + int size = GetComponentCount(sv); + int srcSize = GetComponentCount(sv->getOperand(0)->getType()); + + for (int i = 0; i < size; ++i) { + int select = sv->getMaskValue(i); + + if (select < 0) { + vVals.setComponent(i, UndefValue::get(GetBasicType(sv->getOperand(0)))); + continue; + } + + // Otherwise look up the corresponding component from the correct + // source. + Value* selectee; + if (select < srcSize) { + selectee = sv->getOperand(0); + } else { + // Choose from the second operand + select -= srcSize; + selectee = sv->getOperand(1); + } + + vVals.setComponent(i, getComponent(select, selectee)); + } + + return true; + } + + bool Scalarize::scalarizePerComponent(Instruction* inst) + { + // dst = op %foo, %bar + // ==> dstx = op ty %foox, ty %barx + // dsty = op ty %fooy, ty %bary + // ... + + SmallVector args(inst->op_begin(), inst->op_end()); + + makePerComponentScalarizedCalls(inst, args); + + return true; + } + + bool Scalarize::scalarizePHI(PHINode* phi) + { + // dst = phi [ %foo, %bb1 ], [ %bar, %bb2], ... + // ==> dstx = phi ty [ %foox, %bb1 ], [ %barx, %bb2], ... + // dsty = phi ty [ %fooy, %bb1 ], [ %bary, %bb2], ... + + // If the scalar values are all known up-front, then just make the full + // phinode now. If they are not yet known (phinode for a loop variant + // variable), then deferr the arguments until later + + if (canGetComponentArgs(phi)) { + SmallVector args(phi->op_begin(), phi->op_end()); + makePerComponentScalarizedCalls(phi, args); + } else { + makePerComponentScalarizedCalls(phi, ArrayRef()); + incompletePhis.push_back(phi); + } + + return true; + } + + void Scalarize::extractFromeVector(Value* insn) { + VectorValues& vVals = vectorVals[insn]; + + for (int i = 0; i < GetComponentCount(insn); ++i) { + Value *cv = ConstantInt::get(intTy, i); + Value *EI = builder->CreateExtractElement(insn, cv); + vVals.setComponent(i, EI); + //unit.fakeInsnMap[EI] = insn; + unit.newValueProxy(insn, EI, i, 0); + } + } + + Value* Scalarize::InsertToVector(Value * insn, Value* vecValue) { + //VectorValues& vVals = vectorVals[writeValue]; + //unit.vecValuesMap[call] = vectorVals[writeValue]; + + //add fake insert instructions to avoid removed + Value *II = NULL; + for (int i = 0; i < GetComponentCount(vecValue); ++i) { + Value *vec = II ? II : UndefValue::get(vecValue->getType()); + Value *cv = ConstantInt::get(intTy, i); + II = builder->CreateInsertElement(vec, getComponent(i, vecValue), cv); + //unit.vecValuesMap[insn].setComponent(i, getComponent(i, writeValue)); + //unit.newValueProxy(getComponent(i, vecValue), vecValue, 0, i); + //unit.fakeInsnMap[II] = insn; + } + + for (int i = 0; i < GetComponentCount(vecValue); ++i) { + unit.newValueProxy(getComponent(i, vecValue), II, 0, i); + } + return II; + } + + bool Scalarize::scalarizeFuncCall(CallInst* call) { + if (Function *F = call->getCalledFunction()) { + if (F->getIntrinsicID() != 0) { //Intrinsic functions + NOT_IMPLEMENTED; + } else { + Value *Callee = call->getCalledValue(); + const std::string fnName = Callee->getName(); + auto it = instrinsicMap.map.find(fnName); + GBE_ASSERT(it != instrinsicMap.map.end()); + + // Get the function arguments + CallSite CS(call); + CallSite::arg_iterator CI = CS.arg_begin() + 3; + + switch (it->second) { + default: break; + case GEN_OCL_READ_IMAGE0: + case GEN_OCL_READ_IMAGE1: + case GEN_OCL_READ_IMAGE2: + case GEN_OCL_READ_IMAGE3: + case GEN_OCL_READ_IMAGE4: + case GEN_OCL_READ_IMAGE5: + case GEN_OCL_READ_IMAGE10: + case GEN_OCL_READ_IMAGE11: + case GEN_OCL_READ_IMAGE12: + case GEN_OCL_READ_IMAGE13: + case GEN_OCL_READ_IMAGE14: + case GEN_OCL_READ_IMAGE15: + { + extractFromeVector(call); + break; + } + case GEN_OCL_WRITE_IMAGE10: + case GEN_OCL_WRITE_IMAGE11: + case GEN_OCL_WRITE_IMAGE12: + case GEN_OCL_WRITE_IMAGE13: + case GEN_OCL_WRITE_IMAGE14: + case GEN_OCL_WRITE_IMAGE15: + CI++; + case GEN_OCL_WRITE_IMAGE0: + case GEN_OCL_WRITE_IMAGE1: + case GEN_OCL_WRITE_IMAGE2: + case GEN_OCL_WRITE_IMAGE3: + case GEN_OCL_WRITE_IMAGE4: + case GEN_OCL_WRITE_IMAGE5: + { + *CI = InsertToVector(call, *CI); + break; + } + } + } + } + return false; + } + + bool Scalarize::scalarizeLoad(LoadInst* ld) + { + extractFromeVector(ld); + return false; + } + + bool Scalarize::scalarizeStore(StoreInst* st) { + st->setOperand(0, InsertToVector(st, st->getValueOperand())); + return false; + } + + bool Scalarize::scalarizeExtract(ExtractElementInst* extr) + { + // %res = extractelement %foo, %i + // ==> nothing (just use %foo's %ith component instead of %res) + + if (! isa(extr->getOperand(1))) { + // TODO: Variably referenced components. Probably handle/emulate through + // a series of selects. + NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components"); + } + //if (isa(extr->getOperand(0))) + // return false; + int component = GetConstantInt(extr->getOperand(1)); + Value* v = getComponent(component, extr->getOperand(0)); + if(extr == v) + return false; + extr->replaceAllUsesWith(v); + + return true; + } + + bool Scalarize::scalarizeInsert(InsertElementInst* ins) + { + // %res = insertValue %foo, %i + // ==> nothing (just make a new VectorValues with the new component) + + if (! isa(ins->getOperand(2))) { + // TODO: Variably referenced components. Probably handle/emulate through + // a series of selects. + NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components"); + } + + int component = GetConstantInt(ins->getOperand(2)); + + VectorValues& vVals = vectorVals[ins]; + for (int i = 0; i < GetComponentCount(ins); ++i) { + vVals.setComponent(i, i == component ? ins->getOperand(1) + : getComponent(i, ins->getOperand(0))); + } + + return true; + } + + void Scalarize::scalarizeArgs(Function& F) { + if (F.arg_empty()) + return; + ReversePostOrderTraversal rpot(&F); + BasicBlock::iterator instI = (*rpot.begin())->begin(); + builder->SetInsertPoint(instI); + + Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); + +#if LLVM_VERSION_MINOR <= 1 + const AttrListPtr &PAL = F.getAttributes(); + uint32_t argID = 1; // Start at one actually + for (; I != E; ++I, ++argID) { +#else + for (; I != E; ++I) { +#endif /* LLVM_VERSION_MINOR <= 1 */ + Type *type = I->getType(); + + if(type->isVectorTy()) + extractFromeVector(I); + } + return; + } + + bool Scalarize::runOnFunction(Function& F) + { + switch (F.getCallingConv()) { + case CallingConv::PTX_Device: + return false; + case CallingConv::PTX_Kernel: + break; + default: GBE_ASSERTM(false, "Unsupported calling convention"); + } + + bool changed = false; + module = F.getParent(); + intTy = IntegerType::get(module->getContext(), 32); + floatTy = Type::getFloatTy(module->getContext()); + builder = new IRBuilder<>(module->getContext()); + + scalarizeArgs(F); + + typedef ReversePostOrderTraversal RPOTType; + RPOTType rpot(&F); + for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) { + for (BasicBlock::iterator instI = (*bbI)->begin(), instE = (*bbI)->end(); instI != instE; ++instI) { + bool scalarized = scalarize(instI); + if (scalarized) { + changed = true; + // TODO: uncomment when done + deadList.push_back(instI); + } + } + } + + // Fill in the incomplete phis + for (SmallVectorImpl::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end(); + phiI != phiE; ++phiI) { + assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized"); + + // Fill in each component of this phi + VectorValues& vVals = vectorVals[*phiI]; + for (int c = 0; c < GetComponentCount(*phiI); ++c) { + PHINode* compPhi = dyn_cast(vVals.getComponent(c)); + assert(compPhi && "Vector phi got scalarized to non-phis?"); + + // Loop over pairs of operands: [Value*, BasicBlock*] + for (unsigned int i = 0; i < (*phiI)->getNumOperands(); i++) { + BasicBlock* bb = (*phiI)->getIncomingBlock(i); + assert(bb && "Non-basic block incoming block?"); + compPhi->addIncoming(getComponent(c, (*phiI)->getOperand(i)), bb); + } + } + } + + dce(); + + delete builder; + builder = 0; + + return changed; + } + + void Scalarize::dce() + { + //two passes delete for some phinode + for (std::vector::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) { + (*i)->dropAllReferences(); + if((*i)->use_empty()) + (*i)->eraseFromParent(); + } + for (std::vector::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) { + if((*i)->getParent()) + (*i)->eraseFromParent(); + } + deadList.clear(); + } + + void Scalarize::getAnalysisUsage(AnalysisUsage& AU) const + { + } + + void Scalarize::print(raw_ostream&, const Module*) const + { + return; + } + FunctionPass* createScalarizePass(ir::Unit &unit) + { + return new Scalarize(unit); + } + char Scalarize::ID = 0; + +} // end namespace diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp index ea3d9eb..559cde0 100644 --- a/backend/src/llvm/llvm_to_gen.cpp +++ b/backend/src/llvm/llvm_to_gen.cpp @@ -69,6 +69,7 @@ namespace gbe // Print the code before further optimizations if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS) passes.add(createPrintModulePass(&*o)); + passes.add(createScalarizePass(unit)); // Expand all vector ops passes.add(createScalarReplAggregatesPass()); // Break up allocas passes.add(createRemoveGEPPass(unit)); passes.add(createConstantPropagationPass());