--- /dev/null
+//===- MatmulOptimizer.h -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_MATMULOPTIMIZER_H
+#define POLLY_MATMULOPTIMIZER_H
+
+#include "isl/isl-noexceptions.h"
+
+namespace llvm {
+class TargetTransformInfo;
+}
+
+namespace polly {
+struct Dependences;
+
+/// Apply the BLIS matmul optimization pattern if possible.
+///
+/// Make the loops containing the matrix multiplication be the innermost
+/// loops and apply the BLIS matmul optimization pattern. BLIS implements
+/// gemm as three nested loops around a macro-kernel, plus two packing
+/// routines. The macro-kernel is implemented in terms of two additional
+/// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
+/// (i.e., outer product) update.
+///
+/// For a detailed description please see [1].
+///
+/// The order of the loops defines the data reused in the BLIS implementation
+/// of gemm ([1]). In particular, elements of the matrix B, the second
+/// operand of matrix multiplication, are reused between iterations of the
+/// innermost loop. To keep the reused data in cache, only elements of matrix
+/// A, the first operand of matrix multiplication, should be evicted during
+/// an iteration of the innermost loop. To provide such a cache replacement
+/// policy, elements of the matrix A can, in particular, be loaded first and,
+/// consequently, be least-recently-used.
+///
+/// In our case matrices are stored in row-major order instead of
+/// column-major order used in the BLIS implementation ([1]). It affects only
+/// on the form of the BLIS micro kernel and the computation of its
+/// parameters. In particular, reused elements of the matrix B are
+/// successively multiplied by specific elements of the matrix A.
+///
+/// Refs.:
+/// [1] - Analytical Modeling is Enough for High Performance BLIS
+/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
+/// Technical Report, 2014
+/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
+///
+/// @see ScheduleTreeOptimizer::createMicroKernel
+/// @see ScheduleTreeOptimizer::createMacroKernel
+/// @see getMicroKernelParams
+/// @see getMacroKernelParams
+///
+/// TODO: Implement the packing transformation.
+///
+/// @param Node The node that contains a band to be optimized. The node
+/// is required to successfully pass
+/// ScheduleTreeOptimizer::isMatrMultPattern.
+/// @param TTI Target Transform Info.
+/// @param D The dependencies.
+///
+/// @returns The transformed schedule or nullptr if the optimization
+/// cannot be applied.
+isl::schedule_node
+tryOptimizeMatMulPattern(isl::schedule_node Node,
+ const llvm::TargetTransformInfo *TTI,
+ const Dependences *D);
+
+} // namespace polly
+#endif // POLLY_MATMULOPTIMIZER_H
private:
llvm::raw_ostream &OS;
};
-
-/// Build the desired set of partial tile prefixes.
-///
-/// We build a set of partial tile prefixes, which are prefixes of the vector
-/// loop that have exactly VectorWidth iterations.
-///
-/// 1. Drop all constraints involving the dimension that represents the
-/// vector loop.
-/// 2. Constrain the last dimension to get a set, which has exactly VectorWidth
-/// iterations.
-/// 3. Subtract loop domain from it, project out the vector loop dimension and
-/// get a set that contains prefixes, which do not have exactly VectorWidth
-/// iterations.
-/// 4. Project out the vector loop dimension of the set that was build on the
-/// first step and subtract the set built on the previous step to get the
-/// desired set of prefixes.
-///
-/// @param ScheduleRange A range of a map, which describes a prefix schedule
-/// relation.
-isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth);
} // namespace polly
namespace llvm {
#ifndef POLLY_SCHEDULETREETRANSFORM_H
#define POLLY_SCHEDULETREETRANSFORM_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/ErrorHandling.h"
#include "isl/isl-noexceptions.h"
#include <cassert>
/// Replace the AST band @p BandToUnroll by a partially unrolled equivalent.
isl::schedule applyPartialUnroll(isl::schedule_node BandToUnroll, int Factor);
+/// Build the desired set of partial tile prefixes.
+///
+/// We build a set of partial tile prefixes, which are prefixes of the vector
+/// loop that have exactly VectorWidth iterations.
+///
+/// 1. Drop all constraints involving the dimension that represents the
+/// vector loop.
+/// 2. Constrain the last dimension to get a set, which has exactly VectorWidth
+/// iterations.
+/// 3. Subtract loop domain from it, project out the vector loop dimension and
+/// get a set that contains prefixes, which do not have exactly VectorWidth
+/// iterations.
+/// 4. Project out the vector loop dimension of the set that was build on the
+/// first step and subtract the set built on the previous step to get the
+/// desired set of prefixes.
+///
+/// @param ScheduleRange A range of a map, which describes a prefix schedule
+/// relation.
+isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth);
+
+/// Create an isl::union_set, which describes the isolate option based on
+/// IsolateDomain.
+///
+/// @param IsolateDomain An isl::set whose @p OutDimsNum last dimensions should
+/// belong to the current band node.
+/// @param OutDimsNum A number of dimensions that should belong to
+/// the current band node.
+isl::union_set getIsolateOptions(isl::set IsolateDomain, isl_size OutDimsNum);
+
+/// Create an isl::union_set, which describes the specified option for the
+/// dimension of the current node.
+///
+/// @param Ctx An isl::ctx, which is used to create the isl::union_set.
+/// @param Option The name of the option.
+isl::union_set getDimOptions(isl::ctx Ctx, const char *Option);
+
+/// Tile a schedule node.
+///
+/// @param Node The node to tile.
+/// @param Identifier An name that identifies this kind of tiling and
+/// that is used to mark the tiled loops in the
+/// generated AST.
+/// @param TileSizes A vector of tile sizes that should be used for
+/// tiling.
+/// @param DefaultTileSize A default tile size that is used for dimensions
+/// that are not covered by the TileSizes vector.
+isl::schedule_node tileNode(isl::schedule_node Node, const char *Identifier,
+ llvm::ArrayRef<int> TileSizes, int DefaultTileSize);
+
+/// Tile a schedule node and unroll point loops.
+///
+/// @param Node The node to register tile.
+/// @param TileSizes A vector of tile sizes that should be used for
+/// tiling.
+/// @param DefaultTileSize A default tile size that is used for dimensions
+isl::schedule_node applyRegisterTiling(isl::schedule_node Node,
+ llvm::ArrayRef<int> TileSizes,
+ int DefaultTileSize);
+
} // namespace polly
#endif // POLLY_SCHEDULETREETRANSFORM_H
Transform/RewriteByReferenceParameters.cpp
Transform/ScopInliner.cpp
Transform/ManualOptimizer.cpp
+ Transform/MatmulOptimizer.cpp
${POLLY_HEADER_FILES}
LINK_COMPONENTS
--- /dev/null
+//===- MatmulOptimizer.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/MatmulOptimizer.h"
+#include "polly/DependenceInfo.h"
+#include "polly/Options.h"
+#include "polly/ScheduleTreeTransform.h"
+#include "polly/ScopInfo.h"
+#include "polly/ScopPass.h"
+#include "polly/Simplify.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/raw_ostream.h"
+#include "isl/ctx.h"
+#include "isl/schedule_node.h"
+#include "isl/schedule_type.h"
+#include "isl/union_map.h"
+#include "isl/union_set.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#define DEBUG_TYPE "polly-opt-isl"
+
+using namespace llvm;
+using namespace polly;
+
+namespace llvm {
+class Value;
+}
+
+static cl::opt<int> LatencyVectorFma(
+ "polly-target-latency-vector-fma",
+ cl::desc("The minimal number of cycles between issuing two "
+ "dependent consecutive vector fused multiply-add "
+ "instructions."),
+ cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> ThroughputVectorFma(
+ "polly-target-throughput-vector-fma",
+ cl::desc("A throughput of the processor floating-point arithmetic units "
+ "expressed in the number of vector fused multiply-add "
+ "instructions per clock cycle."),
+ cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> FirstCacheLevelSize(
+ "polly-target-1st-cache-level-size",
+ cl::desc("The size of the first cache level specified in bytes."),
+ cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> FirstCacheLevelDefaultSize(
+ "polly-target-1st-cache-level-default-size",
+ cl::desc("The default size of the first cache level specified in bytes"
+ " (if not enough were provided by the TargetTransformInfo)."),
+ cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> SecondCacheLevelSize(
+ "polly-target-2nd-cache-level-size",
+ cl::desc("The size of the second level specified in bytes."), cl::Hidden,
+ cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> SecondCacheLevelDefaultSize(
+ "polly-target-2nd-cache-level-default-size",
+ cl::desc("The default size of the second cache level specified in bytes"
+ " (if not enough were provided by the TargetTransformInfo)."),
+ cl::Hidden, cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+// This option, along with --polly-target-2nd-cache-level-associativity,
+// --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
+// represent the parameters of the target cache, which do not have typical
+// values that can be used by default. However, to apply the pattern matching
+// optimizations, we use the values of the parameters of Intel Core i7-3820
+// SandyBridge in case the parameters are not specified or not provided by the
+// TargetTransformInfo.
+static cl::opt<int> FirstCacheLevelAssociativity(
+ "polly-target-1st-cache-level-associativity",
+ cl::desc("The associativity of the first cache level."), cl::Hidden,
+ cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> FirstCacheLevelDefaultAssociativity(
+ "polly-target-1st-cache-level-default-associativity",
+ cl::desc("The default associativity of the first cache level"
+ " (if not enough were provided by the TargetTransformInfo)."),
+ cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> SecondCacheLevelAssociativity(
+ "polly-target-2nd-cache-level-associativity",
+ cl::desc("The associativity of the second cache level."), cl::Hidden,
+ cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> SecondCacheLevelDefaultAssociativity(
+ "polly-target-2nd-cache-level-default-associativity",
+ cl::desc("The default associativity of the second cache level"
+ " (if not enough were provided by the TargetTransformInfo)."),
+ cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> VectorRegisterBitwidth(
+ "polly-target-vector-register-bitwidth",
+ cl::desc("The size in bits of a vector register (if not set, this "
+ "information is taken from LLVM's target information."),
+ cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> PollyPatternMatchingNcQuotient(
+ "polly-pattern-matching-nc-quotient",
+ cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
+ "macro-kernel, by Nr, the parameter of the micro-kernel"),
+ cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+namespace {
+/// Parameters of the micro kernel.
+///
+/// Parameters, which determine sizes of rank-1 (i.e., outer product) update
+/// used in the optimized matrix multiplication.
+struct MicroKernelParamsTy {
+ int Mr;
+ int Nr;
+};
+
+/// Parameters of the macro kernel.
+///
+/// Parameters, which determine sizes of blocks of partitioned matrices
+/// used in the optimized matrix multiplication.
+struct MacroKernelParamsTy {
+ int Mc;
+ int Nc;
+ int Kc;
+};
+
+/// Parameters of the matrix multiplication operands.
+///
+/// Parameters, which describe access relations that represent operands of the
+/// matrix multiplication.
+struct MatMulInfoTy {
+ MemoryAccess *A = nullptr;
+ MemoryAccess *B = nullptr;
+ MemoryAccess *ReadFromC = nullptr;
+ MemoryAccess *WriteToC = nullptr;
+ int i = -1;
+ int j = -1;
+ int k = -1;
+};
+
+/// Create an isl::union_set, which describes the option of the form
+/// [isolate[] -> unroll[x]].
+///
+/// @param Ctx An isl::ctx, which is used to create the isl::union_set.
+static isl::union_set getUnrollIsolatedSetOptions(isl::ctx Ctx) {
+ isl::space Space = isl::space(Ctx, 0, 0, 1);
+ isl::map UnrollIsolatedSetOption = isl::map::universe(Space);
+ isl::id DimInId = isl::id::alloc(Ctx, "isolate", nullptr);
+ isl::id DimOutId = isl::id::alloc(Ctx, "unroll", nullptr);
+ UnrollIsolatedSetOption =
+ UnrollIsolatedSetOption.set_tuple_id(isl::dim::in, DimInId);
+ UnrollIsolatedSetOption =
+ UnrollIsolatedSetOption.set_tuple_id(isl::dim::out, DimOutId);
+ return UnrollIsolatedSetOption.wrap();
+}
+
+/// Permute the two dimensions of the isl map.
+///
+/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
+/// have type @p DimType.
+///
+/// @param Map The isl map to be modified.
+/// @param DimType The type of the dimensions.
+/// @param DstPos The first dimension.
+/// @param SrcPos The second dimension.
+/// @return The modified map.
+static isl::map permuteDimensions(isl::map Map, isl::dim DimType,
+ unsigned DstPos, unsigned SrcPos) {
+ assert((isl_size)DstPos < Map.dim(DimType) &&
+ (isl_size)SrcPos < Map.dim(DimType));
+ if (DstPos == SrcPos)
+ return Map;
+ isl::id DimId;
+ if (Map.has_tuple_id(DimType))
+ DimId = Map.get_tuple_id(DimType);
+ auto FreeDim = DimType == isl::dim::in ? isl::dim::out : isl::dim::in;
+ isl::id FreeDimId;
+ if (Map.has_tuple_id(FreeDim))
+ FreeDimId = Map.get_tuple_id(FreeDim);
+ auto MaxDim = std::max(DstPos, SrcPos);
+ auto MinDim = std::min(DstPos, SrcPos);
+ Map = Map.move_dims(FreeDim, 0, DimType, MaxDim, 1);
+ Map = Map.move_dims(FreeDim, 0, DimType, MinDim, 1);
+ Map = Map.move_dims(DimType, MinDim, FreeDim, 1, 1);
+ Map = Map.move_dims(DimType, MaxDim, FreeDim, 0, 1);
+ if (DimId)
+ Map = Map.set_tuple_id(DimType, DimId);
+ if (FreeDimId)
+ Map = Map.set_tuple_id(FreeDim, FreeDimId);
+ return Map;
+}
+
+/// Check the form of the access relation.
+///
+/// Check that the access relation @p AccMap has the form M[i][j], where i
+/// is a @p FirstPos and j is a @p SecondPos.
+///
+/// @param AccMap The access relation to be checked.
+/// @param FirstPos The index of the input dimension that is mapped to
+/// the first output dimension.
+/// @param SecondPos The index of the input dimension that is mapped to the
+/// second output dimension.
+/// @return True in case @p AccMap has the expected form and false,
+/// otherwise.
+static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos,
+ int &SecondPos) {
+ isl::space Space = AccMap.get_space();
+ isl::map Universe = isl::map::universe(Space);
+
+ if (Space.dim(isl::dim::out) != 2)
+ return false;
+
+ // MatMul has the form:
+ // for (i = 0; i < N; i++)
+ // for (j = 0; j < M; j++)
+ // for (k = 0; k < P; k++)
+ // C[i, j] += A[i, k] * B[k, j]
+ //
+ // Permutation of three outer loops: 3! = 6 possibilities.
+ int FirstDims[] = {0, 0, 1, 1, 2, 2};
+ int SecondDims[] = {1, 2, 2, 0, 0, 1};
+ for (int i = 0; i < 6; i += 1) {
+ auto PossibleMatMul =
+ Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0)
+ .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1);
+
+ AccMap = AccMap.intersect_domain(Domain);
+ PossibleMatMul = PossibleMatMul.intersect_domain(Domain);
+
+ // If AccMap spans entire domain (Non-partial write),
+ // compute FirstPos and SecondPos.
+ // If AccMap != PossibleMatMul here (the two maps have been gisted at
+ // this point), it means that the writes are not complete, or in other
+ // words, it is a Partial write and Partial writes must be rejected.
+ if (AccMap.is_equal(PossibleMatMul)) {
+ if (FirstPos != -1 && FirstPos != FirstDims[i])
+ continue;
+ FirstPos = FirstDims[i];
+ if (SecondPos != -1 && SecondPos != SecondDims[i])
+ continue;
+ SecondPos = SecondDims[i];
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// Does the memory access represent a non-scalar operand of the matrix
+/// multiplication.
+///
+/// Check that the memory access @p MemAccess is the read access to a non-scalar
+/// operand of the matrix multiplication or its result.
+///
+/// @param MemAccess The memory access to be checked.
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return True in case the memory access represents the read access
+/// to a non-scalar operand of the matrix multiplication and
+/// false, otherwise.
+static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
+ MatMulInfoTy &MMI) {
+ if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead())
+ return false;
+ auto AccMap = MemAccess->getLatestAccessRelation();
+ isl::set StmtDomain = MemAccess->getStatement()->getDomain();
+ if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) {
+ MMI.ReadFromC = MemAccess;
+ return true;
+ }
+ if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) {
+ MMI.A = MemAccess;
+ return true;
+ }
+ if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) {
+ MMI.B = MemAccess;
+ return true;
+ }
+ return false;
+}
+
+/// Check accesses to operands of the matrix multiplication.
+///
+/// Check that accesses of the SCoP statement, which corresponds to
+/// the partial schedule @p PartialSchedule, are scalar in terms of loops
+/// containing the matrix multiplication, in case they do not represent
+/// accesses to the non-scalar operands of the matrix multiplication or
+/// its result.
+///
+/// @param PartialSchedule The partial schedule of the SCoP statement.
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return True in case the corresponding SCoP statement
+/// represents matrix multiplication and false,
+/// otherwise.
+static bool containsOnlyMatrMultAcc(isl::map PartialSchedule,
+ MatMulInfoTy &MMI) {
+ auto InputDimId = PartialSchedule.get_tuple_id(isl::dim::in);
+ auto *Stmt = static_cast<ScopStmt *>(InputDimId.get_user());
+ isl_size OutDimNum = PartialSchedule.dim(isl::dim::out);
+ assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
+ "and, consequently, the corresponding scheduling "
+ "functions have at least three dimensions.");
+ auto MapI =
+ permuteDimensions(PartialSchedule, isl::dim::out, MMI.i, OutDimNum - 1);
+ auto MapJ =
+ permuteDimensions(PartialSchedule, isl::dim::out, MMI.j, OutDimNum - 1);
+ auto MapK =
+ permuteDimensions(PartialSchedule, isl::dim::out, MMI.k, OutDimNum - 1);
+
+ auto Accesses = getAccessesInOrder(*Stmt);
+ for (auto *MemA = Accesses.begin(); MemA != Accesses.end() - 1; MemA++) {
+ auto *MemAccessPtr = *MemA;
+ if (MemAccessPtr->isLatestArrayKind() && MemAccessPtr != MMI.WriteToC &&
+ !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
+ !(MemAccessPtr->isStrideZero(MapI)) &&
+ MemAccessPtr->isStrideZero(MapJ) && MemAccessPtr->isStrideZero(MapK))
+ return false;
+ }
+ return true;
+}
+
+/// Check for dependencies corresponding to the matrix multiplication.
+///
+/// Check that there is only true dependence of the form
+/// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
+/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
+/// to the dependency produced by the matrix multiplication.
+///
+/// @param Schedule The schedule of the SCoP statement.
+/// @param D The SCoP dependencies.
+/// @param Pos The parameter to describe an acceptable true dependence.
+/// In case it has a negative value, try to determine its
+/// acceptable value.
+/// @return True in case dependencies correspond to the matrix multiplication
+/// and false, otherwise.
+static bool containsOnlyMatMulDep(isl::map Schedule, const Dependences *D,
+ int &Pos) {
+ isl::union_map Dep = D->getDependences(Dependences::TYPE_RAW);
+ isl::union_map Red = D->getDependences(Dependences::TYPE_RED);
+ if (Red)
+ Dep = Dep.unite(Red);
+ auto DomainSpace = Schedule.get_space().domain();
+ auto Space = DomainSpace.map_from_domain_and_range(DomainSpace);
+ auto Deltas = Dep.extract_map(Space).deltas();
+ isl_size DeltasDimNum = Deltas.dim(isl::dim::set);
+ for (int i = 0; i < DeltasDimNum; i++) {
+ auto Val = Deltas.plain_get_val_if_fixed(isl::dim::set, i);
+ Pos = Pos < 0 && Val.is_one() ? i : Pos;
+ if (Val.is_nan() || !(Val.is_zero() || (i == Pos && Val.is_one())))
+ return false;
+ }
+ if (DeltasDimNum == 0 || Pos < 0)
+ return false;
+ return true;
+}
+
+/// Check if the SCoP statement could probably be optimized with analytical
+/// modeling.
+///
+/// containsMatrMult tries to determine whether the following conditions
+/// are true:
+/// 1. The last memory access modeling an array, MA1, represents writing to
+/// memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
+/// S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
+/// under consideration.
+/// 2. There is only one loop-carried true dependency, and it has the
+/// form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
+/// loop-carried or anti dependencies.
+/// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
+/// reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
+/// S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
+/// and all memory accesses of the SCoP that are different from MA1, MA2,
+/// MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
+/// of loops i1, i2 and i3.
+///
+/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
+/// to check.
+/// @D The SCoP dependencies.
+/// @MMI Parameters of the matrix multiplication operands.
+static bool containsMatrMult(isl::map PartialSchedule, const Dependences *D,
+ MatMulInfoTy &MMI) {
+ auto InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
+ auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
+ if (Stmt->size() <= 1)
+ return false;
+
+ auto Accesses = getAccessesInOrder(*Stmt);
+ for (auto *MemA = Accesses.end() - 1; MemA != Accesses.begin(); MemA--) {
+ auto *MemAccessPtr = *MemA;
+ if (!MemAccessPtr->isLatestArrayKind())
+ continue;
+ if (!MemAccessPtr->isWrite())
+ return false;
+ auto AccMap = MemAccessPtr->getLatestAccessRelation();
+ if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j))
+ return false;
+ MMI.WriteToC = MemAccessPtr;
+ break;
+ }
+
+ if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
+ return false;
+
+ if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
+ return false;
+
+ if (!MMI.A || !MMI.B || !MMI.ReadFromC)
+ return false;
+ return true;
+}
+
+/// Permute two dimensions of the band node.
+///
+/// Permute FirstDim and SecondDim dimensions of the Node.
+///
+/// @param Node The band node to be modified.
+/// @param FirstDim The first dimension to be permuted.
+/// @param SecondDim The second dimension to be permuted.
+static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node,
+ unsigned FirstDim,
+ unsigned SecondDim) {
+ assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band &&
+ (unsigned)isl_schedule_node_band_n_member(Node.get()) >
+ std::max(FirstDim, SecondDim));
+ auto PartialSchedule =
+ isl::manage(isl_schedule_node_band_get_partial_schedule(Node.get()));
+ auto PartialScheduleFirstDim = PartialSchedule.get_union_pw_aff(FirstDim);
+ auto PartialScheduleSecondDim = PartialSchedule.get_union_pw_aff(SecondDim);
+ PartialSchedule =
+ PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim);
+ PartialSchedule =
+ PartialSchedule.set_union_pw_aff(FirstDim, PartialScheduleSecondDim);
+ Node = isl::manage(isl_schedule_node_delete(Node.release()));
+ return Node.insert_partial_schedule(PartialSchedule);
+}
+
+static isl::schedule_node
+createMicroKernel(isl::schedule_node Node,
+ MicroKernelParamsTy MicroKernelParams) {
+ Node = applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
+ 1);
+ Node = Node.parent().parent();
+ return permuteBandNodeDimensions(Node, 0, 1).child(0).child(0);
+}
+
+/// Create the BLIS macro-kernel.
+///
+/// We create the BLIS macro-kernel by applying a combination of tiling
+/// of dimensions of the band node and interchanging of two innermost
+/// modified dimensions. The values of of MacroKernelParams's fields are used
+/// as tile sizes.
+///
+/// @param Node The schedule node to be modified.
+/// @param MacroKernelParams Parameters of the macro kernel
+/// to be used as tile sizes.
+static isl::schedule_node
+createMacroKernel(isl::schedule_node Node,
+ MacroKernelParamsTy MacroKernelParams) {
+ assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
+ if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
+ MacroKernelParams.Kc == 1)
+ return Node;
+ int DimOutNum = isl_schedule_node_band_n_member(Node.get());
+ std::vector<int> TileSizes(DimOutNum, 1);
+ TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
+ TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
+ TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
+ Node = tileNode(Node, "1st level tiling", TileSizes, 1);
+ Node = Node.parent().parent();
+ Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
+ Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
+
+ // Mark the outermost loop as parallelizable.
+ Node = Node.band_member_set_coincident(0, true);
+
+ return Node.child(0).child(0);
+}
+
+/// Get the size of the widest type of the matrix multiplication operands
+/// in bytes, including alignment padding.
+///
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The size of the widest type of the matrix multiplication operands
+/// in bytes, including alignment padding.
+static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI) {
+ auto *S = MMI.A->getStatement()->getParent();
+ auto &DL = S->getFunction().getParent()->getDataLayout();
+ auto ElementSizeA = DL.getTypeAllocSize(MMI.A->getElementType());
+ auto ElementSizeB = DL.getTypeAllocSize(MMI.B->getElementType());
+ auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->getElementType());
+ return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
+}
+
+/// Get the size of the widest type of the matrix multiplication operands
+/// in bits.
+///
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The size of the widest type of the matrix multiplication operands
+/// in bits.
+static uint64_t getMatMulTypeSize(MatMulInfoTy MMI) {
+ auto *S = MMI.A->getStatement()->getParent();
+ auto &DL = S->getFunction().getParent()->getDataLayout();
+ auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->getElementType());
+ auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->getElementType());
+ auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->getElementType());
+ return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
+}
+
+/// Get parameters of the BLIS micro kernel.
+///
+/// We choose the Mr and Nr parameters of the micro kernel to be large enough
+/// such that no stalls caused by the combination of latencies and dependencies
+/// are introduced during the updates of the resulting matrix of the matrix
+/// multiplication. However, they should also be as small as possible to
+/// release more registers for entries of multiplied matrices.
+///
+/// @param TTI Target Transform Info.
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The structure of type MicroKernelParamsTy.
+/// @see MicroKernelParamsTy
+static struct MicroKernelParamsTy
+getMicroKernelParams(const TargetTransformInfo *TTI, MatMulInfoTy MMI) {
+ assert(TTI && "The target transform info should be provided.");
+
+ // Nvec - Number of double-precision floating-point numbers that can be hold
+ // by a vector register. Use 2 by default.
+ long RegisterBitwidth = VectorRegisterBitwidth;
+
+ if (RegisterBitwidth == -1)
+ RegisterBitwidth =
+ TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+ auto ElementSize = getMatMulTypeSize(MMI);
+ assert(ElementSize > 0 && "The element size of the matrix multiplication "
+ "operands should be greater than zero.");
+ auto Nvec = RegisterBitwidth / ElementSize;
+ if (Nvec == 0)
+ Nvec = 2;
+ int Nr = ceil(sqrt((double)(Nvec * LatencyVectorFma * ThroughputVectorFma)) /
+ Nvec) *
+ Nvec;
+ int Mr = ceil((double)(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr));
+ return {Mr, Nr};
+}
+
+/// Determine parameters of the target cache.
+///
+/// @param TTI Target Transform Info.
+static void getTargetCacheParameters(const llvm::TargetTransformInfo *TTI) {
+ auto L1DCache = llvm::TargetTransformInfo::CacheLevel::L1D;
+ auto L2DCache = llvm::TargetTransformInfo::CacheLevel::L2D;
+ if (FirstCacheLevelSize == -1) {
+ if (TTI->getCacheSize(L1DCache).hasValue())
+ FirstCacheLevelSize = TTI->getCacheSize(L1DCache).getValue();
+ else
+ FirstCacheLevelSize = static_cast<int>(FirstCacheLevelDefaultSize);
+ }
+ if (SecondCacheLevelSize == -1) {
+ if (TTI->getCacheSize(L2DCache).hasValue())
+ SecondCacheLevelSize = TTI->getCacheSize(L2DCache).getValue();
+ else
+ SecondCacheLevelSize = static_cast<int>(SecondCacheLevelDefaultSize);
+ }
+ if (FirstCacheLevelAssociativity == -1) {
+ if (TTI->getCacheAssociativity(L1DCache).hasValue())
+ FirstCacheLevelAssociativity =
+ TTI->getCacheAssociativity(L1DCache).getValue();
+ else
+ FirstCacheLevelAssociativity =
+ static_cast<int>(FirstCacheLevelDefaultAssociativity);
+ }
+ if (SecondCacheLevelAssociativity == -1) {
+ if (TTI->getCacheAssociativity(L2DCache).hasValue())
+ SecondCacheLevelAssociativity =
+ TTI->getCacheAssociativity(L2DCache).getValue();
+ else
+ SecondCacheLevelAssociativity =
+ static_cast<int>(SecondCacheLevelDefaultAssociativity);
+ }
+}
+
+/// Get parameters of the BLIS macro kernel.
+///
+/// During the computation of matrix multiplication, blocks of partitioned
+/// matrices are mapped to different layers of the memory hierarchy.
+/// To optimize data reuse, blocks should be ideally kept in cache between
+/// iterations. Since parameters of the macro kernel determine sizes of these
+/// blocks, there are upper and lower bounds on these parameters.
+///
+/// @param TTI Target Transform Info.
+/// @param MicroKernelParams Parameters of the micro-kernel
+/// to be taken into account.
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The structure of type MacroKernelParamsTy.
+/// @see MacroKernelParamsTy
+/// @see MicroKernelParamsTy
+static struct MacroKernelParamsTy
+getMacroKernelParams(const llvm::TargetTransformInfo *TTI,
+ const MicroKernelParamsTy &MicroKernelParams,
+ MatMulInfoTy MMI) {
+ getTargetCacheParameters(TTI);
+ // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
+ // it requires information about the first two levels of a cache to determine
+ // all the parameters of a macro-kernel. It also checks that an associativity
+ // degree of a cache level is greater than two. Otherwise, another algorithm
+ // for determination of the parameters should be used.
+ if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
+ FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
+ FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
+ return {1, 1, 1};
+ // The quotient should be greater than zero.
+ if (PollyPatternMatchingNcQuotient <= 0)
+ return {1, 1, 1};
+ int Car = floor(
+ (FirstCacheLevelAssociativity - 1) /
+ (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
+
+ // Car can be computed to be zero since it is floor to int.
+ // On Mac OS, division by 0 does not raise a signal. This causes negative
+ // tile sizes to be computed. Prevent division by Cac==0 by early returning
+ // if this happens.
+ if (Car == 0)
+ return {1, 1, 1};
+
+ auto ElementSize = getMatMulAlignTypeSize(MMI);
+ assert(ElementSize > 0 && "The element size of the matrix multiplication "
+ "operands should be greater than zero.");
+ int Kc = (Car * FirstCacheLevelSize) /
+ (MicroKernelParams.Mr * FirstCacheLevelAssociativity * ElementSize);
+ double Cac =
+ static_cast<double>(Kc * ElementSize * SecondCacheLevelAssociativity) /
+ SecondCacheLevelSize;
+ int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
+ int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
+
+ assert(Mc > 0 && Nc > 0 && Kc > 0 &&
+ "Matrix block sizes should be greater than zero");
+ return {Mc, Nc, Kc};
+}
+
+/// Create an access relation that is specific to
+/// the matrix multiplication pattern.
+///
+/// Create an access relation of the following form:
+/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
+/// where I is @p FirstDim, J is @p SecondDim.
+///
+/// It can be used, for example, to create relations that helps to consequently
+/// access elements of operands of a matrix multiplication after creation of
+/// the BLIS micro and macro kernels.
+///
+/// @see ScheduleTreeOptimizer::createMicroKernel
+/// @see ScheduleTreeOptimizer::createMacroKernel
+///
+/// Subsequently, the described access relation is applied to the range of
+/// @p MapOldIndVar, that is used to map original induction variables to
+/// the ones, which are produced by schedule transformations. It helps to
+/// define relations using a new space and, at the same time, keep them
+/// in the original one.
+///
+/// @param MapOldIndVar The relation, which maps original induction variables
+/// to the ones, which are produced by schedule
+/// transformations.
+/// @param FirstDim, SecondDim The input dimensions that are used to define
+/// the specified access relation.
+/// @return The specified access relation.
+static isl::map getMatMulAccRel(isl::map MapOldIndVar, unsigned FirstDim,
+ unsigned SecondDim) {
+ auto AccessRelSpace = isl::space(MapOldIndVar.get_ctx(), 0, 9, 3);
+ auto AccessRel = isl::map::universe(AccessRelSpace);
+ AccessRel = AccessRel.equate(isl::dim::in, FirstDim, isl::dim::out, 0);
+ AccessRel = AccessRel.equate(isl::dim::in, 5, isl::dim::out, 1);
+ AccessRel = AccessRel.equate(isl::dim::in, SecondDim, isl::dim::out, 2);
+ return MapOldIndVar.apply_range(AccessRel);
+}
+
+static isl::schedule_node createExtensionNode(isl::schedule_node Node,
+ isl::map ExtensionMap) {
+ auto Extension = isl::union_map(ExtensionMap);
+ auto NewNode = isl::schedule_node::from_extension(Extension);
+ return Node.graft_before(NewNode);
+}
+
+/// Apply the packing transformation.
+///
+/// The packing transformation can be described as a data-layout
+/// transformation that requires to introduce a new array, copy data
+/// to the array, and change memory access locations to reference the array.
+/// It can be used to ensure that elements of the new array are read in-stride
+/// access, aligned to cache lines boundaries, and preloaded into certain cache
+/// levels.
+///
+/// As an example let us consider the packing of the array A that would help
+/// to read its elements with in-stride access. An access to the array A
+/// is represented by an access relation that has the form
+/// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
+/// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
+/// k mod Kc, j mod Nr, i mod Mr].
+///
+/// To ensure that elements of the array A are read in-stride access, we add
+/// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
+/// Scop::createScopArrayInfo, change the access relation
+/// S[i, j, k] -> A[i, k] to
+/// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
+/// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
+/// the copy statement created by Scop::addScopStmt.
+///
+/// @param Node The schedule node to be optimized.
+/// @param MapOldIndVar The relation, which maps original induction variables
+/// to the ones, which are produced by schedule
+/// transformations.
+/// @param MicroParams, MacroParams Parameters of the BLIS kernel
+/// to be taken into account.
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The optimized schedule node.
+static isl::schedule_node
+optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar,
+ MicroKernelParamsTy MicroParams,
+ MacroKernelParamsTy MacroParams,
+ MatMulInfoTy &MMI) {
+ auto InputDimsId = MapOldIndVar.get_tuple_id(isl::dim::in);
+ auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
+
+ // Create a copy statement that corresponds to the memory access to the
+ // matrix B, the second operand of the matrix multiplication.
+ Node = Node.parent().parent().parent().parent().parent().parent();
+ Node = isl::manage(isl_schedule_node_band_split(Node.release(), 2)).child(0);
+ auto AccRel = getMatMulAccRel(MapOldIndVar, 3, 7);
+ unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
+ unsigned SecondDimSize = MacroParams.Kc;
+ unsigned ThirdDimSize = MicroParams.Nr;
+ auto *SAI = Stmt->getParent()->createScopArrayInfo(
+ MMI.B->getElementType(), "Packed_B",
+ {FirstDimSize, SecondDimSize, ThirdDimSize});
+ AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
+ auto OldAcc = MMI.B->getLatestAccessRelation();
+ MMI.B->setNewAccessRelation(AccRel);
+ auto ExtMap = MapOldIndVar.project_out(isl::dim::out, 2,
+ MapOldIndVar.dim(isl::dim::out) - 2);
+ ExtMap = ExtMap.reverse();
+ ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0);
+ auto Domain = Stmt->getDomain();
+
+ // Restrict the domains of the copy statements to only execute when also its
+ // originating statement is executed.
+ auto DomainId = Domain.get_tuple_id();
+ auto *NewStmt = Stmt->getParent()->addScopStmt(
+ OldAcc, MMI.B->getLatestAccessRelation(), Domain);
+ ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
+ ExtMap = ExtMap.intersect_range(Domain);
+ ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
+ Node = createExtensionNode(Node, ExtMap);
+
+ // Create a copy statement that corresponds to the memory access
+ // to the matrix A, the first operand of the matrix multiplication.
+ Node = Node.child(0);
+ AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
+ FirstDimSize = MacroParams.Mc / MicroParams.Mr;
+ ThirdDimSize = MicroParams.Mr;
+ SAI = Stmt->getParent()->createScopArrayInfo(
+ MMI.A->getElementType(), "Packed_A",
+ {FirstDimSize, SecondDimSize, ThirdDimSize});
+ AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
+ OldAcc = MMI.A->getLatestAccessRelation();
+ MMI.A->setNewAccessRelation(AccRel);
+ ExtMap = MapOldIndVar.project_out(isl::dim::out, 3,
+ MapOldIndVar.dim(isl::dim::out) - 3);
+ ExtMap = ExtMap.reverse();
+ ExtMap = ExtMap.fix_si(isl::dim::out, MMI.j, 0);
+ NewStmt = Stmt->getParent()->addScopStmt(
+ OldAcc, MMI.A->getLatestAccessRelation(), Domain);
+
+ // Restrict the domains of the copy statements to only execute when also its
+ // originating statement is executed.
+ ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
+ ExtMap = ExtMap.intersect_range(Domain);
+ ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
+ Node = createExtensionNode(Node, ExtMap);
+ return Node.child(0).child(0).child(0).child(0).child(0);
+}
+
+/// Get a relation mapping induction variables produced by schedule
+/// transformations to the original ones.
+///
+/// @param Node The schedule node produced as the result of creation
+/// of the BLIS kernels.
+/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
+/// to be taken into account.
+/// @return The relation mapping original induction variables to the ones
+/// produced by schedule transformation.
+/// @see ScheduleTreeOptimizer::createMicroKernel
+/// @see ScheduleTreeOptimizer::createMacroKernel
+/// @see getMacroKernelParams
+static isl::map
+getInductionVariablesSubstitution(isl::schedule_node Node,
+ MicroKernelParamsTy MicroKernelParams,
+ MacroKernelParamsTy MacroKernelParams) {
+ auto Child = Node.child(0);
+ auto UnMapOldIndVar = Child.get_prefix_schedule_union_map();
+ auto MapOldIndVar = isl::map::from_union_map(UnMapOldIndVar);
+ if (MapOldIndVar.dim(isl::dim::out) > 9)
+ return MapOldIndVar.project_out(isl::dim::out, 0,
+ MapOldIndVar.dim(isl::dim::out) - 9);
+ return MapOldIndVar;
+}
+
+/// Isolate a set of partial tile prefixes and unroll the isolated part.
+///
+/// The set should ensure that it contains only partial tile prefixes that have
+/// exactly Mr x Nr iterations of the two innermost loops produced by
+/// the optimization of the matrix multiplication. Mr and Nr are parameters of
+/// the micro-kernel.
+///
+/// In case of parametric bounds, this helps to auto-vectorize the unrolled
+/// innermost loops, using the SLP vectorizer.
+///
+/// @param Node The schedule node to be modified.
+/// @param MicroKernelParams Parameters of the micro-kernel
+/// to be taken into account.
+/// @return The modified isl_schedule_node.
+static isl::schedule_node
+isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
+ struct MicroKernelParamsTy MicroKernelParams) {
+ isl::schedule_node Child = Node.get_child(0);
+ isl::union_map UnMapOldIndVar = Child.get_prefix_schedule_relation();
+ isl::set Prefix = isl::map::from_union_map(UnMapOldIndVar).range();
+ isl_size Dims = Prefix.dim(isl::dim::set);
+ Prefix = Prefix.project_out(isl::dim::set, Dims - 1, 1);
+ Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
+ Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
+
+ isl::union_set IsolateOption =
+ getIsolateOptions(Prefix.add_dims(isl::dim::set, 3), 3);
+ isl::ctx Ctx = Node.get_ctx();
+ auto Options = IsolateOption.unite(getDimOptions(Ctx, "unroll"));
+ Options = Options.unite(getUnrollIsolatedSetOptions(Ctx));
+ Node = Node.band_set_ast_build_options(Options);
+ Node = Node.parent().parent().parent();
+ IsolateOption = getIsolateOptions(Prefix, 3);
+ Options = IsolateOption.unite(getDimOptions(Ctx, "separate"));
+ Node = Node.band_set_ast_build_options(Options);
+ Node = Node.child(0).child(0).child(0);
+ return Node;
+}
+
+/// Mark @p BasePtr with "Inter iteration alias-free" mark node.
+///
+/// @param Node The child of the mark node to be inserted.
+/// @param BasePtr The pointer to be marked.
+/// @return The modified isl_schedule_node.
+static isl::schedule_node markInterIterationAliasFree(isl::schedule_node Node,
+ Value *BasePtr) {
+ if (!BasePtr)
+ return Node;
+
+ auto Id =
+ isl::id::alloc(Node.get_ctx(), "Inter iteration alias-free", BasePtr);
+ return Node.insert_mark(Id).child(0);
+}
+
+/// Insert "Loop Vectorizer Disabled" mark node.
+///
+/// @param Node The child of the mark node to be inserted.
+/// @return The modified isl_schedule_node.
+static isl::schedule_node markLoopVectorizerDisabled(isl::schedule_node Node) {
+ auto Id = isl::id::alloc(Node.get_ctx(), "Loop Vectorizer Disabled", nullptr);
+ return Node.insert_mark(Id).child(0);
+}
+
+/// Restore the initial ordering of dimensions of the band node
+///
+/// In case the band node represents all the dimensions of the iteration
+/// domain, recreate the band node to restore the initial ordering of the
+/// dimensions.
+///
+/// @param Node The band node to be modified.
+/// @return The modified schedule node.
+static isl::schedule_node
+getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
+ assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
+ if (isl_schedule_node_get_type(Node.child(0).get()) != isl_schedule_node_leaf)
+ return Node;
+ auto Domain = Node.get_universe_domain();
+ assert(isl_union_set_n_set(Domain.get()) == 1);
+ if (Node.get_schedule_depth() != 0 ||
+ (isl::set(Domain).dim(isl::dim::set) !=
+ isl_schedule_node_band_n_member(Node.get())))
+ return Node;
+ Node = isl::manage(isl_schedule_node_delete(Node.copy()));
+ auto PartialSchedulePwAff = Domain.identity_union_pw_multi_aff();
+ auto PartialScheduleMultiPwAff =
+ isl::multi_union_pw_aff(PartialSchedulePwAff);
+ PartialScheduleMultiPwAff =
+ PartialScheduleMultiPwAff.reset_tuple_id(isl::dim::set);
+ return Node.insert_partial_schedule(PartialScheduleMultiPwAff);
+}
+
+static isl::schedule_node optimizeMatMulPattern(isl::schedule_node Node,
+ const TargetTransformInfo *TTI,
+ MatMulInfoTy &MMI) {
+ assert(TTI && "The target transform info should be provided.");
+ Node = markInterIterationAliasFree(
+ Node, MMI.WriteToC->getLatestScopArrayInfo()->getBasePtr());
+ int DimOutNum = isl_schedule_node_band_n_member(Node.get());
+ assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
+ "and, consequently, the corresponding scheduling "
+ "functions have at least three dimensions.");
+ Node = getBandNodeWithOriginDimOrder(Node);
+ Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
+ int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
+ int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
+ Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
+ NewK = NewK == DimOutNum - 2 ? NewJ : NewK;
+ Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
+ auto MicroKernelParams = getMicroKernelParams(TTI, MMI);
+ auto MacroKernelParams = getMacroKernelParams(TTI, MicroKernelParams, MMI);
+ Node = createMacroKernel(Node, MacroKernelParams);
+ Node = createMicroKernel(Node, MicroKernelParams);
+ if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
+ MacroKernelParams.Kc == 1)
+ return Node;
+ auto MapOldIndVar = getInductionVariablesSubstitution(Node, MicroKernelParams,
+ MacroKernelParams);
+ if (!MapOldIndVar)
+ return Node;
+ Node = markLoopVectorizerDisabled(Node.parent()).child(0);
+ Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
+ return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
+ MacroKernelParams, MMI);
+}
+
+/// Check if this node contains a partial schedule that could
+/// probably be optimized with analytical modeling.
+///
+/// isMatrMultPattern tries to determine whether the following conditions
+/// are true:
+/// 1. the partial schedule contains only one statement.
+/// 2. there are exactly three input dimensions.
+/// 3. all memory accesses of the statement will have stride 0 or 1, if we
+/// interchange loops (switch the variable used in the inner loop to
+/// the outer loop).
+/// 4. all memory accesses of the statement except from the last one, are
+/// read memory access and the last one is write memory access.
+/// 5. all subscripts of the last memory access of the statement don't
+/// contain the variable used in the inner loop.
+/// If this is the case, we could try to use an approach that is similar to
+/// the one used to get close-to-peak performance of matrix multiplications.
+///
+/// @param Node The node to check.
+/// @param D The SCoP dependencies.
+/// @param MMI Parameters of the matrix multiplication operands.
+static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
+ MatMulInfoTy &MMI) {
+ auto PartialSchedule = isl::manage(
+ isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
+ Node = Node.child(0);
+ auto LeafType = isl_schedule_node_get_type(Node.get());
+ Node = Node.parent();
+ if (LeafType != isl_schedule_node_leaf ||
+ isl_schedule_node_band_n_member(Node.get()) < 3 ||
+ Node.get_schedule_depth() != 0 ||
+ isl_union_map_n_map(PartialSchedule.get()) != 1)
+ return false;
+ auto NewPartialSchedule = isl::map::from_union_map(PartialSchedule);
+ if (containsMatrMult(NewPartialSchedule, D, MMI))
+ return true;
+ return false;
+}
+
+} // namespace
+
+isl::schedule_node
+polly::tryOptimizeMatMulPattern(isl::schedule_node Node,
+ const llvm::TargetTransformInfo *TTI,
+ const Dependences *D) {
+ MatMulInfoTy MMI;
+ if (isMatrMultPattern(Node, D, MMI)) {
+ LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
+ return optimizeMatMulPattern(Node, TTI, MMI);
+ }
+ return {};
+}
#include "polly/ScheduleOptimizer.h"
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
#include "polly/ManualOptimizer.h"
+#include "polly/MatmulOptimizer.h"
#include "polly/Options.h"
#include "polly/ScheduleTreeTransform.h"
-#include "polly/ScopInfo.h"
-#include "polly/ScopPass.h"
-#include "polly/Simplify.h"
#include "polly/Support/ISLOStream.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "isl/ctx.h"
#include "isl/options.h"
-#include "isl/printer.h"
-#include "isl/schedule.h"
-#include "isl/schedule_node.h"
-#include "isl/union_map.h"
-#include "isl/union_set.h"
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-#include <string>
-#include <vector>
using namespace llvm;
using namespace polly;
+namespace llvm {
+class Loop;
+class Module;
+} // namespace llvm
+
#define DEBUG_TYPE "polly-opt-isl"
static cl::opt<std::string>
cl::init(true), cl::ZeroOrMore,
cl::cat(PollyCategory));
-static cl::opt<int> LatencyVectorFma(
- "polly-target-latency-vector-fma",
- cl::desc("The minimal number of cycles between issuing two "
- "dependent consecutive vector fused multiply-add "
- "instructions."),
- cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> ThroughputVectorFma(
- "polly-target-throughput-vector-fma",
- cl::desc("A throughput of the processor floating-point arithmetic units "
- "expressed in the number of vector fused multiply-add "
- "instructions per clock cycle."),
- cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-// This option, along with --polly-target-2nd-cache-level-associativity,
-// --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
-// represent the parameters of the target cache, which do not have typical
-// values that can be used by default. However, to apply the pattern matching
-// optimizations, we use the values of the parameters of Intel Core i7-3820
-// SandyBridge in case the parameters are not specified or not provided by the
-// TargetTransformInfo.
-static cl::opt<int> FirstCacheLevelAssociativity(
- "polly-target-1st-cache-level-associativity",
- cl::desc("The associativity of the first cache level."), cl::Hidden,
- cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> FirstCacheLevelDefaultAssociativity(
- "polly-target-1st-cache-level-default-associativity",
- cl::desc("The default associativity of the first cache level"
- " (if not enough were provided by the TargetTransformInfo)."),
- cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> SecondCacheLevelAssociativity(
- "polly-target-2nd-cache-level-associativity",
- cl::desc("The associativity of the second cache level."), cl::Hidden,
- cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> SecondCacheLevelDefaultAssociativity(
- "polly-target-2nd-cache-level-default-associativity",
- cl::desc("The default associativity of the second cache level"
- " (if not enough were provided by the TargetTransformInfo)."),
- cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> FirstCacheLevelSize(
- "polly-target-1st-cache-level-size",
- cl::desc("The size of the first cache level specified in bytes."),
- cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> FirstCacheLevelDefaultSize(
- "polly-target-1st-cache-level-default-size",
- cl::desc("The default size of the first cache level specified in bytes"
- " (if not enough were provided by the TargetTransformInfo)."),
- cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> SecondCacheLevelSize(
- "polly-target-2nd-cache-level-size",
- cl::desc("The size of the second level specified in bytes."), cl::Hidden,
- cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> SecondCacheLevelDefaultSize(
- "polly-target-2nd-cache-level-default-size",
- cl::desc("The default size of the second cache level specified in bytes"
- " (if not enough were provided by the TargetTransformInfo)."),
- cl::Hidden, cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> VectorRegisterBitwidth(
- "polly-target-vector-register-bitwidth",
- cl::desc("The size in bits of a vector register (if not set, this "
- "information is taken from LLVM's target information."),
- cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
static cl::opt<int> FirstLevelDefaultTileSize(
"polly-default-tile-size",
cl::desc("The default tile size (if not enough were provided by"
" --polly-register-tile-sizes)"),
cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
-static cl::opt<int> PollyPatternMatchingNcQuotient(
- "polly-pattern-matching-nc-quotient",
- cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
- "macro-kernel, by Nr, the parameter of the micro-kernel"),
- cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
-
static cl::list<int>
RegisterTileSizes("polly-register-tile-sizes",
cl::desc("A tile size for each loop dimension, filled "
"Number of matrix multiplication patterns detected and optimized");
namespace {
-/// Parameters of the micro kernel.
-///
-/// Parameters, which determine sizes of rank-1 (i.e., outer product) update
-/// used in the optimized matrix multiplication.
-struct MicroKernelParamsTy {
- int Mr;
- int Nr;
-};
-
-/// Parameters of the macro kernel.
-///
-/// Parameters, which determine sizes of blocks of partitioned matrices
-/// used in the optimized matrix multiplication.
-struct MacroKernelParamsTy {
- int Mc;
- int Nc;
- int Kc;
-};
-
/// Additional parameters of the schedule optimizer.
///
/// Target Transform Info and the SCoP dependencies used by the schedule
const Dependences *D;
};
-/// Parameters of the matrix multiplication operands.
-///
-/// Parameters, which describe access relations that represent operands of the
-/// matrix multiplication.
-struct MatMulInfoTy {
- MemoryAccess *A = nullptr;
- MemoryAccess *B = nullptr;
- MemoryAccess *ReadFromC = nullptr;
- MemoryAccess *WriteToC = nullptr;
- int i = -1;
- int j = -1;
- int k = -1;
-};
-
class ScheduleTreeOptimizer {
public:
/// Apply schedule tree transformations.
int VectorWidth);
private:
- /// Tile a schedule node.
- ///
- /// @param Node The node to tile.
- /// @param Identifier An name that identifies this kind of tiling and
- /// that is used to mark the tiled loops in the
- /// generated AST.
- /// @param TileSizes A vector of tile sizes that should be used for
- /// tiling.
- /// @param DefaultTileSize A default tile size that is used for dimensions
- /// that are not covered by the TileSizes vector.
- static isl::schedule_node tileNode(isl::schedule_node Node,
- const char *Identifier,
- llvm::ArrayRef<int> TileSizes,
- int DefaultTileSize);
-
- /// Tile a schedule node and unroll point loops.
- ///
- /// @param Node The node to register tile.
- /// @param TileSizes A vector of tile sizes that should be used for
- /// tiling.
- /// @param DefaultTileSize A default tile size that is used for dimensions
- static isl::schedule_node applyRegisterTiling(isl::schedule_node Node,
- llvm::ArrayRef<int> TileSizes,
- int DefaultTileSize);
-
- /// Apply the BLIS matmul optimization pattern.
- ///
- /// Make the loops containing the matrix multiplication be the innermost
- /// loops and apply the BLIS matmul optimization pattern. BLIS implements
- /// gemm as three nested loops around a macro-kernel, plus two packing
- /// routines. The macro-kernel is implemented in terms of two additional
- /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
- /// (i.e., outer product) update.
- ///
- /// For a detailed description please see [1].
- ///
- /// The order of the loops defines the data reused in the BLIS implementation
- /// of gemm ([1]). In particular, elements of the matrix B, the second
- /// operand of matrix multiplication, are reused between iterations of the
- /// innermost loop. To keep the reused data in cache, only elements of matrix
- /// A, the first operand of matrix multiplication, should be evicted during
- /// an iteration of the innermost loop. To provide such a cache replacement
- /// policy, elements of the matrix A can, in particular, be loaded first and,
- /// consequently, be least-recently-used.
- ///
- /// In our case matrices are stored in row-major order instead of
- /// column-major order used in the BLIS implementation ([1]). It affects only
- /// on the form of the BLIS micro kernel and the computation of its
- /// parameters. In particular, reused elements of the matrix B are
- /// successively multiplied by specific elements of the matrix A.
- ///
- /// Refs.:
- /// [1] - Analytical Modeling is Enough for High Performance BLIS
- /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
- /// Technical Report, 2014
- /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
- ///
- /// @see ScheduleTreeOptimizer::createMicroKernel
- /// @see ScheduleTreeOptimizer::createMacroKernel
- /// @see getMicroKernelParams
- /// @see getMacroKernelParams
- ///
- /// TODO: Implement the packing transformation.
- ///
- /// @param Node The node that contains a band to be optimized. The node
- /// is required to successfully pass
- /// ScheduleTreeOptimizer::isMatrMultPattern.
- /// @param TTI Target Transform Info.
- /// @param MMI Parameters of the matrix multiplication operands.
- /// @returns The transformed schedule.
- static isl::schedule_node
- optimizeMatMulPattern(isl::schedule_node Node,
- const llvm::TargetTransformInfo *TTI,
- MatMulInfoTy &MMI);
-
/// Check if this node is a band node we want to tile.
///
/// We look for innermost band nodes where individual dimensions are marked as
/// (currently unused).
static isl::schedule_node standardBandOpts(isl::schedule_node Node,
void *User);
-
- /// Check if this node contains a partial schedule that could
- /// probably be optimized with analytical modeling.
- ///
- /// isMatrMultPattern tries to determine whether the following conditions
- /// are true:
- /// 1. the partial schedule contains only one statement.
- /// 2. there are exactly three input dimensions.
- /// 3. all memory accesses of the statement will have stride 0 or 1, if we
- /// interchange loops (switch the variable used in the inner loop to
- /// the outer loop).
- /// 4. all memory accesses of the statement except from the last one, are
- /// read memory access and the last one is write memory access.
- /// 5. all subscripts of the last memory access of the statement don't
- /// contain the variable used in the inner loop.
- /// If this is the case, we could try to use an approach that is similar to
- /// the one used to get close-to-peak performance of matrix multiplications.
- ///
- /// @param Node The node to check.
- /// @param D The SCoP dependencies.
- /// @param MMI Parameters of the matrix multiplication operands.
- static bool isMatrMultPattern(isl::schedule_node Node,
- const polly::Dependences *D, MatMulInfoTy &MMI);
-
- /// Create the BLIS macro-kernel.
- ///
- /// We create the BLIS macro-kernel by applying a combination of tiling
- /// of dimensions of the band node and interchanging of two innermost
- /// modified dimensions. The values of of MacroKernelParams's fields are used
- /// as tile sizes.
- ///
- /// @param Node The schedule node to be modified.
- /// @param MacroKernelParams Parameters of the macro kernel
- /// to be used as tile sizes.
- static isl::schedule_node
- createMacroKernel(isl::schedule_node Node,
- MacroKernelParamsTy MacroKernelParams);
-
- /// Create the BLIS macro-kernel.
- ///
- /// We create the BLIS macro-kernel by applying a combination of tiling
- /// of dimensions of the band node and interchanging of two innermost
- /// modified dimensions. The values passed in MicroKernelParam are used
- /// as tile sizes.
- ///
- /// @param Node The schedule node to be modified.
- /// @param MicroKernelParams Parameters of the micro kernel
- /// to be used as tile sizes.
- /// @see MicroKernelParamsTy
- static isl::schedule_node
- createMicroKernel(isl::schedule_node Node,
- MicroKernelParamsTy MicroKernelParams);
};
-/// Create an isl::union_set, which describes the isolate option based on
-/// IsolateDomain.
-///
-/// @param IsolateDomain An isl::set whose @p OutDimsNum last dimensions should
-/// belong to the current band node.
-/// @param OutDimsNum A number of dimensions that should belong to
-/// the current band node.
-static isl::union_set getIsolateOptions(isl::set IsolateDomain,
- isl_size OutDimsNum) {
- isl_size Dims = IsolateDomain.dim(isl::dim::set);
- assert(OutDimsNum <= Dims &&
- "The isl::set IsolateDomain is used to describe the range of schedule "
- "dimensions values, which should be isolated. Consequently, the "
- "number of its dimensions should be greater than or equal to the "
- "number of the schedule dimensions.");
- isl::map IsolateRelation = isl::map::from_domain(IsolateDomain);
- IsolateRelation = IsolateRelation.move_dims(isl::dim::out, 0, isl::dim::in,
- Dims - OutDimsNum, OutDimsNum);
- isl::set IsolateOption = IsolateRelation.wrap();
- isl::id Id = isl::id::alloc(IsolateOption.get_ctx(), "isolate", nullptr);
- IsolateOption = IsolateOption.set_tuple_id(Id);
- return isl::union_set(IsolateOption);
-}
-
-/// Create an isl::union_set, which describes the specified option for the
-/// dimension of the current node.
-///
-/// @param Ctx An isl::ctx, which is used to create the isl::union_set.
-/// @param Option The name of the option.
-isl::union_set getDimOptions(isl::ctx Ctx, const char *Option) {
- isl::space Space(Ctx, 0, 1);
- auto DimOption = isl::set::universe(Space);
- auto Id = isl::id::alloc(Ctx, Option, nullptr);
- DimOption = DimOption.set_tuple_id(Id);
- return isl::union_set(DimOption);
-}
-
-/// Create an isl::union_set, which describes the option of the form
-/// [isolate[] -> unroll[x]].
-///
-/// @param Ctx An isl::ctx, which is used to create the isl::union_set.
-static isl::union_set getUnrollIsolatedSetOptions(isl::ctx Ctx) {
- isl::space Space = isl::space(Ctx, 0, 0, 1);
- isl::map UnrollIsolatedSetOption = isl::map::universe(Space);
- isl::id DimInId = isl::id::alloc(Ctx, "isolate", nullptr);
- isl::id DimOutId = isl::id::alloc(Ctx, "unroll", nullptr);
- UnrollIsolatedSetOption =
- UnrollIsolatedSetOption.set_tuple_id(isl::dim::in, DimInId);
- UnrollIsolatedSetOption =
- UnrollIsolatedSetOption.set_tuple_id(isl::dim::out, DimOutId);
- return UnrollIsolatedSetOption.wrap();
-}
-
-/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
-///
-/// @param Set A set, which should be modified.
-/// @param VectorWidth A parameter, which determines the constraint.
-static isl::set addExtentConstraints(isl::set Set, int VectorWidth) {
- unsigned Dims = Set.dim(isl::dim::set);
- isl::space Space = Set.get_space();
- isl::local_space LocalSpace = isl::local_space(Space);
- isl::constraint ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
- ExtConstr = ExtConstr.set_constant_si(0);
- ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, 1);
- Set = Set.add_constraint(ExtConstr);
- ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
- ExtConstr = ExtConstr.set_constant_si(VectorWidth - 1);
- ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1);
- return Set.add_constraint(ExtConstr);
-}
-} // namespace
-
-isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
- int VectorWidth) {
- isl_size Dims = ScheduleRange.dim(isl::dim::set);
- isl::set LoopPrefixes =
- ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1);
- auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth);
- isl::set BadPrefixes = ExtentPrefixes.subtract(ScheduleRange);
- BadPrefixes = BadPrefixes.project_out(isl::dim::set, Dims - 1, 1);
- LoopPrefixes = LoopPrefixes.project_out(isl::dim::set, Dims - 1, 1);
- return LoopPrefixes.subtract(BadPrefixes);
-}
-
-namespace {
isl::schedule_node
ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
int VectorWidth) {
return Node.insert_mark(LoopMarker);
}
-isl::schedule_node ScheduleTreeOptimizer::tileNode(isl::schedule_node Node,
- const char *Identifier,
- ArrayRef<int> TileSizes,
- int DefaultTileSize) {
- auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
- auto Dims = Space.dim(isl::dim::set);
- auto Sizes = isl::multi_val::zero(Space);
- std::string IdentifierString(Identifier);
- for (auto i : seq<isl_size>(0, Dims)) {
- auto tileSize =
- i < (isl_size)TileSizes.size() ? TileSizes[i] : DefaultTileSize;
- Sizes = Sizes.set_val(i, isl::val(Node.get_ctx(), tileSize));
- }
- auto TileLoopMarkerStr = IdentifierString + " - Tiles";
- auto TileLoopMarker =
- isl::id::alloc(Node.get_ctx(), TileLoopMarkerStr, nullptr);
- Node = Node.insert_mark(TileLoopMarker);
- Node = Node.child(0);
- Node =
- isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
- Node = Node.child(0);
- auto PointLoopMarkerStr = IdentifierString + " - Points";
- auto PointLoopMarker =
- isl::id::alloc(Node.get_ctx(), PointLoopMarkerStr, nullptr);
- Node = Node.insert_mark(PointLoopMarker);
- return Node.child(0);
-}
-
-isl::schedule_node ScheduleTreeOptimizer::applyRegisterTiling(
- isl::schedule_node Node, ArrayRef<int> TileSizes, int DefaultTileSize) {
- Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
- auto Ctx = Node.get_ctx();
- return Node.band_set_ast_build_options(isl::union_set(Ctx, "{unroll[x]}"));
-}
-
static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
assert(isl_schedule_node_n_children(Node.get()) == 1);
return Node;
}
-/// Permute the two dimensions of the isl map.
-///
-/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
-/// have type @p DimType.
-///
-/// @param Map The isl map to be modified.
-/// @param DimType The type of the dimensions.
-/// @param DstPos The first dimension.
-/// @param SrcPos The second dimension.
-/// @return The modified map.
-isl::map permuteDimensions(isl::map Map, isl::dim DimType, unsigned DstPos,
- unsigned SrcPos) {
- assert((isl_size)DstPos < Map.dim(DimType) &&
- (isl_size)SrcPos < Map.dim(DimType));
- if (DstPos == SrcPos)
- return Map;
- isl::id DimId;
- if (Map.has_tuple_id(DimType))
- DimId = Map.get_tuple_id(DimType);
- auto FreeDim = DimType == isl::dim::in ? isl::dim::out : isl::dim::in;
- isl::id FreeDimId;
- if (Map.has_tuple_id(FreeDim))
- FreeDimId = Map.get_tuple_id(FreeDim);
- auto MaxDim = std::max(DstPos, SrcPos);
- auto MinDim = std::min(DstPos, SrcPos);
- Map = Map.move_dims(FreeDim, 0, DimType, MaxDim, 1);
- Map = Map.move_dims(FreeDim, 0, DimType, MinDim, 1);
- Map = Map.move_dims(DimType, MinDim, FreeDim, 1, 1);
- Map = Map.move_dims(DimType, MaxDim, FreeDim, 0, 1);
- if (DimId)
- Map = Map.set_tuple_id(DimType, DimId);
- if (FreeDimId)
- Map = Map.set_tuple_id(FreeDim, FreeDimId);
- return Map;
-}
-
-/// Check the form of the access relation.
-///
-/// Check that the access relation @p AccMap has the form M[i][j], where i
-/// is a @p FirstPos and j is a @p SecondPos.
-///
-/// @param AccMap The access relation to be checked.
-/// @param FirstPos The index of the input dimension that is mapped to
-/// the first output dimension.
-/// @param SecondPos The index of the input dimension that is mapped to the
-/// second output dimension.
-/// @return True in case @p AccMap has the expected form and false,
-/// otherwise.
-static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos,
- int &SecondPos) {
- isl::space Space = AccMap.get_space();
- isl::map Universe = isl::map::universe(Space);
-
- if (Space.dim(isl::dim::out) != 2)
- return false;
-
- // MatMul has the form:
- // for (i = 0; i < N; i++)
- // for (j = 0; j < M; j++)
- // for (k = 0; k < P; k++)
- // C[i, j] += A[i, k] * B[k, j]
- //
- // Permutation of three outer loops: 3! = 6 possibilities.
- int FirstDims[] = {0, 0, 1, 1, 2, 2};
- int SecondDims[] = {1, 2, 2, 0, 0, 1};
- for (int i = 0; i < 6; i += 1) {
- auto PossibleMatMul =
- Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0)
- .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1);
-
- AccMap = AccMap.intersect_domain(Domain);
- PossibleMatMul = PossibleMatMul.intersect_domain(Domain);
-
- // If AccMap spans entire domain (Non-partial write),
- // compute FirstPos and SecondPos.
- // If AccMap != PossibleMatMul here (the two maps have been gisted at
- // this point), it means that the writes are not complete, or in other
- // words, it is a Partial write and Partial writes must be rejected.
- if (AccMap.is_equal(PossibleMatMul)) {
- if (FirstPos != -1 && FirstPos != FirstDims[i])
- continue;
- FirstPos = FirstDims[i];
- if (SecondPos != -1 && SecondPos != SecondDims[i])
- continue;
- SecondPos = SecondDims[i];
- return true;
- }
- }
-
- return false;
-}
-
-/// Does the memory access represent a non-scalar operand of the matrix
-/// multiplication.
-///
-/// Check that the memory access @p MemAccess is the read access to a non-scalar
-/// operand of the matrix multiplication or its result.
-///
-/// @param MemAccess The memory access to be checked.
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return True in case the memory access represents the read access
-/// to a non-scalar operand of the matrix multiplication and
-/// false, otherwise.
-static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
- MatMulInfoTy &MMI) {
- if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead())
- return false;
- auto AccMap = MemAccess->getLatestAccessRelation();
- isl::set StmtDomain = MemAccess->getStatement()->getDomain();
- if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) {
- MMI.ReadFromC = MemAccess;
- return true;
- }
- if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) {
- MMI.A = MemAccess;
- return true;
- }
- if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) {
- MMI.B = MemAccess;
- return true;
- }
- return false;
-}
-
-/// Check accesses to operands of the matrix multiplication.
-///
-/// Check that accesses of the SCoP statement, which corresponds to
-/// the partial schedule @p PartialSchedule, are scalar in terms of loops
-/// containing the matrix multiplication, in case they do not represent
-/// accesses to the non-scalar operands of the matrix multiplication or
-/// its result.
-///
-/// @param PartialSchedule The partial schedule of the SCoP statement.
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return True in case the corresponding SCoP statement
-/// represents matrix multiplication and false,
-/// otherwise.
-static bool containsOnlyMatrMultAcc(isl::map PartialSchedule,
- MatMulInfoTy &MMI) {
- auto InputDimId = PartialSchedule.get_tuple_id(isl::dim::in);
- auto *Stmt = static_cast<ScopStmt *>(InputDimId.get_user());
- isl_size OutDimNum = PartialSchedule.dim(isl::dim::out);
- assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
- "and, consequently, the corresponding scheduling "
- "functions have at least three dimensions.");
- auto MapI =
- permuteDimensions(PartialSchedule, isl::dim::out, MMI.i, OutDimNum - 1);
- auto MapJ =
- permuteDimensions(PartialSchedule, isl::dim::out, MMI.j, OutDimNum - 1);
- auto MapK =
- permuteDimensions(PartialSchedule, isl::dim::out, MMI.k, OutDimNum - 1);
-
- auto Accesses = getAccessesInOrder(*Stmt);
- for (auto *MemA = Accesses.begin(); MemA != Accesses.end() - 1; MemA++) {
- auto *MemAccessPtr = *MemA;
- if (MemAccessPtr->isLatestArrayKind() && MemAccessPtr != MMI.WriteToC &&
- !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
- !(MemAccessPtr->isStrideZero(MapI)) &&
- MemAccessPtr->isStrideZero(MapJ) && MemAccessPtr->isStrideZero(MapK))
- return false;
- }
- return true;
-}
-
-/// Check for dependencies corresponding to the matrix multiplication.
-///
-/// Check that there is only true dependence of the form
-/// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
-/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
-/// to the dependency produced by the matrix multiplication.
-///
-/// @param Schedule The schedule of the SCoP statement.
-/// @param D The SCoP dependencies.
-/// @param Pos The parameter to describe an acceptable true dependence.
-/// In case it has a negative value, try to determine its
-/// acceptable value.
-/// @return True in case dependencies correspond to the matrix multiplication
-/// and false, otherwise.
-static bool containsOnlyMatMulDep(isl::map Schedule, const Dependences *D,
- int &Pos) {
- isl::union_map Dep = D->getDependences(Dependences::TYPE_RAW);
- isl::union_map Red = D->getDependences(Dependences::TYPE_RED);
- if (Red)
- Dep = Dep.unite(Red);
- auto DomainSpace = Schedule.get_space().domain();
- auto Space = DomainSpace.map_from_domain_and_range(DomainSpace);
- auto Deltas = Dep.extract_map(Space).deltas();
- isl_size DeltasDimNum = Deltas.dim(isl::dim::set);
- for (int i = 0; i < DeltasDimNum; i++) {
- auto Val = Deltas.plain_get_val_if_fixed(isl::dim::set, i);
- Pos = Pos < 0 && Val.is_one() ? i : Pos;
- if (Val.is_nan() || !(Val.is_zero() || (i == Pos && Val.is_one())))
- return false;
- }
- if (DeltasDimNum == 0 || Pos < 0)
- return false;
- return true;
-}
-
-/// Check if the SCoP statement could probably be optimized with analytical
-/// modeling.
-///
-/// containsMatrMult tries to determine whether the following conditions
-/// are true:
-/// 1. The last memory access modeling an array, MA1, represents writing to
-/// memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
-/// S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
-/// under consideration.
-/// 2. There is only one loop-carried true dependency, and it has the
-/// form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
-/// loop-carried or anti dependencies.
-/// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
-/// reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
-/// S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
-/// and all memory accesses of the SCoP that are different from MA1, MA2,
-/// MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
-/// of loops i1, i2 and i3.
-///
-/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
-/// to check.
-/// @D The SCoP dependencies.
-/// @MMI Parameters of the matrix multiplication operands.
-static bool containsMatrMult(isl::map PartialSchedule, const Dependences *D,
- MatMulInfoTy &MMI) {
- auto InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
- auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
- if (Stmt->size() <= 1)
- return false;
-
- auto Accesses = getAccessesInOrder(*Stmt);
- for (auto *MemA = Accesses.end() - 1; MemA != Accesses.begin(); MemA--) {
- auto *MemAccessPtr = *MemA;
- if (!MemAccessPtr->isLatestArrayKind())
- continue;
- if (!MemAccessPtr->isWrite())
- return false;
- auto AccMap = MemAccessPtr->getLatestAccessRelation();
- if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j))
- return false;
- MMI.WriteToC = MemAccessPtr;
- break;
- }
-
- if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
- return false;
-
- if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
- return false;
-
- if (!MMI.A || !MMI.B || !MMI.ReadFromC)
- return false;
- return true;
-}
-
-/// Permute two dimensions of the band node.
-///
-/// Permute FirstDim and SecondDim dimensions of the Node.
-///
-/// @param Node The band node to be modified.
-/// @param FirstDim The first dimension to be permuted.
-/// @param SecondDim The second dimension to be permuted.
-static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node,
- unsigned FirstDim,
- unsigned SecondDim) {
- assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band &&
- (unsigned)isl_schedule_node_band_n_member(Node.get()) >
- std::max(FirstDim, SecondDim));
- auto PartialSchedule =
- isl::manage(isl_schedule_node_band_get_partial_schedule(Node.get()));
- auto PartialScheduleFirstDim = PartialSchedule.get_union_pw_aff(FirstDim);
- auto PartialScheduleSecondDim = PartialSchedule.get_union_pw_aff(SecondDim);
- PartialSchedule =
- PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim);
- PartialSchedule =
- PartialSchedule.set_union_pw_aff(FirstDim, PartialScheduleSecondDim);
- Node = isl::manage(isl_schedule_node_delete(Node.release()));
- return Node.insert_partial_schedule(PartialSchedule);
-}
-
-isl::schedule_node ScheduleTreeOptimizer::createMicroKernel(
- isl::schedule_node Node, MicroKernelParamsTy MicroKernelParams) {
- Node = applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
- 1);
- Node = Node.parent().parent();
- return permuteBandNodeDimensions(Node, 0, 1).child(0).child(0);
-}
-
-isl::schedule_node ScheduleTreeOptimizer::createMacroKernel(
- isl::schedule_node Node, MacroKernelParamsTy MacroKernelParams) {
- assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
- if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
- MacroKernelParams.Kc == 1)
- return Node;
- int DimOutNum = isl_schedule_node_band_n_member(Node.get());
- std::vector<int> TileSizes(DimOutNum, 1);
- TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
- TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
- TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
- Node = tileNode(Node, "1st level tiling", TileSizes, 1);
- Node = Node.parent().parent();
- Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
- Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
-
- // Mark the outermost loop as parallelizable.
- Node = Node.band_member_set_coincident(0, true);
-
- return Node.child(0).child(0);
-}
-
-/// Get the size of the widest type of the matrix multiplication operands
-/// in bytes, including alignment padding.
-///
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The size of the widest type of the matrix multiplication operands
-/// in bytes, including alignment padding.
-static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI) {
- auto *S = MMI.A->getStatement()->getParent();
- auto &DL = S->getFunction().getParent()->getDataLayout();
- auto ElementSizeA = DL.getTypeAllocSize(MMI.A->getElementType());
- auto ElementSizeB = DL.getTypeAllocSize(MMI.B->getElementType());
- auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->getElementType());
- return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
-}
-
-/// Get the size of the widest type of the matrix multiplication operands
-/// in bits.
-///
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The size of the widest type of the matrix multiplication operands
-/// in bits.
-static uint64_t getMatMulTypeSize(MatMulInfoTy MMI) {
- auto *S = MMI.A->getStatement()->getParent();
- auto &DL = S->getFunction().getParent()->getDataLayout();
- auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->getElementType());
- auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->getElementType());
- auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->getElementType());
- return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
-}
-
-/// Get parameters of the BLIS micro kernel.
-///
-/// We choose the Mr and Nr parameters of the micro kernel to be large enough
-/// such that no stalls caused by the combination of latencies and dependencies
-/// are introduced during the updates of the resulting matrix of the matrix
-/// multiplication. However, they should also be as small as possible to
-/// release more registers for entries of multiplied matrices.
-///
-/// @param TTI Target Transform Info.
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The structure of type MicroKernelParamsTy.
-/// @see MicroKernelParamsTy
-static struct MicroKernelParamsTy
-getMicroKernelParams(const TargetTransformInfo *TTI, MatMulInfoTy MMI) {
- assert(TTI && "The target transform info should be provided.");
-
- // Nvec - Number of double-precision floating-point numbers that can be hold
- // by a vector register. Use 2 by default.
- long RegisterBitwidth = VectorRegisterBitwidth;
-
- if (RegisterBitwidth == -1)
- RegisterBitwidth =
- TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
- auto ElementSize = getMatMulTypeSize(MMI);
- assert(ElementSize > 0 && "The element size of the matrix multiplication "
- "operands should be greater than zero.");
- auto Nvec = RegisterBitwidth / ElementSize;
- if (Nvec == 0)
- Nvec = 2;
- int Nr = ceil(sqrt((double)(Nvec * LatencyVectorFma * ThroughputVectorFma)) /
- Nvec) *
- Nvec;
- int Mr = ceil((double)(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr));
- return {Mr, Nr};
-}
-
-/// Determine parameters of the target cache.
-///
-/// @param TTI Target Transform Info.
-void getTargetCacheParameters(const llvm::TargetTransformInfo *TTI) {
- auto L1DCache = llvm::TargetTransformInfo::CacheLevel::L1D;
- auto L2DCache = llvm::TargetTransformInfo::CacheLevel::L2D;
- if (FirstCacheLevelSize == -1) {
- if (TTI->getCacheSize(L1DCache).hasValue())
- FirstCacheLevelSize = TTI->getCacheSize(L1DCache).getValue();
- else
- FirstCacheLevelSize = static_cast<int>(FirstCacheLevelDefaultSize);
- }
- if (SecondCacheLevelSize == -1) {
- if (TTI->getCacheSize(L2DCache).hasValue())
- SecondCacheLevelSize = TTI->getCacheSize(L2DCache).getValue();
- else
- SecondCacheLevelSize = static_cast<int>(SecondCacheLevelDefaultSize);
- }
- if (FirstCacheLevelAssociativity == -1) {
- if (TTI->getCacheAssociativity(L1DCache).hasValue())
- FirstCacheLevelAssociativity =
- TTI->getCacheAssociativity(L1DCache).getValue();
- else
- FirstCacheLevelAssociativity =
- static_cast<int>(FirstCacheLevelDefaultAssociativity);
- }
- if (SecondCacheLevelAssociativity == -1) {
- if (TTI->getCacheAssociativity(L2DCache).hasValue())
- SecondCacheLevelAssociativity =
- TTI->getCacheAssociativity(L2DCache).getValue();
- else
- SecondCacheLevelAssociativity =
- static_cast<int>(SecondCacheLevelDefaultAssociativity);
- }
-}
-
-/// Get parameters of the BLIS macro kernel.
-///
-/// During the computation of matrix multiplication, blocks of partitioned
-/// matrices are mapped to different layers of the memory hierarchy.
-/// To optimize data reuse, blocks should be ideally kept in cache between
-/// iterations. Since parameters of the macro kernel determine sizes of these
-/// blocks, there are upper and lower bounds on these parameters.
-///
-/// @param TTI Target Transform Info.
-/// @param MicroKernelParams Parameters of the micro-kernel
-/// to be taken into account.
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The structure of type MacroKernelParamsTy.
-/// @see MacroKernelParamsTy
-/// @see MicroKernelParamsTy
-static struct MacroKernelParamsTy
-getMacroKernelParams(const llvm::TargetTransformInfo *TTI,
- const MicroKernelParamsTy &MicroKernelParams,
- MatMulInfoTy MMI) {
- getTargetCacheParameters(TTI);
- // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
- // it requires information about the first two levels of a cache to determine
- // all the parameters of a macro-kernel. It also checks that an associativity
- // degree of a cache level is greater than two. Otherwise, another algorithm
- // for determination of the parameters should be used.
- if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
- FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
- FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
- return {1, 1, 1};
- // The quotient should be greater than zero.
- if (PollyPatternMatchingNcQuotient <= 0)
- return {1, 1, 1};
- int Car = floor(
- (FirstCacheLevelAssociativity - 1) /
- (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
-
- // Car can be computed to be zero since it is floor to int.
- // On Mac OS, division by 0 does not raise a signal. This causes negative
- // tile sizes to be computed. Prevent division by Cac==0 by early returning
- // if this happens.
- if (Car == 0)
- return {1, 1, 1};
-
- auto ElementSize = getMatMulAlignTypeSize(MMI);
- assert(ElementSize > 0 && "The element size of the matrix multiplication "
- "operands should be greater than zero.");
- int Kc = (Car * FirstCacheLevelSize) /
- (MicroKernelParams.Mr * FirstCacheLevelAssociativity * ElementSize);
- double Cac =
- static_cast<double>(Kc * ElementSize * SecondCacheLevelAssociativity) /
- SecondCacheLevelSize;
- int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
- int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
-
- assert(Mc > 0 && Nc > 0 && Kc > 0 &&
- "Matrix block sizes should be greater than zero");
- return {Mc, Nc, Kc};
-}
-
-/// Create an access relation that is specific to
-/// the matrix multiplication pattern.
-///
-/// Create an access relation of the following form:
-/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
-/// where I is @p FirstDim, J is @p SecondDim.
-///
-/// It can be used, for example, to create relations that helps to consequently
-/// access elements of operands of a matrix multiplication after creation of
-/// the BLIS micro and macro kernels.
-///
-/// @see ScheduleTreeOptimizer::createMicroKernel
-/// @see ScheduleTreeOptimizer::createMacroKernel
-///
-/// Subsequently, the described access relation is applied to the range of
-/// @p MapOldIndVar, that is used to map original induction variables to
-/// the ones, which are produced by schedule transformations. It helps to
-/// define relations using a new space and, at the same time, keep them
-/// in the original one.
-///
-/// @param MapOldIndVar The relation, which maps original induction variables
-/// to the ones, which are produced by schedule
-/// transformations.
-/// @param FirstDim, SecondDim The input dimensions that are used to define
-/// the specified access relation.
-/// @return The specified access relation.
-isl::map getMatMulAccRel(isl::map MapOldIndVar, unsigned FirstDim,
- unsigned SecondDim) {
- auto AccessRelSpace = isl::space(MapOldIndVar.get_ctx(), 0, 9, 3);
- auto AccessRel = isl::map::universe(AccessRelSpace);
- AccessRel = AccessRel.equate(isl::dim::in, FirstDim, isl::dim::out, 0);
- AccessRel = AccessRel.equate(isl::dim::in, 5, isl::dim::out, 1);
- AccessRel = AccessRel.equate(isl::dim::in, SecondDim, isl::dim::out, 2);
- return MapOldIndVar.apply_range(AccessRel);
-}
-
-isl::schedule_node createExtensionNode(isl::schedule_node Node,
- isl::map ExtensionMap) {
- auto Extension = isl::union_map(ExtensionMap);
- auto NewNode = isl::schedule_node::from_extension(Extension);
- return Node.graft_before(NewNode);
-}
-
-/// Apply the packing transformation.
-///
-/// The packing transformation can be described as a data-layout
-/// transformation that requires to introduce a new array, copy data
-/// to the array, and change memory access locations to reference the array.
-/// It can be used to ensure that elements of the new array are read in-stride
-/// access, aligned to cache lines boundaries, and preloaded into certain cache
-/// levels.
-///
-/// As an example let us consider the packing of the array A that would help
-/// to read its elements with in-stride access. An access to the array A
-/// is represented by an access relation that has the form
-/// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
-/// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
-/// k mod Kc, j mod Nr, i mod Mr].
-///
-/// To ensure that elements of the array A are read in-stride access, we add
-/// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
-/// Scop::createScopArrayInfo, change the access relation
-/// S[i, j, k] -> A[i, k] to
-/// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
-/// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
-/// the copy statement created by Scop::addScopStmt.
-///
-/// @param Node The schedule node to be optimized.
-/// @param MapOldIndVar The relation, which maps original induction variables
-/// to the ones, which are produced by schedule
-/// transformations.
-/// @param MicroParams, MacroParams Parameters of the BLIS kernel
-/// to be taken into account.
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The optimized schedule node.
-static isl::schedule_node
-optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar,
- MicroKernelParamsTy MicroParams,
- MacroKernelParamsTy MacroParams,
- MatMulInfoTy &MMI) {
- auto InputDimsId = MapOldIndVar.get_tuple_id(isl::dim::in);
- auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
-
- // Create a copy statement that corresponds to the memory access to the
- // matrix B, the second operand of the matrix multiplication.
- Node = Node.parent().parent().parent().parent().parent().parent();
- Node = isl::manage(isl_schedule_node_band_split(Node.release(), 2)).child(0);
- auto AccRel = getMatMulAccRel(MapOldIndVar, 3, 7);
- unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
- unsigned SecondDimSize = MacroParams.Kc;
- unsigned ThirdDimSize = MicroParams.Nr;
- auto *SAI = Stmt->getParent()->createScopArrayInfo(
- MMI.B->getElementType(), "Packed_B",
- {FirstDimSize, SecondDimSize, ThirdDimSize});
- AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
- auto OldAcc = MMI.B->getLatestAccessRelation();
- MMI.B->setNewAccessRelation(AccRel);
- auto ExtMap = MapOldIndVar.project_out(isl::dim::out, 2,
- MapOldIndVar.dim(isl::dim::out) - 2);
- ExtMap = ExtMap.reverse();
- ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0);
- auto Domain = Stmt->getDomain();
-
- // Restrict the domains of the copy statements to only execute when also its
- // originating statement is executed.
- auto DomainId = Domain.get_tuple_id();
- auto *NewStmt = Stmt->getParent()->addScopStmt(
- OldAcc, MMI.B->getLatestAccessRelation(), Domain);
- ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
- ExtMap = ExtMap.intersect_range(Domain);
- ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
- Node = createExtensionNode(Node, ExtMap);
-
- // Create a copy statement that corresponds to the memory access
- // to the matrix A, the first operand of the matrix multiplication.
- Node = Node.child(0);
- AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
- FirstDimSize = MacroParams.Mc / MicroParams.Mr;
- ThirdDimSize = MicroParams.Mr;
- SAI = Stmt->getParent()->createScopArrayInfo(
- MMI.A->getElementType(), "Packed_A",
- {FirstDimSize, SecondDimSize, ThirdDimSize});
- AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
- OldAcc = MMI.A->getLatestAccessRelation();
- MMI.A->setNewAccessRelation(AccRel);
- ExtMap = MapOldIndVar.project_out(isl::dim::out, 3,
- MapOldIndVar.dim(isl::dim::out) - 3);
- ExtMap = ExtMap.reverse();
- ExtMap = ExtMap.fix_si(isl::dim::out, MMI.j, 0);
- NewStmt = Stmt->getParent()->addScopStmt(
- OldAcc, MMI.A->getLatestAccessRelation(), Domain);
-
- // Restrict the domains of the copy statements to only execute when also its
- // originating statement is executed.
- ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
- ExtMap = ExtMap.intersect_range(Domain);
- ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
- Node = createExtensionNode(Node, ExtMap);
- return Node.child(0).child(0).child(0).child(0).child(0);
-}
-
-/// Get a relation mapping induction variables produced by schedule
-/// transformations to the original ones.
-///
-/// @param Node The schedule node produced as the result of creation
-/// of the BLIS kernels.
-/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
-/// to be taken into account.
-/// @return The relation mapping original induction variables to the ones
-/// produced by schedule transformation.
-/// @see ScheduleTreeOptimizer::createMicroKernel
-/// @see ScheduleTreeOptimizer::createMacroKernel
-/// @see getMacroKernelParams
-isl::map
-getInductionVariablesSubstitution(isl::schedule_node Node,
- MicroKernelParamsTy MicroKernelParams,
- MacroKernelParamsTy MacroKernelParams) {
- auto Child = Node.child(0);
- auto UnMapOldIndVar = Child.get_prefix_schedule_union_map();
- auto MapOldIndVar = isl::map::from_union_map(UnMapOldIndVar);
- if (MapOldIndVar.dim(isl::dim::out) > 9)
- return MapOldIndVar.project_out(isl::dim::out, 0,
- MapOldIndVar.dim(isl::dim::out) - 9);
- return MapOldIndVar;
-}
-
-/// Isolate a set of partial tile prefixes and unroll the isolated part.
-///
-/// The set should ensure that it contains only partial tile prefixes that have
-/// exactly Mr x Nr iterations of the two innermost loops produced by
-/// the optimization of the matrix multiplication. Mr and Nr are parameters of
-/// the micro-kernel.
-///
-/// In case of parametric bounds, this helps to auto-vectorize the unrolled
-/// innermost loops, using the SLP vectorizer.
-///
-/// @param Node The schedule node to be modified.
-/// @param MicroKernelParams Parameters of the micro-kernel
-/// to be taken into account.
-/// @return The modified isl_schedule_node.
-static isl::schedule_node
-isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
- struct MicroKernelParamsTy MicroKernelParams) {
- isl::schedule_node Child = Node.get_child(0);
- isl::union_map UnMapOldIndVar = Child.get_prefix_schedule_relation();
- isl::set Prefix = isl::map::from_union_map(UnMapOldIndVar).range();
- isl_size Dims = Prefix.dim(isl::dim::set);
- Prefix = Prefix.project_out(isl::dim::set, Dims - 1, 1);
- Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
- Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
-
- isl::union_set IsolateOption =
- getIsolateOptions(Prefix.add_dims(isl::dim::set, 3), 3);
- isl::ctx Ctx = Node.get_ctx();
- auto Options = IsolateOption.unite(getDimOptions(Ctx, "unroll"));
- Options = Options.unite(getUnrollIsolatedSetOptions(Ctx));
- Node = Node.band_set_ast_build_options(Options);
- Node = Node.parent().parent().parent();
- IsolateOption = getIsolateOptions(Prefix, 3);
- Options = IsolateOption.unite(getDimOptions(Ctx, "separate"));
- Node = Node.band_set_ast_build_options(Options);
- Node = Node.child(0).child(0).child(0);
- return Node;
-}
-
-/// Mark @p BasePtr with "Inter iteration alias-free" mark node.
-///
-/// @param Node The child of the mark node to be inserted.
-/// @param BasePtr The pointer to be marked.
-/// @return The modified isl_schedule_node.
-static isl::schedule_node markInterIterationAliasFree(isl::schedule_node Node,
- Value *BasePtr) {
- if (!BasePtr)
- return Node;
-
- auto Id =
- isl::id::alloc(Node.get_ctx(), "Inter iteration alias-free", BasePtr);
- return Node.insert_mark(Id).child(0);
-}
-
-/// Insert "Loop Vectorizer Disabled" mark node.
-///
-/// @param Node The child of the mark node to be inserted.
-/// @return The modified isl_schedule_node.
-static isl::schedule_node markLoopVectorizerDisabled(isl::schedule_node Node) {
- auto Id = isl::id::alloc(Node.get_ctx(), "Loop Vectorizer Disabled", nullptr);
- return Node.insert_mark(Id).child(0);
-}
-
-/// Restore the initial ordering of dimensions of the band node
-///
-/// In case the band node represents all the dimensions of the iteration
-/// domain, recreate the band node to restore the initial ordering of the
-/// dimensions.
-///
-/// @param Node The band node to be modified.
-/// @return The modified schedule node.
-static isl::schedule_node
-getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
- assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
- if (isl_schedule_node_get_type(Node.child(0).get()) != isl_schedule_node_leaf)
- return Node;
- auto Domain = Node.get_universe_domain();
- assert(isl_union_set_n_set(Domain.get()) == 1);
- if (Node.get_schedule_depth() != 0 ||
- (isl::set(Domain).dim(isl::dim::set) !=
- isl_schedule_node_band_n_member(Node.get())))
- return Node;
- Node = isl::manage(isl_schedule_node_delete(Node.copy()));
- auto PartialSchedulePwAff = Domain.identity_union_pw_multi_aff();
- auto PartialScheduleMultiPwAff =
- isl::multi_union_pw_aff(PartialSchedulePwAff);
- PartialScheduleMultiPwAff =
- PartialScheduleMultiPwAff.reset_tuple_id(isl::dim::set);
- return Node.insert_partial_schedule(PartialScheduleMultiPwAff);
-}
-
-isl::schedule_node
-ScheduleTreeOptimizer::optimizeMatMulPattern(isl::schedule_node Node,
- const TargetTransformInfo *TTI,
- MatMulInfoTy &MMI) {
- assert(TTI && "The target transform info should be provided.");
- Node = markInterIterationAliasFree(
- Node, MMI.WriteToC->getLatestScopArrayInfo()->getBasePtr());
- int DimOutNum = isl_schedule_node_band_n_member(Node.get());
- assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
- "and, consequently, the corresponding scheduling "
- "functions have at least three dimensions.");
- Node = getBandNodeWithOriginDimOrder(Node);
- Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
- int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
- int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
- Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
- NewK = NewK == DimOutNum - 2 ? NewJ : NewK;
- Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
- auto MicroKernelParams = getMicroKernelParams(TTI, MMI);
- auto MacroKernelParams = getMacroKernelParams(TTI, MicroKernelParams, MMI);
- Node = createMacroKernel(Node, MacroKernelParams);
- Node = createMicroKernel(Node, MicroKernelParams);
- if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
- MacroKernelParams.Kc == 1)
- return Node;
- auto MapOldIndVar = getInductionVariablesSubstitution(Node, MicroKernelParams,
- MacroKernelParams);
- if (!MapOldIndVar)
- return Node;
- Node = markLoopVectorizerDisabled(Node.parent()).child(0);
- Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
- return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
- MacroKernelParams, MMI);
-}
-
-bool ScheduleTreeOptimizer::isMatrMultPattern(isl::schedule_node Node,
- const Dependences *D,
- MatMulInfoTy &MMI) {
- auto PartialSchedule = isl::manage(
- isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
- Node = Node.child(0);
- auto LeafType = isl_schedule_node_get_type(Node.get());
- Node = Node.parent();
- if (LeafType != isl_schedule_node_leaf ||
- isl_schedule_node_band_n_member(Node.get()) < 3 ||
- Node.get_schedule_depth() != 0 ||
- isl_union_map_n_map(PartialSchedule.get()) != 1)
- return false;
- auto NewPartialSchedule = isl::map::from_union_map(PartialSchedule);
- if (containsMatrMult(NewPartialSchedule, D, MMI))
- return true;
- return false;
-}
-
__isl_give isl_schedule_node *
ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
void *User) {
const OptimizerAdditionalInfoTy *OAI =
static_cast<const OptimizerAdditionalInfoTy *>(User);
- MatMulInfoTy MMI;
- if (PMBasedOpts && User &&
- isMatrMultPattern(isl::manage_copy(Node), OAI->D, MMI)) {
- LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
- MatMulOpts++;
- return optimizeMatMulPattern(isl::manage(Node), OAI->TTI, MMI).release();
+ if (PMBasedOpts && User) {
+ if (isl::schedule_node PatternOptimizedSchedule = tryOptimizeMatMulPattern(
+ isl::manage_copy(Node), OAI->TTI, OAI->D)) {
+ MatMulOpts++;
+ isl_schedule_node_free(Node);
+ return PatternOptimizedSchedule.release();
+ }
}
return standardBandOpts(isl::manage(Node), User).release();
return Modulo.domain();
}
+/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
+///
+/// @param Set A set, which should be modified.
+/// @param VectorWidth A parameter, which determines the constraint.
+static isl::set addExtentConstraints(isl::set Set, int VectorWidth) {
+ unsigned Dims = Set.dim(isl::dim::set);
+ isl::space Space = Set.get_space();
+ isl::local_space LocalSpace = isl::local_space(Space);
+ isl::constraint ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
+ ExtConstr = ExtConstr.set_constant_si(0);
+ ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, 1);
+ Set = Set.add_constraint(ExtConstr);
+ ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
+ ExtConstr = ExtConstr.set_constant_si(VectorWidth - 1);
+ ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1);
+ return Set.add_constraint(ExtConstr);
+}
} // namespace
bool polly::isBandMark(const isl::schedule_node &Node) {
return NewLoop.get_schedule();
}
+
+isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
+ int VectorWidth) {
+ isl_size Dims = ScheduleRange.dim(isl::dim::set);
+ isl::set LoopPrefixes =
+ ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1);
+ auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth);
+ isl::set BadPrefixes = ExtentPrefixes.subtract(ScheduleRange);
+ BadPrefixes = BadPrefixes.project_out(isl::dim::set, Dims - 1, 1);
+ LoopPrefixes = LoopPrefixes.project_out(isl::dim::set, Dims - 1, 1);
+ return LoopPrefixes.subtract(BadPrefixes);
+}
+
+isl::union_set polly::getIsolateOptions(isl::set IsolateDomain,
+ isl_size OutDimsNum) {
+ isl_size Dims = IsolateDomain.dim(isl::dim::set);
+ assert(OutDimsNum <= Dims &&
+ "The isl::set IsolateDomain is used to describe the range of schedule "
+ "dimensions values, which should be isolated. Consequently, the "
+ "number of its dimensions should be greater than or equal to the "
+ "number of the schedule dimensions.");
+ isl::map IsolateRelation = isl::map::from_domain(IsolateDomain);
+ IsolateRelation = IsolateRelation.move_dims(isl::dim::out, 0, isl::dim::in,
+ Dims - OutDimsNum, OutDimsNum);
+ isl::set IsolateOption = IsolateRelation.wrap();
+ isl::id Id = isl::id::alloc(IsolateOption.get_ctx(), "isolate", nullptr);
+ IsolateOption = IsolateOption.set_tuple_id(Id);
+ return isl::union_set(IsolateOption);
+}
+
+isl::union_set polly::getDimOptions(isl::ctx Ctx, const char *Option) {
+ isl::space Space(Ctx, 0, 1);
+ auto DimOption = isl::set::universe(Space);
+ auto Id = isl::id::alloc(Ctx, Option, nullptr);
+ DimOption = DimOption.set_tuple_id(Id);
+ return isl::union_set(DimOption);
+}
+
+isl::schedule_node polly::tileNode(isl::schedule_node Node,
+ const char *Identifier,
+ ArrayRef<int> TileSizes,
+ int DefaultTileSize) {
+ auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
+ auto Dims = Space.dim(isl::dim::set);
+ auto Sizes = isl::multi_val::zero(Space);
+ std::string IdentifierString(Identifier);
+ for (auto i : seq<isl_size>(0, Dims)) {
+ auto tileSize =
+ i < (isl_size)TileSizes.size() ? TileSizes[i] : DefaultTileSize;
+ Sizes = Sizes.set_val(i, isl::val(Node.get_ctx(), tileSize));
+ }
+ auto TileLoopMarkerStr = IdentifierString + " - Tiles";
+ auto TileLoopMarker =
+ isl::id::alloc(Node.get_ctx(), TileLoopMarkerStr, nullptr);
+ Node = Node.insert_mark(TileLoopMarker);
+ Node = Node.child(0);
+ Node =
+ isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
+ Node = Node.child(0);
+ auto PointLoopMarkerStr = IdentifierString + " - Points";
+ auto PointLoopMarker =
+ isl::id::alloc(Node.get_ctx(), PointLoopMarkerStr, nullptr);
+ Node = Node.insert_mark(PointLoopMarker);
+ return Node.child(0);
+}
+
+isl::schedule_node polly::applyRegisterTiling(isl::schedule_node Node,
+ ArrayRef<int> TileSizes,
+ int DefaultTileSize) {
+ Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
+ auto Ctx = Node.get_ctx();
+ return Node.band_set_ast_build_options(isl::union_set(Ctx, "{unroll[x]}"));
+}
add_polly_unittest(ScheduleOptimizerTests
- ScheduleOptimizerTest.cpp
+ ScheduleTreeTransformTest.cpp
)
-//===- ScheduleOptimizerTest.cpp ------------------------------------------===//
+//===- ScheduleTreeTransformTest.cpp --------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
-#include "polly/ScheduleOptimizer.h"
+#include "polly/ScheduleTreeTransform.h"
#include "gtest/gtest.h"
-#include "isl/stream.h"
-#include "isl/val.h"
+#include "isl/ctx.h"
using namespace isl;
using namespace polly;
namespace {
-TEST(ScheduleOptimizer, getPartialTilePrefixes) {
-
+TEST(ScheduleTreeTransform, getPartialTilePrefixes) {
isl_ctx *ctx = isl_ctx_alloc();
{