[Polly] Move MatMul optimization into its own file. NFC.

author Michael Kruse <llvm-project@meinersbur.de>

Sat, 5 Jun 2021 04:17:41 +0000 (23:17 -0500)

committer Michael Kruse <llvm-project@meinersbur.de>

Sat, 5 Jun 2021 04:22:30 +0000 (23:22 -0500)
author Michael Kruse <llvm-project@meinersbur.de>
Sat, 5 Jun 2021 04:17:41 +0000 (23:17 -0500)
committer Michael Kruse <llvm-project@meinersbur.de>
Sat, 5 Jun 2021 04:22:30 +0000 (23:22 -0500)
diff --git a/polly/include/polly/MatmulOptimizer.h b/polly/include/polly/MatmulOptimizer.h

new file mode 100644 (file)

index 0000000..e00003d
--- /dev/null
+++ b/polly/include/polly/MatmulOptimizer.h
@@ -0,0 +1,74 @@
+//===- MatmulOptimizer.h -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_MATMULOPTIMIZER_H
+#define POLLY_MATMULOPTIMIZER_H
+
+#include "isl/isl-noexceptions.h"
+
+namespace llvm {
+class TargetTransformInfo;
+}
+
+namespace polly {
+struct Dependences;
+
+/// Apply the BLIS matmul optimization pattern if possible.
+///
+/// Make the loops containing the matrix multiplication be the innermost
+/// loops and apply the BLIS matmul optimization pattern. BLIS implements
+/// gemm as three nested loops around a macro-kernel, plus two packing
+/// routines. The macro-kernel is implemented in terms of two additional
+/// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
+/// (i.e., outer product) update.
+///
+/// For a detailed description please see [1].
+///
+/// The order of the loops defines the data reused in the BLIS implementation
+/// of gemm ([1]). In particular, elements of the matrix B, the second
+/// operand of matrix multiplication, are reused between iterations of the
+/// innermost loop. To keep the reused data in cache, only elements of matrix
+/// A, the first operand of matrix multiplication, should be evicted during
+/// an iteration of the innermost loop. To provide such a cache replacement
+/// policy, elements of the matrix A can, in particular, be loaded first and,
+/// consequently, be least-recently-used.
+///
+/// In our case matrices are stored in row-major order instead of
+/// column-major order used in the BLIS implementation ([1]). It affects only
+/// on the form of the BLIS micro kernel and the computation of its
+/// parameters. In particular, reused elements of the matrix B are
+/// successively multiplied by specific elements of the matrix A.
+///
+/// Refs.:
+/// [1] - Analytical Modeling is Enough for High Performance BLIS
+/// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
+/// Technical Report, 2014
+/// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
+///
+/// @see ScheduleTreeOptimizer::createMicroKernel
+/// @see ScheduleTreeOptimizer::createMacroKernel
+/// @see getMicroKernelParams
+/// @see getMacroKernelParams
+///
+/// TODO: Implement the packing transformation.
+///
+/// @param Node The node that contains a band to be optimized. The node
+///             is required to successfully pass
+///             ScheduleTreeOptimizer::isMatrMultPattern.
+/// @param TTI  Target Transform Info.
+/// @param D    The dependencies.
+///
+/// @returns    The transformed schedule or nullptr if the optimization
+///             cannot be applied.
+isl::schedule_node
+tryOptimizeMatMulPattern(isl::schedule_node Node,
+                         const llvm::TargetTransformInfo *TTI,
+                         const Dependences *D);
+
+} // namespace polly
+#endif // POLLY_MATMULOPTIMIZER_H
diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h

index 1056b74cfa18917657224a6c1ae7856f19dcfaf8..8c326f6ac3494d6dabfec5632a5b81828b62d0f3 100644 (file)
--- a/polly/include/polly/ScheduleOptimizer.h
+++ b/polly/include/polly/ScheduleOptimizer.h
@@ -37,26 +37,6 @@ struct IslScheduleOptimizerPrinterPass
  private:
    llvm::raw_ostream &OS;
  };
-
-/// Build the desired set of partial tile prefixes.
-///
-/// We build a set of partial tile prefixes, which are prefixes of the vector
-/// loop that have exactly VectorWidth iterations.
-///
-/// 1. Drop all constraints involving the dimension that represents the
-///    vector loop.
-/// 2. Constrain the last dimension to get a set, which has exactly VectorWidth
-///    iterations.
-/// 3. Subtract loop domain from it, project out the vector loop dimension and
-///    get a set that contains prefixes, which do not have exactly VectorWidth
-///    iterations.
-/// 4. Project out the vector loop dimension of the set that was build on the
-///    first step and subtract the set built on the previous step to get the
-///    desired set of prefixes.
-///
-/// @param ScheduleRange A range of a map, which describes a prefix schedule
-///                      relation.
-isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth);
  } // namespace polly
  
  namespace llvm {
diff --git a/polly/include/polly/ScheduleTreeTransform.h b/polly/include/polly/ScheduleTreeTransform.h

index 3786f70624503af846966d80590c8366e898d62f..8727414c0bc3a611d69da7ebfd576ec907fed11a 100644 (file)
--- a/polly/include/polly/ScheduleTreeTransform.h
+++ b/polly/include/polly/ScheduleTreeTransform.h
@@ -13,6 +13,7 @@
  #ifndef POLLY_SCHEDULETREETRANSFORM_H
  #define POLLY_SCHEDULETREETRANSFORM_H
  
+#include "llvm/ADT/ArrayRef.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "isl/isl-noexceptions.h"
  #include <cassert>
@@ -164,6 +165,65 @@ isl::schedule applyFullUnroll(isl::schedule_node BandToUnroll);
  /// Replace the AST band @p BandToUnroll by a partially unrolled equivalent.
  isl::schedule applyPartialUnroll(isl::schedule_node BandToUnroll, int Factor);
  
+/// Build the desired set of partial tile prefixes.
+///
+/// We build a set of partial tile prefixes, which are prefixes of the vector
+/// loop that have exactly VectorWidth iterations.
+///
+/// 1. Drop all constraints involving the dimension that represents the
+///    vector loop.
+/// 2. Constrain the last dimension to get a set, which has exactly VectorWidth
+///    iterations.
+/// 3. Subtract loop domain from it, project out the vector loop dimension and
+///    get a set that contains prefixes, which do not have exactly VectorWidth
+///    iterations.
+/// 4. Project out the vector loop dimension of the set that was build on the
+///    first step and subtract the set built on the previous step to get the
+///    desired set of prefixes.
+///
+/// @param ScheduleRange A range of a map, which describes a prefix schedule
+///                      relation.
+isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth);
+
+/// Create an isl::union_set, which describes the isolate option based on
+/// IsolateDomain.
+///
+/// @param IsolateDomain An isl::set whose @p OutDimsNum last dimensions should
+///                      belong to the current band node.
+/// @param OutDimsNum    A number of dimensions that should belong to
+///                      the current band node.
+isl::union_set getIsolateOptions(isl::set IsolateDomain, isl_size OutDimsNum);
+
+/// Create an isl::union_set, which describes the specified option for the
+/// dimension of the current node.
+///
+/// @param Ctx    An isl::ctx, which is used to create the isl::union_set.
+/// @param Option The name of the option.
+isl::union_set getDimOptions(isl::ctx Ctx, const char *Option);
+
+/// Tile a schedule node.
+///
+/// @param Node            The node to tile.
+/// @param Identifier      An name that identifies this kind of tiling and
+///                        that is used to mark the tiled loops in the
+///                        generated AST.
+/// @param TileSizes       A vector of tile sizes that should be used for
+///                        tiling.
+/// @param DefaultTileSize A default tile size that is used for dimensions
+///                        that are not covered by the TileSizes vector.
+isl::schedule_node tileNode(isl::schedule_node Node, const char *Identifier,
+                            llvm::ArrayRef<int> TileSizes, int DefaultTileSize);
+
+/// Tile a schedule node and unroll point loops.
+///
+/// @param Node            The node to register tile.
+/// @param TileSizes       A vector of tile sizes that should be used for
+///                        tiling.
+/// @param DefaultTileSize A default tile size that is used for dimensions
+isl::schedule_node applyRegisterTiling(isl::schedule_node Node,
+                                       llvm::ArrayRef<int> TileSizes,
+                                       int DefaultTileSize);
+
  } // namespace polly
  
  #endif // POLLY_SCHEDULETREETRANSFORM_H
diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt

index 8f75a7edc5c64256fa88f5aaac8768e12118e7ee..65fed2634ab857e5a59f5804a734920c43cd8c4c 100644 (file)
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@@ -99,6 +99,7 @@ add_llvm_pass_plugin(Polly
    Transform/RewriteByReferenceParameters.cpp
    Transform/ScopInliner.cpp
    Transform/ManualOptimizer.cpp
+  Transform/MatmulOptimizer.cpp
    ${POLLY_HEADER_FILES}
  
    LINK_COMPONENTS
diff --git a/polly/lib/Transform/MatmulOptimizer.cpp b/polly/lib/Transform/MatmulOptimizer.cpp

new file mode 100644 (file)

index 0000000..27c3b2d
--- /dev/null
+++ b/polly/lib/Transform/MatmulOptimizer.cpp
@@ -0,0 +1,1001 @@
+//===- MatmulOptimizer.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/MatmulOptimizer.h"
+#include "polly/DependenceInfo.h"
+#include "polly/Options.h"
+#include "polly/ScheduleTreeTransform.h"
+#include "polly/ScopInfo.h"
+#include "polly/ScopPass.h"
+#include "polly/Simplify.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/raw_ostream.h"
+#include "isl/ctx.h"
+#include "isl/schedule_node.h"
+#include "isl/schedule_type.h"
+#include "isl/union_map.h"
+#include "isl/union_set.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#define DEBUG_TYPE "polly-opt-isl"
+
+using namespace llvm;
+using namespace polly;
+
+namespace llvm {
+class Value;
+}
+
+static cl::opt<int> LatencyVectorFma(
+    "polly-target-latency-vector-fma",
+    cl::desc("The minimal number of cycles between issuing two "
+             "dependent consecutive vector fused multiply-add "
+             "instructions."),
+    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> ThroughputVectorFma(
+    "polly-target-throughput-vector-fma",
+    cl::desc("A throughput of the processor floating-point arithmetic units "
+             "expressed in the number of vector fused multiply-add "
+             "instructions per clock cycle."),
+    cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> FirstCacheLevelSize(
+    "polly-target-1st-cache-level-size",
+    cl::desc("The size of the first cache level specified in bytes."),
+    cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> FirstCacheLevelDefaultSize(
+    "polly-target-1st-cache-level-default-size",
+    cl::desc("The default size of the first cache level specified in bytes"
+             " (if not enough were provided by the TargetTransformInfo)."),
+    cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> SecondCacheLevelSize(
+    "polly-target-2nd-cache-level-size",
+    cl::desc("The size of the second level specified in bytes."), cl::Hidden,
+    cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> SecondCacheLevelDefaultSize(
+    "polly-target-2nd-cache-level-default-size",
+    cl::desc("The default size of the second cache level specified in bytes"
+             " (if not enough were provided by the TargetTransformInfo)."),
+    cl::Hidden, cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+// This option, along with --polly-target-2nd-cache-level-associativity,
+// --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
+// represent the parameters of the target cache, which do not have typical
+// values that can be used by default. However, to apply the pattern matching
+// optimizations, we use the values of the parameters of Intel Core i7-3820
+// SandyBridge in case the parameters are not specified or not provided by the
+// TargetTransformInfo.
+static cl::opt<int> FirstCacheLevelAssociativity(
+    "polly-target-1st-cache-level-associativity",
+    cl::desc("The associativity of the first cache level."), cl::Hidden,
+    cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> FirstCacheLevelDefaultAssociativity(
+    "polly-target-1st-cache-level-default-associativity",
+    cl::desc("The default associativity of the first cache level"
+             " (if not enough were provided by the TargetTransformInfo)."),
+    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> SecondCacheLevelAssociativity(
+    "polly-target-2nd-cache-level-associativity",
+    cl::desc("The associativity of the second cache level."), cl::Hidden,
+    cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> SecondCacheLevelDefaultAssociativity(
+    "polly-target-2nd-cache-level-default-associativity",
+    cl::desc("The default associativity of the second cache level"
+             " (if not enough were provided by the TargetTransformInfo)."),
+    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> VectorRegisterBitwidth(
+    "polly-target-vector-register-bitwidth",
+    cl::desc("The size in bits of a vector register (if not set, this "
+             "information is taken from LLVM's target information."),
+    cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> PollyPatternMatchingNcQuotient(
+    "polly-pattern-matching-nc-quotient",
+    cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
+             "macro-kernel, by Nr, the parameter of the micro-kernel"),
+    cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+namespace {
+/// Parameters of the micro kernel.
+///
+/// Parameters, which determine sizes of rank-1 (i.e., outer product) update
+/// used in the optimized matrix multiplication.
+struct MicroKernelParamsTy {
+  int Mr;
+  int Nr;
+};
+
+/// Parameters of the macro kernel.
+///
+/// Parameters, which determine sizes of blocks of partitioned matrices
+/// used in the optimized matrix multiplication.
+struct MacroKernelParamsTy {
+  int Mc;
+  int Nc;
+  int Kc;
+};
+
+/// Parameters of the matrix multiplication operands.
+///
+/// Parameters, which describe access relations that represent operands of the
+/// matrix multiplication.
+struct MatMulInfoTy {
+  MemoryAccess *A = nullptr;
+  MemoryAccess *B = nullptr;
+  MemoryAccess *ReadFromC = nullptr;
+  MemoryAccess *WriteToC = nullptr;
+  int i = -1;
+  int j = -1;
+  int k = -1;
+};
+
+/// Create an isl::union_set, which describes the option of the form
+/// [isolate[] -> unroll[x]].
+///
+/// @param Ctx An isl::ctx, which is used to create the isl::union_set.
+static isl::union_set getUnrollIsolatedSetOptions(isl::ctx Ctx) {
+  isl::space Space = isl::space(Ctx, 0, 0, 1);
+  isl::map UnrollIsolatedSetOption = isl::map::universe(Space);
+  isl::id DimInId = isl::id::alloc(Ctx, "isolate", nullptr);
+  isl::id DimOutId = isl::id::alloc(Ctx, "unroll", nullptr);
+  UnrollIsolatedSetOption =
+      UnrollIsolatedSetOption.set_tuple_id(isl::dim::in, DimInId);
+  UnrollIsolatedSetOption =
+      UnrollIsolatedSetOption.set_tuple_id(isl::dim::out, DimOutId);
+  return UnrollIsolatedSetOption.wrap();
+}
+
+/// Permute the two dimensions of the isl map.
+///
+/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
+/// have type @p DimType.
+///
+/// @param Map     The isl map to be modified.
+/// @param DimType The type of the dimensions.
+/// @param DstPos  The first dimension.
+/// @param SrcPos  The second dimension.
+/// @return        The modified map.
+static isl::map permuteDimensions(isl::map Map, isl::dim DimType,
+                                  unsigned DstPos, unsigned SrcPos) {
+  assert((isl_size)DstPos < Map.dim(DimType) &&
+         (isl_size)SrcPos < Map.dim(DimType));
+  if (DstPos == SrcPos)
+    return Map;
+  isl::id DimId;
+  if (Map.has_tuple_id(DimType))
+    DimId = Map.get_tuple_id(DimType);
+  auto FreeDim = DimType == isl::dim::in ? isl::dim::out : isl::dim::in;
+  isl::id FreeDimId;
+  if (Map.has_tuple_id(FreeDim))
+    FreeDimId = Map.get_tuple_id(FreeDim);
+  auto MaxDim = std::max(DstPos, SrcPos);
+  auto MinDim = std::min(DstPos, SrcPos);
+  Map = Map.move_dims(FreeDim, 0, DimType, MaxDim, 1);
+  Map = Map.move_dims(FreeDim, 0, DimType, MinDim, 1);
+  Map = Map.move_dims(DimType, MinDim, FreeDim, 1, 1);
+  Map = Map.move_dims(DimType, MaxDim, FreeDim, 0, 1);
+  if (DimId)
+    Map = Map.set_tuple_id(DimType, DimId);
+  if (FreeDimId)
+    Map = Map.set_tuple_id(FreeDim, FreeDimId);
+  return Map;
+}
+
+/// Check the form of the access relation.
+///
+/// Check that the access relation @p AccMap has the form M[i][j], where i
+/// is a @p FirstPos and j is a @p SecondPos.
+///
+/// @param AccMap    The access relation to be checked.
+/// @param FirstPos  The index of the input dimension that is mapped to
+///                  the first output dimension.
+/// @param SecondPos The index of the input dimension that is mapped to the
+///                  second output dimension.
+/// @return          True in case @p AccMap has the expected form and false,
+///                  otherwise.
+static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos,
+                               int &SecondPos) {
+  isl::space Space = AccMap.get_space();
+  isl::map Universe = isl::map::universe(Space);
+
+  if (Space.dim(isl::dim::out) != 2)
+    return false;
+
+  // MatMul has the form:
+  // for (i = 0; i < N; i++)
+  //   for (j = 0; j < M; j++)
+  //     for (k = 0; k < P; k++)
+  //       C[i, j] += A[i, k] * B[k, j]
+  //
+  // Permutation of three outer loops: 3! = 6 possibilities.
+  int FirstDims[] = {0, 0, 1, 1, 2, 2};
+  int SecondDims[] = {1, 2, 2, 0, 0, 1};
+  for (int i = 0; i < 6; i += 1) {
+    auto PossibleMatMul =
+        Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0)
+            .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1);
+
+    AccMap = AccMap.intersect_domain(Domain);
+    PossibleMatMul = PossibleMatMul.intersect_domain(Domain);
+
+    // If AccMap spans entire domain (Non-partial write),
+    // compute FirstPos and SecondPos.
+    // If AccMap != PossibleMatMul here (the two maps have been gisted at
+    // this point), it means that the writes are not complete, or in other
+    // words, it is a Partial write and Partial writes must be rejected.
+    if (AccMap.is_equal(PossibleMatMul)) {
+      if (FirstPos != -1 && FirstPos != FirstDims[i])
+        continue;
+      FirstPos = FirstDims[i];
+      if (SecondPos != -1 && SecondPos != SecondDims[i])
+        continue;
+      SecondPos = SecondDims[i];
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Does the memory access represent a non-scalar operand of the matrix
+/// multiplication.
+///
+/// Check that the memory access @p MemAccess is the read access to a non-scalar
+/// operand of the matrix multiplication or its result.
+///
+/// @param MemAccess The memory access to be checked.
+/// @param MMI       Parameters of the matrix multiplication operands.
+/// @return          True in case the memory access represents the read access
+///                  to a non-scalar operand of the matrix multiplication and
+///                  false, otherwise.
+static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
+                                        MatMulInfoTy &MMI) {
+  if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead())
+    return false;
+  auto AccMap = MemAccess->getLatestAccessRelation();
+  isl::set StmtDomain = MemAccess->getStatement()->getDomain();
+  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) {
+    MMI.ReadFromC = MemAccess;
+    return true;
+  }
+  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) {
+    MMI.A = MemAccess;
+    return true;
+  }
+  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) {
+    MMI.B = MemAccess;
+    return true;
+  }
+  return false;
+}
+
+/// Check accesses to operands of the matrix multiplication.
+///
+/// Check that accesses of the SCoP statement, which corresponds to
+/// the partial schedule @p PartialSchedule, are scalar in terms of loops
+/// containing the matrix multiplication, in case they do not represent
+/// accesses to the non-scalar operands of the matrix multiplication or
+/// its result.
+///
+/// @param  PartialSchedule The partial schedule of the SCoP statement.
+/// @param  MMI             Parameters of the matrix multiplication operands.
+/// @return                 True in case the corresponding SCoP statement
+///                         represents matrix multiplication and false,
+///                         otherwise.
+static bool containsOnlyMatrMultAcc(isl::map PartialSchedule,
+                                    MatMulInfoTy &MMI) {
+  auto InputDimId = PartialSchedule.get_tuple_id(isl::dim::in);
+  auto *Stmt = static_cast<ScopStmt *>(InputDimId.get_user());
+  isl_size OutDimNum = PartialSchedule.dim(isl::dim::out);
+  assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
+                          "and, consequently, the corresponding scheduling "
+                          "functions have at least three dimensions.");
+  auto MapI =
+      permuteDimensions(PartialSchedule, isl::dim::out, MMI.i, OutDimNum - 1);
+  auto MapJ =
+      permuteDimensions(PartialSchedule, isl::dim::out, MMI.j, OutDimNum - 1);
+  auto MapK =
+      permuteDimensions(PartialSchedule, isl::dim::out, MMI.k, OutDimNum - 1);
+
+  auto Accesses = getAccessesInOrder(*Stmt);
+  for (auto *MemA = Accesses.begin(); MemA != Accesses.end() - 1; MemA++) {
+    auto *MemAccessPtr = *MemA;
+    if (MemAccessPtr->isLatestArrayKind() && MemAccessPtr != MMI.WriteToC &&
+        !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
+        !(MemAccessPtr->isStrideZero(MapI)) &&
+        MemAccessPtr->isStrideZero(MapJ) && MemAccessPtr->isStrideZero(MapK))
+      return false;
+  }
+  return true;
+}
+
+/// Check for dependencies corresponding to the matrix multiplication.
+///
+/// Check that there is only true dependence of the form
+/// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
+/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
+/// to the dependency produced by the matrix multiplication.
+///
+/// @param  Schedule The schedule of the SCoP statement.
+/// @param  D The SCoP dependencies.
+/// @param  Pos The parameter to describe an acceptable true dependence.
+///             In case it has a negative value, try to determine its
+///             acceptable value.
+/// @return True in case dependencies correspond to the matrix multiplication
+///         and false, otherwise.
+static bool containsOnlyMatMulDep(isl::map Schedule, const Dependences *D,
+                                  int &Pos) {
+  isl::union_map Dep = D->getDependences(Dependences::TYPE_RAW);
+  isl::union_map Red = D->getDependences(Dependences::TYPE_RED);
+  if (Red)
+    Dep = Dep.unite(Red);
+  auto DomainSpace = Schedule.get_space().domain();
+  auto Space = DomainSpace.map_from_domain_and_range(DomainSpace);
+  auto Deltas = Dep.extract_map(Space).deltas();
+  isl_size DeltasDimNum = Deltas.dim(isl::dim::set);
+  for (int i = 0; i < DeltasDimNum; i++) {
+    auto Val = Deltas.plain_get_val_if_fixed(isl::dim::set, i);
+    Pos = Pos < 0 && Val.is_one() ? i : Pos;
+    if (Val.is_nan() || !(Val.is_zero() || (i == Pos && Val.is_one())))
+      return false;
+  }
+  if (DeltasDimNum == 0 || Pos < 0)
+    return false;
+  return true;
+}
+
+/// Check if the SCoP statement could probably be optimized with analytical
+/// modeling.
+///
+/// containsMatrMult tries to determine whether the following conditions
+/// are true:
+/// 1. The last memory access modeling an array, MA1, represents writing to
+///    memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
+///    S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
+///    under consideration.
+/// 2. There is only one loop-carried true dependency, and it has the
+///    form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
+///    loop-carried or anti dependencies.
+/// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
+///    reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
+///    S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
+///    and all memory accesses of the SCoP that are different from MA1, MA2,
+///    MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
+///    of loops i1, i2 and i3.
+///
+/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
+///        to check.
+/// @D     The SCoP dependencies.
+/// @MMI   Parameters of the matrix multiplication operands.
+static bool containsMatrMult(isl::map PartialSchedule, const Dependences *D,
+                             MatMulInfoTy &MMI) {
+  auto InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
+  auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
+  if (Stmt->size() <= 1)
+    return false;
+
+  auto Accesses = getAccessesInOrder(*Stmt);
+  for (auto *MemA = Accesses.end() - 1; MemA != Accesses.begin(); MemA--) {
+    auto *MemAccessPtr = *MemA;
+    if (!MemAccessPtr->isLatestArrayKind())
+      continue;
+    if (!MemAccessPtr->isWrite())
+      return false;
+    auto AccMap = MemAccessPtr->getLatestAccessRelation();
+    if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j))
+      return false;
+    MMI.WriteToC = MemAccessPtr;
+    break;
+  }
+
+  if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
+    return false;
+
+  if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
+    return false;
+
+  if (!MMI.A || !MMI.B || !MMI.ReadFromC)
+    return false;
+  return true;
+}
+
+/// Permute two dimensions of the band node.
+///
+/// Permute FirstDim and SecondDim dimensions of the Node.
+///
+/// @param Node The band node to be modified.
+/// @param FirstDim The first dimension to be permuted.
+/// @param SecondDim The second dimension to be permuted.
+static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node,
+                                                    unsigned FirstDim,
+                                                    unsigned SecondDim) {
+  assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band &&
+         (unsigned)isl_schedule_node_band_n_member(Node.get()) >
+             std::max(FirstDim, SecondDim));
+  auto PartialSchedule =
+      isl::manage(isl_schedule_node_band_get_partial_schedule(Node.get()));
+  auto PartialScheduleFirstDim = PartialSchedule.get_union_pw_aff(FirstDim);
+  auto PartialScheduleSecondDim = PartialSchedule.get_union_pw_aff(SecondDim);
+  PartialSchedule =
+      PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim);
+  PartialSchedule =
+      PartialSchedule.set_union_pw_aff(FirstDim, PartialScheduleSecondDim);
+  Node = isl::manage(isl_schedule_node_delete(Node.release()));
+  return Node.insert_partial_schedule(PartialSchedule);
+}
+
+static isl::schedule_node
+createMicroKernel(isl::schedule_node Node,
+                  MicroKernelParamsTy MicroKernelParams) {
+  Node = applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
+                             1);
+  Node = Node.parent().parent();
+  return permuteBandNodeDimensions(Node, 0, 1).child(0).child(0);
+}
+
+/// Create the BLIS macro-kernel.
+///
+/// We create the BLIS macro-kernel by applying a combination of tiling
+/// of dimensions of the band node and interchanging of two innermost
+/// modified dimensions. The values of of MacroKernelParams's fields are used
+/// as tile sizes.
+///
+/// @param Node The schedule node to be modified.
+/// @param MacroKernelParams Parameters of the macro kernel
+///                          to be used as tile sizes.
+static isl::schedule_node
+createMacroKernel(isl::schedule_node Node,
+                  MacroKernelParamsTy MacroKernelParams) {
+  assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
+  if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
+      MacroKernelParams.Kc == 1)
+    return Node;
+  int DimOutNum = isl_schedule_node_band_n_member(Node.get());
+  std::vector<int> TileSizes(DimOutNum, 1);
+  TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
+  TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
+  TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
+  Node = tileNode(Node, "1st level tiling", TileSizes, 1);
+  Node = Node.parent().parent();
+  Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
+  Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
+
+  // Mark the outermost loop as parallelizable.
+  Node = Node.band_member_set_coincident(0, true);
+
+  return Node.child(0).child(0);
+}
+
+/// Get the size of the widest type of the matrix multiplication operands
+/// in bytes, including alignment padding.
+///
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The size of the widest type of the matrix multiplication operands
+///         in bytes, including alignment padding.
+static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI) {
+  auto *S = MMI.A->getStatement()->getParent();
+  auto &DL = S->getFunction().getParent()->getDataLayout();
+  auto ElementSizeA = DL.getTypeAllocSize(MMI.A->getElementType());
+  auto ElementSizeB = DL.getTypeAllocSize(MMI.B->getElementType());
+  auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->getElementType());
+  return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
+}
+
+/// Get the size of the widest type of the matrix multiplication operands
+/// in bits.
+///
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The size of the widest type of the matrix multiplication operands
+///         in bits.
+static uint64_t getMatMulTypeSize(MatMulInfoTy MMI) {
+  auto *S = MMI.A->getStatement()->getParent();
+  auto &DL = S->getFunction().getParent()->getDataLayout();
+  auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->getElementType());
+  auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->getElementType());
+  auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->getElementType());
+  return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
+}
+
+/// Get parameters of the BLIS micro kernel.
+///
+/// We choose the Mr and Nr parameters of the micro kernel to be large enough
+/// such that no stalls caused by the combination of latencies and dependencies
+/// are introduced during the updates of the resulting matrix of the matrix
+/// multiplication. However, they should also be as small as possible to
+/// release more registers for entries of multiplied matrices.
+///
+/// @param TTI Target Transform Info.
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The structure of type MicroKernelParamsTy.
+/// @see MicroKernelParamsTy
+static struct MicroKernelParamsTy
+getMicroKernelParams(const TargetTransformInfo *TTI, MatMulInfoTy MMI) {
+  assert(TTI && "The target transform info should be provided.");
+
+  // Nvec - Number of double-precision floating-point numbers that can be hold
+  // by a vector register. Use 2 by default.
+  long RegisterBitwidth = VectorRegisterBitwidth;
+
+  if (RegisterBitwidth == -1)
+    RegisterBitwidth =
+        TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+  auto ElementSize = getMatMulTypeSize(MMI);
+  assert(ElementSize > 0 && "The element size of the matrix multiplication "
+                            "operands should be greater than zero.");
+  auto Nvec = RegisterBitwidth / ElementSize;
+  if (Nvec == 0)
+    Nvec = 2;
+  int Nr = ceil(sqrt((double)(Nvec * LatencyVectorFma * ThroughputVectorFma)) /
+                Nvec) *
+           Nvec;
+  int Mr = ceil((double)(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr));
+  return {Mr, Nr};
+}
+
+/// Determine parameters of the target cache.
+///
+/// @param TTI Target Transform Info.
+static void getTargetCacheParameters(const llvm::TargetTransformInfo *TTI) {
+  auto L1DCache = llvm::TargetTransformInfo::CacheLevel::L1D;
+  auto L2DCache = llvm::TargetTransformInfo::CacheLevel::L2D;
+  if (FirstCacheLevelSize == -1) {
+    if (TTI->getCacheSize(L1DCache).hasValue())
+      FirstCacheLevelSize = TTI->getCacheSize(L1DCache).getValue();
+    else
+      FirstCacheLevelSize = static_cast<int>(FirstCacheLevelDefaultSize);
+  }
+  if (SecondCacheLevelSize == -1) {
+    if (TTI->getCacheSize(L2DCache).hasValue())
+      SecondCacheLevelSize = TTI->getCacheSize(L2DCache).getValue();
+    else
+      SecondCacheLevelSize = static_cast<int>(SecondCacheLevelDefaultSize);
+  }
+  if (FirstCacheLevelAssociativity == -1) {
+    if (TTI->getCacheAssociativity(L1DCache).hasValue())
+      FirstCacheLevelAssociativity =
+          TTI->getCacheAssociativity(L1DCache).getValue();
+    else
+      FirstCacheLevelAssociativity =
+          static_cast<int>(FirstCacheLevelDefaultAssociativity);
+  }
+  if (SecondCacheLevelAssociativity == -1) {
+    if (TTI->getCacheAssociativity(L2DCache).hasValue())
+      SecondCacheLevelAssociativity =
+          TTI->getCacheAssociativity(L2DCache).getValue();
+    else
+      SecondCacheLevelAssociativity =
+          static_cast<int>(SecondCacheLevelDefaultAssociativity);
+  }
+}
+
+/// Get parameters of the BLIS macro kernel.
+///
+/// During the computation of matrix multiplication, blocks of partitioned
+/// matrices are mapped to different layers of the memory hierarchy.
+/// To optimize data reuse, blocks should be ideally kept in cache between
+/// iterations. Since parameters of the macro kernel determine sizes of these
+/// blocks, there are upper and lower bounds on these parameters.
+///
+/// @param TTI Target Transform Info.
+/// @param MicroKernelParams Parameters of the micro-kernel
+///                          to be taken into account.
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The structure of type MacroKernelParamsTy.
+/// @see MacroKernelParamsTy
+/// @see MicroKernelParamsTy
+static struct MacroKernelParamsTy
+getMacroKernelParams(const llvm::TargetTransformInfo *TTI,
+                     const MicroKernelParamsTy &MicroKernelParams,
+                     MatMulInfoTy MMI) {
+  getTargetCacheParameters(TTI);
+  // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
+  // it requires information about the first two levels of a cache to determine
+  // all the parameters of a macro-kernel. It also checks that an associativity
+  // degree of a cache level is greater than two. Otherwise, another algorithm
+  // for determination of the parameters should be used.
+  if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
+        FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
+        FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
+    return {1, 1, 1};
+  // The quotient should be greater than zero.
+  if (PollyPatternMatchingNcQuotient <= 0)
+    return {1, 1, 1};
+  int Car = floor(
+      (FirstCacheLevelAssociativity - 1) /
+      (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
+
+  // Car can be computed to be zero since it is floor to int.
+  // On Mac OS, division by 0 does not raise a signal. This causes negative
+  // tile sizes to be computed. Prevent division by Cac==0 by early returning
+  // if this happens.
+  if (Car == 0)
+    return {1, 1, 1};
+
+  auto ElementSize = getMatMulAlignTypeSize(MMI);
+  assert(ElementSize > 0 && "The element size of the matrix multiplication "
+                            "operands should be greater than zero.");
+  int Kc = (Car * FirstCacheLevelSize) /
+           (MicroKernelParams.Mr * FirstCacheLevelAssociativity * ElementSize);
+  double Cac =
+      static_cast<double>(Kc * ElementSize * SecondCacheLevelAssociativity) /
+      SecondCacheLevelSize;
+  int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
+  int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
+
+  assert(Mc > 0 && Nc > 0 && Kc > 0 &&
+         "Matrix block sizes should be  greater than zero");
+  return {Mc, Nc, Kc};
+}
+
+/// Create an access relation that is specific to
+///        the matrix multiplication pattern.
+///
+/// Create an access relation of the following form:
+/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
+/// where I is @p FirstDim, J is @p SecondDim.
+///
+/// It can be used, for example, to create relations that helps to consequently
+/// access elements of operands of a matrix multiplication after creation of
+/// the BLIS micro and macro kernels.
+///
+/// @see ScheduleTreeOptimizer::createMicroKernel
+/// @see ScheduleTreeOptimizer::createMacroKernel
+///
+/// Subsequently, the described access relation is applied to the range of
+/// @p MapOldIndVar, that is used to map original induction variables to
+/// the ones, which are produced by schedule transformations. It helps to
+/// define relations using a new space and, at the same time, keep them
+/// in the original one.
+///
+/// @param MapOldIndVar The relation, which maps original induction variables
+///                     to the ones, which are produced by schedule
+///                     transformations.
+/// @param FirstDim, SecondDim The input dimensions that are used to define
+///        the specified access relation.
+/// @return The specified access relation.
+static isl::map getMatMulAccRel(isl::map MapOldIndVar, unsigned FirstDim,
+                                unsigned SecondDim) {
+  auto AccessRelSpace = isl::space(MapOldIndVar.get_ctx(), 0, 9, 3);
+  auto AccessRel = isl::map::universe(AccessRelSpace);
+  AccessRel = AccessRel.equate(isl::dim::in, FirstDim, isl::dim::out, 0);
+  AccessRel = AccessRel.equate(isl::dim::in, 5, isl::dim::out, 1);
+  AccessRel = AccessRel.equate(isl::dim::in, SecondDim, isl::dim::out, 2);
+  return MapOldIndVar.apply_range(AccessRel);
+}
+
+static isl::schedule_node createExtensionNode(isl::schedule_node Node,
+                                              isl::map ExtensionMap) {
+  auto Extension = isl::union_map(ExtensionMap);
+  auto NewNode = isl::schedule_node::from_extension(Extension);
+  return Node.graft_before(NewNode);
+}
+
+/// Apply the packing transformation.
+///
+/// The packing transformation can be described as a data-layout
+/// transformation that requires to introduce a new array, copy data
+/// to the array, and change memory access locations to reference the array.
+/// It can be used to ensure that elements of the new array are read in-stride
+/// access, aligned to cache lines boundaries, and preloaded into certain cache
+/// levels.
+///
+/// As an example let us consider the packing of the array A that would help
+/// to read its elements with in-stride access. An access to the array A
+/// is represented by an access relation that has the form
+/// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
+/// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
+/// k mod Kc, j mod Nr, i mod Mr].
+///
+/// To ensure that elements of the array A are read in-stride access, we add
+/// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
+/// Scop::createScopArrayInfo, change the access relation
+/// S[i, j, k] -> A[i, k] to
+/// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
+/// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
+/// the copy statement created by Scop::addScopStmt.
+///
+/// @param Node The schedule node to be optimized.
+/// @param MapOldIndVar The relation, which maps original induction variables
+///                     to the ones, which are produced by schedule
+///                     transformations.
+/// @param MicroParams, MacroParams Parameters of the BLIS kernel
+///                                 to be taken into account.
+/// @param MMI Parameters of the matrix multiplication operands.
+/// @return The optimized schedule node.
+static isl::schedule_node
+optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar,
+                                 MicroKernelParamsTy MicroParams,
+                                 MacroKernelParamsTy MacroParams,
+                                 MatMulInfoTy &MMI) {
+  auto InputDimsId = MapOldIndVar.get_tuple_id(isl::dim::in);
+  auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
+
+  // Create a copy statement that corresponds to the memory access to the
+  // matrix B, the second operand of the matrix multiplication.
+  Node = Node.parent().parent().parent().parent().parent().parent();
+  Node = isl::manage(isl_schedule_node_band_split(Node.release(), 2)).child(0);
+  auto AccRel = getMatMulAccRel(MapOldIndVar, 3, 7);
+  unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
+  unsigned SecondDimSize = MacroParams.Kc;
+  unsigned ThirdDimSize = MicroParams.Nr;
+  auto *SAI = Stmt->getParent()->createScopArrayInfo(
+      MMI.B->getElementType(), "Packed_B",
+      {FirstDimSize, SecondDimSize, ThirdDimSize});
+  AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
+  auto OldAcc = MMI.B->getLatestAccessRelation();
+  MMI.B->setNewAccessRelation(AccRel);
+  auto ExtMap = MapOldIndVar.project_out(isl::dim::out, 2,
+                                         MapOldIndVar.dim(isl::dim::out) - 2);
+  ExtMap = ExtMap.reverse();
+  ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0);
+  auto Domain = Stmt->getDomain();
+
+  // Restrict the domains of the copy statements to only execute when also its
+  // originating statement is executed.
+  auto DomainId = Domain.get_tuple_id();
+  auto *NewStmt = Stmt->getParent()->addScopStmt(
+      OldAcc, MMI.B->getLatestAccessRelation(), Domain);
+  ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
+  ExtMap = ExtMap.intersect_range(Domain);
+  ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
+  Node = createExtensionNode(Node, ExtMap);
+
+  // Create a copy statement that corresponds to the memory access
+  // to the matrix A, the first operand of the matrix multiplication.
+  Node = Node.child(0);
+  AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
+  FirstDimSize = MacroParams.Mc / MicroParams.Mr;
+  ThirdDimSize = MicroParams.Mr;
+  SAI = Stmt->getParent()->createScopArrayInfo(
+      MMI.A->getElementType(), "Packed_A",
+      {FirstDimSize, SecondDimSize, ThirdDimSize});
+  AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
+  OldAcc = MMI.A->getLatestAccessRelation();
+  MMI.A->setNewAccessRelation(AccRel);
+  ExtMap = MapOldIndVar.project_out(isl::dim::out, 3,
+                                    MapOldIndVar.dim(isl::dim::out) - 3);
+  ExtMap = ExtMap.reverse();
+  ExtMap = ExtMap.fix_si(isl::dim::out, MMI.j, 0);
+  NewStmt = Stmt->getParent()->addScopStmt(
+      OldAcc, MMI.A->getLatestAccessRelation(), Domain);
+
+  // Restrict the domains of the copy statements to only execute when also its
+  // originating statement is executed.
+  ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
+  ExtMap = ExtMap.intersect_range(Domain);
+  ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
+  Node = createExtensionNode(Node, ExtMap);
+  return Node.child(0).child(0).child(0).child(0).child(0);
+}
+
+/// Get a relation mapping induction variables produced by schedule
+/// transformations to the original ones.
+///
+/// @param Node The schedule node produced as the result of creation
+///        of the BLIS kernels.
+/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
+///                                             to be taken into account.
+/// @return  The relation mapping original induction variables to the ones
+///          produced by schedule transformation.
+/// @see ScheduleTreeOptimizer::createMicroKernel
+/// @see ScheduleTreeOptimizer::createMacroKernel
+/// @see getMacroKernelParams
+static isl::map
+getInductionVariablesSubstitution(isl::schedule_node Node,
+                                  MicroKernelParamsTy MicroKernelParams,
+                                  MacroKernelParamsTy MacroKernelParams) {
+  auto Child = Node.child(0);
+  auto UnMapOldIndVar = Child.get_prefix_schedule_union_map();
+  auto MapOldIndVar = isl::map::from_union_map(UnMapOldIndVar);
+  if (MapOldIndVar.dim(isl::dim::out) > 9)
+    return MapOldIndVar.project_out(isl::dim::out, 0,
+                                    MapOldIndVar.dim(isl::dim::out) - 9);
+  return MapOldIndVar;
+}
+
+/// Isolate a set of partial tile prefixes and unroll the isolated part.
+///
+/// The set should ensure that it contains only partial tile prefixes that have
+/// exactly Mr x Nr iterations of the two innermost loops produced by
+/// the optimization of the matrix multiplication. Mr and Nr are parameters of
+/// the micro-kernel.
+///
+/// In case of parametric bounds, this helps to auto-vectorize the unrolled
+/// innermost loops, using the SLP vectorizer.
+///
+/// @param Node              The schedule node to be modified.
+/// @param MicroKernelParams Parameters of the micro-kernel
+///                          to be taken into account.
+/// @return The modified isl_schedule_node.
+static isl::schedule_node
+isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
+                                 struct MicroKernelParamsTy MicroKernelParams) {
+  isl::schedule_node Child = Node.get_child(0);
+  isl::union_map UnMapOldIndVar = Child.get_prefix_schedule_relation();
+  isl::set Prefix = isl::map::from_union_map(UnMapOldIndVar).range();
+  isl_size Dims = Prefix.dim(isl::dim::set);
+  Prefix = Prefix.project_out(isl::dim::set, Dims - 1, 1);
+  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
+  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
+
+  isl::union_set IsolateOption =
+      getIsolateOptions(Prefix.add_dims(isl::dim::set, 3), 3);
+  isl::ctx Ctx = Node.get_ctx();
+  auto Options = IsolateOption.unite(getDimOptions(Ctx, "unroll"));
+  Options = Options.unite(getUnrollIsolatedSetOptions(Ctx));
+  Node = Node.band_set_ast_build_options(Options);
+  Node = Node.parent().parent().parent();
+  IsolateOption = getIsolateOptions(Prefix, 3);
+  Options = IsolateOption.unite(getDimOptions(Ctx, "separate"));
+  Node = Node.band_set_ast_build_options(Options);
+  Node = Node.child(0).child(0).child(0);
+  return Node;
+}
+
+/// Mark @p BasePtr with "Inter iteration alias-free" mark node.
+///
+/// @param Node The child of the mark node to be inserted.
+/// @param BasePtr The pointer to be marked.
+/// @return The modified isl_schedule_node.
+static isl::schedule_node markInterIterationAliasFree(isl::schedule_node Node,
+                                                      Value *BasePtr) {
+  if (!BasePtr)
+    return Node;
+
+  auto Id =
+      isl::id::alloc(Node.get_ctx(), "Inter iteration alias-free", BasePtr);
+  return Node.insert_mark(Id).child(0);
+}
+
+/// Insert "Loop Vectorizer Disabled" mark node.
+///
+/// @param Node The child of the mark node to be inserted.
+/// @return The modified isl_schedule_node.
+static isl::schedule_node markLoopVectorizerDisabled(isl::schedule_node Node) {
+  auto Id = isl::id::alloc(Node.get_ctx(), "Loop Vectorizer Disabled", nullptr);
+  return Node.insert_mark(Id).child(0);
+}
+
+/// Restore the initial ordering of dimensions of the band node
+///
+/// In case the band node represents all the dimensions of the iteration
+/// domain, recreate the band node to restore the initial ordering of the
+/// dimensions.
+///
+/// @param Node The band node to be modified.
+/// @return The modified schedule node.
+static isl::schedule_node
+getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
+  assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
+  if (isl_schedule_node_get_type(Node.child(0).get()) != isl_schedule_node_leaf)
+    return Node;
+  auto Domain = Node.get_universe_domain();
+  assert(isl_union_set_n_set(Domain.get()) == 1);
+  if (Node.get_schedule_depth() != 0 ||
+      (isl::set(Domain).dim(isl::dim::set) !=
+       isl_schedule_node_band_n_member(Node.get())))
+    return Node;
+  Node = isl::manage(isl_schedule_node_delete(Node.copy()));
+  auto PartialSchedulePwAff = Domain.identity_union_pw_multi_aff();
+  auto PartialScheduleMultiPwAff =
+      isl::multi_union_pw_aff(PartialSchedulePwAff);
+  PartialScheduleMultiPwAff =
+      PartialScheduleMultiPwAff.reset_tuple_id(isl::dim::set);
+  return Node.insert_partial_schedule(PartialScheduleMultiPwAff);
+}
+
+static isl::schedule_node optimizeMatMulPattern(isl::schedule_node Node,
+                                                const TargetTransformInfo *TTI,
+                                                MatMulInfoTy &MMI) {
+  assert(TTI && "The target transform info should be provided.");
+  Node = markInterIterationAliasFree(
+      Node, MMI.WriteToC->getLatestScopArrayInfo()->getBasePtr());
+  int DimOutNum = isl_schedule_node_band_n_member(Node.get());
+  assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
+                          "and, consequently, the corresponding scheduling "
+                          "functions have at least three dimensions.");
+  Node = getBandNodeWithOriginDimOrder(Node);
+  Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
+  int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
+  int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
+  Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
+  NewK = NewK == DimOutNum - 2 ? NewJ : NewK;
+  Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
+  auto MicroKernelParams = getMicroKernelParams(TTI, MMI);
+  auto MacroKernelParams = getMacroKernelParams(TTI, MicroKernelParams, MMI);
+  Node = createMacroKernel(Node, MacroKernelParams);
+  Node = createMicroKernel(Node, MicroKernelParams);
+  if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
+      MacroKernelParams.Kc == 1)
+    return Node;
+  auto MapOldIndVar = getInductionVariablesSubstitution(Node, MicroKernelParams,
+                                                        MacroKernelParams);
+  if (!MapOldIndVar)
+    return Node;
+  Node = markLoopVectorizerDisabled(Node.parent()).child(0);
+  Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
+  return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
+                                          MacroKernelParams, MMI);
+}
+
+/// Check if this node contains a partial schedule that could
+///        probably be optimized with analytical modeling.
+///
+/// isMatrMultPattern tries to determine whether the following conditions
+/// are true:
+/// 1. the partial schedule contains only one statement.
+/// 2. there are exactly three input dimensions.
+/// 3. all memory accesses of the statement will have stride 0 or 1, if we
+///    interchange loops (switch the variable used in the inner loop to
+///    the outer loop).
+/// 4. all memory accesses of the statement except from the last one, are
+///    read memory access and the last one is write memory access.
+/// 5. all subscripts of the last memory access of the statement don't
+///    contain the variable used in the inner loop.
+/// If this is the case, we could try to use an approach that is similar to
+/// the one used to get close-to-peak performance of matrix multiplications.
+///
+/// @param Node The node to check.
+/// @param D    The SCoP dependencies.
+/// @param MMI  Parameters of the matrix multiplication operands.
+static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
+                              MatMulInfoTy &MMI) {
+  auto PartialSchedule = isl::manage(
+      isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
+  Node = Node.child(0);
+  auto LeafType = isl_schedule_node_get_type(Node.get());
+  Node = Node.parent();
+  if (LeafType != isl_schedule_node_leaf ||
+      isl_schedule_node_band_n_member(Node.get()) < 3 ||
+      Node.get_schedule_depth() != 0 ||
+      isl_union_map_n_map(PartialSchedule.get()) != 1)
+    return false;
+  auto NewPartialSchedule = isl::map::from_union_map(PartialSchedule);
+  if (containsMatrMult(NewPartialSchedule, D, MMI))
+    return true;
+  return false;
+}
+
+} // namespace
+
+isl::schedule_node
+polly::tryOptimizeMatMulPattern(isl::schedule_node Node,
+                                const llvm::TargetTransformInfo *TTI,
+                                const Dependences *D) {
+  MatMulInfoTy MMI;
+  if (isMatrMultPattern(Node, D, MMI)) {
+    LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
+    return optimizeMatMulPattern(Node, TTI, MMI);
+  }
+  return {};
+}
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp

index 913828994cb332a0cf58adf70e1ec42de819b800..9c893f5e06f729e299efbe399824f408592bc6a2 100644 (file)
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -48,40 +48,25 @@
  #include "polly/ScheduleOptimizer.h"
  #include "polly/CodeGen/CodeGeneration.h"
  #include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
  #include "polly/ManualOptimizer.h"
+#include "polly/MatmulOptimizer.h"
  #include "polly/Options.h"
  #include "polly/ScheduleTreeTransform.h"
-#include "polly/ScopInfo.h"
-#include "polly/ScopPass.h"
-#include "polly/Simplify.h"
  #include "polly/Support/ISLOStream.h"
  #include "llvm/ADT/Sequence.h"
  #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h"
  #include "llvm/InitializePasses.h"
  #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "isl/ctx.h"
  #include "isl/options.h"
-#include "isl/printer.h"
-#include "isl/schedule.h"
-#include "isl/schedule_node.h"
-#include "isl/union_map.h"
-#include "isl/union_set.h"
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-#include <string>
-#include <vector>
  
  using namespace llvm;
  using namespace polly;
  
+namespace llvm {
+class Loop;
+class Module;
+} // namespace llvm
+
  #define DEBUG_TYPE "polly-opt-isl"
  
  static cl::opt<std::string>
@@ -132,77 +117,6 @@ static cl::opt<bool> FirstLevelTiling("polly-tiling",
                                        cl::init(true), cl::ZeroOrMore,
                                        cl::cat(PollyCategory));
  
-static cl::opt<int> LatencyVectorFma(
-    "polly-target-latency-vector-fma",
-    cl::desc("The minimal number of cycles between issuing two "
-             "dependent consecutive vector fused multiply-add "
-             "instructions."),
-    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> ThroughputVectorFma(
-    "polly-target-throughput-vector-fma",
-    cl::desc("A throughput of the processor floating-point arithmetic units "
-             "expressed in the number of vector fused multiply-add "
-             "instructions per clock cycle."),
-    cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-// This option, along with --polly-target-2nd-cache-level-associativity,
-// --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
-// represent the parameters of the target cache, which do not have typical
-// values that can be used by default. However, to apply the pattern matching
-// optimizations, we use the values of the parameters of Intel Core i7-3820
-// SandyBridge in case the parameters are not specified or not provided by the
-// TargetTransformInfo.
-static cl::opt<int> FirstCacheLevelAssociativity(
-    "polly-target-1st-cache-level-associativity",
-    cl::desc("The associativity of the first cache level."), cl::Hidden,
-    cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> FirstCacheLevelDefaultAssociativity(
-    "polly-target-1st-cache-level-default-associativity",
-    cl::desc("The default associativity of the first cache level"
-             " (if not enough were provided by the TargetTransformInfo)."),
-    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> SecondCacheLevelAssociativity(
-    "polly-target-2nd-cache-level-associativity",
-    cl::desc("The associativity of the second cache level."), cl::Hidden,
-    cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> SecondCacheLevelDefaultAssociativity(
-    "polly-target-2nd-cache-level-default-associativity",
-    cl::desc("The default associativity of the second cache level"
-             " (if not enough were provided by the TargetTransformInfo)."),
-    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> FirstCacheLevelSize(
-    "polly-target-1st-cache-level-size",
-    cl::desc("The size of the first cache level specified in bytes."),
-    cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> FirstCacheLevelDefaultSize(
-    "polly-target-1st-cache-level-default-size",
-    cl::desc("The default size of the first cache level specified in bytes"
-             " (if not enough were provided by the TargetTransformInfo)."),
-    cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> SecondCacheLevelSize(
-    "polly-target-2nd-cache-level-size",
-    cl::desc("The size of the second level specified in bytes."), cl::Hidden,
-    cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> SecondCacheLevelDefaultSize(
-    "polly-target-2nd-cache-level-default-size",
-    cl::desc("The default size of the second cache level specified in bytes"
-             " (if not enough were provided by the TargetTransformInfo)."),
-    cl::Hidden, cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
-
-static cl::opt<int> VectorRegisterBitwidth(
-    "polly-target-vector-register-bitwidth",
-    cl::desc("The size in bits of a vector register (if not set, this "
-             "information is taken from LLVM's target information."),
-    cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-
  static cl::opt<int> FirstLevelDefaultTileSize(
      "polly-default-tile-size",
      cl::desc("The default tile size (if not enough were provided by"
@@ -245,12 +159,6 @@ static cl::opt<int> RegisterDefaultTileSize(
               " --polly-register-tile-sizes)"),
      cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
  
-static cl::opt<int> PollyPatternMatchingNcQuotient(
-    "polly-pattern-matching-nc-quotient",
-    cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
-             "macro-kernel, by Nr, the parameter of the micro-kernel"),
-    cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
-
  static cl::list<int>
      RegisterTileSizes("polly-register-tile-sizes",
                        cl::desc("A tile size for each loop dimension, filled "
@@ -303,25 +211,6 @@ STATISTIC(MatMulOpts,
            "Number of matrix multiplication patterns detected and optimized");
  
  namespace {
-/// Parameters of the micro kernel.
-///
-/// Parameters, which determine sizes of rank-1 (i.e., outer product) update
-/// used in the optimized matrix multiplication.
-struct MicroKernelParamsTy {
-  int Mr;
-  int Nr;
-};
-
-/// Parameters of the macro kernel.
-///
-/// Parameters, which determine sizes of blocks of partitioned matrices
-/// used in the optimized matrix multiplication.
-struct MacroKernelParamsTy {
-  int Mc;
-  int Nc;
-  int Kc;
-};
-
  /// Additional parameters of the schedule optimizer.
  ///
  /// Target Transform Info and the SCoP dependencies used by the schedule
@@ -331,20 +220,6 @@ struct OptimizerAdditionalInfoTy {
    const Dependences *D;
  };
  
-/// Parameters of the matrix multiplication operands.
-///
-/// Parameters, which describe access relations that represent operands of the
-/// matrix multiplication.
-struct MatMulInfoTy {
-  MemoryAccess *A = nullptr;
-  MemoryAccess *B = nullptr;
-  MemoryAccess *ReadFromC = nullptr;
-  MemoryAccess *WriteToC = nullptr;
-  int i = -1;
-  int j = -1;
-  int k = -1;
-};
-
  class ScheduleTreeOptimizer {
  public:
    /// Apply schedule tree transformations.
@@ -400,81 +275,6 @@ public:
                                                      int VectorWidth);
  
  private:
-  /// Tile a schedule node.
-  ///
-  /// @param Node            The node to tile.
-  /// @param Identifier      An name that identifies this kind of tiling and
-  ///                        that is used to mark the tiled loops in the
-  ///                        generated AST.
-  /// @param TileSizes       A vector of tile sizes that should be used for
-  ///                        tiling.
-  /// @param DefaultTileSize A default tile size that is used for dimensions
-  ///                        that are not covered by the TileSizes vector.
-  static isl::schedule_node tileNode(isl::schedule_node Node,
-                                     const char *Identifier,
-                                     llvm::ArrayRef<int> TileSizes,
-                                     int DefaultTileSize);
-
-  /// Tile a schedule node and unroll point loops.
-  ///
-  /// @param Node            The node to register tile.
-  /// @param TileSizes       A vector of tile sizes that should be used for
-  ///                        tiling.
-  /// @param DefaultTileSize A default tile size that is used for dimensions
-  static isl::schedule_node applyRegisterTiling(isl::schedule_node Node,
-                                                llvm::ArrayRef<int> TileSizes,
-                                                int DefaultTileSize);
-
-  /// Apply the BLIS matmul optimization pattern.
-  ///
-  /// Make the loops containing the matrix multiplication be the innermost
-  /// loops and apply the BLIS matmul optimization pattern. BLIS implements
-  /// gemm as three nested loops around a macro-kernel, plus two packing
-  /// routines. The macro-kernel is implemented in terms of two additional
-  /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1
-  /// (i.e., outer product) update.
-  ///
-  /// For a detailed description please see [1].
-  ///
-  /// The order of the loops defines the data reused in the BLIS implementation
-  /// of gemm ([1]). In particular, elements of the matrix B, the second
-  /// operand of matrix multiplication, are reused between iterations of the
-  /// innermost loop. To keep the reused data in cache, only elements of matrix
-  /// A, the first operand of matrix multiplication, should be evicted during
-  /// an iteration of the innermost loop. To provide such a cache replacement
-  /// policy, elements of the matrix A can, in particular, be loaded first and,
-  /// consequently, be least-recently-used.
-  ///
-  /// In our case matrices are stored in row-major order instead of
-  /// column-major order used in the BLIS implementation ([1]). It affects only
-  /// on the form of the BLIS micro kernel and the computation of its
-  /// parameters. In particular, reused elements of the matrix B are
-  /// successively multiplied by specific elements of the matrix A.
-  ///
-  /// Refs.:
-  /// [1] - Analytical Modeling is Enough for High Performance BLIS
-  /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti
-  /// Technical Report, 2014
-  /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
-  ///
-  /// @see ScheduleTreeOptimizer::createMicroKernel
-  /// @see ScheduleTreeOptimizer::createMacroKernel
-  /// @see getMicroKernelParams
-  /// @see getMacroKernelParams
-  ///
-  /// TODO: Implement the packing transformation.
-  ///
-  /// @param Node The node that contains a band to be optimized. The node
-  ///             is required to successfully pass
-  ///             ScheduleTreeOptimizer::isMatrMultPattern.
-  /// @param TTI  Target Transform Info.
-  /// @param MMI  Parameters of the matrix multiplication operands.
-  /// @returns    The transformed schedule.
-  static isl::schedule_node
-  optimizeMatMulPattern(isl::schedule_node Node,
-                        const llvm::TargetTransformInfo *TTI,
-                        MatMulInfoTy &MMI);
-
    /// Check if this node is a band node we want to tile.
    ///
    /// We look for innermost band nodes where individual dimensions are marked as
@@ -551,145 +351,8 @@ private:
    ///        (currently unused).
    static isl::schedule_node standardBandOpts(isl::schedule_node Node,
                                               void *User);
-
-  /// Check if this node contains a partial schedule that could
-  ///        probably be optimized with analytical modeling.
-  ///
-  /// isMatrMultPattern tries to determine whether the following conditions
-  /// are true:
-  /// 1. the partial schedule contains only one statement.
-  /// 2. there are exactly three input dimensions.
-  /// 3. all memory accesses of the statement will have stride 0 or 1, if we
-  ///    interchange loops (switch the variable used in the inner loop to
-  ///    the outer loop).
-  /// 4. all memory accesses of the statement except from the last one, are
-  ///    read memory access and the last one is write memory access.
-  /// 5. all subscripts of the last memory access of the statement don't
-  ///    contain the variable used in the inner loop.
-  /// If this is the case, we could try to use an approach that is similar to
-  /// the one used to get close-to-peak performance of matrix multiplications.
-  ///
-  /// @param Node The node to check.
-  /// @param D    The SCoP dependencies.
-  /// @param MMI  Parameters of the matrix multiplication operands.
-  static bool isMatrMultPattern(isl::schedule_node Node,
-                                const polly::Dependences *D, MatMulInfoTy &MMI);
-
-  /// Create the BLIS macro-kernel.
-  ///
-  /// We create the BLIS macro-kernel by applying a combination of tiling
-  /// of dimensions of the band node and interchanging of two innermost
-  /// modified dimensions. The values of of MacroKernelParams's fields are used
-  /// as tile sizes.
-  ///
-  /// @param Node The schedule node to be modified.
-  /// @param MacroKernelParams Parameters of the macro kernel
-  ///                          to be used as tile sizes.
-  static isl::schedule_node
-  createMacroKernel(isl::schedule_node Node,
-                    MacroKernelParamsTy MacroKernelParams);
-
-  /// Create the BLIS macro-kernel.
-  ///
-  /// We create the BLIS macro-kernel by applying a combination of tiling
-  /// of dimensions of the band node and interchanging of two innermost
-  /// modified dimensions. The values passed in MicroKernelParam are used
-  /// as tile sizes.
-  ///
-  /// @param Node The schedule node to be modified.
-  /// @param MicroKernelParams Parameters of the micro kernel
-  ///                          to be used as tile sizes.
-  /// @see MicroKernelParamsTy
-  static isl::schedule_node
-  createMicroKernel(isl::schedule_node Node,
-                    MicroKernelParamsTy MicroKernelParams);
  };
  
-/// Create an isl::union_set, which describes the isolate option based on
-/// IsolateDomain.
-///
-/// @param IsolateDomain An isl::set whose @p OutDimsNum last dimensions should
-///                      belong to the current band node.
-/// @param OutDimsNum    A number of dimensions that should belong to
-///                      the current band node.
-static isl::union_set getIsolateOptions(isl::set IsolateDomain,
-                                        isl_size OutDimsNum) {
-  isl_size Dims = IsolateDomain.dim(isl::dim::set);
-  assert(OutDimsNum <= Dims &&
-         "The isl::set IsolateDomain is used to describe the range of schedule "
-         "dimensions values, which should be isolated. Consequently, the "
-         "number of its dimensions should be greater than or equal to the "
-         "number of the schedule dimensions.");
-  isl::map IsolateRelation = isl::map::from_domain(IsolateDomain);
-  IsolateRelation = IsolateRelation.move_dims(isl::dim::out, 0, isl::dim::in,
-                                              Dims - OutDimsNum, OutDimsNum);
-  isl::set IsolateOption = IsolateRelation.wrap();
-  isl::id Id = isl::id::alloc(IsolateOption.get_ctx(), "isolate", nullptr);
-  IsolateOption = IsolateOption.set_tuple_id(Id);
-  return isl::union_set(IsolateOption);
-}
-
-/// Create an isl::union_set, which describes the specified option for the
-/// dimension of the current node.
-///
-/// @param Ctx    An isl::ctx, which is used to create the isl::union_set.
-/// @param Option The name of the option.
-isl::union_set getDimOptions(isl::ctx Ctx, const char *Option) {
-  isl::space Space(Ctx, 0, 1);
-  auto DimOption = isl::set::universe(Space);
-  auto Id = isl::id::alloc(Ctx, Option, nullptr);
-  DimOption = DimOption.set_tuple_id(Id);
-  return isl::union_set(DimOption);
-}
-
-/// Create an isl::union_set, which describes the option of the form
-/// [isolate[] -> unroll[x]].
-///
-/// @param Ctx An isl::ctx, which is used to create the isl::union_set.
-static isl::union_set getUnrollIsolatedSetOptions(isl::ctx Ctx) {
-  isl::space Space = isl::space(Ctx, 0, 0, 1);
-  isl::map UnrollIsolatedSetOption = isl::map::universe(Space);
-  isl::id DimInId = isl::id::alloc(Ctx, "isolate", nullptr);
-  isl::id DimOutId = isl::id::alloc(Ctx, "unroll", nullptr);
-  UnrollIsolatedSetOption =
-      UnrollIsolatedSetOption.set_tuple_id(isl::dim::in, DimInId);
-  UnrollIsolatedSetOption =
-      UnrollIsolatedSetOption.set_tuple_id(isl::dim::out, DimOutId);
-  return UnrollIsolatedSetOption.wrap();
-}
-
-/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
-///
-/// @param Set         A set, which should be modified.
-/// @param VectorWidth A parameter, which determines the constraint.
-static isl::set addExtentConstraints(isl::set Set, int VectorWidth) {
-  unsigned Dims = Set.dim(isl::dim::set);
-  isl::space Space = Set.get_space();
-  isl::local_space LocalSpace = isl::local_space(Space);
-  isl::constraint ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
-  ExtConstr = ExtConstr.set_constant_si(0);
-  ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, 1);
-  Set = Set.add_constraint(ExtConstr);
-  ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
-  ExtConstr = ExtConstr.set_constant_si(VectorWidth - 1);
-  ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1);
-  return Set.add_constraint(ExtConstr);
-}
-} // namespace
-
-isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
-                                       int VectorWidth) {
-  isl_size Dims = ScheduleRange.dim(isl::dim::set);
-  isl::set LoopPrefixes =
-      ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1);
-  auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth);
-  isl::set BadPrefixes = ExtentPrefixes.subtract(ScheduleRange);
-  BadPrefixes = BadPrefixes.project_out(isl::dim::set, Dims - 1, 1);
-  LoopPrefixes = LoopPrefixes.project_out(isl::dim::set, Dims - 1, 1);
-  return LoopPrefixes.subtract(BadPrefixes);
-}
-
-namespace {
  isl::schedule_node
  ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
                                                 int VectorWidth) {
@@ -742,41 +405,6 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
    return Node.insert_mark(LoopMarker);
  }
  
-isl::schedule_node ScheduleTreeOptimizer::tileNode(isl::schedule_node Node,
-                                                   const char *Identifier,
-                                                   ArrayRef<int> TileSizes,
-                                                   int DefaultTileSize) {
-  auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
-  auto Dims = Space.dim(isl::dim::set);
-  auto Sizes = isl::multi_val::zero(Space);
-  std::string IdentifierString(Identifier);
-  for (auto i : seq<isl_size>(0, Dims)) {
-    auto tileSize =
-        i < (isl_size)TileSizes.size() ? TileSizes[i] : DefaultTileSize;
-    Sizes = Sizes.set_val(i, isl::val(Node.get_ctx(), tileSize));
-  }
-  auto TileLoopMarkerStr = IdentifierString + " - Tiles";
-  auto TileLoopMarker =
-      isl::id::alloc(Node.get_ctx(), TileLoopMarkerStr, nullptr);
-  Node = Node.insert_mark(TileLoopMarker);
-  Node = Node.child(0);
-  Node =
-      isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
-  Node = Node.child(0);
-  auto PointLoopMarkerStr = IdentifierString + " - Points";
-  auto PointLoopMarker =
-      isl::id::alloc(Node.get_ctx(), PointLoopMarkerStr, nullptr);
-  Node = Node.insert_mark(PointLoopMarker);
-  return Node.child(0);
-}
-
-isl::schedule_node ScheduleTreeOptimizer::applyRegisterTiling(
-    isl::schedule_node Node, ArrayRef<int> TileSizes, int DefaultTileSize) {
-  Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
-  auto Ctx = Node.get_ctx();
-  return Node.band_set_ast_build_options(isl::union_set(Ctx, "{unroll[x]}"));
-}
-
  static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
    assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
    assert(isl_schedule_node_n_children(Node.get()) == 1);
@@ -857,787 +485,6 @@ ScheduleTreeOptimizer::standardBandOpts(isl::schedule_node Node, void *User) {
    return Node;
  }
  
-/// Permute the two dimensions of the isl map.
-///
-/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
-/// have type @p DimType.
-///
-/// @param Map     The isl map to be modified.
-/// @param DimType The type of the dimensions.
-/// @param DstPos  The first dimension.
-/// @param SrcPos  The second dimension.
-/// @return        The modified map.
-isl::map permuteDimensions(isl::map Map, isl::dim DimType, unsigned DstPos,
-                           unsigned SrcPos) {
-  assert((isl_size)DstPos < Map.dim(DimType) &&
-         (isl_size)SrcPos < Map.dim(DimType));
-  if (DstPos == SrcPos)
-    return Map;
-  isl::id DimId;
-  if (Map.has_tuple_id(DimType))
-    DimId = Map.get_tuple_id(DimType);
-  auto FreeDim = DimType == isl::dim::in ? isl::dim::out : isl::dim::in;
-  isl::id FreeDimId;
-  if (Map.has_tuple_id(FreeDim))
-    FreeDimId = Map.get_tuple_id(FreeDim);
-  auto MaxDim = std::max(DstPos, SrcPos);
-  auto MinDim = std::min(DstPos, SrcPos);
-  Map = Map.move_dims(FreeDim, 0, DimType, MaxDim, 1);
-  Map = Map.move_dims(FreeDim, 0, DimType, MinDim, 1);
-  Map = Map.move_dims(DimType, MinDim, FreeDim, 1, 1);
-  Map = Map.move_dims(DimType, MaxDim, FreeDim, 0, 1);
-  if (DimId)
-    Map = Map.set_tuple_id(DimType, DimId);
-  if (FreeDimId)
-    Map = Map.set_tuple_id(FreeDim, FreeDimId);
-  return Map;
-}
-
-/// Check the form of the access relation.
-///
-/// Check that the access relation @p AccMap has the form M[i][j], where i
-/// is a @p FirstPos and j is a @p SecondPos.
-///
-/// @param AccMap    The access relation to be checked.
-/// @param FirstPos  The index of the input dimension that is mapped to
-///                  the first output dimension.
-/// @param SecondPos The index of the input dimension that is mapped to the
-///                  second output dimension.
-/// @return          True in case @p AccMap has the expected form and false,
-///                  otherwise.
-static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos,
-                               int &SecondPos) {
-  isl::space Space = AccMap.get_space();
-  isl::map Universe = isl::map::universe(Space);
-
-  if (Space.dim(isl::dim::out) != 2)
-    return false;
-
-  // MatMul has the form:
-  // for (i = 0; i < N; i++)
-  //   for (j = 0; j < M; j++)
-  //     for (k = 0; k < P; k++)
-  //       C[i, j] += A[i, k] * B[k, j]
-  //
-  // Permutation of three outer loops: 3! = 6 possibilities.
-  int FirstDims[] = {0, 0, 1, 1, 2, 2};
-  int SecondDims[] = {1, 2, 2, 0, 0, 1};
-  for (int i = 0; i < 6; i += 1) {
-    auto PossibleMatMul =
-        Universe.equate(isl::dim::in, FirstDims[i], isl::dim::out, 0)
-            .equate(isl::dim::in, SecondDims[i], isl::dim::out, 1);
-
-    AccMap = AccMap.intersect_domain(Domain);
-    PossibleMatMul = PossibleMatMul.intersect_domain(Domain);
-
-    // If AccMap spans entire domain (Non-partial write),
-    // compute FirstPos and SecondPos.
-    // If AccMap != PossibleMatMul here (the two maps have been gisted at
-    // this point), it means that the writes are not complete, or in other
-    // words, it is a Partial write and Partial writes must be rejected.
-    if (AccMap.is_equal(PossibleMatMul)) {
-      if (FirstPos != -1 && FirstPos != FirstDims[i])
-        continue;
-      FirstPos = FirstDims[i];
-      if (SecondPos != -1 && SecondPos != SecondDims[i])
-        continue;
-      SecondPos = SecondDims[i];
-      return true;
-    }
-  }
-
-  return false;
-}
-
-/// Does the memory access represent a non-scalar operand of the matrix
-/// multiplication.
-///
-/// Check that the memory access @p MemAccess is the read access to a non-scalar
-/// operand of the matrix multiplication or its result.
-///
-/// @param MemAccess The memory access to be checked.
-/// @param MMI       Parameters of the matrix multiplication operands.
-/// @return          True in case the memory access represents the read access
-///                  to a non-scalar operand of the matrix multiplication and
-///                  false, otherwise.
-static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
-                                        MatMulInfoTy &MMI) {
-  if (!MemAccess->isLatestArrayKind() || !MemAccess->isRead())
-    return false;
-  auto AccMap = MemAccess->getLatestAccessRelation();
-  isl::set StmtDomain = MemAccess->getStatement()->getDomain();
-  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) {
-    MMI.ReadFromC = MemAccess;
-    return true;
-  }
-  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) {
-    MMI.A = MemAccess;
-    return true;
-  }
-  if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) {
-    MMI.B = MemAccess;
-    return true;
-  }
-  return false;
-}
-
-/// Check accesses to operands of the matrix multiplication.
-///
-/// Check that accesses of the SCoP statement, which corresponds to
-/// the partial schedule @p PartialSchedule, are scalar in terms of loops
-/// containing the matrix multiplication, in case they do not represent
-/// accesses to the non-scalar operands of the matrix multiplication or
-/// its result.
-///
-/// @param  PartialSchedule The partial schedule of the SCoP statement.
-/// @param  MMI             Parameters of the matrix multiplication operands.
-/// @return                 True in case the corresponding SCoP statement
-///                         represents matrix multiplication and false,
-///                         otherwise.
-static bool containsOnlyMatrMultAcc(isl::map PartialSchedule,
-                                    MatMulInfoTy &MMI) {
-  auto InputDimId = PartialSchedule.get_tuple_id(isl::dim::in);
-  auto *Stmt = static_cast<ScopStmt *>(InputDimId.get_user());
-  isl_size OutDimNum = PartialSchedule.dim(isl::dim::out);
-  assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
-                          "and, consequently, the corresponding scheduling "
-                          "functions have at least three dimensions.");
-  auto MapI =
-      permuteDimensions(PartialSchedule, isl::dim::out, MMI.i, OutDimNum - 1);
-  auto MapJ =
-      permuteDimensions(PartialSchedule, isl::dim::out, MMI.j, OutDimNum - 1);
-  auto MapK =
-      permuteDimensions(PartialSchedule, isl::dim::out, MMI.k, OutDimNum - 1);
-
-  auto Accesses = getAccessesInOrder(*Stmt);
-  for (auto *MemA = Accesses.begin(); MemA != Accesses.end() - 1; MemA++) {
-    auto *MemAccessPtr = *MemA;
-    if (MemAccessPtr->isLatestArrayKind() && MemAccessPtr != MMI.WriteToC &&
-        !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
-        !(MemAccessPtr->isStrideZero(MapI)) &&
-        MemAccessPtr->isStrideZero(MapJ) && MemAccessPtr->isStrideZero(MapK))
-      return false;
-  }
-  return true;
-}
-
-/// Check for dependencies corresponding to the matrix multiplication.
-///
-/// Check that there is only true dependence of the form
-/// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
-/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
-/// to the dependency produced by the matrix multiplication.
-///
-/// @param  Schedule The schedule of the SCoP statement.
-/// @param  D The SCoP dependencies.
-/// @param  Pos The parameter to describe an acceptable true dependence.
-///             In case it has a negative value, try to determine its
-///             acceptable value.
-/// @return True in case dependencies correspond to the matrix multiplication
-///         and false, otherwise.
-static bool containsOnlyMatMulDep(isl::map Schedule, const Dependences *D,
-                                  int &Pos) {
-  isl::union_map Dep = D->getDependences(Dependences::TYPE_RAW);
-  isl::union_map Red = D->getDependences(Dependences::TYPE_RED);
-  if (Red)
-    Dep = Dep.unite(Red);
-  auto DomainSpace = Schedule.get_space().domain();
-  auto Space = DomainSpace.map_from_domain_and_range(DomainSpace);
-  auto Deltas = Dep.extract_map(Space).deltas();
-  isl_size DeltasDimNum = Deltas.dim(isl::dim::set);
-  for (int i = 0; i < DeltasDimNum; i++) {
-    auto Val = Deltas.plain_get_val_if_fixed(isl::dim::set, i);
-    Pos = Pos < 0 && Val.is_one() ? i : Pos;
-    if (Val.is_nan() || !(Val.is_zero() || (i == Pos && Val.is_one())))
-      return false;
-  }
-  if (DeltasDimNum == 0 || Pos < 0)
-    return false;
-  return true;
-}
-
-/// Check if the SCoP statement could probably be optimized with analytical
-/// modeling.
-///
-/// containsMatrMult tries to determine whether the following conditions
-/// are true:
-/// 1. The last memory access modeling an array, MA1, represents writing to
-///    memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
-///    S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
-///    under consideration.
-/// 2. There is only one loop-carried true dependency, and it has the
-///    form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
-///    loop-carried or anti dependencies.
-/// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
-///    reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
-///    S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
-///    and all memory accesses of the SCoP that are different from MA1, MA2,
-///    MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
-///    of loops i1, i2 and i3.
-///
-/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
-///        to check.
-/// @D     The SCoP dependencies.
-/// @MMI   Parameters of the matrix multiplication operands.
-static bool containsMatrMult(isl::map PartialSchedule, const Dependences *D,
-                             MatMulInfoTy &MMI) {
-  auto InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
-  auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
-  if (Stmt->size() <= 1)
-    return false;
-
-  auto Accesses = getAccessesInOrder(*Stmt);
-  for (auto *MemA = Accesses.end() - 1; MemA != Accesses.begin(); MemA--) {
-    auto *MemAccessPtr = *MemA;
-    if (!MemAccessPtr->isLatestArrayKind())
-      continue;
-    if (!MemAccessPtr->isWrite())
-      return false;
-    auto AccMap = MemAccessPtr->getLatestAccessRelation();
-    if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j))
-      return false;
-    MMI.WriteToC = MemAccessPtr;
-    break;
-  }
-
-  if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
-    return false;
-
-  if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
-    return false;
-
-  if (!MMI.A || !MMI.B || !MMI.ReadFromC)
-    return false;
-  return true;
-}
-
-/// Permute two dimensions of the band node.
-///
-/// Permute FirstDim and SecondDim dimensions of the Node.
-///
-/// @param Node The band node to be modified.
-/// @param FirstDim The first dimension to be permuted.
-/// @param SecondDim The second dimension to be permuted.
-static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node,
-                                                    unsigned FirstDim,
-                                                    unsigned SecondDim) {
-  assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band &&
-         (unsigned)isl_schedule_node_band_n_member(Node.get()) >
-             std::max(FirstDim, SecondDim));
-  auto PartialSchedule =
-      isl::manage(isl_schedule_node_band_get_partial_schedule(Node.get()));
-  auto PartialScheduleFirstDim = PartialSchedule.get_union_pw_aff(FirstDim);
-  auto PartialScheduleSecondDim = PartialSchedule.get_union_pw_aff(SecondDim);
-  PartialSchedule =
-      PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim);
-  PartialSchedule =
-      PartialSchedule.set_union_pw_aff(FirstDim, PartialScheduleSecondDim);
-  Node = isl::manage(isl_schedule_node_delete(Node.release()));
-  return Node.insert_partial_schedule(PartialSchedule);
-}
-
-isl::schedule_node ScheduleTreeOptimizer::createMicroKernel(
-    isl::schedule_node Node, MicroKernelParamsTy MicroKernelParams) {
-  Node = applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr},
-                             1);
-  Node = Node.parent().parent();
-  return permuteBandNodeDimensions(Node, 0, 1).child(0).child(0);
-}
-
-isl::schedule_node ScheduleTreeOptimizer::createMacroKernel(
-    isl::schedule_node Node, MacroKernelParamsTy MacroKernelParams) {
-  assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
-  if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
-      MacroKernelParams.Kc == 1)
-    return Node;
-  int DimOutNum = isl_schedule_node_band_n_member(Node.get());
-  std::vector<int> TileSizes(DimOutNum, 1);
-  TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
-  TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
-  TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
-  Node = tileNode(Node, "1st level tiling", TileSizes, 1);
-  Node = Node.parent().parent();
-  Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
-  Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
-
-  // Mark the outermost loop as parallelizable.
-  Node = Node.band_member_set_coincident(0, true);
-
-  return Node.child(0).child(0);
-}
-
-/// Get the size of the widest type of the matrix multiplication operands
-/// in bytes, including alignment padding.
-///
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The size of the widest type of the matrix multiplication operands
-///         in bytes, including alignment padding.
-static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI) {
-  auto *S = MMI.A->getStatement()->getParent();
-  auto &DL = S->getFunction().getParent()->getDataLayout();
-  auto ElementSizeA = DL.getTypeAllocSize(MMI.A->getElementType());
-  auto ElementSizeB = DL.getTypeAllocSize(MMI.B->getElementType());
-  auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->getElementType());
-  return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
-}
-
-/// Get the size of the widest type of the matrix multiplication operands
-/// in bits.
-///
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The size of the widest type of the matrix multiplication operands
-///         in bits.
-static uint64_t getMatMulTypeSize(MatMulInfoTy MMI) {
-  auto *S = MMI.A->getStatement()->getParent();
-  auto &DL = S->getFunction().getParent()->getDataLayout();
-  auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->getElementType());
-  auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->getElementType());
-  auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->getElementType());
-  return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
-}
-
-/// Get parameters of the BLIS micro kernel.
-///
-/// We choose the Mr and Nr parameters of the micro kernel to be large enough
-/// such that no stalls caused by the combination of latencies and dependencies
-/// are introduced during the updates of the resulting matrix of the matrix
-/// multiplication. However, they should also be as small as possible to
-/// release more registers for entries of multiplied matrices.
-///
-/// @param TTI Target Transform Info.
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The structure of type MicroKernelParamsTy.
-/// @see MicroKernelParamsTy
-static struct MicroKernelParamsTy
-getMicroKernelParams(const TargetTransformInfo *TTI, MatMulInfoTy MMI) {
-  assert(TTI && "The target transform info should be provided.");
-
-  // Nvec - Number of double-precision floating-point numbers that can be hold
-  // by a vector register. Use 2 by default.
-  long RegisterBitwidth = VectorRegisterBitwidth;
-
-  if (RegisterBitwidth == -1)
-    RegisterBitwidth =
-        TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
-  auto ElementSize = getMatMulTypeSize(MMI);
-  assert(ElementSize > 0 && "The element size of the matrix multiplication "
-                            "operands should be greater than zero.");
-  auto Nvec = RegisterBitwidth / ElementSize;
-  if (Nvec == 0)
-    Nvec = 2;
-  int Nr = ceil(sqrt((double)(Nvec * LatencyVectorFma * ThroughputVectorFma)) /
-                Nvec) *
-           Nvec;
-  int Mr = ceil((double)(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr));
-  return {Mr, Nr};
-}
-
-/// Determine parameters of the target cache.
-///
-/// @param TTI Target Transform Info.
-void getTargetCacheParameters(const llvm::TargetTransformInfo *TTI) {
-  auto L1DCache = llvm::TargetTransformInfo::CacheLevel::L1D;
-  auto L2DCache = llvm::TargetTransformInfo::CacheLevel::L2D;
-  if (FirstCacheLevelSize == -1) {
-    if (TTI->getCacheSize(L1DCache).hasValue())
-      FirstCacheLevelSize = TTI->getCacheSize(L1DCache).getValue();
-    else
-      FirstCacheLevelSize = static_cast<int>(FirstCacheLevelDefaultSize);
-  }
-  if (SecondCacheLevelSize == -1) {
-    if (TTI->getCacheSize(L2DCache).hasValue())
-      SecondCacheLevelSize = TTI->getCacheSize(L2DCache).getValue();
-    else
-      SecondCacheLevelSize = static_cast<int>(SecondCacheLevelDefaultSize);
-  }
-  if (FirstCacheLevelAssociativity == -1) {
-    if (TTI->getCacheAssociativity(L1DCache).hasValue())
-      FirstCacheLevelAssociativity =
-          TTI->getCacheAssociativity(L1DCache).getValue();
-    else
-      FirstCacheLevelAssociativity =
-          static_cast<int>(FirstCacheLevelDefaultAssociativity);
-  }
-  if (SecondCacheLevelAssociativity == -1) {
-    if (TTI->getCacheAssociativity(L2DCache).hasValue())
-      SecondCacheLevelAssociativity =
-          TTI->getCacheAssociativity(L2DCache).getValue();
-    else
-      SecondCacheLevelAssociativity =
-          static_cast<int>(SecondCacheLevelDefaultAssociativity);
-  }
-}
-
-/// Get parameters of the BLIS macro kernel.
-///
-/// During the computation of matrix multiplication, blocks of partitioned
-/// matrices are mapped to different layers of the memory hierarchy.
-/// To optimize data reuse, blocks should be ideally kept in cache between
-/// iterations. Since parameters of the macro kernel determine sizes of these
-/// blocks, there are upper and lower bounds on these parameters.
-///
-/// @param TTI Target Transform Info.
-/// @param MicroKernelParams Parameters of the micro-kernel
-///                          to be taken into account.
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The structure of type MacroKernelParamsTy.
-/// @see MacroKernelParamsTy
-/// @see MicroKernelParamsTy
-static struct MacroKernelParamsTy
-getMacroKernelParams(const llvm::TargetTransformInfo *TTI,
-                     const MicroKernelParamsTy &MicroKernelParams,
-                     MatMulInfoTy MMI) {
-  getTargetCacheParameters(TTI);
-  // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
-  // it requires information about the first two levels of a cache to determine
-  // all the parameters of a macro-kernel. It also checks that an associativity
-  // degree of a cache level is greater than two. Otherwise, another algorithm
-  // for determination of the parameters should be used.
-  if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
-        FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
-        FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
-    return {1, 1, 1};
-  // The quotient should be greater than zero.
-  if (PollyPatternMatchingNcQuotient <= 0)
-    return {1, 1, 1};
-  int Car = floor(
-      (FirstCacheLevelAssociativity - 1) /
-      (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
-
-  // Car can be computed to be zero since it is floor to int.
-  // On Mac OS, division by 0 does not raise a signal. This causes negative
-  // tile sizes to be computed. Prevent division by Cac==0 by early returning
-  // if this happens.
-  if (Car == 0)
-    return {1, 1, 1};
-
-  auto ElementSize = getMatMulAlignTypeSize(MMI);
-  assert(ElementSize > 0 && "The element size of the matrix multiplication "
-                            "operands should be greater than zero.");
-  int Kc = (Car * FirstCacheLevelSize) /
-           (MicroKernelParams.Mr * FirstCacheLevelAssociativity * ElementSize);
-  double Cac =
-      static_cast<double>(Kc * ElementSize * SecondCacheLevelAssociativity) /
-      SecondCacheLevelSize;
-  int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
-  int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
-
-  assert(Mc > 0 && Nc > 0 && Kc > 0 &&
-         "Matrix block sizes should be  greater than zero");
-  return {Mc, Nc, Kc};
-}
-
-/// Create an access relation that is specific to
-///        the matrix multiplication pattern.
-///
-/// Create an access relation of the following form:
-/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
-/// where I is @p FirstDim, J is @p SecondDim.
-///
-/// It can be used, for example, to create relations that helps to consequently
-/// access elements of operands of a matrix multiplication after creation of
-/// the BLIS micro and macro kernels.
-///
-/// @see ScheduleTreeOptimizer::createMicroKernel
-/// @see ScheduleTreeOptimizer::createMacroKernel
-///
-/// Subsequently, the described access relation is applied to the range of
-/// @p MapOldIndVar, that is used to map original induction variables to
-/// the ones, which are produced by schedule transformations. It helps to
-/// define relations using a new space and, at the same time, keep them
-/// in the original one.
-///
-/// @param MapOldIndVar The relation, which maps original induction variables
-///                     to the ones, which are produced by schedule
-///                     transformations.
-/// @param FirstDim, SecondDim The input dimensions that are used to define
-///        the specified access relation.
-/// @return The specified access relation.
-isl::map getMatMulAccRel(isl::map MapOldIndVar, unsigned FirstDim,
-                         unsigned SecondDim) {
-  auto AccessRelSpace = isl::space(MapOldIndVar.get_ctx(), 0, 9, 3);
-  auto AccessRel = isl::map::universe(AccessRelSpace);
-  AccessRel = AccessRel.equate(isl::dim::in, FirstDim, isl::dim::out, 0);
-  AccessRel = AccessRel.equate(isl::dim::in, 5, isl::dim::out, 1);
-  AccessRel = AccessRel.equate(isl::dim::in, SecondDim, isl::dim::out, 2);
-  return MapOldIndVar.apply_range(AccessRel);
-}
-
-isl::schedule_node createExtensionNode(isl::schedule_node Node,
-                                       isl::map ExtensionMap) {
-  auto Extension = isl::union_map(ExtensionMap);
-  auto NewNode = isl::schedule_node::from_extension(Extension);
-  return Node.graft_before(NewNode);
-}
-
-/// Apply the packing transformation.
-///
-/// The packing transformation can be described as a data-layout
-/// transformation that requires to introduce a new array, copy data
-/// to the array, and change memory access locations to reference the array.
-/// It can be used to ensure that elements of the new array are read in-stride
-/// access, aligned to cache lines boundaries, and preloaded into certain cache
-/// levels.
-///
-/// As an example let us consider the packing of the array A that would help
-/// to read its elements with in-stride access. An access to the array A
-/// is represented by an access relation that has the form
-/// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
-/// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
-/// k mod Kc, j mod Nr, i mod Mr].
-///
-/// To ensure that elements of the array A are read in-stride access, we add
-/// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
-/// Scop::createScopArrayInfo, change the access relation
-/// S[i, j, k] -> A[i, k] to
-/// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
-/// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
-/// the copy statement created by Scop::addScopStmt.
-///
-/// @param Node The schedule node to be optimized.
-/// @param MapOldIndVar The relation, which maps original induction variables
-///                     to the ones, which are produced by schedule
-///                     transformations.
-/// @param MicroParams, MacroParams Parameters of the BLIS kernel
-///                                 to be taken into account.
-/// @param MMI Parameters of the matrix multiplication operands.
-/// @return The optimized schedule node.
-static isl::schedule_node
-optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar,
-                                 MicroKernelParamsTy MicroParams,
-                                 MacroKernelParamsTy MacroParams,
-                                 MatMulInfoTy &MMI) {
-  auto InputDimsId = MapOldIndVar.get_tuple_id(isl::dim::in);
-  auto *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
-
-  // Create a copy statement that corresponds to the memory access to the
-  // matrix B, the second operand of the matrix multiplication.
-  Node = Node.parent().parent().parent().parent().parent().parent();
-  Node = isl::manage(isl_schedule_node_band_split(Node.release(), 2)).child(0);
-  auto AccRel = getMatMulAccRel(MapOldIndVar, 3, 7);
-  unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
-  unsigned SecondDimSize = MacroParams.Kc;
-  unsigned ThirdDimSize = MicroParams.Nr;
-  auto *SAI = Stmt->getParent()->createScopArrayInfo(
-      MMI.B->getElementType(), "Packed_B",
-      {FirstDimSize, SecondDimSize, ThirdDimSize});
-  AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
-  auto OldAcc = MMI.B->getLatestAccessRelation();
-  MMI.B->setNewAccessRelation(AccRel);
-  auto ExtMap = MapOldIndVar.project_out(isl::dim::out, 2,
-                                         MapOldIndVar.dim(isl::dim::out) - 2);
-  ExtMap = ExtMap.reverse();
-  ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0);
-  auto Domain = Stmt->getDomain();
-
-  // Restrict the domains of the copy statements to only execute when also its
-  // originating statement is executed.
-  auto DomainId = Domain.get_tuple_id();
-  auto *NewStmt = Stmt->getParent()->addScopStmt(
-      OldAcc, MMI.B->getLatestAccessRelation(), Domain);
-  ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
-  ExtMap = ExtMap.intersect_range(Domain);
-  ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
-  Node = createExtensionNode(Node, ExtMap);
-
-  // Create a copy statement that corresponds to the memory access
-  // to the matrix A, the first operand of the matrix multiplication.
-  Node = Node.child(0);
-  AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
-  FirstDimSize = MacroParams.Mc / MicroParams.Mr;
-  ThirdDimSize = MicroParams.Mr;
-  SAI = Stmt->getParent()->createScopArrayInfo(
-      MMI.A->getElementType(), "Packed_A",
-      {FirstDimSize, SecondDimSize, ThirdDimSize});
-  AccRel = AccRel.set_tuple_id(isl::dim::out, SAI->getBasePtrId());
-  OldAcc = MMI.A->getLatestAccessRelation();
-  MMI.A->setNewAccessRelation(AccRel);
-  ExtMap = MapOldIndVar.project_out(isl::dim::out, 3,
-                                    MapOldIndVar.dim(isl::dim::out) - 3);
-  ExtMap = ExtMap.reverse();
-  ExtMap = ExtMap.fix_si(isl::dim::out, MMI.j, 0);
-  NewStmt = Stmt->getParent()->addScopStmt(
-      OldAcc, MMI.A->getLatestAccessRelation(), Domain);
-
-  // Restrict the domains of the copy statements to only execute when also its
-  // originating statement is executed.
-  ExtMap = ExtMap.set_tuple_id(isl::dim::out, DomainId);
-  ExtMap = ExtMap.intersect_range(Domain);
-  ExtMap = ExtMap.set_tuple_id(isl::dim::out, NewStmt->getDomainId());
-  Node = createExtensionNode(Node, ExtMap);
-  return Node.child(0).child(0).child(0).child(0).child(0);
-}
-
-/// Get a relation mapping induction variables produced by schedule
-/// transformations to the original ones.
-///
-/// @param Node The schedule node produced as the result of creation
-///        of the BLIS kernels.
-/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
-///                                             to be taken into account.
-/// @return  The relation mapping original induction variables to the ones
-///          produced by schedule transformation.
-/// @see ScheduleTreeOptimizer::createMicroKernel
-/// @see ScheduleTreeOptimizer::createMacroKernel
-/// @see getMacroKernelParams
-isl::map
-getInductionVariablesSubstitution(isl::schedule_node Node,
-                                  MicroKernelParamsTy MicroKernelParams,
-                                  MacroKernelParamsTy MacroKernelParams) {
-  auto Child = Node.child(0);
-  auto UnMapOldIndVar = Child.get_prefix_schedule_union_map();
-  auto MapOldIndVar = isl::map::from_union_map(UnMapOldIndVar);
-  if (MapOldIndVar.dim(isl::dim::out) > 9)
-    return MapOldIndVar.project_out(isl::dim::out, 0,
-                                    MapOldIndVar.dim(isl::dim::out) - 9);
-  return MapOldIndVar;
-}
-
-/// Isolate a set of partial tile prefixes and unroll the isolated part.
-///
-/// The set should ensure that it contains only partial tile prefixes that have
-/// exactly Mr x Nr iterations of the two innermost loops produced by
-/// the optimization of the matrix multiplication. Mr and Nr are parameters of
-/// the micro-kernel.
-///
-/// In case of parametric bounds, this helps to auto-vectorize the unrolled
-/// innermost loops, using the SLP vectorizer.
-///
-/// @param Node              The schedule node to be modified.
-/// @param MicroKernelParams Parameters of the micro-kernel
-///                          to be taken into account.
-/// @return The modified isl_schedule_node.
-static isl::schedule_node
-isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
-                                 struct MicroKernelParamsTy MicroKernelParams) {
-  isl::schedule_node Child = Node.get_child(0);
-  isl::union_map UnMapOldIndVar = Child.get_prefix_schedule_relation();
-  isl::set Prefix = isl::map::from_union_map(UnMapOldIndVar).range();
-  isl_size Dims = Prefix.dim(isl::dim::set);
-  Prefix = Prefix.project_out(isl::dim::set, Dims - 1, 1);
-  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
-  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
-
-  isl::union_set IsolateOption =
-      getIsolateOptions(Prefix.add_dims(isl::dim::set, 3), 3);
-  isl::ctx Ctx = Node.get_ctx();
-  auto Options = IsolateOption.unite(getDimOptions(Ctx, "unroll"));
-  Options = Options.unite(getUnrollIsolatedSetOptions(Ctx));
-  Node = Node.band_set_ast_build_options(Options);
-  Node = Node.parent().parent().parent();
-  IsolateOption = getIsolateOptions(Prefix, 3);
-  Options = IsolateOption.unite(getDimOptions(Ctx, "separate"));
-  Node = Node.band_set_ast_build_options(Options);
-  Node = Node.child(0).child(0).child(0);
-  return Node;
-}
-
-/// Mark @p BasePtr with "Inter iteration alias-free" mark node.
-///
-/// @param Node The child of the mark node to be inserted.
-/// @param BasePtr The pointer to be marked.
-/// @return The modified isl_schedule_node.
-static isl::schedule_node markInterIterationAliasFree(isl::schedule_node Node,
-                                                      Value *BasePtr) {
-  if (!BasePtr)
-    return Node;
-
-  auto Id =
-      isl::id::alloc(Node.get_ctx(), "Inter iteration alias-free", BasePtr);
-  return Node.insert_mark(Id).child(0);
-}
-
-/// Insert "Loop Vectorizer Disabled" mark node.
-///
-/// @param Node The child of the mark node to be inserted.
-/// @return The modified isl_schedule_node.
-static isl::schedule_node markLoopVectorizerDisabled(isl::schedule_node Node) {
-  auto Id = isl::id::alloc(Node.get_ctx(), "Loop Vectorizer Disabled", nullptr);
-  return Node.insert_mark(Id).child(0);
-}
-
-/// Restore the initial ordering of dimensions of the band node
-///
-/// In case the band node represents all the dimensions of the iteration
-/// domain, recreate the band node to restore the initial ordering of the
-/// dimensions.
-///
-/// @param Node The band node to be modified.
-/// @return The modified schedule node.
-static isl::schedule_node
-getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
-  assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
-  if (isl_schedule_node_get_type(Node.child(0).get()) != isl_schedule_node_leaf)
-    return Node;
-  auto Domain = Node.get_universe_domain();
-  assert(isl_union_set_n_set(Domain.get()) == 1);
-  if (Node.get_schedule_depth() != 0 ||
-      (isl::set(Domain).dim(isl::dim::set) !=
-       isl_schedule_node_band_n_member(Node.get())))
-    return Node;
-  Node = isl::manage(isl_schedule_node_delete(Node.copy()));
-  auto PartialSchedulePwAff = Domain.identity_union_pw_multi_aff();
-  auto PartialScheduleMultiPwAff =
-      isl::multi_union_pw_aff(PartialSchedulePwAff);
-  PartialScheduleMultiPwAff =
-      PartialScheduleMultiPwAff.reset_tuple_id(isl::dim::set);
-  return Node.insert_partial_schedule(PartialScheduleMultiPwAff);
-}
-
-isl::schedule_node
-ScheduleTreeOptimizer::optimizeMatMulPattern(isl::schedule_node Node,
-                                             const TargetTransformInfo *TTI,
-                                             MatMulInfoTy &MMI) {
-  assert(TTI && "The target transform info should be provided.");
-  Node = markInterIterationAliasFree(
-      Node, MMI.WriteToC->getLatestScopArrayInfo()->getBasePtr());
-  int DimOutNum = isl_schedule_node_band_n_member(Node.get());
-  assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
-                          "and, consequently, the corresponding scheduling "
-                          "functions have at least three dimensions.");
-  Node = getBandNodeWithOriginDimOrder(Node);
-  Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
-  int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
-  int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
-  Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
-  NewK = NewK == DimOutNum - 2 ? NewJ : NewK;
-  Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
-  auto MicroKernelParams = getMicroKernelParams(TTI, MMI);
-  auto MacroKernelParams = getMacroKernelParams(TTI, MicroKernelParams, MMI);
-  Node = createMacroKernel(Node, MacroKernelParams);
-  Node = createMicroKernel(Node, MicroKernelParams);
-  if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
-      MacroKernelParams.Kc == 1)
-    return Node;
-  auto MapOldIndVar = getInductionVariablesSubstitution(Node, MicroKernelParams,
-                                                        MacroKernelParams);
-  if (!MapOldIndVar)
-    return Node;
-  Node = markLoopVectorizerDisabled(Node.parent()).child(0);
-  Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
-  return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
-                                          MacroKernelParams, MMI);
-}
-
-bool ScheduleTreeOptimizer::isMatrMultPattern(isl::schedule_node Node,
-                                              const Dependences *D,
-                                              MatMulInfoTy &MMI) {
-  auto PartialSchedule = isl::manage(
-      isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
-  Node = Node.child(0);
-  auto LeafType = isl_schedule_node_get_type(Node.get());
-  Node = Node.parent();
-  if (LeafType != isl_schedule_node_leaf ||
-      isl_schedule_node_band_n_member(Node.get()) < 3 ||
-      Node.get_schedule_depth() != 0 ||
-      isl_union_map_n_map(PartialSchedule.get()) != 1)
-    return false;
-  auto NewPartialSchedule = isl::map::from_union_map(PartialSchedule);
-  if (containsMatrMult(NewPartialSchedule, D, MMI))
-    return true;
-  return false;
-}
-
  __isl_give isl_schedule_node *
  ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
                                      void *User) {
@@ -1647,12 +494,13 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
    const OptimizerAdditionalInfoTy *OAI =
        static_cast<const OptimizerAdditionalInfoTy *>(User);
  
-  MatMulInfoTy MMI;
-  if (PMBasedOpts && User &&
-      isMatrMultPattern(isl::manage_copy(Node), OAI->D, MMI)) {
-    LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
-    MatMulOpts++;
-    return optimizeMatMulPattern(isl::manage(Node), OAI->TTI, MMI).release();
+  if (PMBasedOpts && User) {
+    if (isl::schedule_node PatternOptimizedSchedule = tryOptimizeMatMulPattern(
+            isl::manage_copy(Node), OAI->TTI, OAI->D)) {
+      MatMulOpts++;
+      isl_schedule_node_free(Node);
+      return PatternOptimizedSchedule.release();
+    }
    }
  
    return standardBandOpts(isl::manage(Node), User).release();
diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp

index 7397f3b26da50e86ca1913aa78f82f8b359720ff..f689b346495961d142af85592521580617982b6b 100644 (file)
--- a/polly/lib/Transform/ScheduleTreeTransform.cpp
+++ b/polly/lib/Transform/ScheduleTreeTransform.cpp
@@ -480,6 +480,23 @@ static isl::basic_set isDivisibleBySet(isl::ctx &Ctx, long Factor,
    return Modulo.domain();
  }
  
+/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
+///
+/// @param Set         A set, which should be modified.
+/// @param VectorWidth A parameter, which determines the constraint.
+static isl::set addExtentConstraints(isl::set Set, int VectorWidth) {
+  unsigned Dims = Set.dim(isl::dim::set);
+  isl::space Space = Set.get_space();
+  isl::local_space LocalSpace = isl::local_space(Space);
+  isl::constraint ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
+  ExtConstr = ExtConstr.set_constant_si(0);
+  ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, 1);
+  Set = Set.add_constraint(ExtConstr);
+  ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
+  ExtConstr = ExtConstr.set_constant_si(VectorWidth - 1);
+  ExtConstr = ExtConstr.set_coefficient_si(isl::dim::set, Dims - 1, -1);
+  return Set.add_constraint(ExtConstr);
+}
  } // namespace
  
  bool polly::isBandMark(const isl::schedule_node &Node) {
@@ -631,3 +648,76 @@ isl::schedule polly::applyPartialUnroll(isl::schedule_node BandToUnroll,
  
    return NewLoop.get_schedule();
  }
+
+isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
+                                       int VectorWidth) {
+  isl_size Dims = ScheduleRange.dim(isl::dim::set);
+  isl::set LoopPrefixes =
+      ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1);
+  auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth);
+  isl::set BadPrefixes = ExtentPrefixes.subtract(ScheduleRange);
+  BadPrefixes = BadPrefixes.project_out(isl::dim::set, Dims - 1, 1);
+  LoopPrefixes = LoopPrefixes.project_out(isl::dim::set, Dims - 1, 1);
+  return LoopPrefixes.subtract(BadPrefixes);
+}
+
+isl::union_set polly::getIsolateOptions(isl::set IsolateDomain,
+                                        isl_size OutDimsNum) {
+  isl_size Dims = IsolateDomain.dim(isl::dim::set);
+  assert(OutDimsNum <= Dims &&
+         "The isl::set IsolateDomain is used to describe the range of schedule "
+         "dimensions values, which should be isolated. Consequently, the "
+         "number of its dimensions should be greater than or equal to the "
+         "number of the schedule dimensions.");
+  isl::map IsolateRelation = isl::map::from_domain(IsolateDomain);
+  IsolateRelation = IsolateRelation.move_dims(isl::dim::out, 0, isl::dim::in,
+                                              Dims - OutDimsNum, OutDimsNum);
+  isl::set IsolateOption = IsolateRelation.wrap();
+  isl::id Id = isl::id::alloc(IsolateOption.get_ctx(), "isolate", nullptr);
+  IsolateOption = IsolateOption.set_tuple_id(Id);
+  return isl::union_set(IsolateOption);
+}
+
+isl::union_set polly::getDimOptions(isl::ctx Ctx, const char *Option) {
+  isl::space Space(Ctx, 0, 1);
+  auto DimOption = isl::set::universe(Space);
+  auto Id = isl::id::alloc(Ctx, Option, nullptr);
+  DimOption = DimOption.set_tuple_id(Id);
+  return isl::union_set(DimOption);
+}
+
+isl::schedule_node polly::tileNode(isl::schedule_node Node,
+                                   const char *Identifier,
+                                   ArrayRef<int> TileSizes,
+                                   int DefaultTileSize) {
+  auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
+  auto Dims = Space.dim(isl::dim::set);
+  auto Sizes = isl::multi_val::zero(Space);
+  std::string IdentifierString(Identifier);
+  for (auto i : seq<isl_size>(0, Dims)) {
+    auto tileSize =
+        i < (isl_size)TileSizes.size() ? TileSizes[i] : DefaultTileSize;
+    Sizes = Sizes.set_val(i, isl::val(Node.get_ctx(), tileSize));
+  }
+  auto TileLoopMarkerStr = IdentifierString + " - Tiles";
+  auto TileLoopMarker =
+      isl::id::alloc(Node.get_ctx(), TileLoopMarkerStr, nullptr);
+  Node = Node.insert_mark(TileLoopMarker);
+  Node = Node.child(0);
+  Node =
+      isl::manage(isl_schedule_node_band_tile(Node.release(), Sizes.release()));
+  Node = Node.child(0);
+  auto PointLoopMarkerStr = IdentifierString + " - Points";
+  auto PointLoopMarker =
+      isl::id::alloc(Node.get_ctx(), PointLoopMarkerStr, nullptr);
+  Node = Node.insert_mark(PointLoopMarker);
+  return Node.child(0);
+}
+
+isl::schedule_node polly::applyRegisterTiling(isl::schedule_node Node,
+                                              ArrayRef<int> TileSizes,
+                                              int DefaultTileSize) {
+  Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
+  auto Ctx = Node.get_ctx();
+  return Node.band_set_ast_build_options(isl::union_set(Ctx, "{unroll[x]}"));
+}
diff --git a/polly/unittests/ScheduleOptimizer/CMakeLists.txt b/polly/unittests/ScheduleOptimizer/CMakeLists.txt

index 75adea671517814caa36f87d069c1cfc262e6148..2b4736027aab0fdd23268a3302cf7a4883506374 100644 (file)
--- a/polly/unittests/ScheduleOptimizer/CMakeLists.txt
+++ b/polly/unittests/ScheduleOptimizer/CMakeLists.txt
@@ -1,3 +1,3 @@
  add_polly_unittest(ScheduleOptimizerTests
-    ScheduleOptimizerTest.cpp
+    ScheduleTreeTransformTest.cpp
    )
diff --git a/polly/unittests/ScheduleOptimizer/ScheduleOptimizerTest.cpp b/polly/unittests/ScheduleOptimizer/ScheduleOptimizerTest.cpp

deleted file mode 100644 (file)

index daa59cd..0000000
--- a/polly/unittests/ScheduleOptimizer/ScheduleOptimizerTest.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//===- ScheduleOptimizerTest.cpp ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/ScheduleOptimizer.h"
-#include "gtest/gtest.h"
-#include "isl/stream.h"
-#include "isl/val.h"
-
-using namespace isl;
-using namespace polly;
-
-namespace {
-
-TEST(ScheduleOptimizer, getPartialTilePrefixes) {
-
-  isl_ctx *ctx = isl_ctx_alloc();
-
-  {
-    // Verify that for a loop with 3 iterations starting at 0 that is
-    // pre-vectorized (strip-mined with a factor of 2), we correctly identify
-    // that only the first two iterations are full vector iterations.
-    isl::map Schedule(
-        ctx, "{[i] -> [floor(i/2), i - 2 * floor(i/2)] : 0 <= i < 3 }");
-    isl::set ScheduleRange = Schedule.range();
-    isl::set Result = getPartialTilePrefixes(ScheduleRange, 2);
-
-    EXPECT_TRUE(Result.is_equal(isl::set(ctx, "{[0]}")));
-  }
-
-  {
-    // Verify that for a loop with 3 iterations starting at 1 that is
-    // pre-vectorized (strip-mined with a factor of 2), we correctly identify
-    // that only the last two iterations are full vector iterations.
-    isl::map Schedule(
-        ctx, "{[i] -> [floor(i/2), i - 2 * floor(i/2)] : 1 <= i < 4 }");
-    isl::set ScheduleRange = Schedule.range();
-    isl::set Result = getPartialTilePrefixes(ScheduleRange, 2);
-
-    EXPECT_TRUE(Result.is_equal(isl::set(ctx, "{[1]}")));
-  }
-
-  {
-    // Verify that for a loop with 6 iterations starting at 1 that is
-    // pre-vectorized (strip-mined with a factor of 2), we correctly identify
-    // that all but the first and the last iteration are full vector iterations.
-    isl::map Schedule(
-        ctx, "{[i] -> [floor(i/2), i - 2 * floor(i/2)] : 1 <= i < 6 }");
-    isl::set ScheduleRange = Schedule.range();
-    isl::set Result = getPartialTilePrefixes(ScheduleRange, 2);
-
-    EXPECT_TRUE(Result.is_equal(isl::set(ctx, "{[1]; [2]}")));
-  }
-
-  isl_ctx_free(ctx);
-}
-} // anonymous namespace
diff --git a/polly/unittests/ScheduleOptimizer/ScheduleTreeTransformTest.cpp b/polly/unittests/ScheduleOptimizer/ScheduleTreeTransformTest.cpp

new file mode 100644 (file)

index 0000000..a2f3479
--- /dev/null
+++ b/polly/unittests/ScheduleOptimizer/ScheduleTreeTransformTest.cpp
@@ -0,0 +1,59 @@
+//===- ScheduleTreeTransformTest.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/ScheduleTreeTransform.h"
+#include "gtest/gtest.h"
+#include "isl/ctx.h"
+
+using namespace isl;
+using namespace polly;
+
+namespace {
+
+TEST(ScheduleTreeTransform, getPartialTilePrefixes) {
+  isl_ctx *ctx = isl_ctx_alloc();
+
+  {
+    // Verify that for a loop with 3 iterations starting at 0 that is
+    // pre-vectorized (strip-mined with a factor of 2), we correctly identify
+    // that only the first two iterations are full vector iterations.
+    isl::map Schedule(
+        ctx, "{[i] -> [floor(i/2), i - 2 * floor(i/2)] : 0 <= i < 3 }");
+    isl::set ScheduleRange = Schedule.range();
+    isl::set Result = getPartialTilePrefixes(ScheduleRange, 2);
+
+    EXPECT_TRUE(Result.is_equal(isl::set(ctx, "{[0]}")));
+  }
+
+  {
+    // Verify that for a loop with 3 iterations starting at 1 that is
+    // pre-vectorized (strip-mined with a factor of 2), we correctly identify
+    // that only the last two iterations are full vector iterations.
+    isl::map Schedule(
+        ctx, "{[i] -> [floor(i/2), i - 2 * floor(i/2)] : 1 <= i < 4 }");
+    isl::set ScheduleRange = Schedule.range();
+    isl::set Result = getPartialTilePrefixes(ScheduleRange, 2);
+
+    EXPECT_TRUE(Result.is_equal(isl::set(ctx, "{[1]}")));
+  }
+
+  {
+    // Verify that for a loop with 6 iterations starting at 1 that is
+    // pre-vectorized (strip-mined with a factor of 2), we correctly identify
+    // that all but the first and the last iteration are full vector iterations.
+    isl::map Schedule(
+        ctx, "{[i] -> [floor(i/2), i - 2 * floor(i/2)] : 1 <= i < 6 }");
+    isl::set ScheduleRange = Schedule.range();
+    isl::set Result = getPartialTilePrefixes(ScheduleRange, 2);
+
+    EXPECT_TRUE(Result.is_equal(isl::set(ctx, "{[1]; [2]}")));
+  }
+
+  isl_ctx_free(ctx);
+}
+} // anonymous namespace
author	Michael Kruse <llvm-project@meinersbur.de>
	Sat, 5 Jun 2021 04:17:41 +0000 (23:17 -0500)
committer	Michael Kruse <llvm-project@meinersbur.de>
	Sat, 5 Jun 2021 04:22:30 +0000 (23:22 -0500)
polly/include/polly/MatmulOptimizer.h	[new file with mode: 0644]	patch \| blob
polly/include/polly/ScheduleOptimizer.h		patch \| blob \| history
polly/include/polly/ScheduleTreeTransform.h		patch \| blob \| history
polly/lib/CMakeLists.txt		patch \| blob \| history
polly/lib/Transform/MatmulOptimizer.cpp	[new file with mode: 0644]	patch \| blob
polly/lib/Transform/ScheduleOptimizer.cpp		patch \| blob \| history
polly/lib/Transform/ScheduleTreeTransform.cpp		patch \| blob \| history
polly/unittests/ScheduleOptimizer/CMakeLists.txt		patch \| blob \| history
polly/unittests/ScheduleOptimizer/ScheduleOptimizerTest.cpp	[deleted file]	patch \| blob \| history
polly/unittests/ScheduleOptimizer/ScheduleTreeTransformTest.cpp	[new file with mode: 0644]	patch \| blob