From a0e4081cf5e11556c8e3d3e022a17afca991b3fe Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Mon, 21 May 2018 11:11:48 -0700
Subject: [PATCH] Add a kernel usable as a GEBP inner loop for an LLVM IR GEMM

This is not used in any real code path, but I've added an escape hatch that runs
regular matrix multiplies through this kernel for testing purposes.

As far as I can tell this is functionally correct, but I don't yet have a proper
apples-to-apples performance comparison -- that'll have to wait till the
implementation is complete.

PiperOrigin-RevId: 197422075
---
 tensorflow/compiler/xla/service/cpu/cpu_options.cc |   8 +
 tensorflow/compiler/xla/service/cpu/cpu_options.h  |   1 +
 .../compiler/xla/service/cpu/dot_op_emitter.cc     | 292 ++++++++++++++++++++-
 .../compiler/xla/service/cpu/dot_op_emitter.h      |  16 ++
 .../xla/service/cpu/vector_support_library.h       |   6 +
 .../xla/service/llvm_ir/kernel_support_library.h   |   9 +
 6 files changed, 328 insertions(+), 4 deletions(-)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index f9c51f2..e75fcb6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -22,6 +22,8 @@ namespace {
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
+const char* const kXlaEnableExperimentalLlvmIrGemm =
+    "xla_enable_experimental_llvm_ir_gemm";
 
 }  // namespace
 
@@ -54,6 +56,12 @@ tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
   return tensorflow::gtl::nullopt;
 }
 
+bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0;
+}
+
 }  // namespace options
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index be62ff3..106dfbb 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -26,6 +26,7 @@ namespace options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
+bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
     const HloModuleConfig& config);
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 5cdfc11..a0b1146 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -521,6 +521,233 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
   }
 }
 
+// This class implements a tiled matrix multiplication algorithm, intended for
+// use as the innermost GEBP loop in a GEMM kernel (GEBP is described in "Goto,
+// Kazushige, and Robert Van De Geijn. "High-performance implementation of the
+// level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1 (2008):
+// 4).
+//
+// This only supports canonical dot operations (i.e. where the lhs contraction
+// dimension is 1 and the rhs contraction dimension is 0) over row major
+// matrices.
+class MatrixMatrixBlockPanelEmitter {
+ public:
+  // Creates an instance of MatrixMatrixBlockPanelEmitter that matrix-multiplies
+  // `lhs` with `rhs` and stores the result in `result`.
+  //
+  // `m`, `k` and `n` are the matrix multiplication dimensions.
+  //
+  // `max_vectorization_width` is the maximum vector width (i.e. the width of
+  // the largest vector register we will use).  This can be larger than the
+  // largest vector register supported by the machine -- LLVM will legalize
+  // these large vector widths into legally sized vectors.
+  // `min_vectorization_width` is the smallest vector width the emitter will use
+  // -- below that it will devolve to using a scalar loop.
+  //
+  // `k_tiling_factor` is the number of elements along the reduction dimensions
+  // that we will attempt to process at once.
+  explicit MatrixMatrixBlockPanelEmitter(
+      llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result, int64 m, int64 k,
+      int64 n, int max_vectorization_width, int min_vectorization_width,
+      int k_tiling_factor, const TargetMachineFeatures& target_machine_features,
+      llvm::IRBuilder<>* ir_builder, PrimitiveType primitive_type)
+      : lhs_(lhs),
+        rhs_(rhs),
+        result_(result),
+        m_(m),
+        k_(k),
+        n_(n),
+        max_vectorization_width_(max_vectorization_width),
+        min_vectorization_width_(min_vectorization_width),
+        k_tiling_factor_(k_tiling_factor),
+        target_machine_features_(target_machine_features),
+        ir_builder_(ir_builder),
+        primitive_type_(primitive_type),
+        ksl_(ir_builder_) {
+    CHECK(max_vectorization_width > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width)));
+    CHECK(min_vectorization_width > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width)));
+    CHECK_GT(k_tiling_factor, 0);
+  }
+
+  void Emit();
+
+ private:
+  // We can only iterate the `n` dimension for an extent that is divisible by
+  // the vectorization width.  So we emit an outer loop that first processes the
+  // largest extent in `n` that is divisible by max_vectorization_width, then
+  // the largest remaining extent that is divisible by max_vectorization_width /
+  // 2 etc.  This function emits that outermost loop.
+  void EmitChunkedLoopOverN();
+
+  // This emits a loop that loops over the `k` dimension in multiples of
+  // `k_tiling_factor` as much as possible and then emits a remainder epilogue.
+  void EmitLoopOverK(VectorSupportLibrary* vsl, llvm::Value* n_start,
+                     llvm::Value* n_end);
+
+  // This emits the inner reduction loop.  This inner reduction loop processes
+  // all indices in the `m` dimension, [`k_start`, `k_end`) in the k dimension
+  // and [`n_start`, `n_end`) in the `n` dimension.
+  void EmitInnerLoop(int64 k_tiling_factor, llvm::Value* k_start,
+                     llvm::Value* k_end, llvm::Value* n_start,
+                     llvm::Value* n_end, VectorSupportLibrary* vsl);
+
+  llvm::Value* getInt64(int64 value) { return ir_builder_->getInt64(value); }
+
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* result_;
+  int64 m_;
+  int64 k_;
+  int64 n_;
+
+  int64 max_vectorization_width_;
+  int64 min_vectorization_width_;
+  int64 k_tiling_factor_;
+
+  const TargetMachineFeatures& target_machine_features_;
+  llvm::IRBuilder<>* ir_builder_;
+  PrimitiveType primitive_type_;
+  KernelSupportLibrary ksl_;
+};
+
+void MatrixMatrixBlockPanelEmitter::Emit() { EmitChunkedLoopOverN(); }
+
+void MatrixMatrixBlockPanelEmitter::EmitChunkedLoopOverN() {
+  int64 current_vectorization_width = max_vectorization_width_;
+  int64 n_start = 0;
+  while (n_start != n_ &&
+         current_vectorization_width >= min_vectorization_width_) {
+    int64 n_end = n_ - (n_ % current_vectorization_width);
+    if (n_start != n_end) {
+      VectorSupportLibrary vsl(primitive_type_, current_vectorization_width,
+                               ir_builder_, "gebp");
+      EmitLoopOverK(&vsl, getInt64(n_start), getInt64(n_end));
+      n_start = n_end;
+    }
+    current_vectorization_width /= 2;
+  }
+
+  if (n_start != n_) {
+    VectorSupportLibrary vsl(primitive_type_, 1, ir_builder_, "gebp");
+    ksl_.For("epi.n", n_start, n_, 1, [&](llvm::Value* n_i) {
+      llvm::Value* n_i_next =
+          ir_builder_->CreateAdd(n_i, ir_builder_->getInt64(1));
+      EmitLoopOverK(&vsl, n_i, n_i_next);
+    });
+  }
+}
+
+void MatrixMatrixBlockPanelEmitter::EmitLoopOverK(VectorSupportLibrary* vsl,
+                                                  llvm::Value* n_start,
+                                                  llvm::Value* n_end) {
+  int64 k_start = 0;
+  int64 k_end = k_ - (k_ % k_tiling_factor_);
+  if (k_end != k_start) {
+    EmitInnerLoop(k_tiling_factor_, getInt64(k_start), getInt64(k_end), n_start,
+                  n_end, vsl);
+    k_start = k_end;
+  }
+
+  if (k_start != k_) {
+    EmitInnerLoop(k_ - k_start, getInt64(k_start), getInt64(k_), n_start, n_end,
+                  vsl);
+  }
+}
+
+// The tiling scheme is as follows:
+//
+// Let the LHS be:
+//
+//   +---+---+---+
+//   | a | b | c | .
+//   +---+---+---+ .
+//   |   |   |   | .
+//   +---+---+---+
+//     ..     ..
+//
+// and the RHS be:
+//
+//   +----+----+----+----+
+//   | p0 | p1 | p2 | p3 | .
+//   +----+----+----+----+ .
+//   | q0 | q1 | q2 | q3 | .
+//   +----+----+----+----+
+//   | r0 | r1 | r2 | r3 | .
+//   +----+----+----+----+ .
+//     ......    ......
+//
+// and let k_tiling_factor be 3 and the vector width (implicitly denoted by
+// `vsl`) be 4.
+//
+// Then we
+//
+//  1. broadcast the first row in LHS to 3 vectors of width 4
+//  2. elementwise multiply the RHS rows with these broadcasted vectors
+//  3. elementwise add them:
+//
+//   +---+---+---+---+   +----+----+----+----+
+//   | a | a | a | a | * | p0 | p1 | p2 | p3 |   +
+//   +---+---+---+---+   +----+----+----+----+
+//
+//   +---+---+---+---+   +----+----+----+----+
+//   | b | b | b | b | * | q0 | q1 | q2 | q3 |   +
+//   +---+---+---+---+   +----+----+----+----+
+//
+//   +---+---+---+---+   +----+----+----+----+
+//   | c | c | c | c | * | r0 | r1 | r2 | r3 |
+//   +---+---+---+---+   +----+----+----+----+
+//
+// to get:
+//
+//   +----------------+----------------+----------------+----------------+
+//   | a*p0+b*q0+c*r0 | a*p1+b*q1+c*r1 | a*p2+b*q2+c*r2 | a*p3+b*q3+c*r3 |
+//   +----------------+----------------+----------------+----------------+
+//
+// which we increment into the appropriate region in the result.
+void MatrixMatrixBlockPanelEmitter::EmitInnerLoop(
+    int64 k_tiling_factor, llvm::Value* k_start, llvm::Value* k_end,
+    llvm::Value* n_start, llvm::Value* n_end, VectorSupportLibrary* vsl) {
+  ksl_.For("dot.m", 0, m_, 1, [&](llvm::Value* m_i) {
+    // This outer loop iterates over all of the M dimension
+    llvm::Value* result_row_begin = vsl->ComputeOffsetPointer(
+        result_, /*offset_elements=*/m_i, /*scale=*/n_);
+    llvm::Value* lhs_row_begin =
+        vsl->ComputeOffsetPointer(lhs_, /*offset_elements=*/m_i, /*scale=*/k_);
+
+    ksl_.For("dot.k", k_start, k_end, k_tiling_factor, [&](llvm::Value* k_i) {
+      // broadcasted_a is the broadcasted set of vectors denoted as <a,a,a,a>,
+      // <b,b,b,b> etc. in the diagram.
+      std::vector<llvm::Value*> broadcasted_a;
+      broadcasted_a.reserve(k_tiling_factor);
+      for (int i = 0; i < k_tiling_factor; i++) {
+        broadcasted_a.push_back(vsl->LoadBroadcast(
+            lhs_row_begin, ir_builder_->CreateAdd(getInt64(i), k_i)));
+      }
+
+      // rhs_loader will be used to load the tile off of the RHS, denoted as
+      // <<p0,p1,p2,p3>,<q0,q1,q2,q3> ...> in the diagram.
+      TileLoader rhs_loader(vsl, ir_builder_, rhs_, n_, k_i, k_tiling_factor);
+      ksl_.For(
+          "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+            // This loop iterates over the N dimension.  It loads the tile from
+            // RHS, does the FMA resulting in the
+            // <a*p0+b*q0+c*r0,a*p1+b*q1+c*r1,...> in the diagram and increments
+            // the result.
+            std::vector<llvm::Value*> tile = rhs_loader.LoadTile(n_i);
+            llvm::Value* result_accumulator =
+                vsl->LoadVector(result_row_begin, n_i);
+            for (int i = 0; i < tile.size(); i++) {
+              result_accumulator =
+                  vsl->MulAdd(tile[i], broadcasted_a[i], result_accumulator);
+            }
+            vsl->StoreVector(result_accumulator, result_row_begin, n_i);
+          });
+    });
+  });
+}
+
 }  // namespace
 
 DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
@@ -558,6 +785,62 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
   return dot_emitter.Emit();
 }
 
+bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
+    const DotOpEmitter::MatMultDims& mat_mult_dims) {
+  if (!EnableExperimentalLlvmIrGemm() || ShouldUseMultiThreadedEigen()) {
+    return false;
+  }
+
+  if (mat_mult_dims.lhs_non_canonical || mat_mult_dims.rhs_non_canonical) {
+    return false;
+  }
+
+  PrimitiveType primitive_type = dot_.shape().element_type();
+
+  switch (primitive_type) {
+    default:
+      return false;
+
+    case F32:
+    case F64:
+    case S32:
+    case S64:
+      break;
+  }
+
+  if (!(mat_mult_dims.lhs_column_major == mat_mult_dims.rhs_column_major &&
+        mat_mult_dims.rhs_column_major == mat_mult_dims.target_column_major)) {
+    return false;
+  }
+
+  VLOG(2) << "Emitting GEBP kernel in LLVM IR";
+
+  llvm::Value* lhs = lhs_array_.GetBasePointer();
+  llvm::Value* rhs = rhs_array_.GetBasePointer();
+  llvm::Value* target = target_array_.GetBasePointer();
+  int64 m = mat_mult_dims.m;
+  int64 k = mat_mult_dims.k;
+  int64 n = mat_mult_dims.n;
+
+  if (mat_mult_dims.lhs_column_major) {
+    std::swap(lhs, rhs);
+    std::swap(m, n);
+  }
+
+  int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
+  ir_builder_->CreateMemSet(
+      target, ir_builder_->getInt8(0), size_bytes,
+      target_machine_features_.minimum_alignment_for_allocation(size_bytes));
+
+  MatrixMatrixBlockPanelEmitter gebp_emitter(
+      /*lhs=*/lhs, /*rhs=*/rhs, /*result=*/target, /*m=*/m, /*k=*/k, /*n=*/n,
+      /*max_vectorization_width=*/8, /*min_vectorization_width=*/4,
+      /*k_tiling_factor=*/8, target_machine_features_, ir_builder_,
+      primitive_type);
+  gebp_emitter.Emit();
+  return true;
+}
+
 bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   if (dot_.shape().dimensions_size() != 2) {
     return false;
@@ -610,7 +893,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   }
 
   if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
-    return false;
+    return EmitExperimentalGebpDotIfEnabled(mat_mult_dims);
   }
 
   int64 tiling_factor = GetGemvTilingFactor();
@@ -909,8 +1192,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded =
-      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  bool multi_threaded = ShouldUseMultiThreadedEigen();
   bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
@@ -1019,7 +1301,9 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
       /*lhs_column_major=*/LayoutUtil::Minor(lhs_shape.layout(), 0) == 0,
       /*lhs_non_canonical=*/dim_nums.lhs_contracting_dimensions(0) == 0,
       /*rhs_column_major=*/LayoutUtil::Minor(rhs_shape.layout(), 0) == 0,
-      /*rhs_non_canonical=*/dim_nums.rhs_contracting_dimensions(0) == 1};
+      /*rhs_non_canonical=*/dim_nums.rhs_contracting_dimensions(0) == 1,
+      /*target_column_major=*/
+      LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
 llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index a75b8ff..d88ccea 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -123,6 +123,9 @@ class DotOpEmitter {
 
     // True if the RHS contraction dimension is not 0.
     bool rhs_non_canonical;
+
+    // True if the result matrix is column major.
+    bool target_column_major;
   };
 
   // Get the MatMultDims instance for the dot product this DotOpEmitter
@@ -130,6 +133,8 @@ class DotOpEmitter {
   // of rank 2 as well).
   MatMultDims GetMatMultDims() const;
 
+  bool EmitExperimentalGebpDotIfEnabled(const MatMultDims& mat_mult_dims);
+
   // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
   // registers.
   int64 GetGemvTilingFactor() const {
@@ -138,6 +143,17 @@ class DotOpEmitter {
         .value_or(kDefaultTilingFactor);
   }
 
+  // Returns true if we should use an experimental implementation of GEMM
+  // (general matrix matrix multiplication) if possible.
+  bool EnableExperimentalLlvmIrGemm() const {
+    return options::EnableExperimentalLlvmIrGemm(hlo_module_config_);
+  }
+
+  // Returns true if we should call into multi-threaded Eigen routines.
+  bool ShouldUseMultiThreadedEigen() {
+    return hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  }
+
   const HloInstruction& dot_;
   const llvm_ir::IrArray& target_array_;
   const llvm_ir::IrArray& lhs_array_;
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 6479bf7..edcaec5 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -144,6 +144,12 @@ class VectorSupportLibrary {
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     llvm::Value* offset_elements);
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    llvm::Value* offset_elements, int64 scale) {
+    return ComputeOffsetPointer(
+        base_pointer,
+        ir_builder_->CreateMul(ir_builder_->getInt64(scale), offset_elements));
+  }
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     int64 offset_elements) {
     return ComputeOffsetPointer(base_pointer,
                                 ir_builder()->getInt64(offset_elements));
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 1c00b2a..64b935b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -101,6 +101,15 @@ class KernelSupportLibrary {
   }
 
   void For(
+      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      int64 step,
+      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
+    For(name, start, end, ir_builder_->getInt64(step),
+        /*peel_first_iteration=*/false,
+        [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
+  }
+
+  void For(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
     For(name, /*start=*/ir_builder_->getInt64(start),
-- 
2.7.4