From 5d45f758f0fba3174126bda24b315006b8b48f1f Mon Sep 17 00:00:00 2001
From: Thomas Raoux <thomasraoux@google.com>
Date: Thu, 29 Oct 2020 14:28:01 -0700
Subject: [PATCH] [mlir][vector] Improve vector distribute integration test and
 fix block distribution

Fix semantic in the distribute integration test based on offline feedback. This
exposed a bug in block distribution, we need to make sure the id is multiplied
by the stride of the vector. Fix the transformation and unit test.

Differential Revision: https://reviews.llvm.org/D89291
---
 .../Dialect/Vector/CPU/test-vector-distribute.mlir | 42 +++++++++------
 mlir/lib/Dialect/Vector/VectorTransforms.cpp       | 12 ++++-
 mlir/test/Dialect/Vector/vector-distribution.mlir  | 21 ++++++--
 mlir/test/lib/Transforms/TestVectorTransforms.cpp  | 62 ++++++++++++++++++++++
 4 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/mlir/integration_test/Dialect/Vector/CPU/test-vector-distribute.mlir b/mlir/integration_test/Dialect/Vector/CPU/test-vector-distribute.mlir
index befbcd8..b83b1f6 100644
--- a/mlir/integration_test/Dialect/Vector/CPU/test-vector-distribute.mlir
+++ b/mlir/integration_test/Dialect/Vector/CPU/test-vector-distribute.mlir
@@ -1,9 +1,18 @@
-// RUN: mlir-opt %s -test-vector-distribute-patterns=distribution-multiplicity=32 \
-// RUN:  -convert-vector-to-scf -lower-affine -convert-scf-to-std -convert-vector-to-llvm | \
+// RUN: mlir-opt %s -test-vector-to-forloop -convert-vector-to-scf \
+// RUN:   -lower-affine -convert-scf-to-std -convert-vector-to-llvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine \
+// RUN: -convert-scf-to-std -convert-vector-to-llvm | mlir-cpu-runner -e main \
+// RUN: -entry-point-result=void \
+// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// RUN: mlir-opt %s -test-vector-to-forloop | FileCheck %s -check-prefix=TRANSFORM
+
+
 func @print_memref_f32(memref<*xf32>)
 
 func @alloc_1d_filled_inc_f32(%arg0: index, %arg1: f32) -> memref<?xf32> {
@@ -19,30 +28,29 @@ func @alloc_1d_filled_inc_f32(%arg0: index, %arg1: f32) -> memref<?xf32> {
   return %0 : memref<?xf32>
 }
 
-func @vector_add_cycle(%id : index, %A: memref<?xf32>, %B: memref<?xf32>, %C: memref<?xf32>) {
-  %c0 = constant 0 : index
-  %cf0 = constant 0.0 : f32
-  %a = vector.transfer_read %A[%c0], %cf0: memref<?xf32>, vector<64xf32>
-  %b = vector.transfer_read %B[%c0], %cf0: memref<?xf32>, vector<64xf32>
-  %acc = addf %a, %b: vector<64xf32>
-  vector.transfer_write %acc, %C[%c0]: vector<64xf32>, memref<?xf32>
-  return
-}
-
-// Loop over a function containinng a large add vector and distribute it so that
-// each iteration of the loop process part of the vector operation.
+// Large vector addf that can be broken down into a loop of smaller vector addf.
 func @main() {
+  %cf0 = constant 0.0 : f32
   %cf1 = constant 1.0 : f32
   %cf2 = constant 2.0 : f32
   %c0 = constant 0 : index
   %c1 = constant 1 : index
+  %c32 = constant 32 : index
   %c64 = constant 64 : index
   %out = alloc(%c64) : memref<?xf32>
   %in1 = call @alloc_1d_filled_inc_f32(%c64, %cf1) : (index, f32) -> memref<?xf32>
   %in2 = call @alloc_1d_filled_inc_f32(%c64, %cf2) : (index, f32) -> memref<?xf32>
-  scf.for %arg5 = %c0 to %c64 step %c1 {
-    call @vector_add_cycle(%arg5, %in1, %in2, %out) : (index, memref<?xf32>, memref<?xf32>, memref<?xf32>) -> ()
-  }
+  // Check that the tansformatio correctly happened.
+  // TRANSFORM: scf.for
+  // TRANSFORM:   vector.transfer_read {{.*}} : memref<?xf32>, vector<2xf32>
+  // TRANSFORM:   vector.transfer_read {{.*}} : memref<?xf32>, vector<2xf32>
+  // TRANSFORM:   %{{.*}} = addf %{{.*}}, %{{.*}} : vector<2xf32>
+  // TRANSFORM:   vector.transfer_write {{.*}} : vector<2xf32>, memref<?xf32>
+  // TRANSFORM: }
+  %a = vector.transfer_read %in1[%c0], %cf0: memref<?xf32>, vector<64xf32>
+  %b = vector.transfer_read %in2[%c0], %cf0: memref<?xf32>, vector<64xf32>
+  %acc = addf %a, %b: vector<64xf32>
+  vector.transfer_write %acc, %out[%c0]: vector<64xf32>, memref<?xf32>
   %converted = memref_cast %out : memref<?xf32> to memref<*xf32>
   call @print_memref_f32(%converted): (memref<*xf32>) -> ()
   // CHECK:      Unranked{{.*}}data =
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
index 1d8d0c6..c24a1d5 100644
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -2526,9 +2526,13 @@ struct TransferReadExtractPattern
       return failure();
     edsc::ScopedContext scope(rewriter, read.getLoc());
     using mlir::edsc::op::operator+;
+    using mlir::edsc::op::operator*;
     using namespace mlir::edsc::intrinsics;
     SmallVector<Value, 4> indices(read.indices().begin(), read.indices().end());
-    indices.back() = indices.back() + extract.id();
+    indices.back() =
+        indices.back() +
+        (extract.id() *
+         std_constant_index(extract.getResultType().getDimSize(0)));
     Value newRead = vector_transfer_read(extract.getType(), read.memref(),
                                          indices, read.permutation_map(),
                                          read.padding(), ArrayAttr());
@@ -2552,10 +2556,14 @@ struct TransferWriteInsertPattern
       return failure();
     edsc::ScopedContext scope(rewriter, write.getLoc());
     using mlir::edsc::op::operator+;
+    using mlir::edsc::op::operator*;
     using namespace mlir::edsc::intrinsics;
     SmallVector<Value, 4> indices(write.indices().begin(),
                                   write.indices().end());
-    indices.back() = indices.back() + insert.id();
+    indices.back() =
+        indices.back() +
+        (insert.id() *
+         std_constant_index(insert.getSourceVectorType().getDimSize(0)));
     vector_transfer_write(insert.vector(), write.memref(), indices,
                           write.permutation_map(), ArrayAttr());
     rewriter.eraseOp(write);
diff --git a/mlir/test/Dialect/Vector/vector-distribution.mlir b/mlir/test/Dialect/Vector/vector-distribution.mlir
index 5fb32da..f93e96b 100644
--- a/mlir/test/Dialect/Vector/vector-distribution.mlir
+++ b/mlir/test/Dialect/Vector/vector-distribution.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-vector-distribute-patterns=distribution-multiplicity=32 | FileCheck %s
+// RUN: mlir-opt %s -test-vector-distribute-patterns=distribution-multiplicity=32 -split-input-file | FileCheck %s
 
 // CHECK-LABEL: func @distribute_vector_add
 //  CHECK-SAME: (%[[ID:.*]]: index
@@ -13,6 +13,8 @@ func @distribute_vector_add(%id : index, %A: vector<32xf32>, %B: vector<32xf32>)
   return %0: vector<32xf32>
 }
 
+// -----
+
 // CHECK-LABEL: func @vector_add_read_write
 //  CHECK-SAME: (%[[ID:.*]]: index
 //       CHECK:    %[[EXA:.*]] = vector.transfer_read %{{.*}}[%[[ID]]], %{{.*}} : memref<32xf32>, vector<1xf32>
@@ -34,12 +36,19 @@ func @vector_add_read_write(%id : index, %A: memref<32xf32>, %B: memref<32xf32>,
   return
 }
 
-// CHECK-LABEL: func @vector_add_cycle
+// -----
+
+// CHECK-DAG: #[[MAP0:map[0-9]+]] = affine_map<()[s0] -> (s0 * 2)>
+
+//       CHECK: func @vector_add_cycle
 //  CHECK-SAME: (%[[ID:.*]]: index
-//       CHECK:    %[[EXA:.*]] = vector.transfer_read %{{.*}}[%[[ID]]], %{{.*}} : memref<64xf32>, vector<2xf32>
-//  CHECK-NEXT:    %[[EXB:.*]] = vector.transfer_read %{{.*}}[%[[ID]]], %{{.*}} : memref<64xf32>, vector<2xf32>
+//       CHECK:    %[[ID1:.*]] = affine.apply #[[MAP0]]()[%[[ID]]]
+//  CHECK-NEXT:    %[[EXA:.*]] = vector.transfer_read %{{.*}}[%[[ID1]]], %{{.*}} : memref<64xf32>, vector<2xf32>
+//  CHECK-NEXT:    %[[ID2:.*]] = affine.apply #[[MAP0]]()[%[[ID]]]
+//  CHECK-NEXT:    %[[EXB:.*]] = vector.transfer_read %{{.*}}[%[[ID2]]], %{{.*}} : memref<64xf32>, vector<2xf32>
 //  CHECK-NEXT:    %[[ADD:.*]] = addf %[[EXA]], %[[EXB]] : vector<2xf32>
-//  CHECK-NEXT:    vector.transfer_write %[[ADD]], %{{.*}}[%[[ID]]] : vector<2xf32>, memref<64xf32>
+//  CHECK-NEXT:    %[[ID3:.*]] = affine.apply #[[MAP0]]()[%[[ID]]]
+//  CHECK-NEXT:    vector.transfer_write %[[ADD]], %{{.*}}[%[[ID3]]] : vector<2xf32>, memref<64xf32>
 //  CHECK-NEXT:    return
 func @vector_add_cycle(%id : index, %A: memref<64xf32>, %B: memref<64xf32>, %C: memref<64xf32>) {
   %c0 = constant 0 : index
@@ -51,6 +60,8 @@ func @vector_add_cycle(%id : index, %A: memref<64xf32>, %B: memref<64xf32>, %C:
   return
 }
 
+// -----
+
 // Negative test to make sure nothing is done in case the vector size is not a
 // multiple of multiplicity.
 // CHECK-LABEL: func @vector_negative_test
diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
index 20903b30..e2a507f 100644
--- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
@@ -8,6 +8,7 @@
 
 #include <type_traits>
 
+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/SCF/SCF.h"
@@ -185,6 +186,64 @@ struct TestVectorDistributePatterns
   }
 };
 
+struct TestVectorToLoopPatterns
+    : public PassWrapper<TestVectorToLoopPatterns, FunctionPass> {
+  TestVectorToLoopPatterns() = default;
+  TestVectorToLoopPatterns(const TestVectorToLoopPatterns &pass) {}
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<VectorDialect>();
+    registry.insert<AffineDialect>();
+  }
+  Option<int32_t> multiplicity{
+      *this, "distribution-multiplicity",
+      llvm::cl::desc("Set the multiplicity used for distributing vector"),
+      llvm::cl::init(32)};
+  void runOnFunction() override {
+    MLIRContext *ctx = &getContext();
+    OwningRewritePatternList patterns;
+    FuncOp func = getFunction();
+    func.walk([&](AddFOp op) {
+      // Check that the operation type can be broken down into a loop.
+      VectorType type = op.getType().dyn_cast<VectorType>();
+      if (!type || type.getRank() != 1 ||
+          type.getNumElements() % multiplicity != 0)
+        return mlir::WalkResult::advance();
+      auto filterAlloc = [](Operation *op) {
+        if (isa<ConstantOp, AllocOp, CallOp>(op))
+          return false;
+        return true;
+      };
+      auto dependentOps = getSlice(op, filterAlloc);
+      // Create a loop and move instructions from the Op slice into the loop.
+      OpBuilder builder(op);
+      auto zero = builder.create<ConstantOp>(
+          op.getLoc(), builder.getIndexType(),
+          builder.getIntegerAttr(builder.getIndexType(), 0));
+      auto one = builder.create<ConstantOp>(
+          op.getLoc(), builder.getIndexType(),
+          builder.getIntegerAttr(builder.getIndexType(), 1));
+      auto numIter = builder.create<ConstantOp>(
+          op.getLoc(), builder.getIndexType(),
+          builder.getIntegerAttr(builder.getIndexType(), multiplicity));
+      auto forOp = builder.create<scf::ForOp>(op.getLoc(), zero, numIter, one);
+      for (Operation *it : dependentOps) {
+        it->moveBefore(forOp.getBody()->getTerminator());
+      }
+      // break up the original op and let the patterns propagate.
+      Optional<mlir::vector::DistributeOps> ops = distributPointwiseVectorOp(
+          builder, op.getOperation(), forOp.getInductionVar(), multiplicity);
+      if (ops.hasValue()) {
+        SmallPtrSet<Operation *, 1> extractOp({ops->extract, ops->insert});
+        op.getResult().replaceAllUsesExcept(ops->insert.getResult(), extractOp);
+      }
+      return mlir::WalkResult::interrupt();
+    });
+    patterns.insert<PointwiseExtractPattern>(ctx);
+    populateVectorToVectorTransformationPatterns(patterns, ctx);
+    applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
+  }
+};
+
 struct TestVectorTransferUnrollingPatterns
     : public PassWrapper<TestVectorTransferUnrollingPatterns, FunctionPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -264,5 +323,8 @@ void registerTestVectorConversions() {
       "test-vector-distribute-patterns",
       "Test conversion patterns to distribute vector ops in the vector "
       "dialect");
+  PassRegistration<TestVectorToLoopPatterns> vectorToForLoop(
+      "test-vector-to-forloop",
+      "Test conversion patterns to break up a vector op into a for loop");
 }
 } // namespace mlir
-- 
2.7.4