From 79da91c59aeea3b126ec9a0d6f403a8c0f59e4dc Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Wed, 10 Mar 2021 20:25:49 +0100
Subject: [PATCH] Revert "[mlir][Vector][Affine] Improve affine vectorizer
 algorithm"

This reverts commit 95db7b4aeaad590f37720898e339a6d54313422f.

This breaks vectorize_2d.mlir and vectorize_3d.mlir test under ASAN (use
after free).
---
 mlir/include/mlir/Dialect/Vector/VectorUtils.h     |   12 +-
 .../Dialect/Affine/Transforms/SuperVectorize.cpp   | 1026 ++++++++++----------
 mlir/lib/Dialect/Vector/VectorUtils.cpp            |   28 +-
 .../Affine/SuperVectorize/vectorize_1d.mlir        |  115 +--
 4 files changed, 542 insertions(+), 639 deletions(-)
diff --git a/mlir/include/mlir/Dialect/Vector/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/VectorUtils.h
index 56f8f62..6666032 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/VectorUtils.h
@@ -19,7 +19,6 @@ namespace mlir {
 class AffineApplyOp;
 class AffineForOp;
 class AffineMap;
-class Block;
 class Location;
 class OpBuilder;
 class Operation;
@@ -99,10 +98,8 @@ Optional<SmallVector<int64_t, 4>> shapeRatio(VectorType superVectorType,
 /// Note that loopToVectorDim is a whole function map from which only enclosing
 /// loop information is extracted.
 ///
-/// Prerequisites: `indices` belong to a vectorizable load or store operation
-/// (i.e. at most one invariant index along each AffineForOp of
-/// `loopToVectorDim`). `insertPoint` is the insertion point for the vectorized
-/// load or store operation.
+/// Prerequisites: `opInst` is a vectorizable load or store operation (i.e. at
+/// most one invariant index along each AffineForOp of `loopToVectorDim`).
 ///
 /// Example 1:
 /// The following MLIR snippet:
@@ -154,10 +151,7 @@ Optional<SmallVector<int64_t, 4>> shapeRatio(VectorType superVectorType,
 /// `%arg0[%c0, %c0]` into vector<128xf32> which needs a 1-D vector broadcast.
 ///
 AffineMap
-makePermutationMap(Block *insertPoint, ArrayRef<Value> indices,
-                   const DenseMap<Operation *, unsigned> &loopToVectorDim);
-AffineMap
-makePermutationMap(Operation *insertPoint, ArrayRef<Value> indices,
+makePermutationMap(Operation *op, ArrayRef<Value> indices,
                    const DenseMap<Operation *, unsigned> &loopToVectorDim);
 
 /// Build the default minor identity map suitable for a vector transfer. This
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index c27b3e3..3540b51 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -14,13 +14,28 @@
 #include "PassDetail.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/NestedMatcher.h"
+#include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Types.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/FoldUtils.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 
 using namespace mlir;
 using namespace vector;
@@ -237,38 +252,61 @@ using namespace vector;
 ///     fastest varying) ;
 ///  2. analyzing those patterns for profitability (TODO: and
 ///     interference);
-///  3. then, for each pattern in order:
-///    a. applying iterative rewriting of the loops and all their nested
-///       operations in topological order. Rewriting is implemented by
-///       coarsening the loops and converting operations and operands to their
-///       vector forms. Processing operations in topological order is relatively
-///       simple due to the structured nature of the control-flow
-///       representation. This order ensures that all the operands of a given
-///       operation have been vectorized before the operation itself in a single
-///       traversal, except for operands defined outside of the loop nest. The
-///       algorithm can convert the following operations to their vector form:
-///         * Affine load and store operations are converted to opaque vector
-///           transfer read and write operations.
-///         * Scalar constant operations/operands are converted to vector
-///           constant operations (splat).
-///         * Uniform operands (only operands defined outside of the loop nest,
-///           for now) are broadcasted to a vector.
-///           TODO: Support more uniform cases.
-///         * The remaining operations in the loop nest are vectorized by
-///           widening their scalar types to vector types.
-///         * TODO: Add vectorization support for loops with 'iter_args' and
-///           more complex loops with divergent lbs and/or ubs.
-///    b. if everything under the root AffineForOp in the current pattern
-///       is vectorized properly, we commit that loop to the IR and remove the
-///       scalar loop. Otherwise, we discard the vectorized loop and keep the
-///       original scalar loop.
-///    c. vectorization is applied on the next pattern in the list. Because
+///  3. Then, for each pattern in order:
+///    a. applying iterative rewriting of the loop and the load operations in
+///       inner-to-outer order. Rewriting is implemented by coarsening the loops
+///       and turning load operations into opaque vector.transfer_read ops;
+///    b. keeping track of the load operations encountered as "roots" and the
+///       store operations as "terminals";
+///    c. traversing the use-def chains starting from the roots and iteratively
+///       propagating vectorized values. Scalar values that are encountered
+///       during this process must come from outside the scope of the current
+///       pattern (TODO: enforce this and generalize). Such a scalar value
+///       is vectorized only if it is a constant (into a vector splat). The
+///       non-constant case is not supported for now and results in the pattern
+///       failing to vectorize;
+///    d. performing a second traversal on the terminals (store ops) to
+///       rewriting the scalar value they write to memory into vector form.
+///       If the scalar value has been vectorized previously, we simply replace
+///       it by its vector form. Otherwise, if the scalar value is a constant,
+///       it is vectorized into a splat. In all other cases, vectorization for
+///       the pattern currently fails.
+///    e. if everything under the root AffineForOp in the current pattern
+///       vectorizes properly, we commit that loop to the IR. Otherwise we
+///       discard it and restore a previously cloned version of the loop. Thanks
+///       to the recursive scoping nature of matchers and captured patterns,
+///       this is transparently achieved by a simple RAII implementation.
+///    f. vectorization is applied on the next pattern in the list. Because
 ///       pattern interference avoidance is not yet implemented and that we do
 ///       not support further vectorizing an already vector load we need to
 ///       re-verify that the pattern is still vectorizable. This is expected to
 ///       make cost models more difficult to write and is subject to improvement
 ///       in the future.
 ///
+/// Points c. and d. above are worth additional comment. In most passes that
+/// do not change the type of operands, it is usually preferred to eagerly
+/// `replaceAllUsesWith`. Unfortunately this does not work for vectorization
+/// because during the use-def chain traversal, all the operands of an operation
+/// must be available in vector form. Trying to propagate eagerly makes the IR
+/// temporarily invalid and results in errors such as:
+///   `vectorize.mlir:308:13: error: 'addf' op requires the same type for all
+///   operands and results
+///      %s5 = addf %a5, %b5 : f32`
+///
+/// Lastly, we show a minimal example for which use-def chains rooted in load /
+/// vector.transfer_read are not enough. This is what motivated splitting
+/// terminal processing out of the use-def chains starting from loads. In the
+/// following snippet, there is simply no load::
+/// ```mlir
+/// func @fill(%A : memref<128xf32>) -> () {
+///   %f1 = constant 1.0 : f32
+///   affine.for %i0 = 0 to 32 {
+///     affine.store %f1, %A[%i0] : memref<128xf32, 0>
+///   }
+///   return
+/// }
+/// ```
+///
 /// Choice of loop transformation to support the algorithm:
 /// =======================================================
 /// The choice of loop transformation to apply for coarsening vectorized loops
@@ -489,6 +527,7 @@ using namespace vector;
 #define DEBUG_TYPE "early-vect"
 
 using llvm::dbgs;
+using llvm::SetVector;
 
 /// Forward declaration.
 static FilterFunctionType
@@ -593,196 +632,199 @@ static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches,
 namespace {
 
 struct VectorizationState {
-
-  VectorizationState(MLIRContext *context) : builder(context) {}
-
-  /// Registers the vector replacement of a scalar operation and its result
-  /// values. Both operations must have the same number of results.
-  ///
-  /// This utility is used to register the replacement for the vast majority of
-  /// the vectorized operations.
-  ///
-  /// Example:
-  ///   * 'replaced': %0 = addf %1, %2 : f32
-  ///   * 'replacement': %0 = addf %1, %2 : vector<128xf32>
-  void registerOpVectorReplacement(Operation *replaced, Operation *replacement);
-
-  /// Registers the vector replacement of a scalar value. The replacement
-  /// operation should have a single result, which replaces the scalar value.
-  ///
-  /// This utility is used to register the vector replacement of block arguments
-  /// and operation results which are not directly vectorized (i.e., their
-  /// scalar version still exists after vectorization), like uniforms.
-  ///
-  /// Example:
-  ///   * 'replaced': block argument or operation outside of the vectorized
-  ///     loop.
-  ///   * 'replacement': %0 = vector.broadcast %1 : f32 to vector<128xf32>
-  void registerValueVectorReplacement(Value replaced, Operation *replacement);
-
-  /// Registers the scalar replacement of a scalar value. 'replacement' must be
-  /// scalar. Both values must be block arguments. Operation results should be
-  /// replaced using the 'registerOp*' utilitites.
-  ///
-  /// This utility is used to register the replacement of block arguments
-  /// that are within the loop to be vectorized and will continue being scalar
-  /// within the vector loop.
-  ///
-  /// Example:
-  ///   * 'replaced': induction variable of a loop to be vectorized.
-  ///   * 'replacement': new induction variable in the new vector loop.
-  void registerValueScalarReplacement(BlockArgument replaced,
-                                      BlockArgument replacement);
-
-  /// Returns in 'replacedVals' the scalar replacement for values in
-  /// 'inputVals'.
-  void getScalarValueReplacementsFor(ValueRange inputVals,
-                                     SmallVectorImpl<Value> &replacedVals);
-
-  /// Erases the scalar loop nest after its successful vectorization.
-  void finishVectorizationPattern(AffineForOp rootLoop);
-
-  // Used to build and insert all the new operations created. The insertion
-  // point is preserved and updated along the vectorization process.
-  OpBuilder builder;
-
-  // Maps input scalar operations to their vector counterparts.
-  DenseMap<Operation *, Operation *> opVectorReplacement;
-  // Maps input scalar values to their vector counterparts.
-  BlockAndValueMapping valueVectorReplacement;
-  // Maps input scalar values to their new scalar counterparts in the vector
-  // loop nest.
-  BlockAndValueMapping valueScalarReplacement;
-
-  // Maps the newly created vector loops to their vector dimension.
-  DenseMap<Operation *, unsigned> vecLoopToVecDim;
-
+  /// Adds an entry of pre/post vectorization operations in the state.
+  void registerReplacement(Operation *key, Operation *value);
+  /// When the current vectorization pattern is successful, this erases the
+  /// operations that were marked for erasure in the proper order and resets
+  /// the internal state for the next pattern.
+  void finishVectorizationPattern();
+
+  // In-order tracking of original Operation that have been vectorized.
+  // Erase in reverse order.
+  SmallVector<Operation *, 16> toErase;
+  // Set of Operation that have been vectorized (the values in the
+  // vectorizationMap for hashed access). The vectorizedSet is used in
+  // particular to filter the operations that have already been vectorized by
+  // this pattern, when iterating over nested loops in this pattern.
+  DenseSet<Operation *> vectorizedSet;
+  // Map of old scalar Operation to new vectorized Operation.
+  DenseMap<Operation *, Operation *> vectorizationMap;
+  // Map of old scalar Value to new vectorized Value.
+  DenseMap<Value, Value> replacementMap;
   // The strategy drives which loop to vectorize by which amount.
   const VectorizationStrategy *strategy;
+  // Use-def roots. These represent the starting points for the worklist in the
+  // vectorizeNonTerminals function. They consist of the subset of load
+  // operations that have been vectorized. They can be retrieved from
+  // `vectorizationMap` but it is convenient to keep track of them in a separate
+  // data structure.
+  DenseSet<Operation *> roots;
+  // Terminal operations for the worklist in the vectorizeNonTerminals
+  // function. They consist of the subset of store operations that have been
+  // vectorized. They can be retrieved from `vectorizationMap` but it is
+  // convenient to keep track of them in a separate data structure. Since they
+  // do not necessarily belong to use-def chains starting from loads (e.g
+  // storing a constant), we need to handle them in a post-pass.
+  DenseSet<Operation *> terminals;
+  // Checks that the type of `op` is AffineStoreOp and adds it to the terminals
+  // set.
+  void registerTerminal(Operation *op);
+  // Folder used to factor out constant creation.
+  OperationFolder *folder;
 
 private:
-  /// Internal implementation to map input scalar values to new vector or scalar
-  /// values.
-  void registerValueVectorReplacementImpl(Value replaced, Value replacement);
-  void registerValueScalarReplacementImpl(Value replaced, Value replacement);
+  void registerReplacement(Value key, Value value);
 };
 
 } // end namespace
 
-/// Registers the vector replacement of a scalar operation and its result
-/// values. Both operations must have the same number of results.
-///
-/// This utility is used to register the replacement for the vast majority of
-/// the vectorized operations.
-///
-/// Example:
-///   * 'replaced': %0 = addf %1, %2 : f32
-///   * 'replacement': %0 = addf %1, %2 : vector<128xf32>
-void VectorizationState::registerOpVectorReplacement(Operation *replaced,
-                                                     Operation *replacement) {
-  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op:\n");
-  LLVM_DEBUG(dbgs() << *replaced << "\n");
-  LLVM_DEBUG(dbgs() << "into\n");
-  LLVM_DEBUG(dbgs() << *replacement << "\n");
-
-  assert(replaced->getNumResults() <= 1 && "Unsupported multi-result op");
-  assert(replaced->getNumResults() == replacement->getNumResults() &&
-         "Unexpected replaced and replacement results");
-  assert(opVectorReplacement.count(replaced) == 0 && "already registered");
-  opVectorReplacement[replaced] = replacement;
-
-  if (replaced->getNumResults() > 0)
-    registerValueVectorReplacementImpl(replaced->getResult(0),
-                                       replacement->getResult(0));
-}
-
-/// Registers the vector replacement of a scalar value. The replacement
-/// operation should have a single result, which replaces the scalar value.
-///
-/// This utility is used to register the vector replacement of block arguments
-/// and operation results which are not directly vectorized (i.e., their
-/// scalar version still exists after vectorization), like uniforms.
-///
-/// Example:
-///   * 'replaced': block argument or operation outside of the vectorized loop.
-///   * 'replacement': %0 = vector.broadcast %1 : f32 to vector<128xf32>
-void VectorizationState::registerValueVectorReplacement(
-    Value replaced, Operation *replacement) {
-  assert(replacement->getNumResults() == 1 &&
-         "Expected single-result replacement");
-  if (Operation *defOp = replaced.getDefiningOp())
-    registerOpVectorReplacement(defOp, replacement);
-  else
-    registerValueVectorReplacementImpl(replaced, replacement->getResult(0));
-}
-
-void VectorizationState::registerValueVectorReplacementImpl(Value replaced,
-                                                            Value replacement) {
-  assert(!valueVectorReplacement.contains(replaced) &&
-         "Vector replacement already registered");
-  assert(replacement.getType().isa<VectorType>() &&
-         "Expected vector type in vector replacement");
-  valueVectorReplacement.map(replaced, replacement);
-}
-
-/// Registers the scalar replacement of a scalar value. 'replacement' must be
-/// scalar. Both values must be block arguments. Operation results should be
-/// replaced using the 'registerOp*' utilitites.
-///
-/// This utility is used to register the replacement of block arguments
-/// that are within the loop to be vectorized and will continue being scalar
-/// within the vector loop.
-///
-/// Example:
-///   * 'replaced': induction variable of a loop to be vectorized.
-///   * 'replacement': new induction variable in the new vector loop.
-void VectorizationState::registerValueScalarReplacement(
-    BlockArgument replaced, BlockArgument replacement) {
-  registerValueScalarReplacementImpl(replaced, replacement);
-}
-
-void VectorizationState::registerValueScalarReplacementImpl(Value replaced,
-                                                            Value replacement) {
-  assert(!valueScalarReplacement.contains(replaced) &&
-         "Scalar value replacement already registered");
-  assert(!replacement.getType().isa<VectorType>() &&
-         "Expected scalar type in scalar replacement");
-  valueScalarReplacement.map(replaced, replacement);
+void VectorizationState::registerReplacement(Operation *key, Operation *value) {
+  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: ");
+  LLVM_DEBUG(key->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  into  ");
+  LLVM_DEBUG(value->print(dbgs()));
+  assert(key->getNumResults() == 1 && "already registered");
+  assert(value->getNumResults() == 1 && "already registered");
+  assert(vectorizedSet.count(value) == 0 && "already registered");
+  assert(vectorizationMap.count(key) == 0 && "already registered");
+  toErase.push_back(key);
+  vectorizedSet.insert(value);
+  vectorizationMap.insert(std::make_pair(key, value));
+  registerReplacement(key->getResult(0), value->getResult(0));
+  if (isa<AffineLoadOp>(key)) {
+    assert(roots.count(key) == 0 && "root was already inserted previously");
+    roots.insert(key);
+  }
 }
 
-/// Returns in 'replacedVals' the scalar replacement for values in 'inputVals'.
-void VectorizationState::getScalarValueReplacementsFor(
-    ValueRange inputVals, SmallVectorImpl<Value> &replacedVals) {
-  for (Value inputVal : inputVals)
-    replacedVals.push_back(valueScalarReplacement.lookupOrDefault(inputVal));
+void VectorizationState::registerTerminal(Operation *op) {
+  assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp");
+  assert(terminals.count(op) == 0 &&
+         "terminal was already inserted previously");
+  terminals.insert(op);
 }
 
-/// Erases a loop nest, including all its nested operations.
-static void eraseLoopNest(AffineForOp forOp) {
-  LLVM_DEBUG(dbgs() << "[early-vect]+++++ erasing:\n" << forOp << "\n");
-  forOp.erase();
+void VectorizationState::finishVectorizationPattern() {
+  while (!toErase.empty()) {
+    auto *op = toErase.pop_back_val();
+    LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: ");
+    LLVM_DEBUG(op->print(dbgs()));
+    op->erase();
+  }
 }
 
-/// Erases the scalar loop nest after its successful vectorization.
-void VectorizationState::finishVectorizationPattern(AffineForOp rootLoop) {
-  LLVM_DEBUG(dbgs() << "\n[early-vect] Finalizing vectorization\n");
-  eraseLoopNest(rootLoop);
+void VectorizationState::registerReplacement(Value key, Value value) {
+  assert(replacementMap.count(key) == 0 && "replacement already registered");
+  replacementMap.insert(std::make_pair(key, value));
 }
 
 // Apply 'map' with 'mapOperands' returning resulting values in 'results'.
 static void computeMemoryOpIndices(Operation *op, AffineMap map,
                                    ValueRange mapOperands,
-                                   VectorizationState &state,
                                    SmallVectorImpl<Value> &results) {
+  OpBuilder builder(op);
   for (auto resultExpr : map.getResults()) {
     auto singleResMap =
         AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
-    auto afOp = state.builder.create<AffineApplyOp>(op->getLoc(), singleResMap,
-                                                    mapOperands);
+    auto afOp =
+        builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands);
     results.push_back(afOp);
   }
 }
 
+////// TODO: Hoist to a VectorizationMaterialize.cpp when appropriate. ////
+
+/// Handles the vectorization of load and store MLIR operations.
+///
+/// AffineLoadOp operations are the roots of the vectorizeNonTerminals call.
+/// They are vectorized immediately. The resulting vector.transfer_read is
+/// immediately registered to replace all uses of the AffineLoadOp in this
+/// pattern's scope.
+///
+/// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They
+/// need to be vectorized late once all the use-def chains have been traversed.
+/// Additionally, they may have ssa-values operands which come from outside the
+/// scope of the current pattern.
+/// Such special cases force us to delay the vectorization of the stores until
+/// the last step. Here we merely register the store operation.
+template <typename LoadOrStoreOpPointer>
+static LogicalResult vectorizeRootOrTerminal(Value iv,
+                                             LoadOrStoreOpPointer memoryOp,
+                                             VectorizationState *state) {
+  auto memRefType = memoryOp.getMemRef().getType().template cast<MemRefType>();
+
+  auto elementType = memRefType.getElementType();
+  // TODO: ponder whether we want to further vectorize a vector value.
+  assert(VectorType::isValidElementType(elementType) &&
+         "Not a valid vector element type");
+  auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType);
+
+  // Materialize a MemRef with 1 vector.
+  auto *opInst = memoryOp.getOperation();
+  // For now, vector.transfers must be aligned, operate only on indices with an
+  // identity subset of AffineMap and do not change layout.
+  // TODO: increase the expressiveness power of vector.transfer operations
+  // as needed by various targets.
+  if (auto load = dyn_cast<AffineLoadOp>(opInst)) {
+    OpBuilder b(opInst);
+    ValueRange mapOperands = load.getMapOperands();
+    SmallVector<Value, 8> indices;
+    indices.reserve(load.getMemRefType().getRank());
+    if (load.getAffineMap() !=
+        b.getMultiDimIdentityMap(load.getMemRefType().getRank())) {
+      computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices);
+    } else {
+      indices.append(mapOperands.begin(), mapOperands.end());
+    }
+    auto permutationMap =
+        makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
+    if (!permutationMap)
+      return failure();
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
+    LLVM_DEBUG(permutationMap.print(dbgs()));
+    auto transfer = b.create<vector::TransferReadOp>(
+        opInst->getLoc(), vectorType, memoryOp.getMemRef(), indices,
+        permutationMap);
+    state->registerReplacement(opInst, transfer.getOperation());
+  } else {
+    state->registerTerminal(opInst);
+  }
+  return success();
+}
+/// end TODO: Hoist to a VectorizationMaterialize.cpp when appropriate. ///
+
+/// Coarsens the loops bounds and transforms all remaining load and store
+/// operations into the appropriate vector.transfer.
+static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
+                                          VectorizationState *state) {
+  loop.setStep(step);
+
+  FilterFunctionType notVectorizedThisPattern = [state](Operation &op) {
+    if (!matcher::isLoadOrStore(op)) {
+      return false;
+    }
+    return state->vectorizationMap.count(&op) == 0 &&
+           state->vectorizedSet.count(&op) == 0 &&
+           state->roots.count(&op) == 0 && state->terminals.count(&op) == 0;
+  };
+  auto loadAndStores = matcher::Op(notVectorizedThisPattern);
+  SmallVector<NestedMatch, 8> loadAndStoresMatches;
+  loadAndStores.match(loop.getOperation(), &loadAndStoresMatches);
+  for (auto ls : loadAndStoresMatches) {
+    auto *opInst = ls.getMatchedOperation();
+    auto load = dyn_cast<AffineLoadOp>(opInst);
+    auto store = dyn_cast<AffineStoreOp>(opInst);
+    LLVM_DEBUG(opInst->print(dbgs()));
+    LogicalResult result =
+        load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state)
+             : vectorizeRootOrTerminal(loop.getInductionVar(), store, state);
+    if (failed(result)) {
+      return failure();
+    }
+  }
+  return success();
+}
+
 /// Returns a FilterFunctionType that can be used in NestedPattern to match a
 /// loop whose underlying load/store accesses are either invariant or all
 // varying along the `fastestVaryingMemRefDimension`.
@@ -804,6 +846,68 @@ isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
   };
 }
 
+/// Apply vectorization of `loop` according to `state`. `loops` are processed in
+/// inner-to-outer order to ensure that all the children loops have already been
+/// vectorized before vectorizing the parent loop.
+static LogicalResult
+vectorizeLoopsAndLoads(std::vector<SmallVector<AffineForOp, 2>> &loops,
+                       VectorizationState *state) {
+  // Vectorize loops in inner-to-outer order. If any children fails, the parent
+  // will fail too.
+  for (auto &loopsInLevel : llvm::reverse(loops)) {
+    for (AffineForOp loop : loopsInLevel) {
+      // 1. This loop may have been omitted from vectorization for various
+      // reasons (e.g. due to the performance model or pattern depth > vector
+      // size).
+      auto it = state->strategy->loopToVectorDim.find(loop.getOperation());
+      if (it == state->strategy->loopToVectorDim.end())
+        continue;
+
+      // 2. Actual inner-to-outer transformation.
+      auto vectorDim = it->second;
+      assert(vectorDim < state->strategy->vectorSizes.size() &&
+             "vector dim overflow");
+      //   a. get actual vector size
+      auto vectorSize = state->strategy->vectorSizes[vectorDim];
+      //   b. loop transformation for early vectorization is still subject to
+      //     exploratory tradeoffs (see top of the file). Apply coarsening,
+      //     i.e.:
+      //        | ub -> ub
+      //        | step -> step * vectorSize
+      LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
+                        << " : \n"
+                        << loop);
+      if (failed(
+              vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state)))
+        return failure();
+    } // end for.
+  }
+
+  return success();
+}
+
+/// Tries to transform a scalar constant into a vector splat of that constant.
+/// Returns the vectorized splat operation if the constant is a valid vector
+/// element type.
+/// If `type` is not a valid vector type or if the scalar constant is not a
+/// valid vector element type, returns nullptr.
+static Value vectorizeConstant(Operation *op, ConstantOp constant, Type type) {
+  if (!type || !type.isa<VectorType>() ||
+      !VectorType::isValidElementType(constant.getType())) {
+    return nullptr;
+  }
+  OpBuilder b(op);
+  Location loc = op->getLoc();
+  auto vectorType = type.cast<VectorType>();
+  auto attr = DenseElementsAttr::get(vectorType, constant.getValue());
+  auto *constantOpInst = constant.getOperation();
+
+  OperationState state(loc, constantOpInst->getName().getStringRef(), {},
+                       {vectorType}, {b.getNamedAttr("value", attr)});
+
+  return b.createOperation(state)->getResult(0);
+}
+
 /// Returns the vector type resulting from applying the provided vectorization
 /// strategy on the scalar type.
 static VectorType getVectorType(Type scalarTy,
@@ -812,24 +916,6 @@ static VectorType getVectorType(Type scalarTy,
   return VectorType::get(strategy->vectorSizes, scalarTy);
 }
 
-/// Tries to transform a scalar constant into a vector constant. Returns the
-/// vector constant if the scalar type is valid vector element type. Returns
-/// nullptr, otherwise.
-static ConstantOp vectorizeConstant(ConstantOp constOp,
-                                    VectorizationState &state) {
-  Type scalarTy = constOp.getType();
-  if (!VectorType::isValidElementType(scalarTy))
-    return nullptr;
-
-  auto vecTy = getVectorType(scalarTy, state.strategy);
-  auto vecAttr = DenseElementsAttr::get(vecTy, constOp.getValue());
-  auto newConstOp = state.builder.create<ConstantOp>(constOp.getLoc(), vecAttr);
-
-  // Register vector replacement for future uses in the scope.
-  state.registerOpVectorReplacement(constOp, newConstOp);
-  return newConstOp;
-}
-
 /// Returns true if the provided value is vector uniform given the vectorization
 /// strategy.
 // TODO: For now, only values that are invariants to all the loops in the
@@ -846,27 +932,32 @@ static bool isUniformDefinition(Value value,
 
 /// Generates a broadcast op for the provided uniform value using the
 /// vectorization strategy in 'state'.
-static Operation *vectorizeUniform(Value uniformVal,
-                                   VectorizationState &state) {
-  OpBuilder::InsertionGuard guard(state.builder);
-  state.builder.setInsertionPointAfterValue(uniformVal);
-
-  auto vectorTy = getVectorType(uniformVal.getType(), state.strategy);
-  auto bcastOp = state.builder.create<BroadcastOp>(uniformVal.getLoc(),
-                                                   vectorTy, uniformVal);
-  state.registerValueVectorReplacement(uniformVal, bcastOp);
-  return bcastOp;
+static Value vectorizeUniform(Value value, VectorizationState *state) {
+  OpBuilder builder(value.getContext());
+  builder.setInsertionPointAfterValue(value);
+
+  auto vectorTy = getVectorType(value.getType(), state->strategy);
+  auto bcast = builder.create<BroadcastOp>(value.getLoc(), vectorTy, value);
+
+  // Add broadcast to the replacement map to reuse it for other uses.
+  state->replacementMap[value] = bcast;
+  return bcast;
 }
 
-/// Tries to vectorize a given `operand` by applying the following logic:
-/// 1. if the defining operation has been already vectorized, `operand` is
-///    already in the proper vector form;
-/// 2. if the `operand` is a constant, returns the vectorized form of the
-///    constant;
-/// 3. if the `operand` is uniform, returns a vector broadcast of the `op`;
-/// 4. otherwise, the vectorization of `operand` is not supported.
-/// Newly created vector operations are registered in `state` as replacement
-/// for their scalar counterparts.
+/// Tries to vectorize a given operand `op` of Operation `op` during
+/// def-chain propagation or during terminal vectorization, by applying the
+/// following logic:
+/// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized
+///    useby -def propagation), `op` is already in the proper vector form;
+/// 2. otherwise, the `op` may be in some other vector form that fails to
+///    vectorize atm (i.e. broadcasting required), returns nullptr to indicate
+///    failure;
+/// 3. if the `op` is a constant, returns the vectorized form of the constant;
+/// 4. if the `op` is uniform, returns a vector broadcast of the `op`;
+/// 5. non-constant scalars are currently non-vectorizable, in particular to
+///    guard against vectorizing an index which may be loop-variant and needs
+///    special handling.
+///
 /// In particular this logic captures some of the use cases where definitions
 /// that are not scoped under the current pattern are needed to vectorize.
 /// One such example is top level function constants that need to be splatted.
@@ -875,213 +966,112 @@ static Operation *vectorizeUniform(Value uniformVal,
 /// vectorization is possible with the above logic. Returns nullptr otherwise.
 ///
 /// TODO: handle more complex cases.
-static Value vectorizeOperand(Value operand, VectorizationState &state) {
-  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorize operand: " << operand);
-  // If this value is already vectorized, we are done.
-  if (Value vecRepl = state.valueVectorReplacement.lookupOrNull(operand)) {
-    LLVM_DEBUG(dbgs() << " -> already vectorized: " << vecRepl);
-    return vecRepl;
+static Value vectorizeOperand(Value operand, Operation *op,
+                              VectorizationState *state) {
+  LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: " << operand);
+  // 1. If this value has already been vectorized this round, we are done.
+  if (state->vectorizedSet.count(operand.getDefiningOp()) > 0) {
+    LLVM_DEBUG(dbgs() << " -> already vector operand");
+    return operand;
   }
-
-  // An vector operand that is not in the replacement map should never reach
-  // this point. Reaching this point could mean that the code was already
-  // vectorized and we shouldn't try to vectorize already vectorized code.
-  assert(!operand.getType().isa<VectorType>() &&
-         "Vector op not found in replacement map");
-
-  // Vectorize constant.
-  if (auto constOp = operand.getDefiningOp<ConstantOp>()) {
-    ConstantOp vecConstant = vectorizeConstant(constOp, state);
-    LLVM_DEBUG(dbgs() << "-> constant: " << vecConstant);
-    return vecConstant.getResult();
+  // 1.b. Delayed on-demand replacement of a use.
+  //    Note that we cannot just call replaceAllUsesWith because it may result
+  //    in ops with mixed types, for ops whose operands have not all yet
+  //    been vectorized. This would be invalid IR.
+  auto it = state->replacementMap.find(operand);
+  if (it != state->replacementMap.end()) {
+    auto res = it->second;
+    LLVM_DEBUG(dbgs() << "-> delayed replacement by: " << res);
+    return res;
   }
-
-  // Vectorize uniform values.
-  if (isUniformDefinition(operand, state.strategy)) {
-    Operation *vecUniform = vectorizeUniform(operand, state);
-    LLVM_DEBUG(dbgs() << "-> uniform: " << *vecUniform);
-    return vecUniform->getResult(0);
-  }
-
-  // Check for unsupported block argument scenarios. A supported block argument
-  // should have been vectorized already.
-  if (!operand.getDefiningOp())
-    LLVM_DEBUG(dbgs() << "-> unsupported block argument\n");
-  else
-    // Generic unsupported case.
-    LLVM_DEBUG(dbgs() << "-> non-vectorizable\n");
-
-  return nullptr;
-}
-
-/// Vectorizes an affine load with the vectorization strategy in 'state' by
-/// generating a 'vector.transfer_read' op with the proper permutation map
-/// inferred from the indices of the load. The new 'vector.transfer_read' is
-/// registered as replacement of the scalar load. Returns the newly created
-/// 'vector.transfer_read' if vectorization was successful. Returns nullptr,
-/// otherwise.
-static Operation *vectorizeAffineLoad(AffineLoadOp loadOp,
-                                      VectorizationState &state) {
-  MemRefType memRefType = loadOp.getMemRefType();
-  Type elementType = memRefType.getElementType();
-  auto vectorType = VectorType::get(state.strategy->vectorSizes, elementType);
-
-  // Replace map operands with operands from the vector loop nest.
-  SmallVector<Value, 8> mapOperands;
-  state.getScalarValueReplacementsFor(loadOp.getMapOperands(), mapOperands);
-
-  // Compute indices for the transfer op. AffineApplyOp's may be generated.
-  SmallVector<Value, 8> indices;
-  indices.reserve(memRefType.getRank());
-  if (loadOp.getAffineMap() !=
-      state.builder.getMultiDimIdentityMap(memRefType.getRank()))
-    computeMemoryOpIndices(loadOp, loadOp.getAffineMap(), mapOperands, state,
-                           indices);
-  else
-    indices.append(mapOperands.begin(), mapOperands.end());
-
-  // Compute permutation map using the information of new vector loops.
-  auto permutationMap = makePermutationMap(state.builder.getInsertionBlock(),
-                                           indices, state.vecLoopToVecDim);
-  if (!permutationMap) {
-    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ can't compute permutationMap\n");
+  // 2. TODO: broadcast needed.
+  if (operand.getType().isa<VectorType>()) {
+    LLVM_DEBUG(dbgs() << "-> non-vectorizable");
     return nullptr;
   }
-  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
-  LLVM_DEBUG(permutationMap.print(dbgs()));
+  // 3. vectorize constant.
+  if (auto constant = operand.getDefiningOp<ConstantOp>())
+    return vectorizeConstant(op, constant,
+                             getVectorType(operand.getType(), state->strategy));
 
-  auto transfer = state.builder.create<vector::TransferReadOp>(
-      loadOp.getLoc(), vectorType, loadOp.getMemRef(), indices, permutationMap);
+  // 4. Uniform values.
+  if (isUniformDefinition(operand, state->strategy))
+    return vectorizeUniform(operand, state);
 
-  // Register replacement for future uses in the scope.
-  state.registerOpVectorReplacement(loadOp, transfer);
-  return transfer;
+  // 5. currently non-vectorizable.
+  LLVM_DEBUG(dbgs() << "-> non-vectorizable: " << operand);
+  return nullptr;
 }
 
-/// Vectorizes an affine store with the vectorization strategy in 'state' by
-/// generating a 'vector.transfer_write' op with the proper permutation map
-/// inferred from the indices of the store. The new 'vector.transfer_store' is
-/// registered as replacement of the scalar load. Returns the newly created
-/// 'vector.transfer_write' if vectorization was successful. Returns nullptr,
-/// otherwise.
-static Operation *vectorizeAffineStore(AffineStoreOp storeOp,
-                                       VectorizationState &state) {
-  MemRefType memRefType = storeOp.getMemRefType();
-  Value vectorValue = vectorizeOperand(storeOp.getValueToStore(), state);
-  if (!vectorValue)
-    return nullptr;
-
-  // Replace map operands with operands from the vector loop nest.
-  SmallVector<Value, 8> mapOperands;
-  state.getScalarValueReplacementsFor(storeOp.getMapOperands(), mapOperands);
-
-  // Compute indices for the transfer op. AffineApplyOp's may be generated.
-  SmallVector<Value, 8> indices;
-  indices.reserve(memRefType.getRank());
-  if (storeOp.getAffineMap() !=
-      state.builder.getMultiDimIdentityMap(memRefType.getRank()))
-    computeMemoryOpIndices(storeOp, storeOp.getAffineMap(), mapOperands, state,
-                           indices);
-  else
-    indices.append(mapOperands.begin(), mapOperands.end());
-
-  // Compute permutation map using the information of new vector loops.
-  auto permutationMap = makePermutationMap(state.builder.getInsertionBlock(),
-                                           indices, state.vecLoopToVecDim);
-  if (!permutationMap)
-    return nullptr;
-  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
-  LLVM_DEBUG(permutationMap.print(dbgs()));
-
-  auto transfer = state.builder.create<vector::TransferWriteOp>(
-      storeOp.getLoc(), vectorValue, storeOp.getMemRef(), indices,
-      permutationMap);
-  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << transfer);
+/// Encodes Operation-specific behavior for vectorization. In general we assume
+/// that all operands of an op must be vectorized but this is not always true.
+/// In the future, it would be nice to have a trait that describes how a
+/// particular operation vectorizes. For now we implement the case distinction
+/// here.
+/// Returns a vectorized form of an operation or nullptr if vectorization fails.
+// TODO: consider adding a trait to Op to describe how it gets vectorized.
+// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
+// do one-off logic here; ideally it would be TableGen'd.
+static Operation *vectorizeOneOperation(Operation *opInst,
+                                        VectorizationState *state) {
+  // Sanity checks.
+  assert(!isa<AffineLoadOp>(opInst) &&
+         "all loads must have already been fully vectorized independently");
+  assert(!isa<vector::TransferReadOp>(opInst) &&
+         "vector.transfer_read cannot be further vectorized");
+  assert(!isa<vector::TransferWriteOp>(opInst) &&
+         "vector.transfer_write cannot be further vectorized");
 
-  // Register replacement for future uses in the scope.
-  state.registerOpVectorReplacement(storeOp, transfer);
-  return transfer;
-}
+  if (auto store = dyn_cast<AffineStoreOp>(opInst)) {
+    OpBuilder b(opInst);
+    auto memRef = store.getMemRef();
+    auto value = store.getValueToStore();
+    auto vectorValue = vectorizeOperand(value, opInst, state);
+    if (!vectorValue)
+      return nullptr;
 
-/// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is
-/// created and registered as replacement for the scalar loop. The builder's
-/// insertion point is set to the new loop's body so that subsequent vectorized
-/// operations are inserted into the new loop. If the loop is a vector
-/// dimension, the step of the newly created loop will reflect the vectorization
-/// factor used to vectorized that dimension.
-// TODO: Add support for 'iter_args'. Related operands and results will be
-// vectorized at this point.
-static Operation *vectorizeAffineForOp(AffineForOp forOp,
-                                       VectorizationState &state) {
-  // 'iter_args' not supported yet.
-  if (forOp.getNumIterOperands() > 0)
-    return nullptr;
+    ValueRange mapOperands = store.getMapOperands();
+    SmallVector<Value, 8> indices;
+    indices.reserve(store.getMemRefType().getRank());
+    if (store.getAffineMap() !=
+        b.getMultiDimIdentityMap(store.getMemRefType().getRank())) {
+      computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands,
+                             indices);
+    } else {
+      indices.append(mapOperands.begin(), mapOperands.end());
+    }
 
-  // If we are vectorizing a vector dimension, compute a new step for the new
-  // vectorized loop using the vectorization factor for the vector dimension.
-  // Otherwise, propagate the step of the scalar loop.
-  const VectorizationStrategy &strategy = *state.strategy;
-  auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);
-  bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();
-  unsigned newStep;
-  if (isLoopVecDim) {
-    unsigned vectorDim = loopToVecDimIt->second;
-    assert(vectorDim < strategy.vectorSizes.size() && "vector dim overflow");
-    int64_t forOpVecFactor = strategy.vectorSizes[vectorDim];
-    newStep = forOp.getStep() * forOpVecFactor;
-  } else {
-    newStep = forOp.getStep();
+    auto permutationMap =
+        makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
+    if (!permutationMap)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
+    LLVM_DEBUG(permutationMap.print(dbgs()));
+    auto transfer = b.create<vector::TransferWriteOp>(
+        opInst->getLoc(), vectorValue, memRef, indices, permutationMap);
+    auto *res = transfer.getOperation();
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
+    // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
+    opInst->erase();
+    return res;
   }
+  if (opInst->getNumRegions() != 0)
+    return nullptr;
 
-  auto vecForOp = state.builder.create<AffineForOp>(
-      forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),
-      forOp.getUpperBoundOperands(), forOp.getUpperBoundMap(), newStep,
-      forOp.getIterOperands(),
-      /*bodyBuilder=*/[](OpBuilder &, Location, Value, ValueRange) {
-        // Make sure we don't create a default terminator in the loop body as
-        // the proper terminator will be added during vectorization.
-        return;
-      });
-
-  // Register loop-related replacements:
-  //   1) The new vectorized loop is registered as vector replacement of the
-  //      scalar loop.
-  //      TODO: Support reductions along the vector dimension.
-  //   2) The new iv of the vectorized loop is registered as scalar replacement
-  //      since a scalar copy of the iv will prevail in the vectorized loop.
-  //      TODO: A vector replacement will also be added in the future when
-  //      vectorization of linear ops is supported.
-  //   3) TODO: Support 'iter_args' along non-vector dimensions.
-  state.registerOpVectorReplacement(forOp, vecForOp);
-  state.registerValueScalarReplacement(forOp.getInductionVar(),
-                                       vecForOp.getInductionVar());
-  // Map the new vectorized loop to its vector dimension.
-  if (isLoopVecDim)
-    state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;
-
-  // Change insertion point so that upcoming vectorized instructions are
-  // inserted into the vectorized loop's body.
-  state.builder.setInsertionPointToStart(vecForOp.getBody());
-  return vecForOp;
-}
-
-/// Vectorizes arbitrary operation by plain widening. We apply generic type
-/// widening of all its results and retrieve the vector counterparts for all its
-/// operands.
-static Operation *widenOp(Operation *op, VectorizationState &state) {
   SmallVector<Type, 8> vectorTypes;
-  for (Value result : op->getResults())
+  for (auto v : opInst->getResults()) {
     vectorTypes.push_back(
-        VectorType::get(state.strategy->vectorSizes, result.getType()));
-
+        VectorType::get(state->strategy->vectorSizes, v.getType()));
+  }
   SmallVector<Value, 8> vectorOperands;
-  for (Value operand : op->getOperands()) {
-    Value vecOperand = vectorizeOperand(operand, state);
-    if (!vecOperand) {
-      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize\n");
-      return nullptr;
-    }
-    vectorOperands.push_back(vecOperand);
+  for (auto v : opInst->getOperands()) {
+    vectorOperands.push_back(vectorizeOperand(v, opInst, state));
+  }
+  // Check whether a single operand is null. If so, vectorization failed.
+  bool success = llvm::all_of(vectorOperands, [](Value op) { return op; });
+  if (!success) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize");
+    return nullptr;
   }
 
   // Create a clone of the op with the proper operands and return types.
@@ -1089,64 +1079,59 @@ static Operation *widenOp(Operation *op, VectorizationState &state) {
   // name that works both in scalar mode and vector mode.
   // TODO: Is it worth considering an Operation.clone operation which
   // changes the type so we can promote an Operation with less boilerplate?
-  OperationState vecOpState(op->getLoc(), op->getName().getStringRef(),
-                            vectorOperands, vectorTypes, op->getAttrs(),
-                            /*successors=*/{}, /*regions=*/{});
-  Operation *vecOp = state.builder.createOperation(vecOpState);
-  state.registerOpVectorReplacement(op, vecOp);
-  return vecOp;
+  OpBuilder b(opInst);
+  OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(),
+                       vectorOperands, vectorTypes, opInst->getAttrs(),
+                       /*successors=*/{}, /*regions=*/{});
+  return b.createOperation(newOp);
 }
 
-/// Vectorizes a yield operation by widening its types. The builder's insertion
-/// point is set after the vectorized parent op to continue vectorizing the
-/// operations after the parent op.
-static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,
-                                         VectorizationState &state) {
-  // 'iter_args' not supported yet.
-  if (yieldOp.getNumOperands() > 0)
-    return nullptr;
-
-  // Vectorize the yield op and change the insertion point right after the new
-  // parent op.
-  Operation *newYieldOp = widenOp(yieldOp, state);
-  Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();
-  state.builder.setInsertionPointAfter(newParentOp);
-  return newYieldOp;
-}
-
-/// Encodes Operation-specific behavior for vectorization. In general we
-/// assume that all operands of an op must be vectorized but this is not
-/// always true. In the future, it would be nice to have a trait that
-/// describes how a particular operation vectorizes. For now we implement the
-/// case distinction here. Returns a vectorized form of an operation or
-/// nullptr if vectorization fails.
-// TODO: consider adding a trait to Op to describe how it gets vectorized.
-// Maybe some Ops are not vectorizable or require some tricky logic, we cannot
-// do one-off logic here; ideally it would be TableGen'd.
-static Operation *vectorizeOneOperation(Operation *op,
-                                        VectorizationState &state) {
-  // Sanity checks.
-  assert(!isa<vector::TransferReadOp>(op) &&
-         "vector.transfer_read cannot be further vectorized");
-  assert(!isa<vector::TransferWriteOp>(op) &&
-         "vector.transfer_write cannot be further vectorized");
-
-  if (auto loadOp = dyn_cast<AffineLoadOp>(op))
-    return vectorizeAffineLoad(loadOp, state);
-  if (auto storeOp = dyn_cast<AffineStoreOp>(op))
-    return vectorizeAffineStore(storeOp, state);
-  if (auto forOp = dyn_cast<AffineForOp>(op))
-    return vectorizeAffineForOp(forOp, state);
-  if (auto yieldOp = dyn_cast<AffineYieldOp>(op))
-    return vectorizeAffineYieldOp(yieldOp, state);
-  if (auto constant = dyn_cast<ConstantOp>(op))
-    return vectorizeConstant(constant, state);
-
-  // Other ops with regions are not supported.
-  if (op->getNumRegions() != 0)
-    return nullptr;
+/// Iterates over the forward slice from the loads in the vectorization pattern
+/// and rewrites them using their vectorized counterpart by:
+///   1. Create the forward slice starting from the loads in the vectorization
+///   pattern.
+///   2. Topologically sorts the forward slice.
+///   3. For each operation in the slice, create the vector form of this
+///   operation, replacing each operand by a replacement operands retrieved from
+///   replacementMap. If any such replacement is missing, vectorization fails.
+static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
+  // 1. create initial worklist with the uses of the roots.
+  SetVector<Operation *> worklist;
+  // Note: state->roots have already been vectorized and must not be vectorized
+  // again. This fits `getForwardSlice` which does not insert `op` in the
+  // result.
+  // Note: we have to exclude terminals because some of their defs may not be
+  // nested under the vectorization pattern (e.g. constants defined in an
+  // encompassing scope).
+  // TODO: Use a backward slice for terminals, avoid special casing and
+  // merge implementations.
+  for (auto *op : state->roots) {
+    getForwardSlice(op, &worklist, [state](Operation *op) {
+      return state->terminals.count(op) == 0; // propagate if not terminal
+    });
+  }
+  // We merged multiple slices, topological order may not hold anymore.
+  worklist = topologicalSort(worklist);
+
+  for (unsigned i = 0; i < worklist.size(); ++i) {
+    auto *op = worklist[i];
+    LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: ");
+    LLVM_DEBUG(op->print(dbgs()));
+
+    // Create vector form of the operation.
+    // Insert it just before op, on success register op as replaced.
+    auto *vectorizedInst = vectorizeOneOperation(op, state);
+    if (!vectorizedInst) {
+      return failure();
+    }
 
-  return widenOp(op, state);
+    // 3. Register replacement for future uses in the scope.
+    //    Note that we cannot just call replaceAllUsesWith because it may
+    //    result in ops with mixed types, for ops whose operands have not all
+    //    yet been vectorized. This would be invalid IR.
+    state->registerReplacement(op, vectorizedInst);
+  }
+  return success();
 }
 
 /// Recursive implementation to convert all the nested loops in 'match' to a 2D
@@ -1186,9 +1171,10 @@ vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
                   const VectorizationStrategy &strategy) {
   assert(loops[0].size() == 1 && "Expected single root loop");
   AffineForOp rootLoop = loops[0][0];
-  VectorizationState state(rootLoop.getContext());
-  state.builder.setInsertionPointAfter(rootLoop);
+  OperationFolder folder(rootLoop.getContext());
+  VectorizationState state;
   state.strategy = &strategy;
+  state.folder = &folder;
 
   // Since patterns are recursive, they can very well intersect.
   // Since we do not want a fully greedy strategy in general, we decouple
@@ -1202,48 +1188,70 @@ vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
     return failure();
   }
 
+  /// Sets up error handling for this root loop. This is how the root match
+  /// maintains a clone for handling failure and restores the proper state via
+  /// RAII.
+  auto *loopInst = rootLoop.getOperation();
+  OpBuilder builder(loopInst);
+  auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
+  struct Guard {
+    LogicalResult failure() {
+      loop.getInductionVar().replaceAllUsesWith(clonedLoop.getInductionVar());
+      loop.erase();
+      return mlir::failure();
+    }
+    LogicalResult success() {
+      clonedLoop.erase();
+      return mlir::success();
+    }
+    AffineForOp loop;
+    AffineForOp clonedLoop;
+  } guard{rootLoop, clonedLoop};
+
   //////////////////////////////////////////////////////////////////////////////
-  // Vectorize the scalar loop nest following a topological order. A new vector
-  // loop nest with the vectorized operations is created along the process. If
-  // vectorization succeeds, the scalar loop nest is erased. If vectorization
-  // fails, the vector loop nest is erased and the scalar loop nest is not
-  // modified.
+  // Start vectorizing.
+  // From now on, any error triggers the scope guard above.
   //////////////////////////////////////////////////////////////////////////////
+  // 1. Vectorize all the loop candidates, in inner-to-outer order.
+  // This also vectorizes the roots (AffineLoadOp) as well as registers the
+  // terminals (AffineStoreOp) for post-processing vectorization (we need to
+  // wait for all use-def chains into them to be vectorized first).
+  if (failed(vectorizeLoopsAndLoads(loops, &state))) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
+    return guard.failure();
+  }
 
-  auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {
-    LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);
-    Operation *vectorOp = vectorizeOneOperation(op, state);
-    if (!vectorOp)
-      return WalkResult::interrupt();
-
-    return WalkResult::advance();
-  });
-
-  if (opVecResult.wasInterrupted()) {
-    LLVM_DEBUG(dbgs() << "[early-vect]+++++ failed vectorization for: "
-                      << rootLoop << "\n");
-    // Erase vector loop nest if it was created.
-    auto vecRootLoopIt = state.opVectorReplacement.find(rootLoop);
-    if (vecRootLoopIt != state.opVectorReplacement.end())
-      eraseLoopNest(cast<AffineForOp>(vecRootLoopIt->second));
+  // 2. Vectorize operations reached by use-def chains from root except the
+  // terminals (store operations) that need to be post-processed separately.
+  // TODO: add more as we expand.
+  if (failed(vectorizeNonTerminals(&state))) {
+    LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals");
+    return guard.failure();
+  }
 
-    return failure();
+  // 3. Post-process terminals.
+  // Note: we have to post-process terminals because some of their defs may not
+  // be nested under the vectorization pattern (e.g. constants defined in an
+  // encompassing scope).
+  // TODO: Use a backward slice for terminals, avoid special casing and
+  // merge implementations.
+  for (auto *op : state.terminals) {
+    if (!vectorizeOneOperation(op, &state)) { // nullptr == failure
+      LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals");
+      return guard.failure();
+    }
   }
 
-  assert(state.opVectorReplacement.count(rootLoop) == 1 &&
-         "Expected vector replacement for loop nest");
+  // 4. Finish this vectorization pattern.
   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
-  LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorization result:\n"
-                    << *state.opVectorReplacement[rootLoop]);
-
-  // Finish this vectorization pattern.
-  state.finishVectorizationPattern(rootLoop);
-  return success();
+  state.finishVectorizationPattern();
+  return guard.success();
 }
 
-/// Extracts the matched loops and vectorizes them following a topological
-/// order. A new vector loop nest will be created if vectorization succeeds. The
-/// original loop nest won't be modified in any case.
+/// Vectorization is a recursive procedure where anything below can fail. The
+/// root match thus needs to maintain a clone for handling failure. Each root
+/// may succeed independently but will otherwise clean after itself if anything
+/// below it fails.
 static LogicalResult vectorizeRootMatch(NestedMatch m,
                                         const VectorizationStrategy &strategy) {
   std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize;
@@ -1264,7 +1272,7 @@ static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
     LLVM_DEBUG(dbgs() << "\n******************************************");
     LLVM_DEBUG(dbgs() << "\n******************************************");
     LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on parent op\n");
-    LLVM_DEBUG(dbgs() << *parentOp << "\n");
+    LLVM_DEBUG(parentOp->print(dbgs()));
 
     unsigned patternDepth = pat.getDepth();
 
diff --git a/mlir/lib/Dialect/Vector/VectorUtils.cpp b/mlir/lib/Dialect/Vector/VectorUtils.cpp
index b68886c..ef3ef3d 100644
--- a/mlir/lib/Dialect/Vector/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/VectorUtils.cpp
@@ -211,29 +211,29 @@ static AffineMap makePermutationMap(
 /// TODO: could also be implemented as a collect parents followed by a
 /// filter and made available outside this file.
 template <typename T>
-static SetVector<Operation *> getParentsOfType(Block *block) {
+static SetVector<Operation *> getParentsOfType(Operation *op) {
   SetVector<Operation *> res;
-  auto *current = block->getParentOp();
-  while (current) {
-    if (auto typedParent = dyn_cast<T>(current)) {
-      assert(res.count(current) == 0 && "Already inserted");
-      res.insert(current);
+  auto *current = op;
+  while (auto *parent = current->getParentOp()) {
+    if (auto typedParent = dyn_cast<T>(parent)) {
+      assert(res.count(parent) == 0 && "Already inserted");
+      res.insert(parent);
     }
-    current = current->getParentOp();
+    current = parent;
   }
   return res;
 }
 
 /// Returns the enclosing AffineForOp, from closest to farthest.
-static SetVector<Operation *> getEnclosingforOps(Block *block) {
-  return getParentsOfType<AffineForOp>(block);
+static SetVector<Operation *> getEnclosingforOps(Operation *op) {
+  return getParentsOfType<AffineForOp>(op);
 }
 
 AffineMap mlir::makePermutationMap(
-    Block *insertPoint, ArrayRef<Value> indices,
+    Operation *op, ArrayRef<Value> indices,
     const DenseMap<Operation *, unsigned> &loopToVectorDim) {
   DenseMap<Operation *, unsigned> enclosingLoopToVectorDim;
-  auto enclosingLoops = getEnclosingforOps(insertPoint);
+  auto enclosingLoops = getEnclosingforOps(op);
   for (auto *forInst : enclosingLoops) {
     auto it = loopToVectorDim.find(forInst);
     if (it != loopToVectorDim.end()) {
@@ -243,12 +243,6 @@ AffineMap mlir::makePermutationMap(
   return ::makePermutationMap(indices, enclosingLoopToVectorDim);
 }
 
-AffineMap mlir::makePermutationMap(
-    Operation *op, ArrayRef<Value> indices,
-    const DenseMap<Operation *, unsigned> &loopToVectorDim) {
-  return makePermutationMap(op->getBlock(), indices, loopToVectorDim);
-}
-
 AffineMap mlir::getTransferMinorIdentityMap(ShapedType shapedType,
                                             VectorType vectorType) {
   int64_t elementVectorRank = 0;
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
index 528169e..86749e2 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
@@ -1,8 +1,16 @@
-// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s
 
-// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
+// Permutation maps used in vectorization.
 // CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
+// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
+
+#map0 = affine_map<(d0) -> (d0)>
+#mapadd1 = affine_map<(d0) -> (d0 + 1)>
+#mapadd2 = affine_map<(d0) -> (d0 + 2)>
+#mapadd3 = affine_map<(d0) -> (d0 + 3)>
+#set0 = affine_set<(i) : (i >= 0)>
 
+// Maps introduced to vectorize fastest varying memory index.
 // CHECK-LABEL: func @vec1d_1
 func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -29,8 +37,6 @@ func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec1d_2
 func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -55,8 +61,6 @@ func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec1d_3
 func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -86,8 +90,6 @@ func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vector_add_2d
 func @vector_add_2d(%M : index, %N : index) -> f32 {
   %A = alloc (%M, %N) : memref<?x?xf32, 0>
@@ -140,8 +142,6 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
   return %res : f32
 }
 
-// -----
-
 // CHECK-LABEL: func @vec_rejected_1
 func @vec_rejected_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -164,8 +164,6 @@ func @vec_rejected_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec_rejected_2
 func @vec_rejected_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -188,8 +186,6 @@ func @vec_rejected_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec_rejected_3
 func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -217,8 +213,6 @@ func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec_rejected_4
 func @vec_rejected_4(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -244,8 +238,6 @@ func @vec_rejected_4(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec_rejected_5
 func @vec_rejected_5(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -272,8 +264,6 @@ func @vec_rejected_5(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec_rejected_6
 func @vec_rejected_6(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -302,8 +292,6 @@ func @vec_rejected_6(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec_rejected_7
 func @vec_rejected_7(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -327,11 +315,6 @@ func @vec_rejected_7(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
-// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
-// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
-
 // CHECK-LABEL: func @vec_rejected_8
 func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -361,11 +344,6 @@ func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
-// CHECK-DAG: #[[$map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)>
-// CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
-
 // CHECK-LABEL: func @vec_rejected_9
 func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -395,10 +373,6 @@ func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
-#set0 = affine_set<(i) : (i >= 0)>
-
 // CHECK-LABEL: func @vec_rejected_10
 func @vec_rejected_10(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -423,8 +397,6 @@ func @vec_rejected_10(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    return
 }
 
-// -----
-
 // CHECK-LABEL: func @vec_rejected_11
 func @vec_rejected_11(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
   // CHECK-DAG: %[[C0:.*]] = constant 0 : index
@@ -452,9 +424,7 @@ func @vec_rejected_11(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
   return
 }
 
-// -----
-
-// This should not vectorize due to the sequential dependence in the loop.
+// This should not vectorize due to the sequential dependence in the scf.
 // CHECK-LABEL: @vec_rejected_sequential
 func @vec_rejected_sequential(%A : memref<?xf32>) {
   %c0 = constant 0 : index
@@ -467,66 +437,3 @@ func @vec_rejected_sequential(%A : memref<?xf32>) {
   }
   return
 }
-
-// -----
-
-// CHECK-LABEL: @vec_no_load_store_ops
-func @vec_no_load_store_ops(%a: f32, %b: f32) {
- %cst = constant 0.000000e+00 : f32
- affine.for %i = 0 to 128 {
-   %add = addf %a, %b : f32
- }
- // CHECK-DAG:  %[[bc1:.*]] = vector.broadcast
- // CHECK-DAG:  %[[bc0:.*]] = vector.broadcast
- // CHECK:      affine.for %{{.*}} = 0 to 128 step
- // CHECK-NEXT:   [[add:.*]] addf %[[bc0]], %[[bc1]]
-
- return
-}
-
-// -----
-
-// This should not be vectorized due to the unsupported block argument (%i).
-// Support for operands with linear evolution is needed.
-// CHECK-LABEL: @vec_rejected_unsupported_block_arg
-func @vec_rejected_unsupported_block_arg(%A : memref<512xi32>) {
-  affine.for %i = 0 to 512 {
-    // CHECK-NOT: vector
-    %idx = std.index_cast %i : index to i32
-    affine.store %idx, %A[%i] : memref<512xi32>
-  }
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @vec_rejected_unsupported_reduction
-func @vec_rejected_unsupported_reduction(%in: memref<128x256xf32>, %out: memref<256xf32>) {
- %cst = constant 0.000000e+00 : f32
- affine.for %i = 0 to 256 {
-   // CHECK-NOT: vector
-   %final_red = affine.for %j = 0 to 128 iter_args(%red_iter = %cst) -> (f32) {
-     %ld = affine.load %in[%j, %i] : memref<128x256xf32>
-     %add = addf %red_iter, %ld : f32
-     affine.yield %add : f32
-   }
-   affine.store %final_red, %out[%i] : memref<256xf32>
- }
- return
-}
-
-// -----
-
-// CHECK-LABEL: @vec_rejected_unsupported_last_value
-func @vec_rejected_unsupported_last_value(%in: memref<128x256xf32>, %out: memref<256xf32>) {
- %cst = constant 0.000000e+00 : f32
- affine.for %i = 0 to 256 {
-   // CHECK-NOT: vector
-   %last_val = affine.for %j = 0 to 128 iter_args(%last_iter = %cst) -> (f32) {
-     %ld = affine.load %in[%j, %i] : memref<128x256xf32>
-     affine.yield %ld : f32
-   }
-   affine.store %last_val, %out[%i] : memref<256xf32>
- }
- return
-}
-- 
2.7.4