From a2c4ca50caf43a3924a37580451ebe9fa3daa128 Mon Sep 17 00:00:00 2001
From: Stella Stamenova <stilis@microsoft.com>
Date: Mon, 7 Nov 2022 08:48:52 -0800
Subject: [PATCH] Revert "[mlir][sparse] support Parallel for/reduction."

This reverts commit 838389780e56f1a198a94f66ea436359466bf5ed.

This broke the windows mlir buildbot: https://lab.llvm.org/buildbot/#/builders/13/builds/27934
---
 .../SparseTensor/Transforms/CodegenUtils.cpp       | 146 +++++----------------
 .../Dialect/SparseTensor/Transforms/CodegenUtils.h |  32 +----
 .../SparseTensor/Transforms/Sparsification.cpp     | 133 +++++++++++--------
 .../test/Dialect/SparseTensor/sparse_parallel.mlir |  20 +--
 .../SparseTensor/sparse_parallel_reduce.mlir       |  63 ---------
 .../Dialect/SparseTensor/CPU/sparse_matmul.mlir    |   8 --
 .../Dialect/SparseTensor/CPU/sparse_matvec.mlir    |  10 --
 7 files changed, 127 insertions(+), 285 deletions(-)
 delete mode 100644 mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
index 27b7acb..032d802 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -219,12 +219,9 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
     OpBuilder &builder, Location loc, size_t tid, size_t dim,
     MutableArrayRef<Value> reduc, bool isParallel, ArrayRef<size_t> extraTids,
     ArrayRef<size_t> extraDims) {
-
   assert(dimTypes[tid].size() > dim);
   // We can not re-enter the same level.
   assert(!coord[tid][dim]);
-  // TODO: support multiple return on parallel for?
-  assert(!isParallel || reduc.empty() <= 1);
 
   Value step = constantIndex(builder, loc, 1);
   auto dimType = dimTypes[tid][dim];
@@ -235,38 +232,11 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
   Value lo = isSparseInput ? pidxs[tid][dim]      // current offset
                            : loopSeqStack.back(); // univeral tid
   Value hi = highs[tid][dim];
-  Operation *loop = nullptr;
-  Value iv;
-  if (isParallel) {
-    scf::ParallelOp parOp =
-        builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
-    builder.setInsertionPointToStart(parOp.getBody());
-    assert(parOp.getNumReductions() == reduc.size());
-    iv = parOp.getInductionVars()[0];
-
-    // In-place update on the reduction variable vector.
-    // Note that the init vals is not the actual reduction variables but instead
-    // used as a `special handle` to (temporarily) represent them. The
-    // expression on init vals will be moved into scf.reduce and replaced with
-    // the block arguments when exiting the loop (see exitForLoop). This is
-    // needed as we can not build the actual reduction block and get the actual
-    // reduction varaible before users fill parallel loop body.
-    for (int i = 0, e = reduc.size(); i < e; i++)
-      reduc[i] = parOp.getInitVals()[i];
-    loop = parOp;
-  } else {
-    scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
-    builder.setInsertionPointToStart(forOp.getBody());
-    iv = forOp.getInductionVar();
-
-    // In-place update on the reduction variable vector.
-    assert(forOp.getNumRegionIterArgs() == reduc.size());
-    for (int i = 0, e = reduc.size(); i < e; i++)
-      reduc[i] = forOp.getRegionIterArg(i);
-    loop = forOp;
-  }
-  assert(loop && iv);
 
+  scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
+  builder.setInsertionPointToStart(forOp.getBody());
+  Value iv = forOp.getInductionVar();
+  assert(iv);
   if (isSparseInput) {
     pidxs[tid][dim] = iv;
     // Generating a load on the indices array yields the coordinate.
@@ -283,12 +253,16 @@ Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
 
   // NOTE: we can also prepares for next dim here in advance
   // Push the loop into stack
-  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), loop,
+  loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
                          coord[tid][dim]);
   // Emit extra locals.
   emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);
 
-  return loop;
+  // In-place update on the reduction variable vector.
+  assert(forOp.getNumRegionIterArgs() == reduc.size());
+  for (int i = 0, e = reduc.size(); i < e; i++)
+    reduc[i] = forOp.getRegionIterArg(i);
+  return forOp;
 }
 
 Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims(
@@ -460,73 +434,17 @@ void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims(
   }
 }
 
-void SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
-                                          MutableArrayRef<Value> reduc) {
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
+                                     ArrayRef<Value> reduc) {
   LoopLevelInfo &loopInfo = loopStack.back();
   auto &dims = loopStack.back().dims;
   auto &tids = loopStack.back().tids;
-  auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop);
-  if (forOp) {
-    if (!reduc.empty()) {
-      assert(reduc.size() == forOp.getNumResults());
-      rewriter.setInsertionPointToEnd(forOp.getBody());
-      rewriter.create<scf::YieldOp>(loc, reduc);
-    }
-    // Exit the loop.
-    rewriter.setInsertionPointAfter(forOp);
-    // In-place update reduction variables.
-    for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
-      reduc[i] = forOp.getResult(i);
-  } else {
-    auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
-    if (!reduc.empty()) {
-      assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
-      Operation *redExp = reduc.front().getDefiningOp();
-      // Reduction expression should have no use.
-      assert(redExp->getUses().empty());
-      // This must be a binary operation.
-      // NOTE: This is users' responsibilty to ensure the operation are
-      // commutative.
-      assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
-
-      Value redVal = parOp.getInitVals().front();
-      Value curVal;
-      if (redExp->getOperand(0) == redVal)
-        curVal = redExp->getOperand(1);
-      else if (redExp->getOperand(1) == redVal)
-        curVal = redExp->getOperand(0);
-      // One of the operands must be the init value (which is also the
-      // previous reduction value).
-      assert(curVal);
-      // The reduction expression should be the only user of the reduction val
-      // inside the parallel for.
-      unsigned numUsers = 0;
-      for (Operation *op : redVal.getUsers()) {
-        if (op->getParentOp() == parOp)
-          numUsers++;
-      }
-      assert(numUsers == 1);
-      (void)numUsers; // to silence unused variable warning in release build
-
-      rewriter.setInsertionPointAfter(redExp);
-      auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
-      // Attach to the reduction op.
-      Block *redBlock = &redOp.getRegion().getBlocks().front();
-      rewriter.setInsertionPointToEnd(redBlock);
-      Operation *newRed = rewriter.clone(*redExp);
-      // Replaces arguments of the reduction expression by using the block
-      // arguments from scf.reduce.
-      rewriter.updateRootInPlace(
-          newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
-      // Erases the out-dated reduction expression.
-      rewriter.eraseOp(redExp);
-      rewriter.setInsertionPointToEnd(redBlock);
-      rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
-    }
-    rewriter.setInsertionPointAfter(parOp);
-    // In-place update reduction variables.
-    for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
-      reduc[i] = parOp.getResult(i);
+  auto forOp = llvm::cast<scf::ForOp>(loopInfo.loop);
+  if (!reduc.empty()) {
+    assert(reduc.size() == forOp.getNumResults());
+    builder.setInsertionPointToEnd(forOp.getBody());
+    builder.create<scf::YieldOp>(loc, reduc);
   }
 
   // Finished iterating a tensor, clean up
@@ -540,10 +458,14 @@ void SparseTensorLoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
     if (!isDenseDLT(dimTypes[tid][dim]))
       highs[tid][dim] = Value();
   }
+  // exit the loop
+  builder.setInsertionPointAfter(forOp);
+  return forOp.getResults();
 }
 
-void SparseTensorLoopEmitter::exitCoIterationLoop(
-    OpBuilder &builder, Location loc, MutableArrayRef<Value> reduc) {
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
+                                             ArrayRef<Value> reduc) {
   auto whileOp = llvm::cast<scf::WhileOp>(loopStack.back().loop);
   auto &dims = loopStack.back().dims;
   auto &tids = loopStack.back().tids;
@@ -577,10 +499,10 @@ void SparseTensorLoopEmitter::exitCoIterationLoop(
   }
 
   // Reduction value from users.
-  for (unsigned i = 0, e = reduc.size(); i < e; i++) {
-    operands.push_back(reduc[i]);
-    // In place update reduction variable.
-    reduc[i] = whileOp->getResult(o++);
+  SmallVector<Value, 2> ret;
+  for (auto red : reduc) {
+    operands.push_back(red);
+    ret.push_back(whileOp->getResult(o++));
   }
 
   // An (optional) universal index.
@@ -595,24 +517,26 @@ void SparseTensorLoopEmitter::exitCoIterationLoop(
   assert(o == operands.size());
   builder.create<scf::YieldOp>(loc, operands);
   builder.setInsertionPointAfter(whileOp);
+  return ret;
 }
 
-void SparseTensorLoopEmitter::exitCurrentLoop(RewriterBase &rewriter,
-                                              Location loc,
-                                              MutableArrayRef<Value> reduc) {
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc,
+                                         ArrayRef<Value> reduc) {
   // Clean up the values, it would help use to discover potential bug at a
   // earlier stage (instead of silently using a wrong value).
   LoopLevelInfo &loopInfo = loopStack.back();
   assert(loopInfo.tids.size() == loopInfo.dims.size());
   SmallVector<Value, 2> red;
   if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
-    exitCoIterationLoop(rewriter, loc, reduc);
+    red = exitCoiterationLoop(builder, loc, reduc);
   } else {
-    exitForLoop(rewriter, loc, reduc);
+    red = exitForLoop(builder, loc, reduc);
   }
 
   assert(loopStack.size() == loopSeqStack.size());
   loopStack.pop_back();
+  return red;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
index a75d392..3228eb4 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -380,8 +380,8 @@ public:
       ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc = {},
       ArrayRef<size_t> extraTids = {}, ArrayRef<size_t> extraDims = {});
 
-  void exitCurrentLoop(RewriterBase &rewriter, Location loc,
-                       MutableArrayRef<Value> reduc = {});
+  SmallVector<Value, 2> exitCurrentLoop(OpBuilder &builder, Location loc,
+                                        ArrayRef<Value> reduc = {});
 
   /// Returns the array of coordinate for all the loop generated till now.
   void getCoordinateArray(SmallVectorImpl<Value> &coords) const {
@@ -452,35 +452,17 @@ private:
                                             ArrayRef<size_t> dims);
 
   /// Exits a for loop, returns the reduction results, e.g.,
-  /// For sequential for loops:
   /// %ret = for () {
   ///   ...
-  ///   %val = addi %args, %c
   ///   yield %val
   /// }
-  /// For parallel loops, the following generated code by users:
-  /// %ret = parallel () init(%args) {
-  ///   ...
-  ///   %val = op %args, %c
-  /// }
-  /// will be transformed into
-  /// %ret = parallel () init(%args) {
-  ///   ...
-  ///   scf.reduce(%c) bb0(%0, %1){
-  ///     %val = op %0, %1
-  ///     scf.reduce.return %val
-  ///   }
-  /// }
-  /// NOTE: only one instruction will be moved into reduce block, transformation
-  /// will fail if multiple instructions are used to compute the reduction
-  /// value.
-  /// Return %ret to user, while %val is provided by users (`reduc`).
-  void exitForLoop(RewriterBase &rewriter, Location loc,
-                   MutableArrayRef<Value> reduc);
+  /// Return %ret to user, while %val is provided by users (`reduc`)
+  SmallVector<Value, 2> exitForLoop(OpBuilder &builder, Location loc,
+                                    ArrayRef<Value> reduc);
 
   /// Exits a while loop, returns the reduction results.
-  void exitCoIterationLoop(OpBuilder &builder, Location loc,
-                           MutableArrayRef<Value> reduc);
+  SmallVector<Value, 2> exitCoiterationLoop(OpBuilder &builder, Location loc,
+                                            ArrayRef<Value> reduc);
 
   // Whether the loop emitter needs to treat the last tensor as the output
   // tensor.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index 533d31f..9f01731 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -410,34 +410,6 @@ static Value getCustomRedId(Operation *op) {
 // Sparse compiler synthesis methods (statements and expressions).
 //===----------------------------------------------------------------------===//
 
-/// Generates loop boundary statements (entering/exiting loops). The function
-/// passes and updates the reduction value.
-static Optional<Operation *> genLoopBoundary(
-    CodeGen &codegen, Merger &merger,
-    function_ref<Optional<Operation *>(MutableArrayRef<Value> reduc)>
-        callback) {
-  SmallVector<Value, 4> reduc;
-  if (codegen.redVal)
-    reduc.push_back(codegen.redVal);
-  if (codegen.expValues)
-    reduc.push_back(codegen.expCount);
-  if (codegen.insChain)
-    reduc.push_back(codegen.insChain);
-
-  auto r = callback(reduc);
-
-  // Callback should do in-place update on reduction value vector.
-  unsigned i = 0;
-  if (codegen.redVal)
-    updateReduc(merger, codegen, reduc[i++]);
-  if (codegen.expValues)
-    codegen.expCount = reduc[i++];
-  if (codegen.insChain)
-    codegen.insChain = reduc[i];
-
-  return r;
-}
-
 /// Local bufferization of all dense and sparse data structures.
 static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                        linalg::GenericOp op) {
@@ -897,25 +869,23 @@ static void genExpansion(Merger &merger, CodeGen &codegen, OpBuilder &builder,
 /// Returns parallelization strategy. Any implicit loop in the Linalg
 /// operation that is marked "parallel" is a candidate. Whether it is actually
 /// converted to a parallel operation depends on the requested strategy.
-static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isSparse) {
+static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
+                          bool isSparse) {
   // Reject parallelization of sparse output.
   if (codegen.sparseOut)
     return false;
-  // Parallel loops on tensor expansion can cause data races.
-  if (codegen.expCount)
-    return false;
   // Inspect strategy.
   switch (codegen.options.parallelizationStrategy) {
   case SparseParallelizationStrategy::kNone:
     return false;
   case SparseParallelizationStrategy::kDenseOuterLoop:
-    return isOuter && !isSparse;
+    return isOuter && !isSparse && !isReduction;
   case SparseParallelizationStrategy::kAnyStorageOuterLoop:
-    return isOuter;
+    return isOuter && !isReduction;
   case SparseParallelizationStrategy::kDenseAnyLoop:
-    return !isSparse;
+    return !isSparse && !isReduction;
   case SparseParallelizationStrategy::kAnyStorageAnyLoop:
-    return true;
+    return !isReduction;
   }
   llvm_unreachable("unexpected parallelization strategy");
 }
@@ -928,16 +898,33 @@ static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                          ArrayRef<size_t> extraDims) {
   Location loc = op.getLoc();
   auto iteratorTypes = op.getIteratorTypesArray();
+  bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]);
   bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) ||
                   isSingletonDLT(merger.getDimLevelType(tid, idx));
-  bool isParallel = isParallelFor(codegen, isOuter, isSparse);
-
-  Operation *loop =
-      genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
-        return codegen.loopEmitter.enterLoopOverTensorAtDim(
-            builder, loc, tid, dim, reduc, isParallel, extraTids, extraDims);
-      }).value();
-  assert(loop);
+  bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse);
+  assert(!isParallel);
+
+  // Emit a sequential for loop.
+  SmallVector<Value, 4> operands;
+  if (codegen.redVal)
+    operands.push_back(codegen.redVal);
+  if (codegen.expValues)
+    operands.push_back(codegen.expCount);
+  if (codegen.insChain)
+    operands.push_back(codegen.insChain);
+
+  Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim(
+      builder, loc, tid, dim, operands, isParallel, extraTids, extraDims);
+
+  unsigned o = 0;
+  if (codegen.redVal)
+    updateReduc(merger, codegen, operands[o++]);
+  if (codegen.expValues)
+    codegen.expCount = operands[o++];
+  if (codegen.insChain)
+    codegen.insChain = operands[o++];
+  assert(o == operands.size());
+
   return loop;
 }
 
@@ -947,15 +934,29 @@ static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                            ArrayRef<size_t> condTids, ArrayRef<size_t> condDims,
                            ArrayRef<size_t> extraTids,
                            ArrayRef<size_t> extraDims) {
+  SmallVector<Value, 4> operands;
+
+  // Construct the while-loop with a parameter for each index.
+  if (codegen.redVal)
+    operands.push_back(codegen.redVal);
+  if (codegen.expValues)
+    operands.push_back(codegen.expCount);
+  if (codegen.insChain)
+    operands.push_back(codegen.insChain);
+
+  Operation *loop = codegen.loopEmitter.enterCoIterationOverTensorsAtDims(
+      builder, op.getLoc(), condTids, condDims, needsUniv, operands, extraTids,
+      extraDims);
+
+  unsigned o = 0;
+  if (codegen.redVal)
+    updateReduc(merger, codegen, operands[o++]);
+  if (codegen.expValues)
+    codegen.expCount = operands[o++];
+  if (codegen.insChain)
+    codegen.insChain = operands[o++];
+  assert(o == operands.size());
 
-  Operation *loop =
-      genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
-        // Construct the while-loop with a parameter for each index.
-        return codegen.loopEmitter.enterCoIterationOverTensorsAtDims(
-            builder, op.getLoc(), condTids, condDims, needsUniv, reduc,
-            extraTids, extraDims);
-      }).value();
-  assert(loop);
   return loop;
 }
 
@@ -1185,21 +1186,37 @@ static Operation *startLoop(Merger &merger, CodeGen &codegen,
 }
 
 /// Ends a single loop in current sequence. Returns new values for needsUniv.
-static bool endLoop(Merger &merger, CodeGen &codegen, RewriterBase &rewriter,
+static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
                     linalg::GenericOp op, Operation *loop, unsigned idx,
                     unsigned li, bool needsUniv) {
   // End a while-loop.
   if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) {
-    finalizeWhileOp(merger, codegen, rewriter, op, idx, needsUniv,
+    finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv,
                     merger.lat(li).bits, whileOp);
   } else {
     needsUniv = false;
   }
 
-  genLoopBoundary(codegen, merger, [&](MutableArrayRef<Value> reduc) {
-    codegen.loopEmitter.exitCurrentLoop(rewriter, op.getLoc(), reduc);
-    return llvm::None;
-  });
+  SmallVector<Value, 2> reduc;
+  if (codegen.redVal)
+    reduc.push_back(codegen.redVal);
+  if (codegen.expValues)
+    reduc.push_back(codegen.expCount);
+  if (codegen.insChain)
+    reduc.push_back(codegen.insChain);
+
+  auto loopRet =
+      codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc);
+  assert(reduc.size() == loopRet.size());
+
+  unsigned o = 0;
+  if (codegen.redVal)
+    updateReduc(merger, codegen, loopRet[o++]);
+  if (codegen.expValues)
+    codegen.expCount = loopRet[o++];
+  if (codegen.insChain)
+    codegen.insChain = loopRet[o++];
+  assert(o == loopRet.size());
 
   return needsUniv;
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir
index f38865c..38766b0 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_parallel.mlir
@@ -1,13 +1,14 @@
 // RUN: mlir-opt %s -sparsification="parallelization-strategy=none" | \
 // RUN:   FileCheck %s --check-prefix=CHECK-PAR0
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
-// RUN:   FileCheck %s --check-prefix=CHECK-PAR1
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
-// RUN:   FileCheck %s --check-prefix=CHECK-PAR2
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
-// RUN:   FileCheck %s --check-prefix=CHECK-PAR3
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
-// RUN:   FileCheck %s --check-prefix=CHECK-PAR4
+// FIXME: we do not support vectorization/parallel loops in loop emitter right now
+// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
+// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR1
+// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
+// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR2
+// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
+// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR3
+// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
+// R_U_N:   FileCheck %s --check-prefix=CHECK-PAR4
 
 #DenseMatrix = #sparse_tensor.encoding<{
   dimLevelType = [ "dense", "dense" ]
@@ -150,8 +151,7 @@ func.func @scale_ss(%scale: f32,
 //
 // CHECK-PAR4-LABEL: func @matvec
 // CHECK-PAR4:         scf.parallel
-// CHECK-PAR4:           scf.parallel
-// CHECK-PAR4:             scf.reduce
+// CHECK-PAR4:           scf.for
 // CHECK-PAR4:         return
 //
 func.func @matvec(%arga: tensor<16x32xf32, #CSR>,
diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
deleted file mode 100644
index 8ba66d2..0000000
--- a/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
+++ /dev/null
@@ -1,63 +0,0 @@
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
-// RUN:   FileCheck %s
-
-#CSR = #sparse_tensor.encoding<{
-  dimLevelType = [ "dense", "compressed" ]
-}>
-
-#trait_matvec = {
-  indexing_maps = [
-    affine_map<(i,j) -> (i,j)>,  // A
-    affine_map<(i,j) -> (j)>,    // b
-    affine_map<(i,j) -> (i)>     // x (out)
-  ],
-  iterator_types = ["parallel", "reduction"],
-  doc = "x(i) += A(i,j) * b(j)"
-}
-// CHECK-LABEL:  func.func @matvec(
-//  CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>,
-//  CHECK-SAME:    %[[TMP_arg1:.*]]: tensor<32xf32>,
-//  CHECK-SAME:    %[[TMP_arg2:.*]]: tensor<16xf32>) -> tensor<16xf32> {
-//   CHECK-DAG:  %[[TMP_c16:.*]] = arith.constant 16 : index
-//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
-//       CHECK:  %[[TMP_0:.*]] = sparse_tensor.pointers %[[TMP_arg0]] {dimension = 1 : index}
-//       CHECK:  %[[TMP_1:.*]] = sparse_tensor.indices %[[TMP_arg0]] {dimension = 1 : index}
-//       CHECK:  %[[TMP_2:.*]] = sparse_tensor.values %[[TMP_arg0]]
-//       CHECK:  %[[TMP_3:.*]] = bufferization.to_memref %[[TMP_arg1]] : memref<32xf32>
-//       CHECK:  %[[TMP_4:.*]] = bufferization.to_memref %[[TMP_arg2]] : memref<16xf32>
-//       CHECK:  scf.parallel (%[[TMP_arg3:.*]]) = (%[[TMP_c0]]) to (%[[TMP_c16]]) step (%[[TMP_c1]]) {
-//       CHECK:    %[[TMP_6:.*]] = memref.load %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32>
-//       CHECK:    %[[TMP_7:.*]] = memref.load %[[TMP_0]][%[[TMP_arg3]]] : memref<?xindex>
-//       CHECK:    %[[TMP_8:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-//       CHECK:    %[[TMP_9:.*]] = memref.load %[[TMP_0]][%[[TMP_8]]] : memref<?xindex>
-//       CHECK:    %[[TMP_10:.*]] = scf.parallel (%[[TMP_arg4:.*]]) = (%[[TMP_7]]) to (%[[TMP_9]]) step (%[[TMP_c1]]) init (%[[TMP_6]]) -> f32 {
-//       CHECK:      %[[TMP_11:.*]] = memref.load %[[TMP_1]][%[[TMP_arg4]]] : memref<?xindex>
-//       CHECK:      %[[TMP_12:.*]] = memref.load %[[TMP_2]][%[[TMP_arg4]]] : memref<?xf32>
-//       CHECK:      %[[TMP_13:.*]] = memref.load %[[TMP_3]][%[[TMP_11]]] : memref<32xf32>
-//       CHECK:      %[[TMP_14:.*]] = arith.mulf %[[TMP_12]], %[[TMP_13]] : f32
-//       CHECK:      scf.reduce(%[[TMP_14]])  : f32 {
-//       CHECK:      ^bb0(%[[TMP_arg5:.*]]: f32, %[[TMP_arg6:.*]]: f32):
-//       CHECK:        %[[TMP_15:.*]] = arith.addf %[[TMP_arg5]], %[[TMP_arg6]] : f32
-//       CHECK:        scf.reduce.return %[[TMP_15]] : f32
-//       CHECK:      }
-//       CHECK:      scf.yield
-//       CHECK:    }
-//       CHECK:    memref.store %[[TMP_10]], %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32>
-//       CHECK:    scf.yield
-//       CHECK:  }
-//       CHECK:  %[[TMP_5:.*]] = bufferization.to_tensor %[[TMP_4]] : memref<16xf32>
-//       CHECK:  return %[[TMP_5]] : tensor<16xf32>
-func.func @matvec(%arga: tensor<16x32xf32, #CSR>,
-                  %argb: tensor<32xf32>,
-	          %argx: tensor<16xf32>) -> tensor<16xf32> {
-  %0 = linalg.generic #trait_matvec
-      ins(%arga, %argb : tensor<16x32xf32, #CSR>, tensor<32xf32>)
-     outs(%argx: tensor<16xf32>) {
-    ^bb(%A: f32, %b: f32, %x: f32):
-      %0 = arith.mulf %A, %b : f32
-      %1 = arith.addf %0, %x : f32
-      linalg.yield %1 : f32
-  } -> tensor<16xf32>
-  return %0 : tensor<16xf32>
-}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
index 459b0e1..c12d2b9 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
@@ -2,14 +2,6 @@
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
-//
-// Do the same run, but now with parallelization.
-//
-// RUN: mlir-opt %s --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
-
 
 #CSR = #sparse_tensor.encoding<{
   dimLevelType = [ "dense", "compressed" ],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
index adc0b26..59e7f33 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matvec.mlir
@@ -4,16 +4,6 @@
 // RUN:  -e entry -entry-point-result=void  \
 // RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
-//
-// Do the same run, but now with parallelization.
-//
-// RUN: mlir-opt %s \
-// RUN:   --sparse-compiler="parallelization-strategy=any-storage-any-loop" | \
-// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
-// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
 
 !Filename = !llvm.ptr<i8>
 
-- 
2.7.4