From 44cfea0279a4fb9ea8cb0c68a2b5ee7a81654071 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Mon, 10 Oct 2022 02:05:14 -0700
Subject: [PATCH] [mlir][Linalg] Retire LinalgStrategyTilePass and filter-based
 pattern.

Context: https://discourse.llvm.org/t/psa-retire-linalg-filter-based-patterns/63785

Uses of `LinalgTilingPattern::returningMatchAndRewrite` are replaced by a top-level `tileWithLinalgTilingOptions` function that is marked obsolete and serves
as a temporary means to transition away from `LinalgTilingOptions`-based tiling.
LinalgTilingOptions supports too many options that have been orthogonalized with the use of the transform dialect.

Additionally, the revision introduces a `transform.structured.tile_to_scf_for` structured transform operation that is needed to properly tile `tensor.pad`
via the TilingInterface. Uses of `transform.structured.tile` will be deprecated and replaced by this new op.
This will achieve the deprecation of `linalg::tileLinalgOp`.
Context: https://discourse.llvm.org/t/psa-retire-tileandfuselinalgops-method/63850

In the process of transitioning, tests that were performing tile and distribute on tensors are retired: transformations should be orthogonalized better in the future.
In particular, tiling to specific loop types and tileAndDistribute behavior are not available via the transform ops.
The behavior is still available as part of the `tileWithLinalgTilingOptions` method to allow downstream clients to transition without breakages but is meant to be retired soon.

As more tests are ported to the transform dialect, it became necessary to introduce a test-transform-dialect-erase-schedule-pass to discard the transform specification
once applied so that e2e lowering and execution is possible.

Lastly, a number of redundant tests that were testing composition of patterns are retired as they are available with a better mechanism via the transform dialect.

Differential Revision: https://reviews.llvm.org/D135573
---
 mlir/include/mlir/Dialect/Linalg/Passes.h          |   5 -
 mlir/include/mlir/Dialect/Linalg/Passes.td         |  29 --
 .../Linalg/TransformOps/LinalgTransformOps.td      |  57 ++++
 .../Dialect/Linalg/Transforms/CodegenStrategy.h    |  33 --
 .../mlir/Dialect/Linalg/Transforms/Transforms.h    |  90 ++----
 .../Linalg/TransformOps/LinalgTransformOps.cpp     | 165 +++++++++-
 .../Linalg/Transforms/LinalgStrategyPasses.cpp     |  42 ---
 mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp      |  69 -----
 mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp  |  24 +-
 mlir/test/Dialect/Linalg/tile-and-distribute.mlir  | 219 --------------
 .../test/Dialect/Linalg/tile-and-peel-tensors.mlir | 110 -------
 mlir/test/Dialect/Linalg/tile-conv.mlir            |   8 +-
 mlir/test/Dialect/Linalg/tile-indexed.mlir         |  53 +---
 mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir   | 251 +++++++++-------
 mlir/test/Dialect/Linalg/tile-parallel-reduce.mlir | 113 -------
 mlir/test/Dialect/Linalg/tile-parallel.mlir        |  68 -----
 .../Linalg/tile-scalarize-dynamic-dims.mlir        |  74 -----
 mlir/test/Dialect/Linalg/tile-tensors.mlir         |  19 +-
 mlir/test/Dialect/Linalg/tile-zero.mlir            |  12 -
 mlir/test/Dialect/Linalg/tile.mlir                 | 331 ---------------------
 mlir/test/Dialect/Linalg/transform-patterns.mlir   | 118 +++++---
 .../Dialect/Linalg/CPU/test-conv-1d-call.mlir      |  10 +-
 .../Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir      |  10 +-
 .../Dialect/Linalg/CPU/test-conv-2d-call.mlir      |  10 +-
 .../Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir    |  10 +-
 .../Dialect/Linalg/CPU/test-conv-3d-call.mlir      |  10 +-
 .../Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir  |   9 +-
 .../Dialect/Linalg/CPU/test-tensor-matmul.mlir     |  10 +-
 .../lib/Dialect/Linalg/TestLinalgTransforms.cpp    | 303 +------------------
 .../Transform/TestTransformDialectInterpreter.cpp  |  29 ++
 mlir/tools/mlir-opt/mlir-opt.cpp                   |   2 +
 31 files changed, 588 insertions(+), 1705 deletions(-)
 delete mode 100644 mlir/test/Dialect/Linalg/tile-and-distribute.mlir
 delete mode 100644 mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir
 delete mode 100644 mlir/test/Dialect/Linalg/tile-parallel-reduce.mlir
 delete mode 100644 mlir/test/Dialect/Linalg/tile-parallel.mlir
 delete mode 100644 mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir
 delete mode 100644 mlir/test/Dialect/Linalg/tile-zero.mlir
 delete mode 100644 mlir/test/Dialect/Linalg/tile.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index 6e41f05..40ca027 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -39,11 +39,6 @@ std::unique_ptr<Pass> createFoldReshapeOpsByLinearizationPass();
 std::unique_ptr<Pass> createLinalgNamedOpConversionPass();
 
 std::unique_ptr<OperationPass<func::FuncOp>>
-createLinalgTilingPass(ArrayRef<int64_t> tileSizes = {},
-                       linalg::LinalgTilingLoopType loopType =
-                           linalg::LinalgTilingLoopType::Loops);
-
-std::unique_ptr<OperationPass<func::FuncOp>>
 createLinalgInlineScalarOperandsPass();
 
 /// Create a pass to convert Linalg operations to scf.for loops and
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 40a2f11..73fd30b 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -102,22 +102,6 @@ def LinalgBufferize : Pass<"linalg-bufferize", "func::FuncOp"> {
   ];
 }
 
-def LinalgTilingPass : Pass<"linalg-tile", "func::FuncOp"> {
-  let summary = "Tile operations in the linalg dialect";
-  let constructor = "mlir::createLinalgTilingPass()";
-  let dependentDialects = [
-    "AffineDialect",
-    "linalg::LinalgDialect",
-    "memref::MemRefDialect",
-    "scf::SCFDialect"
-  ];
-  let options = [
-    ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes">,
-    Option<"loopType", "loop-type", "std::string", /*default=*/"\"for\"",
-           "Specify the type of loops to generate: for, parallel">
-  ];
-}
-
 def LinalgGeneralization : Pass<"linalg-generalize-named-ops", "func::FuncOp"> {
   let summary = "Convert named ops into generic ops";
   let constructor = "mlir::createLinalgGeneralizationPass()";
@@ -162,19 +146,6 @@ def LinalgDetensorize : Pass<"linalg-detensorize", ""> {
   ];
 }
 
-def LinalgStrategyTilePass
-    : Pass<"linalg-strategy-tile-pass", "func::FuncOp"> {
-  let summary = "Configurable pass to apply pattern-based linalg tiling.";
-  let constructor = "mlir::createLinalgStrategyTilePass()";
-  let dependentDialects = ["linalg::LinalgDialect"];
-  let options = [
-    Option<"anchorFuncName", "anchor-func", "std::string", /*default=*/"",
-      "Which func op is the anchor to latch on.">,
-    Option<"anchorOpName", "anchor-op", "std::string", /*default=*/"",
-      "Which linalg op within the func is the anchor to latch on.">,
-  ];
-}
-
 def LinalgStrategyRemoveMarkersPass
     : Pass<"linalg-strategy-remove-markers-pass", "func::FuncOp"> {
   let summary = "Cleanup pass that drops markers.";
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 491c5a8..be4efaa 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -751,6 +751,63 @@ def TileToForeachThreadOp :
   }];
 }
 
+def TileToScfForOp : Op<Transform_Dialect, "structured.tile_to_scf_for",
+       [DeclareOpInterfaceMethods<TransformOpInterface>,
+        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let description = [{
+    Indicates that the given `target` op should be tiled with the given sizes.
+    This transform generates a loop nest with a smaller ("tiled") target
+    operation in its body. The target must implement TilingInterface.
+
+    Tile sizes may be known at transformation time, in which case they are
+    expected to be provided in the `static_size` attribute, or not, in which
+    case the tile value must be computed by the payload IR and the handle to the
+    operation computing it must be provided through `dynamic_sizes`. When the
+    sizes are not known statically, the corresponding entry in the
+    `static_sizes` attribute must be set to `ShapedType::kDynamicSize`. Only
+    the dynamic sizes must be provided in `dynamic_sizes`, i.e., there should
+    be as many handles as `ShapedType::kDynamicSize` values in the
+    `static_sizes` attribute. A static size of `0` indicates that the dimension
+    should not be tiled. No loop will be generated for such dimensions. If all
+    tile sizes are `0`, this transform is effectively a no-op.
+
+    This op returns handles to the tiled op (in the generated loop nest) and the
+    generated loops. The number of loops is the number of tile sizes that are
+    statically known to be non-zero.
+
+    #### Return modes
+
+    On success, the resulting handles are associated with co-indexed lists of
+    tiled operations and loops around them.
+
+    This operation only supports TilingInterface ops and produces a silenceable
+    failure if the input contains any non-TilingInterface ops. The ops preceding
+    it in the list associated with the `target` handle will have been tiled.
+
+    This operation produces a silenceable failure if the `dynamic_sizes` handles
+    are associated with lists of payload operations of a size different than
+    that of the list associated with the `target` handle.
+
+    If the internal implementation of tiling for any of the operations fails,
+    produces a definite failure.
+  }];
+
+  let arguments = (ins PDL_Operation:$target,
+                   Variadic<PDL_Operation>:$dynamic_sizes,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$static_sizes,
+                   DefaultValuedAttr<I64ArrayAttr, "{}">:$interchange);
+  let results = (outs PDL_Operation:$tiled_linalg_op,
+                      Variadic<PDL_Operation>:$loops);
+
+  let hasCustomAssemblyFormat = 1;
+
+  let extraClassDeclaration = [{
+    /// Returns the list of tile sizes, which may be static (Attribute) or
+    /// dynamic (Value).
+    SmallVector<OpFoldResult> getMixedSizes();
+  }];
+}
+
 def VectorizeOp : Op<Transform_Dialect, "structured.vectorize",
     [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
      TransformEachOpTrait, TransformOpInterface]> {
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h b/mlir/include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h
index d7c0d22..ae3df32 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h
@@ -30,41 +30,8 @@ struct Transformation {
   LinalgTransformationFilter::FilterFunction filter = nullptr;
 };
 
-/// Represent one application of LinalgStrategyTilePass.
-struct Tile : public Transformation {
-  Tile(StringRef name, linalg::LinalgTilingOptions options,
-       LinalgTransformationFilter::FilterFunction f = nullptr)
-      : Transformation(std::move(f)), opName(name),
-        options(std::move(options)) {}
-
-  void addToPassPipeline(OpPassManager &pm,
-                         LinalgTransformationFilter m) const override {
-    pm.addPass(createLinalgStrategyTilePass(opName, options, m));
-  }
-
-private:
-  std::string opName;
-  linalg::LinalgTilingOptions options;
-};
-
 /// Codegen strategy controls how a Linalg op is progressively lowered.
 struct CodegenStrategy {
-  /// Append a pattern to add a level of tiling for Op `opName` with tiling
-  /// `options`.
-  CodegenStrategy &
-  tile(StringRef opName, const linalg::LinalgTilingOptions &options,
-       const LinalgTransformationFilter::FilterFunction &f = nullptr) {
-    transformationSequence.emplace_back(
-        std::make_unique<Tile>(opName, options, f));
-    return *this;
-  }
-  /// Conditionally append a pattern to add a level of tiling for
-  /// `LinalgOpType` with tiling `options`.
-  CodegenStrategy &
-  tileIf(bool b, StringRef opName, linalg::LinalgTilingOptions options,
-         LinalgTransformationFilter::FilterFunction f = nullptr) {
-    return b ? tile(opName, std::move(options), std::move(f)) : *this;
-  }
   /// Configure the post staged-patterns global enabling passes options.
   CodegenStrategy &
   setVectorTransferToSCFOptions(LinalgEnablingOptions options) {
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index fb37c6f..044ce8d 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -696,57 +696,26 @@ struct LinalgTilingOptions {
 RewritePatternSet getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx);
 void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns);
 
-///
-/// Linalg tiling pattern.
-///
-/// Apply the `tiling` transformation as a pattern.
-/// `filter` controls LinalgTransformMarker matching and update when specified.
-/// See `tiling` for more details.
-// TODO: TiledOpInterface
-struct LinalgTilingPattern : public OpInterfaceRewritePattern<LinalgOp> {
-  /// Construct a generic pattern applied to all LinalgOp that verify `filter`.
-  LinalgTilingPattern(
-      MLIRContext *context, LinalgTilingOptions options,
-      LinalgTransformationFilter f = LinalgTransformationFilter(),
-      PatternBenefit benefit = 1);
-
-  /// Construct a pattern specifically applied to `opName`.
-  LinalgTilingPattern(
-      StringRef opName, MLIRContext *context, LinalgTilingOptions options,
-      LinalgTransformationFilter f = LinalgTransformationFilter(),
-      PatternBenefit benefit = 1);
-
-  /// `matchAndRewrite` implementation that returns the significant transformed
-  /// pieces of IR.
-  FailureOr<TiledLinalgOp>
-  returningMatchAndRewrite(LinalgOp op, PatternRewriter &rewriter) const;
-
-  LogicalResult matchAndRewrite(LinalgOp op,
-                                PatternRewriter &rewriter) const override {
-    return returningMatchAndRewrite(op, rewriter);
-  }
-
-private:
-  /// LinalgTransformMarker handles special attribute manipulations.
-  LinalgTransformationFilter filter;
-  /// Options to control tiling;
-  LinalgTilingOptions options;
-};
+/// Perform tiling using LinalgTilingOptions.
+/// Note: this is on a path to deprecation that only works on LinalgOp.
+/// Clients should favor using `tileUsingSCFForOp`  that more generally works on
+/// TilingInterface.
+FailureOr<TiledLinalgOp>
+tileWithLinalgTilingOptions(RewriterBase &rewriter, LinalgOp op,
+                            const LinalgTilingOptions &options);
 
 ///
 /// Linalg padding pattern.
 ///
 /// Apply the `padding` transformation as a pattern.
-/// `filter` controls LinalgTransformMarker matching and update when specified.
 /// See `padding` for more details.
 struct LinalgPaddingPattern : public OpInterfaceRewritePattern<LinalgOp> {
-  /// Construct a generic pattern applied to all LinalgOp that verify `filter`.
   LinalgPaddingPattern(MLIRContext *context,
                        LinalgPaddingOptions options = LinalgPaddingOptions(),
                        PatternBenefit benefit = 1);
 
-  /// `matchAndRewrite` implementation that returns the significant transformed
-  /// pieces of IR.
+  /// `matchAndRewrite` implementation that returns the significant
+  /// transformed pieces of IR.
   FailureOr<LinalgOp> returningMatchAndRewrite(LinalgOp op,
                                                PatternRewriter &rewriter) const;
 
@@ -954,9 +923,9 @@ void populateLinalgNamedOpsGeneralizationPatterns(
 
 /// Linalg decompose convolutions patterns
 
-/// Populates patterns to decompose high-D convolution ops into low-D ones. This
-/// is a step in progressive lowering for convolution ops, afterwards we can
-/// vectorize the low-D convolution ops.
+/// Populates patterns to decompose high-D convolution ops into low-D ones.
+/// This is a step in progressive lowering for convolution ops, afterwards we
+/// can vectorize the low-D convolution ops.
 void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns,
                                           PatternBenefit benefit = 1);
 
@@ -977,8 +946,8 @@ struct PadOpTransformationPattern : public OpRewritePattern<tensor::PadOp> {
 /// a static bounding box. Use `paddingValues` and `packPaddings` to set padding
 /// value and nofold attribute of the created tensor::PadOps, respectively.
 /// Update `paddedOp` to the cloned operation with statically shaped
-/// `paddingDimensions` and return the extracted dynamically shaped results. If
-/// padding fails, return failure.
+/// `paddingDimensions` and return the extracted dynamically shaped results.
+/// If padding fails, return failure.
 FailureOr<SmallVector<Value>>
 rewriteAsPaddedOp(OpBuilder &b, LinalgOp opToPad,
                   ArrayRef<int64_t> paddingDimensions,
@@ -1132,29 +1101,6 @@ public:
                      const LinalgTransformationFilter &f) {}
 };
 
-template <typename... OpTypes>
-class TilingPatterns;
-
-template <>
-class TilingPatterns<> {
-public:
-  static void insert(RewritePatternSet &patterns,
-                     const LinalgTilingOptions &options,
-                     const LinalgTransformationFilter &f) {}
-};
-
-template <typename OpTy, typename... OpTypes>
-class TilingPatterns<OpTy, OpTypes...> {
-public:
-  static void insert(RewritePatternSet &patterns,
-                     const LinalgTilingOptions &options,
-                     const LinalgTransformationFilter &f) {
-    patterns.add<LinalgTilingPattern>(OpTy::getOperationName(),
-                                      patterns.getContext(), options, f);
-    TilingPatterns<OpTypes...>::insert(patterns, options, f);
-  }
-};
-
 /// Split Reduction options.
 struct SplitReductionOptions {
   // Ratio used to split the reduction dimension.  If the ratio is <= 1, nothing
@@ -1181,8 +1127,10 @@ void populateSplitReductionPattern(
 
 /// Apply transformation to split the single linalg op reduction into a parallel
 /// and reduction dimension. Then create a new linalg.generic op doing the rest
-/// of the reduction. Return the new linalg op with an extra parallel dimension
-/// or failure if the transformation didn't happen.
+/// of the reduction.
+/// Return the new linalg op with an extra parallel dimension or failure if the
+/// transformation didn't happen.
+///
 /// Example:
 /// ```
 ///  %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
@@ -1265,7 +1213,7 @@ splitReduction(PatternRewriter &b, LinalgOp op,
 ///  %3 = linalg.generic {indexing_maps = [#map0, #map1, #map2, #map3],
 ///    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
 ///    ins(%A, %B, %2 : tensor<16x256xf32>, tensor<256x32xf32>, tensor<64x4xi1>)
-///    outs(%1 : tensor<16x32x64xf32>) {
+///   outs(%1 : tensor<16x32x64xf32>) {
 ///      ^bb0(%arg3: f32, %arg4: f32, %arg5: i1, %arg6: f32):
 ///        %5 = arith.mulf %arg3, %arg4 : f32
 ///        %6 = arith.addf %arg6, %5 : f32
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 5b82520..ed74de7 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -37,6 +37,16 @@ static SmallVector<unsigned> extractUIntArray(ArrayAttr attr) {
   return result;
 }
 
+/// Extracts a vector of int64_t from an array attribute. Asserts if the
+/// attribute contains values other than integers.
+static SmallVector<int64_t> extractI64Array(ArrayAttr attr) {
+  SmallVector<int64_t> result;
+  result.reserve(attr.size());
+  for (APInt value : attr.getAsValueRange<IntegerAttr>())
+    result.push_back(value.getSExtValue());
+  return result;
+}
+
 namespace {
 /// A simple pattern rewriter that implements no special logic.
 class SimpleRewriter : public PatternRewriter {
@@ -858,11 +868,10 @@ transform::ScalarizeOp::applyToOne(linalg::LinalgOp target,
   // Tiling with "scalarize_dyn_dims" actually sets the same lambda as the
   // tile sizes and asserts that it is not already set.
   SmallVector<int64_t> emptyTileSizes;
-  LinalgTilingPattern pattern(getContext(), tilingOptions);
   SimpleRewriter rewriter(getContext());
   rewriter.setInsertionPoint(target);
   FailureOr<TiledLinalgOp> result =
-      pattern.returningMatchAndRewrite(target, rewriter);
+      tileWithLinalgTilingOptions(rewriter, target, tilingOptions);
   if (failed(result))
     return DiagnosedSilenceableFailure(reportUnknownTransformError(target));
 
@@ -1052,7 +1061,6 @@ transform::SplitReductionOp::applyToOne(linalg::LinalgOp target,
 DiagnosedSilenceableFailure
 transform::TileOp::apply(TransformResults &transformResults,
                          TransformState &state) {
-  LinalgTilingOptions tilingOptions;
   SmallVector<int64_t> tileSizes = extractFromI64ArrayAttr(getStaticSizes());
 
   ArrayRef<Operation *> targets = state.getPayloadOps(getTarget());
@@ -1097,6 +1105,7 @@ transform::TileOp::apply(TransformResults &transformResults,
       return diag;
     }
 
+    LinalgTilingOptions tilingOptions;
     unsigned index = en.index();
     if (!tileSizes.empty()) {
       tilingOptions.setTileSizeComputationFunction(
@@ -1118,10 +1127,9 @@ transform::TileOp::apply(TransformResults &transformResults,
     }
 
     tilingOptions.setInterchange(extractUIntArray(getInterchange()));
-    LinalgTilingPattern pattern(getContext(), tilingOptions);
     SimpleRewriter rewriter(linalgOp.getContext());
     FailureOr<TiledLinalgOp> tiledOp =
-        pattern.returningMatchAndRewrite(linalgOp, rewriter);
+        tileWithLinalgTilingOptions(rewriter, linalgOp, tilingOptions);
     if (failed(tiledOp))
       return DiagnosedSilenceableFailure::definiteFailure();
 
@@ -1341,6 +1349,153 @@ LogicalResult TileToForeachThreadOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
+// TileToScfForOp
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure
+transform::TileToScfForOp::apply(TransformResults &transformResults,
+                                 TransformState &state) {
+  SmallVector<int64_t> tileSizes = extractFromI64ArrayAttr(getStaticSizes());
+
+  ArrayRef<Operation *> targets = state.getPayloadOps(getTarget());
+  SmallVector<ArrayRef<Operation *>> dynamicSizeProducers;
+  dynamicSizeProducers.reserve(getDynamicSizes().size());
+  for (Value dynamicSizeProducerHandle : getDynamicSizes()) {
+    dynamicSizeProducers.push_back(
+        state.getPayloadOps(dynamicSizeProducerHandle));
+
+    if (dynamicSizeProducers.back().size() != targets.size()) {
+      DiagnosedSilenceableFailure diag =
+          emitSilenceableError()
+          << "expected as many dynamic size-producing operations ("
+          << dynamicSizeProducers.back().size() << ") as target ops ("
+          << targets.size() << ")";
+      diag.attachNote(dynamicSizeProducerHandle.getLoc()) << "for this handle";
+      return diag;
+    }
+
+    for (Operation *op : dynamicSizeProducers.back()) {
+      if (op->getNumResults() == 1 &&
+          op->getResult(0).getType().isa<IndexType>())
+        continue;
+      DiagnosedSilenceableFailure diag =
+          emitSilenceableError() << "expected sizes to be produced by ops "
+                                    "with a single index-type result";
+      diag.attachNote(op->getLoc()) << "size producer op";
+      diag.attachNote(dynamicSizeProducerHandle.getLoc()) << "for this handle";
+      return diag;
+    }
+  }
+
+  SmallVector<Operation *> tiled;
+  SmallVector<SmallVector<Operation *, 4>, 4> loops;
+  loops.resize(getLoops().size());
+  for (auto &en : llvm::enumerate(targets)) {
+    auto tilingInterfaceOp = dyn_cast<TilingInterface>(en.value());
+    if (!tilingInterfaceOp) {
+      DiagnosedSilenceableFailure diag =
+          emitSilenceableError() << "only TilingInterface ops are supported";
+      diag.attachNote(en.value()->getLoc()) << "target op";
+      return diag;
+    }
+
+    scf::SCFTilingOptions tilingOptions;
+    unsigned index = en.index();
+    if (!tileSizes.empty()) {
+      tilingOptions.setTileSizeComputationFunction(
+          [&, index](OpBuilder &b, Operation *) {
+            SmallVector<Value, 4> sizes;
+            sizes.reserve(tileSizes.size());
+            unsigned dynamicIdx = 0;
+            for (OpFoldResult ofr : getMixedSizes()) {
+              if (auto attr = ofr.dyn_cast<Attribute>()) {
+                sizes.push_back(b.create<arith::ConstantIndexOp>(
+                    getLoc(), attr.cast<IntegerAttr>().getInt()));
+              } else {
+                sizes.push_back(
+                    dynamicSizeProducers[dynamicIdx++][index]->getResult(0));
+              }
+            }
+            return sizes;
+          });
+    }
+
+    tilingOptions.setInterchange(extractI64Array(getInterchange()));
+    SimpleRewriter rewriter(tilingInterfaceOp.getContext());
+    FailureOr<scf::SCFTilingResult> tilingResult =
+        tileUsingSCFForOp(rewriter, tilingInterfaceOp, tilingOptions);
+    if (failed(tilingResult))
+      return DiagnosedSilenceableFailure::definiteFailure();
+
+    rewriter.replaceOp(tilingInterfaceOp, tilingResult->replacements);
+
+    tiled.push_back(tilingResult->tiledOp);
+    for (const auto &en2 : llvm::enumerate(tilingResult->loops))
+      loops[en2.index()].push_back(en2.value());
+  }
+
+  transformResults.set(getTiledLinalgOp().cast<OpResult>(), tiled);
+  for (const auto &en : llvm::enumerate(loops))
+    transformResults.set(getLoops()[en.index()].cast<OpResult>(), en.value());
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+SmallVector<OpFoldResult> transform::TileToScfForOp::getMixedSizes() {
+  ValueRange dynamic = getDynamicSizes();
+  SmallVector<int64_t> tileSizes = extractFromI64ArrayAttr(getStaticSizes());
+  SmallVector<OpFoldResult> results;
+  results.reserve(tileSizes.size());
+  unsigned dynamicPos = 0;
+  Builder builder(getContext());
+  for (int64_t size : tileSizes) {
+    if (size == ShapedType::kDynamicSize) {
+      results.push_back(dynamic[dynamicPos++]);
+    } else {
+      results.push_back(builder.getIndexAttr(size));
+    }
+  }
+  return results;
+}
+
+ParseResult transform::TileToScfForOp::parse(OpAsmParser &parser,
+                                             OperationState &result) {
+  OpAsmParser::UnresolvedOperand target;
+  SmallVector<OpAsmParser::UnresolvedOperand> dynamicSizes;
+  ArrayAttr staticSizes;
+  auto pdlOperationType = pdl::OperationType::get(parser.getContext());
+  if (parser.parseOperand(target) ||
+      parser.resolveOperand(target, pdlOperationType, result.operands) ||
+      parseDynamicIndexList(parser, dynamicSizes, staticSizes,
+                            ShapedType::kDynamicSize) ||
+      parser.resolveOperands(dynamicSizes, pdlOperationType, result.operands) ||
+      parser.parseOptionalAttrDict(result.attributes))
+    return ParseResult::failure();
+
+  result.addAttribute(getStaticSizesAttrName(result.name), staticSizes);
+  size_t numExpectedLoops =
+      staticSizes.size() - llvm::count(extractFromI64ArrayAttr(staticSizes), 0);
+  result.addTypes(SmallVector<Type>(numExpectedLoops + 1, pdlOperationType));
+  return success();
+}
+
+void TileToScfForOp::print(OpAsmPrinter &p) {
+  p << ' ' << getTarget();
+  printDynamicIndexList(p, getOperation(), getDynamicSizes(), getStaticSizes(),
+                        ShapedType::kDynamicSize);
+  p.printOptionalAttrDict((*this)->getAttrs(), {getStaticSizesAttrName()});
+}
+
+void transform::TileToScfForOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  consumesHandle(getTarget(), effects);
+  onlyReadsHandle(getDynamicSizes(), effects);
+  producesHandle(getTiledLinalgOp(), effects);
+  producesHandle(getLoops(), effects);
+  modifiesPayload(effects);
+}
+
+//===----------------------------------------------------------------------===//
 // VectorizeOp
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
index 162e74f..39a9c7f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
@@ -51,40 +51,6 @@ using namespace linalg;
 
 namespace {
 
-/// Configurable pass to apply pattern-based linalg tiling.
-struct LinalgStrategyTilePass
-    : public impl::LinalgStrategyTilePassBase<LinalgStrategyTilePass> {
-
-  LinalgStrategyTilePass() = default;
-
-  LinalgStrategyTilePass(StringRef opName,
-                         mlir::linalg::LinalgTilingOptions opt,
-                         LinalgTransformationFilter filt)
-      : options(std::move(opt)), filter(std::move(filt)) {
-    this->anchorOpName.setValue(opName.str());
-  }
-
-  void runOnOperation() override {
-    auto funcOp = getOperation();
-    if (!anchorFuncName.empty() && funcOp.getName() != anchorFuncName)
-      return;
-
-    MLIRContext *ctx = funcOp.getContext();
-    RewritePatternSet tilingPattern(ctx);
-    if (!anchorOpName.empty())
-      tilingPattern.add<LinalgTilingPattern>(anchorOpName, ctx, options,
-                                             filter);
-    else
-      tilingPattern.add<LinalgTilingPattern>(ctx, options, filter);
-    if (anchorOpName == tensor::PadOp::getOperationName())
-      populatePadTensorTilingPatterns(tilingPattern, options);
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(tilingPattern));
-  }
-
-  mlir::linalg::LinalgTilingOptions options;
-  LinalgTransformationFilter filter;
-};
-
 /// Configurable pass to lower vector operations.
 struct LinalgStrategyRemoveMarkersPass
     : public impl::LinalgStrategyRemoveMarkersPassBase<
@@ -101,14 +67,6 @@ struct LinalgStrategyRemoveMarkersPass
 };
 } // namespace
 
-/// Create a LinalgStrategyTilePass.
-std::unique_ptr<OperationPass<func::FuncOp>>
-mlir::createLinalgStrategyTilePass(StringRef opName,
-                                   const LinalgTilingOptions &opt,
-                                   const LinalgTransformationFilter &filter) {
-  return std::make_unique<LinalgStrategyTilePass>(opName, opt, filter);
-}
-
 /// Create a LinalgStrategyRemoveMarkersPass.
 std::unique_ptr<OperationPass<func::FuncOp>>
 mlir::createLinalgStrategyRemoveMarkersPass() {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index d377906..c0ff3e0 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -732,77 +732,8 @@ void mlir::linalg::populateLinalgTilingCanonicalizationPatterns(
       >::insert(patterns);
 }
 
-/// Populate the given list with patterns that apply Linalg tiling.
-static void insertTilingPatterns(RewritePatternSet &patterns,
-                                 const LinalgTilingOptions &options) {
-  auto *ctx = patterns.getContext();
-  LinalgTransformationFilter f(ArrayRef<StringAttr>{},
-                               StringAttr::get(ctx, "tiled"));
-  TilingPatterns<GenericOp,
-#define GET_OP_LIST
-#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
-                 >::insert(patterns, options, f);
-  patterns.add<PadOpTilingPattern>(ctx, options);
-}
-
 void mlir::linalg::populatePadTensorTilingPatterns(
     RewritePatternSet &patterns, const LinalgTilingOptions &options) {
   auto *ctx = patterns.getContext();
   patterns.add<PadOpTilingPattern>(ctx, options);
 }
-
-static void applyExtractSliceOfPadTensorSwapPattern(func::FuncOp funcOp) {
-  MLIRContext *ctx = funcOp.getContext();
-  RewritePatternSet patterns(ctx);
-  patterns.add<ExtractSliceOfPadTensorSwapPattern>(patterns.getContext());
-  (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
-  (void)applyPatternsAndFoldGreedily(
-      funcOp, getLinalgTilingCanonicalizationPatterns(ctx));
-}
-
-namespace {
-struct LinalgTilingPass : public impl::LinalgTilingPassBase<LinalgTilingPass> {
-  LinalgTilingPass() = default;
-  LinalgTilingPass(ArrayRef<int64_t> tileSizes, LinalgTilingLoopType loopType) {
-    this->tileSizes = tileSizes;
-    this->loopType = "";
-    this->loopTypeEnum = loopType;
-  }
-
-  void runOnOperation() override {
-    func::FuncOp funcOp = getOperation();
-    LinalgTilingLoopType type =
-        llvm::StringSwitch<LinalgTilingLoopType>(loopType)
-            .Case("for", LinalgTilingLoopType::Loops)
-            .Case("affine", LinalgTilingLoopType::AffineLoops)
-            .Case("parallel", LinalgTilingLoopType::ParallelLoops)
-            .Default(loopTypeEnum);
-    auto options =
-        LinalgTilingOptions().setTileSizes(tileSizes).setLoopType(type);
-    MLIRContext *ctx = funcOp.getContext();
-    RewritePatternSet patterns(ctx);
-    insertTilingPatterns(patterns, options);
-    scf::populateSCFForLoopCanonicalizationPatterns(patterns);
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
-    (void)applyPatternsAndFoldGreedily(
-        funcOp, getLinalgTilingCanonicalizationPatterns(ctx));
-    // Drop the marker.
-    funcOp.walk([](LinalgOp op) {
-      op->removeAttr(LinalgTransforms::kLinalgTransformMarker);
-    });
-
-    // Apply swap pattern after generating loop nest and running
-    // canonicalizations.
-    applyExtractSliceOfPadTensorSwapPattern(funcOp);
-  }
-
-  LinalgTilingLoopType loopTypeEnum;
-};
-
-} // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-mlir::createLinalgTilingPass(ArrayRef<int64_t> tileSizes,
-                             linalg::LinalgTilingLoopType loopType) {
-  return std::make_unique<LinalgTilingPass>(tileSizes, loopType);
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 938b9e7..58923bc 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -356,33 +356,13 @@ void mlir::linalg::peelTiledLinalgOp(RewriterBase &rewriter, TiledLinalgOp &res,
   }
 }
 
-/// Linalg tiling pattern.
-mlir::linalg::LinalgTilingPattern::LinalgTilingPattern(
-    MLIRContext *context, LinalgTilingOptions options,
-    LinalgTransformationFilter f, PatternBenefit benefit)
-    : OpInterfaceRewritePattern<LinalgOp>(context, benefit),
-      filter(std::move(f)), options(std::move(options)) {}
-
-mlir::linalg::LinalgTilingPattern::LinalgTilingPattern(
-    StringRef opName, MLIRContext *context, LinalgTilingOptions options,
-    LinalgTransformationFilter f, PatternBenefit benefit)
-    : OpInterfaceRewritePattern<LinalgOp>(context, benefit),
-      filter(f.addOpNameFilter(opName)), options(std::move(options)) {}
-
 FailureOr<TiledLinalgOp>
-mlir::linalg::LinalgTilingPattern::returningMatchAndRewrite(
-    LinalgOp op, PatternRewriter &rewriter) const {
-  if (failed(filter.checkAndNotify(rewriter, op)))
-    return failure();
-
+mlir::linalg::tileWithLinalgTilingOptions(RewriterBase &rewriter, LinalgOp op,
+                                          const LinalgTilingOptions &options) {
   FailureOr<TiledLinalgOp> res = tileLinalgOp(rewriter, op, options);
   if (failed(res))
     return failure();
 
-  // Clear filter to stop recursive pattern application.
-  // This must be done here to properly propagate to peeling branches.
-  filter.replaceLinalgTransformationFilter(rewriter, res->op);
-
   // Peel the loops of the TiledLinalgOp.
   peelTiledLinalgOp(rewriter, *res, options.peeledLoops, options.loopType);
 
diff --git a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
deleted file mode 100644
index 6178aa3..0000000
--- a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
+++ /dev/null
@@ -1,219 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-tile-and-distribute-options -split-input-file | FileCheck %s
-
-func.func @gemm1(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
-{
-  linalg.matmul {__internal_linalg_transform__ = "distribute1"}
-    ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
-   outs(%c: memref<?x?xf32>)
-  return
-}
-//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
-//      CHECK: func @gemm1(
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
-//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
-//      CHECK: scf.for %[[ARG3:.*]] =
-//      CHECK:   %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK:   %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK:   %[[OFFSETY_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK:   %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK:   %[[SV1:.*]] = memref.subview %[[ARG0]][%[[OFFSETY]], %[[ARG3]]]
-//      CHECK:   %[[SV2:.*]] = memref.subview %[[ARG1]][%[[ARG3]], %[[OFFSETX]]]
-//      CHECK:   %[[SV3:.*]] = memref.subview %[[ARG2]][%[[OFFSETY_2]], %[[OFFSETX_2]]]
-//      CHECK:   linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
-
-// -----
-
-func.func @gemm2(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
-{
-  linalg.matmul  {__internal_linalg_transform__ = "distribute2"}
-    ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
-   outs(%c:memref<?x?xf32>)
-  return
-}
-//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
-//      CHECK: func @gemm2(
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
-//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
-//      CHECK: %[[ITERY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK: %[[ITERX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK: %[[INBOUNDSY:.*]] = arith.cmpi slt, %[[ITERY]], %{{.*}}
-//      CHECK: %[[INBOUNDSX:.*]] = arith.cmpi slt, %[[ITERX]], %{{.*}}
-//      CHECK: %[[INBOUNDS:.*]] = arith.andi %[[INBOUNDSY]], %[[INBOUNDSX]]
-//      CHECK: scf.if %[[INBOUNDS]]
-//      CHECK:   scf.for %[[ARG3:.*]] =
-//      CHECK:     %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK:     %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK:     %[[OFFSETY_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK:     %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK:     %[[SV1:.*]] = memref.subview %[[ARG0]][%[[OFFSETY]], %[[ARG3]]]
-//      CHECK:     %[[SV2:.*]] = memref.subview %[[ARG1]][%[[ARG3]], %[[OFFSETX]]]
-//      CHECK:     %[[SV3:.*]] = memref.subview %[[ARG2]][%[[OFFSETY_2]], %[[OFFSETX_2]]]
-//      CHECK:     linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
-
-// -----
-
-func.func @gemm3(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
-{
-  linalg.matmul {__internal_linalg_transform__ = "distribute3"}
-    ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
-   outs(%c: memref<?x?xf32>)
-  return
-}
-//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
-//      CHECK: func @gemm3(
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
-//  CHECK-DAG: %[[NBLOCKSY:.*]] = gpu.grid_dim y
-//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
-//  CHECK-DAG: %[[NBLOCKSX:.*]] = gpu.grid_dim x
-//      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK: %[[STEPY:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSY]]]
-//      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK: %[[STEPX:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSX]]]
-//      CHECK: scf.parallel (%[[ARG3:.*]], %[[ARG4:.*]]) = (%[[LBY]], %[[LBX]]) to (%{{.*}}, %{{.*}}) step (%[[STEPY]], %[[STEPX]])
-//      CHECK:   scf.for %[[ARG5:.*]] =
-//      CHECK:     %[[SV1:.*]] = memref.subview %[[ARG0]][%[[ARG3]], %[[ARG5]]]
-//      CHECK:     %[[SV2:.*]] = memref.subview %[[ARG1]][%[[ARG5]], %[[ARG4]]]
-//      CHECK:     %[[SV3:.*]] = memref.subview %[[ARG2]][%[[ARG3]], %[[ARG4]]]
-//      CHECK:     linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
-
-// -----
-
-func.func @gemm4(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
-{
-  linalg.matmul {__internal_linalg_transform__ = "distribute4"}
-    ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
-   outs(%c: memref<?x?xf32>)
-  return
-}
-//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
-//      CHECK: func @gemm4(
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
-//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
-//      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK: %[[INBOUNDS:.*]] = arith.cmpi slt, %[[LBX]], %{{.*}}
-//      CHECK: scf.if %[[INBOUNDS]]
-//      CHECK:   scf.for %[[ARG3:.*]] =
-//      CHECK:     %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK:     %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK:     %[[OFFSETY_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK:     %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK:     %[[SV1:.*]] = memref.subview %[[ARG0]][%[[OFFSETY]], %[[ARG3]]]
-//      CHECK:     %[[SV2:.*]] = memref.subview %[[ARG1]][%[[ARG3]], %[[OFFSETX]]]
-//      CHECK:     %[[SV3:.*]] = memref.subview %[[ARG2]][%[[OFFSETY_2]], %[[OFFSETX_2]]]
-//      CHECK:     linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
-
-// -----
-
-func.func @gemm5(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
-{
-  linalg.matmul {__internal_linalg_transform__ = "distribute5"}
-    ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
-   outs(%c: memref<?x?xf32>)
-  return
-}
-//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
-//      CHECK: func @gemm5(
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
-//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
-//  CHECK-DAG: %[[NBLOCKSX:.*]] = gpu.grid_dim x
-//      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK: %[[STEPX:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSX]]]
-//      CHECK: %[[INBOUNDS:.*]] = arith.cmpi slt, %[[LBY]], %{{.*}}
-//      CHECK: scf.if %[[INBOUNDS]]
-//      CHECK:   scf.parallel (%[[ARG3:.*]]) = (%[[LBX]]) to (%{{.*}}) step (%[[STEPX]])
-//      CHECK:     scf.for %[[ARG4:.*]] =
-//      CHECK:      %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK:       %[[OFFSETY_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK:       %[[SV1:.*]] = memref.subview %[[ARG0]][%[[OFFSETY]], %[[ARG4]]]
-//      CHECK:       %[[SV2:.*]] = memref.subview %[[ARG1]][%[[ARG4]], %[[ARG3]]]
-//      CHECK:       %[[SV3:.*]] = memref.subview %[[ARG2]][%[[OFFSETY_2]], %[[ARG3]]]
-//      CHECK:       linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
-
-// -----
-
-func.func @gemm6(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
-{
-  linalg.matmul {__internal_linalg_transform__ = "distribute6"}
-    ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
-   outs(%c: memref<?x?xf32>)
-  return
-}
-//  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)>
-//      CHECK: func @gemm6(
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
-//  CHECK-DAG: %[[NBLOCKSY:.*]] = gpu.grid_dim y
-//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
-//      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
-//      CHECK: %[[STEPY:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSY]]]
-//      CHECK: scf.parallel (%[[ARG3:.*]]) = (%[[LBY]]) to (%{{.*}}) step (%[[STEPY]])
-//      CHECK:   scf.for %[[ARG4:.*]] =
-//      CHECK:     %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK:     %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
-//      CHECK:     %[[SV1:.*]] = memref.subview %[[ARG0]][%[[ARG3]], %[[ARG4]]]
-//      CHECK:     %[[SV2:.*]] = memref.subview %[[ARG1]][%[[ARG4]], %[[OFFSETX]]]
-//      CHECK:     %[[SV3:.*]] = memref.subview %[[ARG2]][%[[ARG3]], %[[OFFSETX_2]]]
-//      CHECK:     linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
-
-// -----
-
-//      CHECK: #[[MULMAP:.+]] = affine_map<()[s0, s1] -> (s0 * s1)>
-//      CHECK: #[[ADDMAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
-//      CHECK: func @matmul_tensors(
-// CHECK-SAME:    %[[TA:[0-9a-z]+]]: tensor<?x?xf32>
-// CHECK-SAME:    %[[TB:[0-9a-z]+]]: tensor<?x?xf32>
-// CHECK-SAME:    %[[TC:[0-9a-z]+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
-func.func @matmul_tensors(
-  %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-//  CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
-//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//  CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y
-//  CHECK-DAG: %[[NBLOCKSY:.*]] = gpu.grid_dim y
-//  CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x
-//  CHECK-DAG: %[[NBLOCKSX:.*]] = gpu.grid_dim x
-//      CHECK: %[[MUL:.+]] = affine.apply #[[MULMAP]]()[%[[BIDY]], %[[C8]]]
-//      CHECK: %[[LBY:.+]] = affine.apply #[[ADDMAP]]()[%[[MUL]], %[[C0]]]
-//      CHECK: %[[STEPY:.+]] = affine.apply #[[MULMAP]]()[%[[NBLOCKSY]], %[[C8]]]
-//      CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?xf32>) {
-//      CHECK: %[[MUL:.+]] = affine.apply #[[MULMAP]]()[%[[BIDX]], %[[C8]]]
-//      CHECK: %[[LBX:.+]] = affine.apply #[[ADDMAP]]()[%[[MUL]], %[[C0]]]
-//      CHECK: %[[STEPX:.+]] = affine.apply #[[MULMAP]]()[%[[NBLOCKSX]], %[[C8]]]
-//      CHECK:   %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xf32>) {
-//      CHECK:     %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?xf32>) {
-//      CHECK:       %[[sTA:.*]] = tensor.extract_slice %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
-//      CHECK:       %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
-//      CHECK:       %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
-//      CHECK:       %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME:                                  outs(%[[sTC]] : tensor<?x?xf32>)  -> tensor<?x?xf32>
-//      CHECK:       %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}]  : tensor<?x?xf32> into tensor<?x?xf32>
-//      CHECK:       scf.yield %[[TD]] : tensor<?x?xf32>
-//      CHECK:     scf.yield %[[TD2]] : tensor<?x?xf32>
-//      CHECK:   scf.yield %[[TD1]] : tensor<?x?xf32>
-  %0 = linalg.matmul {__internal_linalg_transform__ = "tensors_distribute1"}
-       ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
-
-//      CHECK: return %[[TD0]] : tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
diff --git a/mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir
deleted file mode 100644
index f8f102e..0000000
--- a/mlir/test/Dialect/Linalg/tile-and-peel-tensors.mlir
+++ /dev/null
@@ -1,110 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns="test-tile-pattern tile-sizes=256,128,512 peeled-loops=0" -canonicalize | \
-// RUN:     FileCheck %s -check-prefix=CHECK-PEEL-0
-
-// RUN: mlir-opt %s -test-linalg-transform-patterns="test-tile-pattern tile-sizes=256,128,512 peeled-loops=1,2" -canonicalize | \
-// RUN:     FileCheck %s -check-prefix=CHECK-PEEL-12
-
-//     CHECK-PEEL-0: func @matmul_static_tensor
-// CHECK-PEEL-0-DAG:   %[[c0:.*]] = arith.constant 0 : index
-// CHECK-PEEL-0-DAG:   %[[c128:.*]] = arith.constant 128 : index
-// CHECK-PEEL-0-DAG:   %[[c256:.*]] = arith.constant 256 : index
-// CHECK-PEEL-0-DAG:   %[[c512:.*]] = arith.constant 512 : index
-// CHECK-PEEL-0-DAG:   %[[c1280:.*]] = arith.constant 1280 : index
-// CHECK-PEEL-0-DAG:   %[[c1600:.*]] = arith.constant 1600 : index
-// CHECK-PEEL-0-DAG:   %[[c1700:.*]] = arith.constant 1700 : index
-//     CHECK-PEEL-0:   scf.for %{{.*}} = %[[c0]] to %[[c1280]] step %[[c256]] {{.*}} {
-//     CHECK-PEEL-0:     scf.for %{{.*}} = %[[c0]] to %[[c1700]] step %[[c128]] {{.*}} {
-//     CHECK-PEEL-0:       scf.for %{{.*}} = %[[c0]] to %[[c1600]] step %[[c512]] {{.*}} {
-//     CHECK-PEEL-0:         linalg.matmul ins({{.*}} : tensor<256x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<256x?xf32>)
-//     CHECK-PEEL-0:       }
-//     CHECK-PEEL-0:     }
-//     CHECK-PEEL-0:   }
-//     CHECK-PEEL-0:   scf.for %{{.*}} = %[[c0]] to %[[c1700]] step %[[c128]] {{.*}} {
-//     CHECK-PEEL-0:     scf.for %{{.*}} = %[[c0]] to %[[c1600]] step %[[c512]] {{.*}} {
-//     CHECK-PEEL-0:       linalg.matmul ins({{.*}} : tensor<220x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<220x?xf32>)
-//     CHECK-PEEL-0:     }
-//     CHECK-PEEL-0:   }
-
-//     CHECK-PEEL-12: func @matmul_static_tensor
-// CHECK-PEEL-12-DAG:   %[[c0:.*]] = arith.constant 0 : index
-// CHECK-PEEL-12-DAG:   %[[c128:.*]] = arith.constant 128 : index
-// CHECK-PEEL-12-DAG:   %[[c256:.*]] = arith.constant 256 : index
-// CHECK-PEEL-12-DAG:   %[[c512:.*]] = arith.constant 512 : index
-// CHECK-PEEL-12-DAG:   %[[c1500:.*]] = arith.constant 1500 : index
-// CHECK-PEEL-12-DAG:   %[[c1536:.*]] = arith.constant 1536 : index
-// CHECK-PEEL-12-DAG:   %[[c1600:.*]] = arith.constant 1600 : index
-// CHECK-PEEL-12-DAG:   %[[c1664:.*]] = arith.constant 1664 : index
-//     CHECK-PEEL-12:   scf.for %{{.*}} = %[[c0]] to %[[c1500]] step %[[c256]] {{.*}} {
-//     CHECK-PEEL-12:     scf.for %{{.*}} = %[[c0]] to %[[c1664]] step %[[c128]] {{.*}} {
-//     CHECK-PEEL-12:       scf.for %{{.*}} = %[[c0]] to %[[c1536]] step %[[c512]] {{.*}} {
-//     CHECK-PEEL-12:         linalg.matmul ins({{.*}} : tensor<?x512xf32>, tensor<512x128xf32>) outs({{.*}} : tensor<?x128xf32>)
-//     CHECK-PEEL-12:       }
-//     CHECK-PEEL-12:       linalg.matmul ins({{.*}} : tensor<?x64xf32>, tensor<64x128xf32>) outs({{.*}} : tensor<?x128xf32>)
-//     CHECK-PEEL-12:     }
-//     CHECK-PEEL-12:     scf.for %{{.*}} = %[[c0]] to %[[c1600]] step %[[c512]] {{.*}} {
-//     CHECK-PEEL-12:       linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x36xf32>) outs({{.*}} : tensor<?x36xf32>)
-//     CHECK-PEEL-12:     }
-//     CHECK-PEEL-12:   }
-func.func @matmul_static_tensor(%arg0: tensor<1500x1600xf32>, %arg1: tensor<1600x1700xf32>)
-    -> tensor<1500x1700xf32> {
-  %out = tensor.empty() : tensor<1500x1700xf32>
-  %r = linalg.matmul {__internal_linalg_transform__ = "tile"}
-      ins(%arg0, %arg1: tensor<1500x1600xf32>, tensor<1600x1700xf32>)
-      outs(%out: tensor<1500x1700xf32>) -> tensor<1500x1700xf32>
-  return %r : tensor<1500x1700xf32>
-}
-
-// -----
-
-//     CHECK-PEEL-0: func @matmul_dynamic_tensor
-// CHECK-PEEL-0-DAG:   %[[c0:.*]] = arith.constant 0 : index
-// CHECK-PEEL-0-DAG:   %[[c128:.*]] = arith.constant 128 : index
-// CHECK-PEEL-0-DAG:   %[[c256:.*]] = arith.constant 256 : index
-// CHECK-PEEL-0-DAG:   %[[c512:.*]] = arith.constant 512 : index
-//     CHECK-PEEL-0:   scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c256]] {{.*}} {
-//     CHECK-PEEL-0:     scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c128]] {{.*}} {
-//     CHECK-PEEL-0:       scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c512]] {{.*}} {
-//     CHECK-PEEL-0:         linalg.matmul ins({{.*}} : tensor<256x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<256x?xf32>)
-//     CHECK-PEEL-0:       }
-//     CHECK-PEEL-0:     }
-//     CHECK-PEEL-0:   }
-//     CHECK-PEEL-0:   scf.for %{{.*}} {
-//     CHECK-PEEL-0:     scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c128]] {{.*}} {
-//     CHECK-PEEL-0:       scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c512]] {{.*}} {
-//     CHECK-PEEL-0:         linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<?x?xf32>)
-//     CHECK-PEEL-0:       }
-//     CHECK-PEEL-0:     }
-//     CHECK-PEEL-0:   }
-
-//     CHECK-PEEL-12: func @matmul_dynamic_tensor
-// CHECK-PEEL-12-DAG:   %[[c0:.*]] = arith.constant 0 : index
-// CHECK-PEEL-12-DAG:   %[[c128:.*]] = arith.constant 128 : index
-// CHECK-PEEL-12-DAG:   %[[c256:.*]] = arith.constant 256 : index
-// CHECK-PEEL-12-DAG:   %[[c512:.*]] = arith.constant 512 : index
-//     CHECK-PEEL-12:   scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c256]] {{.*}} {
-//     CHECK-PEEL-12:     scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c128]] {{.*}} {
-//     CHECK-PEEL-12:       scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c512]] {{.*}} {
-//     CHECK-PEEL-12:         linalg.matmul ins({{.*}} : tensor<?x512xf32>, tensor<512x128xf32>) outs({{.*}} : tensor<?x128xf32>)
-//     CHECK-PEEL-12:       }
-//     CHECK-PEEL-12:       scf.for %{{.*}} {
-//     CHECK-PEEL-12:         linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x128xf32>) outs({{.*}} : tensor<?x128xf32>)
-//     CHECK-PEEL-12:       }
-//     CHECK-PEEL-12:     }
-//     CHECK-PEEL-12:     scf.for %{{.*}} {
-//     CHECK-PEEL-12:       scf.for %{{.*}} = %[[c0]] to %{{.*}} step %[[c512]] {{.*}} {
-//     CHECK-PEEL-12:         linalg.matmul ins({{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs({{.*}} : tensor<?x?xf32>)
-//     CHECK-PEEL-12:       }
-//     CHECK-PEEL-12:     }
-//     CHECK-PEEL-12:   }
-func.func @matmul_dynamic_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-  %out = tensor.empty(%d0, %d1) : tensor<?x?xf32>
-  %r = linalg.matmul {__internal_linalg_transform__ = "tile"}
-      ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%out: tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %r : tensor<?x?xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/tile-conv.mlir b/mlir/test/Dialect/Linalg/tile-conv.mlir
index 028c93a..f8b1064 100644
--- a/mlir/test/Dialect/Linalg/tile-conv.mlir
+++ b/mlir/test/Dialect/Linalg/tile-conv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3" | FileCheck %s
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -canonicalize | FileCheck %s
 
 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0, s1] -> (-d0 + s0 + s1 - 1, s1 + 1)>
 //  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (-d0 + s0 + s1 - 1, s1 + 2)>
@@ -10,6 +10,12 @@ func.func @conv(%arg0 : memref<?x?xf32>, %arg1 : memref<?x?xf32>, %arg2 : memref
   return
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.conv_2d"]} in %arg1
+    %1, %loop:2 = transform.structured.tile %0 [2, 3]
+}
+
 //       CHECK: func @conv
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/tile-indexed.mlir b/mlir/test/Dialect/Linalg/tile-indexed.mlir
index fdca6fb..d6e9c0e 100644
--- a/mlir/test/Dialect/Linalg/tile-indexed.mlir
+++ b/mlir/test/Dialect/Linalg/tile-indexed.mlir
@@ -1,6 +1,4 @@
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=10,25" -split-input-file | FileCheck %s -check-prefix=TILE-10n25
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=25,0" -split-input-file | FileCheck %s -check-prefix=TILE-25n0
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=0,25" -split-input-file | FileCheck %s -check-prefix=TILE-0n25
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -canonicalize -split-input-file | FileCheck %s -check-prefix=TILE-10n25
 
 func.func @indexed_vector(%arg0: memref<50xindex>) {
   linalg.generic {indexing_maps = [affine_map<(i) -> (i)>],
@@ -12,6 +10,13 @@ func.func @indexed_vector(%arg0: memref<50xindex>) {
   }
   return
 }
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1
+    %1, %loop:2 = transform.structured.tile %0 [10, 25]
+}
+
 // TILE-10n25-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0 + d1)>
 // TILE-10n25-LABEL: func @indexed_vector
 // TILE-10n25: %[[C10:.*]] = arith.constant 10 : index
@@ -21,19 +26,6 @@ func.func @indexed_vector(%arg0: memref<50xindex>) {
 // TILE-10n25:     %[[NEW_I:.*]] = affine.apply [[$MAP]](%[[I]], %[[J]])
 // TILE-10n25:     linalg.yield %[[NEW_I]] : index
 
-// TILE-25n0-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0 + d1)>
-// TILE-25n0-LABEL: func @indexed_vector
-// TILE-25n0: %[[C25:.*]] = arith.constant 25 : index
-// TILE-25n0: scf.for %[[J:.*]] = {{.*}} step %[[C25]]
-// TILE-25n0:   linalg.generic
-// TILE-25n0:     %[[I:.*]] = linalg.index 0 : index
-// TILE-25n0:     %[[NEW_I:.*]] = affine.apply [[$MAP]](%[[I]], %[[J]])
-// TILE-25n0:     linalg.yield %[[NEW_I]] : index
-
-// TILE-0n25-LABEL: func @indexed_vector
-// TILE-0n25-NOT: scf.for %[[J:.*]] = {{.*}} step %
-// TILE-0n25: linalg.generic
-
 // -----
 
 func.func @indexed_matrix(%arg0: memref<50x50xindex>) {
@@ -48,6 +40,13 @@ func.func @indexed_matrix(%arg0: memref<50x50xindex>) {
   }
   return
 }
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1
+    %1, %loop:2 = transform.structured.tile %0 [10, 25]
+}
+
 // TILE-10n25-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0 + d1)>
 // TILE-10n25-LABEL: func @indexed_matrix
 // TILE-10n25-DAG: %[[C25:.*]] = arith.constant 25 : index
@@ -61,25 +60,3 @@ func.func @indexed_matrix(%arg0: memref<50x50xindex>) {
 // TILE-10n25:       %[[NEW_J:.*]] = affine.apply [[$MAP]](%[[J]], %[[L]])
 // TILE-10n25:       %[[SUM:.*]] = arith.addi %[[NEW_I]], %[[NEW_J]] : index
 // TILE-10n25:       linalg.yield %[[SUM]] : index
-
-// TILE-25n0-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0 + d1)>
-// TILE-25n0-LABEL: func @indexed_matrix
-// TILE-25n0: %[[C25:.*]] = arith.constant 25 : index
-// TILE-25n0: scf.for %[[L:.*]] = {{.*}} step %[[C25]]
-// TILE-25n0:   linalg.generic
-// TILE-25n0:     %[[I:.*]] = linalg.index 0 : index
-// TILE-25n0:     %[[NEW_I:.*]] = affine.apply [[$MAP]](%[[I]], %[[L]])
-// TILE-25n0:     %[[J:.*]] = linalg.index 1 : index
-// TILE-25n0:     %[[SUM:.*]] = arith.addi %[[NEW_I]], %[[J]] : index
-// TILE-25n0:     linalg.yield %[[SUM]] : index
-
-// TILE-0n25-DAG: [[$MAP:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0 + d1)>
-// TILE-0n25-LABEL: func @indexed_matrix
-// TILE-0n25: %[[C25:.*]] = arith.constant 25 : index
-// TILE-0n25: scf.for %[[L:.*]] = {{.*}} step %[[C25]]
-// TILE-0n25:   linalg.generic
-// TILE-0n25:     %[[I:.*]] = linalg.index 0 : index
-// TILE-0n25:     %[[J:.*]] = linalg.index 1 : index
-// TILE-0n25:     %[[NEW_J:.*]] = affine.apply [[$MAP]](%[[J]], %[[L]])
-// TILE-0n25:     %[[SUM:.*]] = arith.addi %[[I]], %[[NEW_J]] : index
-// TILE-0n25:     linalg.yield %[[SUM]] : index
diff --git a/mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir b/mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir
index 6295f91..74e8ebb 100644
--- a/mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir
+++ b/mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir
@@ -1,53 +1,65 @@
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3" -cse -split-input-file | \
-// RUN: FileCheck %s -check-prefix=TILE2
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=0,3" -resolve-shaped-type-result-dims -cse -split-input-file | \
-// RUN: FileCheck %s -check-prefix=TILE1
-// This test only checks that tiling does not crash.
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2" -resolve-shaped-type-result-dims -cse -split-input-file
-
-//  TILE2-DAG:  #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 8)>
-//  TILE2-DAG:  #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 7)>
-//       TILE2: func @dynamic_pad_tensor(
-//  TILE2-SAME:     %[[IN:.*]]: tensor<?x?xf32>
-//   TILE2-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   TILE2-DAG:   %[[C1:.*]] = arith.constant 1 : index
-//   TILE2-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//   TILE2-DAG:   %[[C3:.*]] = arith.constant 3 : index
-//       TILE2:   %[[DIM_IN0:.*]] = tensor.dim %[[IN]], %[[C0]]
-//       TILE2:   %[[DIM0:.*]] = affine.apply #[[MAP0]]()[%[[DIM_IN0]]]
-//       TILE2:   %[[DIM_IN1:.*]] = tensor.dim %[[IN]], %[[C1]]
-//       TILE2:   %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[DIM_IN1]]]
-//       TILE2:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[DIM0]] step %[[C2]]
-//       TILE2:     scf.for {{.*}} = %[[C0]] to %[[DIM1]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
-//       TILE2:       %[[SWAP_RESULT:.*]] = scf.if
-//       TILE2:         tensor.generate
-//       TILE2:       else
-//       TILE2:         %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
-//       TILE2:         %[[PAD:.*]] = tensor.pad %[[SLICE]]
-//       TILE2:       tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
-//       TILE2:   return %[[RESULT]]
-
-//   TILE1-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 7)>
-//   TILE1-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)>
-//       TILE1: func @dynamic_pad_tensor(
-//  TILE1-SAME:     %[[IN:.*]]: tensor<?x?xf32>
-//   TILE1-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   TILE1-DAG:   %[[C1:.*]] = arith.constant 1 : index
-//   TILE1-DAG:   %[[C3:.*]] = arith.constant 3 : index
-//       TILE1:   %[[DIM_IN1:.*]] = tensor.dim %[[IN]], %[[C1]]
-//       TILE1:   %[[DIM1:.*]] = affine.apply #[[MAP0]]()[%[[DIM_IN1]]]
-//       TILE1:   %[[DIM_IN0:.*]] = tensor.dim %[[IN]], %[[C0]]
-//       TILE1:   %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[DIM_IN0]]]
-//       TILE1:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[DIM1]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
-//       TILE1:     %[[SWAP_RESULT:.*]] = scf.if
-//       TILE1:       tensor.generate
-//       TILE1:     else
-//       TILE1:       %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
-//       TILE1:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[3, %{{.*}}] high[{{.*}}, {{.*}}]
-//       TILE1:     tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][0, {{.*}}] [%[[DIM0]], {{.*}}] [1, 1]
-//       TILE1:   return %[[RESULT]]
-
-func.func @dynamic_pad_tensor(%input_tensor: tensor<?x?xf32>,
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -canonicalize -cse -split-input-file
+
+//  CHECK-DAG:  #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 8)>
+//  CHECK-DAG:  #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 7)>
+//       CHECK: func @dynamic_pad_tensor_3_4(
+//  CHECK-SAME:     %[[IN:.*]]: tensor<?x?xf32>
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[DIM_IN0:.*]] = tensor.dim %[[IN]], %[[C0]]
+//   CHECK-DAG:   %[[DIM_IN1:.*]] = tensor.dim %[[IN]], %[[C1]]
+//   CHECK-DAG:   %[[DIM0:.*]] = affine.apply #[[MAP0]]()[%[[DIM_IN0]]]
+//   CHECK-DAG:   %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[DIM_IN1]]]
+//       CHECK:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[DIM0]] step %[[C2]]
+//       CHECK:     scf.for {{.*}} = %[[C0]] to %[[DIM1]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
+//       CHECK:       %[[SWAP_RESULT:.*]] = scf.if
+//       CHECK:         tensor.generate
+//       CHECK:       else
+//       CHECK:         %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
+//       CHECK:         %[[PAD:.*]] = tensor.pad %[[SLICE]]
+//       CHECK:       tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
+//       CHECK:   return %[[RESULT]]
+
+func.func @dynamic_pad_tensor_3_4(%input_tensor: tensor<?x?xf32>,
+                         %pad_value: f32) -> tensor<?x?xf32> {
+  %0 = tensor.pad %input_tensor low[3, 4] high[5, 3] {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %pad_value : f32
+    } : tensor<?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1
+    %1, %loops:2 = transform.structured.tile_to_scf_for %0 [2, 3]
+}
+
+// -----
+
+//   CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 7)>
+//   CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)>
+//       CHECK: func @dynamic_pad_tensor_0_3(
+//  CHECK-SAME:     %[[IN:.*]]: tensor<?x?xf32>
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[DIM_IN1:.*]] = tensor.dim %[[IN]], %[[C1]]
+//   CHECK-DAG:   %[[DIM1:.*]] = affine.apply #[[MAP0]]()[%[[DIM_IN1]]]
+//   CHECK-DAG:   %[[DIM_IN0:.*]] = tensor.dim %[[IN]], %[[C0]]
+//   CHECK-DAG:   %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[DIM_IN0]]]
+//       CHECK:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[DIM1]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
+//       CHECK:     %[[SWAP_RESULT:.*]] = scf.if
+//       CHECK:       tensor.generate
+//       CHECK:     else
+//       CHECK:       %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
+//       CHECK:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[3, %{{.*}}] high[{{.*}}, {{.*}}]
+//       CHECK:     tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][0, {{.*}}] [%[[DIM0]], {{.*}}] [1, 1]
+//       CHECK:   return %[[RESULT]]
+
+func.func @dynamic_pad_tensor_0_3(%input_tensor: tensor<?x?xf32>,
                          %pad_value: f32) -> tensor<?x?xf32> {
   %0 = tensor.pad %input_tensor low[3, 4] high[5, 3] {
     ^bb0(%arg1: index, %arg2: index):
@@ -56,41 +68,64 @@ func.func @dynamic_pad_tensor(%input_tensor: tensor<?x?xf32>,
   return %0 : tensor<?x?xf32>
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1
+    %1, %loop = transform.structured.tile_to_scf_for %0 [0, 3]
+}
+
+// -----
+
+// CHECK-LABEL: func @static_pad_tensor_3_4(
+//  CHECK-SAME:     %[[IN:.*]]: tensor<7x9xf32>
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[C15:.*]] = arith.constant 15 : index
+//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
+//       CHECK:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[C15]] step %[[C2]]
+//       CHECK:     scf.for {{.*}} = %[[C0]] to %[[C16]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
+//       CHECK:       %[[SWAP_RESULT:.*]] = scf.if
+//       CHECK:         tensor.generate
+//       CHECK:       else
+//       CHECK:         %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
+//       CHECK:         %[[PAD:.*]] = tensor.pad %[[SLICE]]
+//       CHECK:       tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
+//       CHECK:   return %[[RESULT]]
+
+func.func @static_pad_tensor_3_4(%input_tensor: tensor<7x9xf32>,
+                        %pad_value: f32) -> tensor<15x16xf32> {
+  %0 = tensor.pad %input_tensor low[3, 4] high[5, 3] {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %pad_value : f32
+    } : tensor<7x9xf32> to tensor<15x16xf32>
+  return %0 : tensor<15x16xf32>
+}
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1
+    %1, %loops:2 = transform.structured.tile_to_scf_for %0 [2, 3]
+}
+
 // -----
 
-// TILE2-LABEL: func @static_pad_tensor(
-//  TILE2-SAME:     %[[IN:.*]]: tensor<7x9xf32>
-//   TILE2-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   TILE2-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//   TILE2-DAG:   %[[C3:.*]] = arith.constant 3 : index
-//   TILE2-DAG:   %[[C15:.*]] = arith.constant 15 : index
-//   TILE2-DAG:   %[[C16:.*]] = arith.constant 16 : index
-//       TILE2:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[C15]] step %[[C2]]
-//       TILE2:     scf.for {{.*}} = %[[C0]] to %[[C16]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
-//       TILE2:       %[[SWAP_RESULT:.*]] = scf.if
-//       TILE2:         tensor.generate
-//       TILE2:       else
-//       TILE2:         %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
-//       TILE2:         %[[PAD:.*]] = tensor.pad %[[SLICE]]
-//       TILE2:       tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
-//       TILE2:   return %[[RESULT]]
-
-
-// TILE1-LABEL: func @static_pad_tensor(
-//  TILE1-SAME:     %[[IN:.*]]: tensor<7x9xf32>
-//   TILE1-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   TILE1-DAG:   %[[C3:.*]] = arith.constant 3 : index
-//   TILE1-DAG:   %[[C16:.*]] = arith.constant 16 : index
-//       TILE1:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[C16]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
-//       TILE1:     %[[SWAP_RESULT:.*]] = scf.if
-//       TILE1:       tensor.generate
-//       TILE1:     else
-//       TILE1:       %[[SLICE:.*]] = tensor.extract_slice %[[IN]][0, {{.*}}] [7, {{.*}}] [1, 1]
-//       TILE1:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[3, %{{.*}}] high[5, {{.*}}]
-//       TILE1:     tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][0, {{.*}}] [15, {{.*}}] [1, 1]
-//       TILE1:   return %[[RESULT]]
-
-func.func @static_pad_tensor(%input_tensor: tensor<7x9xf32>,
+// CHECK-LABEL: func @static_pad_tensor_0_3(
+//  CHECK-SAME:     %[[IN:.*]]: tensor<7x9xf32>
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
+//       CHECK:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[C16]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
+//       CHECK:     %[[SWAP_RESULT:.*]] = scf.if
+//       CHECK:       tensor.generate
+//       CHECK:     else
+//       CHECK:       %[[SLICE:.*]] = tensor.extract_slice %[[IN]][0, {{.*}}] [7, {{.*}}] [1, 1]
+//       CHECK:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[3, %{{.*}}] high[5, {{.*}}]
+//       CHECK:     %[[CAST_SWAP_RESULT:.*]] = tensor.cast %[[SWAP_RESULT]] : tensor<?x?xf32> to tensor<15x?xf32> 
+//       CHECK:     tensor.insert_slice %[[CAST_SWAP_RESULT]] into %[[INNER_OUT]][0, {{.*}}] [15, {{.*}}] [1, 1]
+//       CHECK:   return %[[RESULT]]
+
+func.func @static_pad_tensor_0_3(%input_tensor: tensor<7x9xf32>,
                         %pad_value: f32) -> tensor<15x16xf32> {
   %0 = tensor.pad %input_tensor low[3, 4] high[5, 3] {
     ^bb0(%arg1: index, %arg2: index):
@@ -99,25 +134,35 @@ func.func @static_pad_tensor(%input_tensor: tensor<7x9xf32>,
   return %0 : tensor<15x16xf32>
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1
+    %1, %loop = transform.structured.tile_to_scf_for %0 [0, 3]
+}
+
 // -----
 
-// TILE1-LABEL: func @static_pad_tile_evenly(
-//  TILE1-SAME:     %[[IN:.*]]: tensor<7x9xf32>, %[[OUT:.*]]: tensor<14x15xf32>
-//   TILE1-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   TILE1-DAG:   %[[C3:.*]] = arith.constant 3 : index
-//   TILE1-DAG:   %[[C15:.*]] = arith.constant 15 : index
-//       TILE1:   %[[RESULT:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C15]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
-//       TILE1:     %[[R2:.*]] = scf.if
-//       TILE1:       %[[GEN:.*]] = tensor.generate
-//       TILE1:       scf.yield %[[GEN]] : tensor<14x3xf32>
-//       TILE1:     else
-//       TILE1:       %[[SLICE:.*]] = tensor.extract_slice %arg0[0, %{{.*}}] [7, %{{.*}}] [1, 1] : tensor<7x9xf32> to tensor<7x?xf32>
-//       TILE1:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[0, 0] high[7, %{{.*}}]
-//       TILE1:       scf.yield %[[PAD]] : tensor<14x3xf32>
-//       TILE1:     %[[R3:.*]] = tensor.insert_slice %[[R2]] into %[[INNER_OUT]][0, %[[IV]]] [14, 3] [1, 1] : tensor<14x3xf32> into tensor<14x15xf32>
-//       TILE1:     scf.yield %[[R3]] : tensor<14x15xf32>
-//       TILE1:   return %[[RESULT]] : tensor<14x15xf32>
-func.func @static_pad_tile_evenly(%input_tensor: tensor<7x9xf32>,
+// CHECK-LABEL: func @static_pad_tile_evenly_0_3(
+//  CHECK-SAME:     %[[IN:.*]]: tensor<7x9xf32>, %[[OUT:.*]]: tensor<14x15xf32>
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[C15:.*]] = arith.constant 15 : index
+//       CHECK:   %[[RESULT:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C15]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
+//       CHECK:     %[[R2:.*]] = scf.if
+//       CHECK:       %[[GEN:.*]] = tensor.generate
+//       CHECK:       %[[cast_0:.*]] = tensor.cast %[[GEN]] : tensor<14x3xf32> to tensor<?x?xf32>
+//       CHECK:       scf.yield %[[cast_0]] : tensor<?x?xf32>
+//       CHECK:     else
+//       CHECK:       %[[SLICE:.*]] = tensor.extract_slice %arg0[0, %{{.*}}] [7, %{{.*}}] [1, 1] : tensor<7x9xf32> to tensor<7x?xf32>
+//       CHECK:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[0, 0] high[7, %{{.*}}]
+//       CHECK:       %[[cast_1:.*]] = tensor.cast %[[PAD]] : tensor<14x?xf32> to tensor<?x?xf32>
+//       CHECK:       scf.yield %[[cast_1]] : tensor<?x?xf32>
+//       CHECK:     %[[cast:.*]] = tensor.cast %[[R2]] : tensor<?x?xf32> to tensor<14x3xf32>
+//       CHECK:     %[[R3:.*]] = tensor.insert_slice %[[cast]] into %[[INNER_OUT]][0, %[[IV]]] [14, 3] [1, 1] : tensor<14x3xf32> into tensor<14x15xf32>
+//       CHECK:     scf.yield %[[R3]] : tensor<14x15xf32>
+//       CHECK:   return %[[RESULT]] : tensor<14x15xf32>
+
+func.func @static_pad_tile_evenly_0_3(%input_tensor: tensor<7x9xf32>,
                              %output_tensor: tensor<14x15xf32>,
                              %pad_value: f32) -> tensor<14x15xf32> {
   %0 = tensor.pad %input_tensor low[0, 0] high[7, 6] {
@@ -126,3 +171,9 @@ func.func @static_pad_tile_evenly(%input_tensor: tensor<7x9xf32>,
     } : tensor<7x9xf32> to tensor<14x15xf32>
   return %0 : tensor<14x15xf32>
 }
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1
+    %1, %loop = transform.structured.tile_to_scf_for %0 [0, 3]
+}
diff --git a/mlir/test/Dialect/Linalg/tile-parallel-reduce.mlir b/mlir/test/Dialect/Linalg/tile-parallel-reduce.mlir
deleted file mode 100644
index dcad7a0..0000000
--- a/mlir/test/Dialect/Linalg/tile-parallel-reduce.mlir
+++ /dev/null
@@ -1,113 +0,0 @@
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,4,8 loop-type=parallel" -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2 loop-type=parallel" -split-input-file | FileCheck %s -check-prefix=TILE1
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,4 loop-type=parallel" -split-input-file | FileCheck %s -check-prefix=TILE2
-
-func.func @gemm(%arg0 : memref<?x?xf32>,
-           %arg1 : memref<?x?xf32>,
-           %arg2 : memref<?x?xf32>)
-{
-  linalg.matmul ins(%arg0, %arg1: memref<?x?xf32>, memref<?x?xf32>)
-               outs(%arg2: memref<?x?xf32>)
-  return
-}
-// CHECK-LABEL: func @gemm
-//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
-//       CHECK:   scf.parallel (%[[ARG3:.*]], %[[ARG4:.*]]) =
-//  CHECK-SAME:     step (%[[C2]], %[[C4]])
-//       CHECK:     scf.for %[[ARG5:.*]] =
-//  CHECK-SAME:       step %[[C8]]
-//       CHECK:       %[[SV1:.*]] = memref.subview %{{.*}}[%[[ARG3]], %[[ARG5]]]
-//       CHECK:       %[[SV2:.*]] = memref.subview %{{.*}}[%[[ARG5]], %[[ARG4]]]
-//       CHECK:       %[[SV3:.*]] = memref.subview %{{.*}}[%[[ARG3]], %[[ARG4]]]
-//       CHECK:       linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
-
-// TILE1-LABEL: func @gemm
-//   TILE1-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       TILE1:   scf.parallel (%[[ARG3:.*]]) =
-//  TILE1-SAME:     step (%[[C2]])
-//       TILE1:     %[[SV1:.*]] = memref.subview %{{.*}}[%[[ARG3]], 0]
-//       TILE1:     %[[SV3:.*]] = memref.subview %{{.*}}[%[[ARG3]], 0]
-//   TILE1-NOT:     memref.subview
-//       TILE1:     linalg.matmul ins(%[[SV1]], %{{.*}} outs(%[[SV3]]
-
-// TILE2-LABEL: func @gemm
-//   TILE2-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//   TILE2-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//       TILE2:   scf.parallel (%[[ARG3:.*]], %[[ARG4:.*]]) =
-//  TILE2-SAME:     step (%[[C2]], %[[C4]])
-//       TILE2:       %[[SV1:.*]] = memref.subview %{{.*}}[%[[ARG3]], 0]
-//       TILE2:       %[[SV2:.*]] = memref.subview %{{.*}}[0, %[[ARG4]]]
-//       TILE2:       %[[SV3:.*]] = memref.subview %{{.*}}[%[[ARG3]], %[[ARG4]]]
-//       TILE2:       linalg.matmul ins(%[[SV1]], %[[SV2]]{{.*}} outs(%[[SV3]]
-
-// -----
-
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d1)>
-#accesses = [#map0, #map1, #map2]
-#trait = {
-  args_in = 2 : i64,
-  args_out = 1 : i64,
-  iterator_types = ["reduction", "parallel", "reduction"],
-  indexing_maps = #accesses
-}
-
-func.func @reduction(%arg0 : memref<?x?x?xf32>,
-                %arg1 : memref<?x?xf32>,
-                %arg2 : memref<?xf32>)
-{
-  linalg.generic #trait
-    ins(%arg0, %arg1 : memref<?x?x?xf32>, memref<?x?xf32>)
-   outs(%arg2 : memref<?xf32>) {
-  ^bb0(%arg3 : f32, %arg4 : f32, %arg5 : f32):
-    %0 = arith.addf %arg3, %arg4 : f32
-    %1 = arith.addf %0, %arg5 : f32
-    linalg.yield %1 : f32
-  }
-  return
-}
-
-// CHECK-LABEL: func @reduction
-//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
-//       CHECK:   scf.for %[[ARG3:.*]] =
-//  CHECK-SAME:     step %[[C2]]
-//       CHECK:     scf.parallel (%[[ARG4:.*]]) =
-//  CHECK-SAME:       step (%[[C4]])
-//       CHECK:       scf.for %[[ARG5:.*]] =
-//  CHECK-SAME:         step %[[C8]]
-//       CHECK:         %[[SV1:.*]] = memref.subview %{{.*}}[%[[ARG3]], %[[ARG4]], %[[ARG5]]]
-//       CHECK:         %[[SV2:.*]] = memref.subview %{{.*}}[%[[ARG3]], %[[ARG5]]]
-//       CHECK:         %[[SV3:.*]] = memref.subview %{{.*}}[%[[ARG4]]]
-//       CHECK:         linalg.generic
-//  CHECK-SAME:           ins(%[[SV1]], %[[SV2]]
-//  CHECK-SAME:          outs(%[[SV3]]
-
-// TILE1-LABEL: func @reduction
-//   TILE1-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       TILE1:   scf.for %[[ARG3:.*]] =
-//  TILE1-SAME:     step %[[C2]]
-//       TILE1:         %[[SV1:.*]] = memref.subview %{{.*}}[%[[ARG3]], 0, 0]
-//       TILE1:         %[[SV2:.*]] = memref.subview %{{.*}}[%[[ARG3]], 0]
-//   TILE1-NOT:         memref.subview
-//       TILE1:         linalg.generic
-//  TILE1-SAME:           ins(%[[SV1]], %[[SV2]]
-//  TILE1-SAME:          outs(%{{.*}}
-
-// TILE2-LABEL: func @reduction
-//   TILE2-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//   TILE2-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//       TILE2:   scf.for %[[ARG3:.*]] =
-//  TILE2-SAME:     step %[[C2]]
-//       TILE2:     scf.parallel (%[[ARG4:.*]]) =
-//  TILE2-SAME:       step (%[[C4]])
-//       TILE2:         %[[SV1:.*]] = memref.subview %{{.*}}[%[[ARG3]], %[[ARG4]], 0]
-//       TILE2:         %[[SV2:.*]] = memref.subview %{{.*}}[%[[ARG3]], 0]
-//       TILE2:         %[[SV3:.*]] = memref.subview %{{.*}}[%[[ARG4]]]
-//       TILE2:         linalg.generic
-//  TILE2-SAME:           ins(%[[SV1]], %[[SV2]]
-//  TILE2-SAME:          outs(%[[SV3]]
diff --git a/mlir/test/Dialect/Linalg/tile-parallel.mlir b/mlir/test/Dialect/Linalg/tile-parallel.mlir
deleted file mode 100644
index cf346f9..0000000
--- a/mlir/test/Dialect/Linalg/tile-parallel.mlir
+++ /dev/null
@@ -1,68 +0,0 @@
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2 loop-type=parallel" | FileCheck %s -check-prefix=TILE-2
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=0,2 loop-type=parallel" | FileCheck %s -check-prefix=TILE-02
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=0,0,2 loop-type=parallel" | FileCheck %s -check-prefix=TILE-002
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3,4 loop-type=parallel" | FileCheck %s -check-prefix=TILE-234
-
-#id_2d = affine_map<(i, j) -> (i, j)>
-#pointwise_2d_trait = {
-  args_in = 2,
-  args_out = 1,
-  indexing_maps = [#id_2d, #id_2d, #id_2d],
-  iterator_types = ["parallel", "parallel"]
-}
-
-func.func @sum(%lhs: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-          %rhs: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-          %sum: memref<?x?xf32, strided<[?, 1], offset: ?>>) {
-  linalg.generic #pointwise_2d_trait
-     ins(%lhs, %rhs: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-                     memref<?x?xf32, strided<[?, 1], offset: ?>>)
-    outs(%sum : memref<?x?xf32, strided<[?, 1], offset: ?>>) {
-  ^bb0(%lhs_in: f32, %rhs_in: f32, %sum_out: f32):
-    %result = arith.addf %lhs_in, %rhs_in : f32
-    linalg.yield %result : f32
-  }
-  return
-}
-// TILE-2-LABEL: func @sum(
-// TILE-2-SAME:    [[LHS:%.*]]: memref{{.*}}, [[RHS:%.*]]: memref{{.*}}, [[SUM:%.*]]: memref{{.*}}) {
-// TILE-2-DAG: [[C0:%.*]] = arith.constant 0 : index
-// TILE-2-DAG: [[C2:%.*]] = arith.constant 2 : index
-// TILE-2: [[LHS_ROWS:%.*]] = memref.dim [[LHS]], %c0
-// TILE-2: scf.parallel ([[I:%.*]]) = ([[C0]]) to ([[LHS_ROWS]]) step ([[C2]]) {
-// TILE-2-NO: scf.parallel
-// TILE-2:   [[LHS_SUBVIEW:%.*]] = memref.subview [[LHS]]
-// TILE-2:   [[RHS_SUBVIEW:%.*]] = memref.subview [[RHS]]
-// TILE-2:   [[SUM_SUBVIEW:%.*]] = memref.subview [[SUM]]
-// TILE-2:   linalg.generic {{.*}} ins([[LHS_SUBVIEW]], [[RHS_SUBVIEW]]{{.*}} outs([[SUM_SUBVIEW]]
-
-// TILE-02-LABEL: func @sum(
-// TILE-02-SAME:    [[LHS:%.*]]: memref{{.*}}, [[RHS:%.*]]: memref{{.*}}, [[SUM:%.*]]: memref{{.*}}) {
-// TILE-02-DAG: [[C0:%.*]] = arith.constant 0 : index
-// TILE-02-DAG: [[C2:%.*]] = arith.constant 2 : index
-// TILE-02: [[LHS_COLS:%.*]] = memref.dim [[LHS]], %c1
-// TILE-02: scf.parallel ([[I:%.*]]) = ([[C0]]) to ([[LHS_COLS]]) step ([[C2]]) {
-// TILE-02-NO: scf.parallel
-// TILE-02:   [[LHS_SUBVIEW:%.*]] = memref.subview [[LHS]]
-// TILE-02:   [[RHS_SUBVIEW:%.*]] = memref.subview [[RHS]]
-// TILE-02:   [[SUM_SUBVIEW:%.*]] = memref.subview [[SUM]]
-// TILE-02:    linalg.generic {{.*}} ins([[LHS_SUBVIEW]], [[RHS_SUBVIEW]]{{.*}} outs([[SUM_SUBVIEW]]
-
-// TILE-002-LABEL: func @sum(
-// TILE-002-SAME:    [[LHS:%.*]]: memref{{.*}}, [[RHS:%.*]]: memref{{.*}}, [[SUM:%.*]]: memref{{.*}}) {
-// TILE-002-NO: scf.parallel
-// TILE-002:   linalg.generic {{.*}} ins([[LHS]], [[RHS]]{{.*}} outs([[SUM]]
-
-// TILE-234-LABEL: func @sum(
-// TILE-234-SAME:    [[LHS:%.*]]: memref{{.*}}, [[RHS:%.*]]: memref{{.*}}, [[SUM:%.*]]: memref{{.*}}) {
-// TILE-234-DAG: [[C0:%.*]] = arith.constant 0 : index
-// TILE-234-DAG: [[C2:%.*]] = arith.constant 2 : index
-// TILE-234-DAG: [[C3:%.*]] = arith.constant 3 : index
-// TILE-234: [[LHS_ROWS:%.*]] = memref.dim [[LHS]], %c0
-// TILE-234: [[LHS_COLS:%.*]] = memref.dim [[LHS]], %c1
-// TILE-234: scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]]) to ([[LHS_ROWS]], [[LHS_COLS]]) step ([[C2]], [[C3]]) {
-// TILE-234-NO: scf.parallel
-// TILE-234:   [[LHS_SUBVIEW:%.*]] = memref.subview [[LHS]]
-// TILE-234:   [[RHS_SUBVIEW:%.*]] = memref.subview [[RHS]]
-// TILE-234:   [[SUM_SUBVIEW:%.*]] = memref.subview [[SUM]]
-// TILE-234:   linalg.generic {{.*}} ins([[LHS_SUBVIEW]], [[RHS_SUBVIEW]]{{.*}} outs([[SUM_SUBVIEW]]
diff --git a/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir b/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir
deleted file mode 100644
index 9697adf..0000000
--- a/mlir/test/Dialect/Linalg/tile-scalarize-dynamic-dims.mlir
+++ /dev/null
@@ -1,74 +0,0 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns="test-tile-scalarize-dynamic-dims" -scf-for-loop-canonicalization -canonicalize -split-input-file | \
-// RUN:     FileCheck %s
-
-// CHECK-LABEL: func @matmul_partly_dynamic_tensor(
-//  CHECK-SAME:     %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x2000xf32>
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
-//       CHECK:   tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
-//       CHECK:   %[[UB1:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
-//       CHECK:   %[[UB2:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
-//       CHECK:   scf.for %[[IV0:.*]] = %[[C0]] to %[[UB1]] step %[[C1]]
-//       CHECK:     scf.for %[[IV1:.*]] = %[[C0]] to %[[UB2]] step %[[C1]]
-//       CHECK:       %[[S1:.*]] = tensor.extract_slice %[[ARG0]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] : tensor<?x?xf32> to tensor<1x1xf32>
-//       CHECK:       %[[S2:.*]] = tensor.extract_slice %[[ARG1]][%[[IV1]], 0] [1, 2000] [1, 1] : tensor<?x2000xf32> to tensor<1x2000xf32>
-//       CHECK:       %[[S3:.*]] = tensor.extract_slice %{{.*}}[%[[IV0]], 0] [1, 2000] [1, 1] : tensor<?x2000xf32> to tensor<1x2000xf32>
-//       CHECK:       linalg.matmul ins(%[[S1]], %[[S2]] : tensor<1x1xf32>, tensor<1x2000xf32>) outs(%[[S3]] : tensor<1x2000xf32>) -> tensor<1x2000xf32>
-func.func @matmul_partly_dynamic_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x2000xf32>)
-    -> tensor<?x2000xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %out = tensor.empty(%d0) : tensor<?x2000xf32>
-  %r = linalg.matmul {__internal_linalg_transform__ = "tile"}
-      ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x2000xf32>)
-      outs(%out: tensor<?x2000xf32>) -> tensor<?x2000xf32>
-  return %r : tensor<?x2000xf32>
-}
-
-// -----
-
-// The input IR of this test case is a tiled and peeled linalg.matmul op.
-
-// CHECK-LABEL: func @tiled_and_peeled_matmul(
-//       CHECK:   linalg.matmul ins({{.*}} : tensor<32x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<32x258xf32>) -> tensor<32x258xf32>
-//       CHECK:   linalg.matmul ins({{.*}} : tensor<1x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<1x258xf32>) -> tensor<1x258xf32>
-#map0 = affine_map<(d0) -> (64, -d0 + 257)>
-#map1 = affine_map<()[s0] -> ((s0 floordiv 32) * 32)>
-#map2 = affine_map<(d0)[s0] -> (d0 - (s0 floordiv 32) * 32)>
-
-func.func @tiled_and_peeled_matmul(%arg0: tensor<257x259xf32>, %arg1: tensor<259x258xf32>, %arg2: tensor<257x258xf32>) -> tensor<257x258xf32> {
-  %c257 = arith.constant 257 : index
-  %c64 = arith.constant 64 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<257x258xf32>) -> tensor<257x258xf32>
-  %1 = scf.for %arg3 = %c0 to %c257 step %c64 iter_args(%arg4 = %0) -> (tensor<257x258xf32>) {
-    %2 = affine.min #map0(%arg3)
-    %3 = tensor.extract_slice %arg0[%arg3, 0] [%2, 259] [1, 1] : tensor<257x259xf32> to tensor<?x259xf32>
-    %4 = tensor.extract_slice %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<257x258xf32> to tensor<?x258xf32>
-    %5 = affine.apply #map1()[%2]
-    %6 = scf.for %arg5 = %c0 to %5 step %c32 iter_args(%arg6 = %4) -> (tensor<?x258xf32>) {
-      %10 = tensor.extract_slice %3[%arg5, 0] [32, 259] [1, 1] : tensor<?x259xf32> to tensor<32x259xf32>
-      %11 = tensor.extract_slice %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<?x258xf32> to tensor<32x258xf32>
-      %12 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%10, %arg1 : tensor<32x259xf32>, tensor<259x258xf32>) outs(%11 : tensor<32x258xf32>) -> tensor<32x258xf32>
-      %13 = tensor.insert_slice %12 into %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<32x258xf32> into tensor<?x258xf32>
-      scf.yield %13 : tensor<?x258xf32>
-    }
-    %7 = arith.cmpi slt, %5, %2 : index
-    %8 = scf.if %7 -> (tensor<?x258xf32>) {
-      %10 = affine.apply #map2(%2)[%2]
-      %11 = tensor.extract_slice %3[%5, 0] [%10, 259] [1, 1] : tensor<?x259xf32> to tensor<?x259xf32>
-      %12 = tensor.extract_slice %6[%5, 0] [%10, 258] [1, 1] : tensor<?x258xf32> to tensor<?x258xf32>
-      %13 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%11, %arg1 : tensor<?x259xf32>, tensor<259x258xf32>) outs(%12 : tensor<?x258xf32>) -> tensor<?x258xf32>
-      %14 = tensor.insert_slice %13 into %6[%5, 0] [%10, 258] [1, 1] : tensor<?x258xf32> into tensor<?x258xf32>
-      scf.yield %14 : tensor<?x258xf32>
-    } else {
-      scf.yield %6 : tensor<?x258xf32>
-    }
-    %9 = tensor.insert_slice %8 into %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<?x258xf32> into tensor<257x258xf32>
-    scf.yield %9 : tensor<257x258xf32>
-  }
-  return %1 : tensor<257x258xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
index 736a0e9..b87d728 100644
--- a/mlir/test/Dialect/Linalg/tile-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3,4" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -split-input-file | FileCheck %s
 
 // CHECK-LABEL: func @matmul_tensors(
 // CHECK-SAME:    %[[TA:[0-9a-z]+]]: tensor<?x?xf32>
@@ -27,6 +27,12 @@ func.func @matmul_tensors(
   return %0 : tensor<?x?xf32>
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+    %1, %loops:3 = transform.structured.tile %0 [2, 3, 4]
+}
+
 // -----
 
 func.func @generic_op_tensors(
@@ -52,6 +58,12 @@ func.func @generic_op_tensors(
   return %4 : tensor<?x?x?xf32>
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1
+    %1, %loops:3 = transform.structured.tile %0 [2, 3, 4]
+}
+
 // CHECK-LABEL: func @generic_op_tensors
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
@@ -117,3 +129,8 @@ func.func @fold_extract_slice(
   return %2 : tensor<?x42xf32>
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1
+    %1, %loops:3 = transform.structured.tile %0 [2, 3, 4]
+}
diff --git a/mlir/test/Dialect/Linalg/tile-zero.mlir b/mlir/test/Dialect/Linalg/tile-zero.mlir
deleted file mode 100644
index 147b7c7..0000000
--- a/mlir/test/Dialect/Linalg/tile-zero.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-opt -test-linalg-transform-patterns=test-tile-pattern %s | FileCheck %s
-
-func.func @matmul_zero_tile(
-  %arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = linalg.matmul {__internal_linalg_transform__ = "tile"}
-      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-// CHECK-LABEL: matmul_zero_tile
-//       CHECK:   linalg.matmul
-//   CHECK-NOT:   __internal_linalg_transform__
diff --git a/mlir/test/Dialect/Linalg/tile.mlir b/mlir/test/Dialect/Linalg/tile.mlir
deleted file mode 100644
index 0fc2ca6..0000000
--- a/mlir/test/Dialect/Linalg/tile.mlir
+++ /dev/null
@@ -1,331 +0,0 @@
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2" -mlir-disable-threading=true | FileCheck %s -check-prefix=TILE-2
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=0,2" -mlir-disable-threading=true | FileCheck %s -check-prefix=TILE-02
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=0,0,2" -mlir-disable-threading=true | FileCheck %s -check-prefix=TILE-002
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3,4" -mlir-disable-threading=true | FileCheck %s -check-prefix=TILE-234
-
-//   TILE-2-DAG: #[[$bound_map:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
-//  TILE-02-DAG: #[[$bound_map:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
-// TILE-002-DAG: #[[$bound_map:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
-// TILE-234-DAG: #[[$bound_map_2:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
-// TILE-234-DAG: #[[$bound_map_3:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 3)>
-// TILE-234-DAG: #[[$bound_map_4:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
-
-func.func @matmul(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-             %arg1: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-             %arg2: memref<?x?xf32, strided<[?, 1], offset: ?>>) {
-  linalg.matmul
-    ins(%arg0, %arg1: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-                      memref<?x?xf32, strided<[?, 1], offset: ?>>)
-   outs(%arg2: memref<?x?xf32, strided<[?, 1], offset: ?>>)
-  return
-}
-// TILE-2-LABEL: func @matmul(
-//       TILE-2-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-2-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-2: %[[M:.*]] = memref.dim %{{.*}}, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-2: scf.for %[[I:.*]] = %{{.*}}{{.*}} to %[[M]] step %{{.*}} {
-//       TILE-2:   %[[szM:.*]] = affine.min #[[$bound_map]](%[[I]])[%[[M]]]
-//       TILE-2:   %[[K:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-2:   %[[szK:.*]] = affine.min #[[$bound_map]](%[[I]])[%[[M]]]
-//       TILE-2:   %[[N:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-2:   %[[sAi:.*]] = memref.subview %{{.*}}[%[[I]], 0] [%[[szM]], %[[K]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-2:   %[[sCi:.*]] = memref.subview %{{.*}}[%[[I]], 0] [%[[szK]], %[[N]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-2:   linalg.matmul ins(%[[sAi]]{{.*}} outs(%[[sCi]]
-
-// TILE-02-LABEL: func @matmul(
-//       TILE-02-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-02-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-02: %[[N:.*]] = memref.dim %arg1, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-02: scf.for %[[J:.*]] = %{{.*}} to %[[N]] step %{{.*}} {
-//       TILE-02:   %[[K:.*]] = memref.dim %{{.*}}, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-02:   %[[szN:.*]] = affine.min #[[$bound_map]](%[[J]])[%[[N]]]
-//       TILE-02:   %[[M:.*]] = memref.dim %{{.*}}, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-02:   %[[szK:.*]] = affine.min #[[$bound_map]](%[[J]])[%[[N]]]
-//       TILE-02:   %[[sBj:.*]] = memref.subview %{{.*}}[0, %[[J]]] [%[[K]], %[[szN]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-02:   %[[sCj:.*]] = memref.subview %{{.*}}[0, %[[J]]] [%[[M]], %[[szK]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-02:   linalg.matmul ins(%{{.*}}, %[[sBj]]{{.*}} outs(%[[sCj]]
-
-// TILE-002-LABEL: func @matmul(
-//       TILE-002-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-002-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-002: %[[ubK:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-002: scf.for %[[K:.*]] = %{{.*}}{{.*}} to %[[ubK]] step %{{.*}} {
-//       TILE-002:   %[[M:.*]] = memref.dim %{{.*}}, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-002:   %[[szK:.*]] = affine.min #[[$bound_map]](%[[K]])[%[[ubK]]]
-//       TILE-002:   %[[szK_1:.*]] = affine.min #[[$bound_map]](%[[K]])[%[[ubK]]]
-//       TILE-002:   %[[N:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-002:   %[[sAj:.*]] = memref.subview %{{.*}}[0, %[[K]]] [%[[M]], %[[szK]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-002:   %[[sBj:.*]] = memref.subview %{{.*}}[%[[K]], 0] [%[[szK_1]], %[[N]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-002:   linalg.matmul ins(%[[sAj]], %[[sBj]]{{.*}} outs(%{{.*}}
-
-// TILE-234-LABEL: func @matmul(
-//       TILE-234-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-234-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-234-DAG: %[[C3:.*]] = arith.constant 3 : index
-//       TILE-234-DAG: %[[C4:.*]] = arith.constant 4 : index
-//       TILE-234: %[[ubM:.*]] = memref.dim %{{.*}}, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-234: %[[ubK:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-234: %[[ubN:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-234:  scf.for %[[I:.*]] = %{{.*}}{{.*}} to %[[ubM]] step %{{.*}} {
-//       TILE-234:    scf.for %[[J:.*]] = %{{.*}}{{.*}} to %[[ubN]] step %{{.*}} {
-//       TILE-234:      scf.for %[[K:.*]] = %{{.*}}{{.*}} to %[[ubK]] step %{{.*}} {
-//       TILE-234:        %[[szM:.*]] = affine.min #[[$bound_map_2]](%[[I]])[%[[ubM]]]
-//       TILE-234:        %[[szK:.*]] = affine.min #[[$bound_map_4]](%[[K]])[%[[ubK]]]
-//       TILE-234:        %[[szK_1:.*]] = affine.min #[[$bound_map_4]](%[[K]])[%[[ubK]]]
-//       TILE-234:        %[[szN:.*]] = affine.min #[[$bound_map_3]](%[[J]])[%[[ubN]]]
-//       TILE-234:        %[[szM_1:.*]] = affine.min #[[$bound_map_2]](%[[I]])[%[[ubM]]]
-//       TILE-234:        %[[szN_1:.*]] = affine.min #[[$bound_map_3]](%[[J]])[%[[ubN]]]
-//       TILE-234:        %[[sAik:.*]] = memref.subview %{{.*}}[%[[I]], %[[K]]] [%[[szM]], %[[szK]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-234:        %[[sBkj:.*]] = memref.subview %{{.*}}[%[[K]], %[[J]]] [%[[szK_1]], %[[szN]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-234:        %[[sCij:.*]] = memref.subview %{{.*}}[%[[I]], %[[J]]] [%[[szM_1]], %[[szN_1]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//
-//       TILE-234:        linalg.matmul ins(%[[sAik]], %[[sBkj]]{{.*}} outs(%[[sCij]]
-
-// When the buffer shapes are known at compile time, it is possible to avoid
-// the "min" in subview size computation. This test uses buffer sizes divisible
-// by respective tile sizes (M=10 divisble by 2, N=12 divisible by 2 and 3,
-// K=16 divisble by 2 and 4).
-func.func @matmul_static(%arg0: memref<10x16xf32, strided<[?, 1], offset: ?>>,
-                    %arg1: memref<16x12xf32, strided<[?, 1], offset: ?>>,
-                    %arg2: memref<10x12xf32, strided<[?, 1], offset: ?>>) {
-  linalg.matmul
-    ins(%arg0, %arg1: memref<10x16xf32, strided<[?, 1], offset: ?>>,
-                      memref<16x12xf32, strided<[?, 1], offset: ?>>)
-   outs(%arg2: memref<10x12xf32, strided<[?, 1], offset: ?>>)
-  return
-}
-// TILE-2-LABEL: func @matmul_static(
-//  TILE-2-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
-//  TILE-2-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
-//  TILE-2-SAME: %[[ARG2:[0-9a-zA-Z]*]]: memref
-//       TILE-2-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-2-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-2-DAG: %[[M:.*]] = arith.constant 10 : index
-//       TILE-2: scf.for %[[I:.*]] = %{{.*}} to %[[M]] step %{{.*}} {
-//       TILE-2:   %[[sAi:.*]] = memref.subview %{{.*}}[%[[I]], 0] [2, 16] [1, 1] : memref<10x16xf32, strided<[?, 1], offset: ?>> to memref<2x16xf32, strided<[?, 1], offset: ?>>
-//       TILE-2:   %[[sCi:.*]] = memref.subview %{{.*}}[%[[I]], 0] [2, 12] [1, 1] : memref<10x12xf32, strided<[?, 1], offset: ?>> to memref<2x12xf32, strided<[?, 1], offset: ?>>
-//       TILE-2:   linalg.matmul ins(%[[sAi]], %{{.*}}{{.*}} outs(%[[sCi]]
-
-// TILE-02-LABEL: func @matmul_static(
-//       TILE-02-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-02-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-02-DAG: %[[N:.*]] = arith.constant 12 : index
-//       TILE-02: scf.for %[[J:.*]] = %{{.*}} to %[[N]] step %{{.*}} {
-//       TILE-02:   %[[sBj:.*]] = memref.subview %{{.*}}[0, %[[J]]] [16, 2] [1, 1] : memref<16x12xf32, strided<[?, 1], offset: ?>> to memref<16x2xf32, strided<[?, 1], offset: ?>>
-//       TILE-02:   %[[sCj:.*]] = memref.subview %{{.*}}[0, %[[J]]] [10, 2] [1, 1] : memref<10x12xf32, strided<[?, 1], offset: ?>> to memref<10x2xf32, strided<[?, 1], offset: ?>>
-//       TILE-02:   linalg.matmul ins(%{{.*}}, %[[sBj]]{{.*}} outs(%[[sCj]]
-
-// TILE-002-LABEL: func @matmul_static(
-//       TILE-002-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-002-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-002-DAG: %[[C16:.*]] = arith.constant 16 : index
-//       TILE-002: scf.for %[[K:.*]] = %{{.*}}{{.*}} to %[[C16]] step %{{.*}} {
-//       TILE-002:   %[[sAj:.*]] = memref.subview %{{.*}}[0, %[[K]]] [10, 2] [1, 1] : memref<10x16xf32, strided<[?, 1], offset: ?>> to memref<10x2xf32, strided<[?, 1], offset: ?>>
-//       TILE-002:   %[[sBj:.*]] = memref.subview %{{.*}}[%[[K]], 0] [2, 12] [1, 1] : memref<16x12xf32, strided<[?, 1], offset: ?>> to memref<2x12xf32, strided<[?, 1], offset: ?>>
-//       TILE-002:   linalg.matmul ins(%[[sAj]], %[[sBj]]{{.*}} outs(%{{.*}}
-
-// TILE-234-LABEL: func @matmul_static(
-//       TILE-234-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-234-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-234-DAG: %[[C3:.*]] = arith.constant 3 : index
-//       TILE-234-DAG: %[[C4:.*]] = arith.constant 4 : index
-//       TILE-234-DAG: %[[C10:.*]] = arith.constant 10 : index
-//       TILE-234-DAG: %[[C16:.*]] = arith.constant 16 : index
-//       TILE-234-DAG: %[[C12:.*]] = arith.constant 12 : index
-//       TILE-234:  scf.for %[[I:.*]] = %{{.*}}{{.*}} to %[[C10]] step %{{.*}} {
-//       TILE-234:    scf.for %[[J:.*]] = %{{.*}}{{.*}} to %[[C12]] step %{{.*}} {
-//       TILE-234:      scf.for %[[K:.*]] = %{{.*}}{{.*}} to %[[C16]] step %{{.*}} {
-//       TILE-234:        %[[sAik:.*]] = memref.subview %{{.*}}[%[[I]], %[[K]]] [2, 4] [1, 1] : memref<10x16xf32, strided<[?, 1], offset: ?>> to memref<2x4xf32, strided<[?, 1], offset: ?>>
-//       TILE-234:        %[[sBkj:.*]] = memref.subview %{{.*}}[%[[K]], %[[J]]] [4, 3] [1, 1] : memref<16x12xf32, strided<[?, 1], offset: ?>> to memref<4x3xf32, strided<[?, 1], offset: ?>>
-//       TILE-234:        %[[sCij:.*]] = memref.subview %{{.*}}[%[[I]], %[[J]]] [2, 3] [1, 1] : memref<10x12xf32, strided<[?, 1], offset: ?>> to memref<2x3xf32, strided<[?, 1], offset: ?>>
-//
-//       TILE-234:        linalg.matmul ins(%[[sAik]], %[[sBkj]]{{.*}} outs(%[[sCij]]
-
-func.func @matvec(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?>>, %arg1: memref<?xf32, strided<[1], offset: ?>>, %arg2: memref<?xf32, strided<[1], offset: ?>>) {
-  linalg.matvec
-    ins(%arg0, %arg1: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-                      memref<?xf32, strided<[1], offset: ?>>)
-   outs(%arg2: memref<?xf32, strided<[1], offset: ?>>)
-  return
-}
-// TILE-2-LABEL: func @matvec(
-//  TILE-2-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
-//  TILE-2-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
-//  TILE-2-SAME: %[[ARG2:[0-9a-zA-Z]*]]: memref
-//       TILE-2-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-2-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-2: %[[M:.*]] = memref.dim %{{.*}}, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-2: scf.for %[[I:.*]] = %{{.*}}{{.*}} to %[[M]] step %{{.*}} {
-//       TILE-2:   %[[szM:.*]] = affine.min #[[$bound_map]](%[[I]])[%[[M]]]
-//       TILE-2:   %[[N:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-2:   %[[szN:.*]] = affine.min #[[$bound_map]](%[[I]])[%[[M]]]
-//       TILE-2:   %[[sAi:.*]] = memref.subview %{{.*}}[%[[I]], 0] [%[[szM]], %[[N]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-2:   %[[sCi:.*]] = memref.subview %{{.*}}[%[[I]]] [%[[szN]]] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
-//       TILE-2:   linalg.matvec ins(%[[sAi]], %{{.*}} outs(%[[sCi]]
-
-// TILE-02-LABEL: func @matvec(
-// TILE-02-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
-// TILE-02-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
-// TILE-02-SAME: %[[ARG2:[0-9a-zA-Z]*]]: memref
-//       TILE-02-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-02-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-02: %[[K:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-02: scf.for %[[J:.*]] = %{{.*}}{{.*}} to %[[K]] step %{{.*}} {
-//       TILE-02:   %[[M:.*]] = memref.dim %{{.*}}, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-02:   %[[szN:.*]] = affine.min #[[$bound_map]](%[[J]])[%[[K]]]
-//       TILE-02:   %[[szN_1:.*]] = affine.min #[[$bound_map]](%[[J]])[%[[K]]]
-//       TILE-02:   %[[sAj:.*]] = memref.subview %{{.*}}[0, %[[J]]] [%[[M]], %[[szN]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-02:   %[[sBj:.*]] = memref.subview %{{.*}}[%[[J]]] [%[[szN_1]]] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
-//       TILE-02:   linalg.matvec ins(%[[sAj]], %[[sBj]]{{.*}} outs(%{{.*}}
-
-// TILE-002-LABEL: func @matvec(
-// TILE-002-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
-// TILE-002-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
-// TILE-002-SAME: %[[ARG2:[0-9a-zA-Z]*]]: memref
-//   TILE-002-NOT: scf.for
-
-// TILE-234-LABEL: func @matvec(
-// TILE-234-SAME: %[[ARG0:[0-9a-zA-Z]*]]: memref
-// TILE-234-SAME: %[[ARG1:[0-9a-zA-Z]*]]: memref
-// TILE-234-SAME: %[[ARG2:[0-9a-zA-Z]*]]: memref
-//       TILE-234-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-234-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-234-DAG: %[[C3:.*]] = arith.constant 3 : index
-//       TILE-234: %[[M:.*]] = memref.dim %{{.*}}, %c0 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-234: %[[K:.*]] = memref.dim %{{.*}}, %c1 : memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-234:  scf.for %[[I:.*]] = %{{.*}}{{.*}} to %[[M]] step %{{.*}} {
-//       TILE-234:    scf.for %[[J:.*]] = %{{.*}}{{.*}} to %[[K]] step %{{.*}} {
-//       TILE-234:      %[[szM:.*]] = affine.min #[[$bound_map_2]](%[[I]])[%[[M]]]
-//       TILE-234:      %[[szN:.*]] = affine.min #[[$bound_map_3]](%[[J]])[%[[K]]]
-//       TILE-234:      %[[szN_1:.*]] = affine.min #[[$bound_map_3]](%[[J]])[%[[K]]]
-//       TILE-234:      %[[szM_1:.*]] = affine.min #[[$bound_map_2]](%[[I]])[%[[M]]]
-//       TILE-234:      %[[sAij:.*]] = memref.subview %{{.*}}[%[[I]], %[[J]]] [%[[szM]], %[[szN]]] [1, 1] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       TILE-234:      %[[sBj:.*]] = memref.subview %{{.*}}[%[[J]]] [%[[szN_1]]] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
-//       TILE-234:      %[[sCi:.*]] = memref.subview %{{.*}}[%[[I]]] [%[[szM_1]]] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
-//
-//       TILE-234:      linalg.matvec ins(%[[sAij]], %[[sBj]]{{.*}} outs(%[[sCi]]
-
-func.func @dot(%arg0: memref<?xf32, strided<[1], offset: ?>>, %arg1: memref<?xf32, strided<[1], offset: ?>>, %arg2: memref<f32>) {
-  linalg.dot
-    ins(%arg0, %arg1: memref<?xf32, strided<[1], offset: ?>>, memref<?xf32, strided<[1], offset: ?>>)
-   outs(%arg2: memref<f32>)
-  return
-}
-// TILE-2-LABEL: func @dot(
-//       TILE-2-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-2-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-2: %[[M:.*]] = memref.dim %{{.*}}, %c0 : memref<?xf32, strided<[1], offset: ?>>
-//       TILE-2: scf.for %[[I:.*]] = %{{.*}}{{.*}} to %[[M]] step %{{.*}} {
-//       TILE-2:   %[[szM:.*]] = affine.min #[[$bound_map]](%[[I]])[%[[M]]]
-//       TILE-2:   %[[szM_1:.*]] = affine.min #[[$bound_map]](%[[I]])[%[[M]]]
-//       TILE-2:   %[[sAi:.*]] = memref.subview %{{.*}}[%[[I]]] [%[[szM]]] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
-//       TILE-2:   %[[sBi:.*]] = memref.subview %{{.*}}[%[[I]]] [%[[szM_1]]] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
-//       TILE-2:   linalg.dot ins(%[[sAi]], %[[sBi]]{{.*}} outs(
-
-// TILE-02-LABEL: func @dot(
-//   TILE-02-NOT: scf.for
-
-// TILE-002-LABEL: func @dot(
-//   TILE-002-NOT: scf.for
-
-// TILE-234-LABEL: func @dot(
-//       TILE-234-DAG: %[[C0:.*]] = arith.constant 0 : index
-//       TILE-234-DAG: %[[C2:.*]] = arith.constant 2 : index
-//       TILE-234:  %[[ubK:.*]] = memref.dim %{{.*}}, %c0 : memref<?xf32, strided<[1], offset: ?>>
-//       TILE-234:  scf.for %[[I:.*]] = %{{.*}} to %[[ubK]] step %{{.*}} {
-//       TILE-234:    %[[szM:.*]] = affine.min #[[$bound_map_2]](%[[I]])[%[[ubK]]]
-//       TILE-234:    %[[szM_1:.*]] = affine.min #[[$bound_map_2]](%[[I]])[%[[ubK]]]
-//       TILE-234:    %[[sAi:.*]] = memref.subview %{{.*}}[%[[I]]] [%[[szM]]] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
-//       TILE-234:    %[[sBi:.*]] = memref.subview %{{.*}}[%[[I]]] [%[[szM_1]]] [1] : memref<?xf32, strided<[1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
-//       TILE-234:    linalg.dot ins(%[[sAi]], %[[sBi]]{{.*}} outs(
-
-func.func @fill_static(%arg0: memref<127x99xf32>, %arg1: f32) {
-  linalg.fill ins(%arg1 : f32) outs(%arg0 : memref<127x99xf32>)
-  return
-}
-// TILE-2-LABEL: func @fill_static
-//       TILE-2:   for
-//   TILE-2-NOT:   for
-//       TILE-2:       memref.subview{{.*}} : memref<127x99xf32>
-//       TILE-2:       linalg.fill{{.*}} : memref<?x99xf32, strided<[99, 1], offset: ?>>
-
-// TILE-02-LABEL: func @fill_static
-//       TILE-02:   for
-//   TILE-02-NOT:   for
-//       TILE-02:       memref.subview{{.*}} : memref<127x99xf32>
-//       TILE-02:       linalg.fill{{.*}} : memref<127x?xf32, strided<[99, 1], offset: ?>>
-
-// TILE-002-LABEL: func @fill_static
-//   TILE-002-NOT:   for
-//       TILE-002:     linalg.fill{{.*}} : memref<127x99xf32>
-
-// TILE-234-LABEL: func @fill_static
-//       TILE-234:   for
-//       TILE-234:     for
-//   TILE-234-NOT:   for
-//       TILE-234:       memref.subview{{.*}} : memref<127x99xf32>
-//       TILE-234:       linalg.fill{{.*}} : memref<?x3xf32, strided<[99, 1], offset: ?>>
-
-
-func.func @fill(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?>>, %arg1: f32) {
-  linalg.fill ins(%arg1 : f32) outs(%arg0 : memref<?x?xf32, strided<[?, 1], offset: ?>>)
-  return
-}
-// TILE-2-LABEL: func @fill
-//       TILE-2:   for
-//   TILE-2-NOT:   for
-//       TILE-2:   fill{{.*}} f32
-
-// TILE-02-LABEL: func @fill
-//       TILE-02:   for
-//   TILE-02-NOT:   for
-//       TILE-02:     fill{{.*}} f32
-
-// TILE-002-LABEL: func @fill
-//   TILE-002-NOT:   for
-//       TILE-002:     fill{{.*}} f32
-
-// TILE-234-LABEL: func @fill
-//       TILE-234:   for
-//       TILE-234:     for
-//   TILE-234-NOT:   for
-//       TILE-234:       fill{{.*}} f32
-
-#id_2d = affine_map<(i, j) -> (i, j)>
-#pointwise_2d_trait = {
-  args_in = 2,
-  args_out = 1,
-  indexing_maps = [#id_2d, #id_2d, #id_2d],
-  iterator_types = ["parallel", "parallel"]
-}
-
-func.func @pointwise(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?>>, %arg1: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-                %arg2: memref<?x?xf32, strided<[?, 1], offset: ?>>) {
-  linalg.generic #pointwise_2d_trait
-    ins(%arg0, %arg1 : memref<?x?xf32, strided<[?, 1], offset: ?>>, memref<?x?xf32, strided<[?, 1], offset: ?>>)
-    outs(%arg2 : memref<?x?xf32, strided<[?, 1], offset: ?>>) {
-  ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
-    %4 = arith.addf %arg4, %arg5 : f32
-    linalg.yield %4 : f32
-  }
-  return
-}
-// TILE-2-LABEL: func @pointwise
-//       TILE-2:   for
-//   TILE-2-NOT:   for
-//       TILE-2:   linalg.generic
-
-// TILE-02-LABEL: func @pointwise
-//       TILE-02:   for
-//   TILE-02-NOT:   for
-//       TILE-02:     linalg.generic
-
-// TILE-002-LABEL: func @pointwise
-//   TILE-002-NOT:   for
-//       TILE-002:     linalg.generic
-
-// TILE-234-LABEL: func @pointwise
-//       TILE-234:   for
-//       TILE-234:     for
-//   TILE-234-NOT:   for
-//       TILE-234:       linalg.generic
diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir
index 3502f99..ad3271c 100644
--- a/mlir/test/Dialect/Linalg/transform-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir
@@ -1,20 +1,22 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-patterns -split-input-file -test-transform-dialect-interpreter | FileCheck %s
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-linalg-transform-patterns=test-patterns -split-input-file | FileCheck %s
 
-// Map corresponding to a 2D memory access where the stride along the last dim is known to be 1.
-// CHECK-DAG: #[[$kn:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK-DAG: #[[$nm:.*]] = affine_map<(d0, d1, d2) -> (d1, d0)>
-// CHECK-DAG: #[[$km:.*]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// -----
 
 func.func @dot(%x: memref<?xf32, strided<[1], offset: ?>>,
           %y: memref<?xf32, strided<[1], offset: ?>>,
           %v: memref<f32>) {
-  linalg.dot { __internal_linalg_transform__ = "MEM" }
-    ins(%x, %y: memref<?xf32, strided<[1], offset: ?>>,
-                memref<?xf32, strided<[1], offset: ?>>)
-    outs(%v: memref<f32>)
-
+  linalg.dot ins(%x, %y: memref<?xf32, strided<[1], offset: ?>>,
+                         memref<?xf32, strided<[1], offset: ?>>)
+            outs(%v: memref<f32>)
   return
 }
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.dot"]} in %arg1
+    %1, %loop = transform.structured.tile %0 [8000]
+}
+
 // CHECK-LABEL: func @dot
 // CHECK-DAG:     %[[c0:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[c1:.*]] = arith.constant 1 : index
@@ -28,6 +30,8 @@ func.func @dot(%x: memref<?xf32, strided<[1], offset: ?>>,
 // CHECK:               arith.addf
 // CHECK:               store
 
+// -----
+
 func.func @matvec(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
              %x: memref<?xf32, strided<[1], offset: ?>>,
              %y: memref<?xf32, strided<[1], offset: ?>>) {
@@ -37,25 +41,43 @@ func.func @matvec(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
     outs(%y: memref<?xf32, strided<[1], offset: ?>>)
   return
 }
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.matvec"]} in %arg1
+    %1, %loops:2 = transform.structured.tile %0 [5, 6]
+}
+
 // CHECK-LABEL: func @matvec
 // CHECK-DAG:     %[[c0:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[c5:.*]] = arith.constant 5 : index
 // CHECK-DAG:     %[[c6:.*]] = arith.constant 6 : index
-// CHECK:         scf.parallel {{.*}} step (%[[c5]])
+// CHECK:         scf.for {{.*}} step %[[c5]]
 // CHECK:           scf.for {{.*}} step %[[c6]]
 // CHECK:             linalg.matvec
 // CHECK:               ins({{.*}}: memref<?x?xf32, strided<[?, 1], offset: ?>>, memref<?xf32, strided<[1], offset: ?>>)
 // CHECK:              outs({{.*}}: memref<?xf32, strided<[1], offset: ?>>)
 
+// -----
+
 func.func @matmul(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
              %B: memref<?x?xf32, strided<[?, 1], offset: ?>>,
              %C: memref<?x?xf32, strided<[?, 1], offset: ?>>) {
-  linalg.matmul { __internal_linalg_transform__ = "MEM" }
-    ins(%A, %B: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-                memref<?x?xf32, strided<[?, 1], offset: ?>>)
-    outs(%C: memref<?x?xf32, strided<[?, 1], offset: ?>>)
+  linalg.matmul ins(%A, %B: memref<?x?xf32, strided<[?, 1], offset: ?>>,
+                            memref<?x?xf32, strided<[?, 1], offset: ?>>)
+               outs(%C: memref<?x?xf32, strided<[?, 1], offset: ?>>)
   return
 }
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+    %1, %loops:3 = transform.structured.tile %0 [2000, 3000, 4000]
+    %2, %loops_2:3 = transform.structured.tile %1 [200, 300, 400]
+    %3, %loops_3:3 = transform.structured.tile %2 [20, 30, 40]
+    %4, %loops_4:3 = transform.structured.tile %3 [2, 3, 4]
+}
+
 // CHECK-LABEL: func @matmul
 // CHECK-DAG:     %[[c0:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[c2:.*]] = arith.constant 2 : index
@@ -86,6 +108,13 @@ func.func @matmul(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
 // CHECK:                                   ins({{.*}}: memref<?x?xf32, strided<[?, 1], offset: ?>>, memref<?x?xf32, strided<[?, 1], offset: ?>>)
 // CHECK:                                  outs({{.*}}: memref<?x?xf32, strided<[?, 1], offset: ?>>)
 
+// -----
+
+// Map corresponding to a 2D memory access where the stride along the last dim is known to be 1.
+// CHECK-DAG: #[[$kn:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK-DAG: #[[$nm:.*]] = affine_map<(d0, d1, d2) -> (d1, d0)>
+// CHECK-DAG: #[[$km:.*]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+
 #matmul_accesses = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (k, n)>,
@@ -112,6 +141,7 @@ func.func @permute_generic(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
   }
   return
 }
+
 transform.with_pdl_patterns {
 ^bb0(%arg0: !pdl.operation):
   transform.sequence %arg0 failures(propagate) {
@@ -120,6 +150,7 @@ transform.with_pdl_patterns {
     transform.structured.interchange %0 { iterator_interchange = [1, 2, 0]}
   }
 }
+
 // CHECK-LABEL:  func @permute_generic
 // CHECK:        linalg.generic {
 // CHECK-SAME:   indexing_maps = [#[[$kn]], #[[$nm]], #[[$km]]],
@@ -129,15 +160,23 @@ transform.with_pdl_patterns {
 // CHECK-SAME:     memref<?x?xf32, strided<[?, 1], offset: ?>>
 // CHECK-SAME:     memref<?x?xf32, strided<[?, 1], offset: ?>>
 
+// -----
+
 func.func @matvec_perm(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
              %x: memref<?xf32, strided<[1], offset: ?>>,
              %y: memref<?xf32, strided<[1], offset: ?>>) {
-  linalg.matvec {__internal_linalg_transform__ = "__with_perm__"}
-    ins(%A, %x: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-                memref<?xf32, strided<[1], offset: ?>>)
-   outs(%y: memref<?xf32, strided<[1], offset: ?>>)
+  linalg.matvec ins(%A, %x: memref<?x?xf32, strided<[?, 1], offset: ?>>,
+                            memref<?xf32, strided<[1], offset: ?>>)
+               outs(%y: memref<?xf32, strided<[1], offset: ?>>)
   return
 }
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.matvec"]} in %arg1
+    %1, %loops:2 = transform.structured.tile %0 [5, 6] {interchange = [1, 0]}
+}
+
 // CHECK-LABEL: func @matvec_perm
 // CHECK-DAG:     %[[c0:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[c5:.*]] = arith.constant 5 : index
@@ -148,15 +187,25 @@ func.func @matvec_perm(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
 // CHECK:               ins({{.*}}: memref<?x?xf32, strided<[?, 1], offset: ?>>, memref<?xf32, strided<[1], offset: ?>>)
 // CHECK:              outs({{.*}}: memref<?xf32, strided<[1], offset: ?>>)
 
+// -----
+
 func.func @matmul_perm(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
              %B: memref<?x?xf32, strided<[?, 1], offset: ?>>,
              %C: memref<?x?xf32, strided<[?, 1], offset: ?>>) {
-  linalg.matmul {__internal_linalg_transform__ = "__with_perm__"}
-    ins(%A, %B: memref<?x?xf32, strided<[?, 1], offset: ?>>,
-                memref<?x?xf32, strided<[?, 1], offset: ?>>)
-   outs(%C : memref<?x?xf32, strided<[?, 1], offset: ?>>)
+  linalg.matmul ins(%A, %B: memref<?x?xf32, strided<[?, 1], offset: ?>>,
+                            memref<?x?xf32, strided<[?, 1], offset: ?>>)
+               outs(%C : memref<?x?xf32, strided<[?, 1], offset: ?>>)
   return
 }
+
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+    %1, %loops:3 = transform.structured.tile %0 [2000, 3000, 4000] {interchange=[1, 2, 0]}
+    %2, %loops_2:3 = transform.structured.tile %1 [200, 300, 400] {interchange=[1, 0, 2]}
+    %3, %loops_3:3 = transform.structured.tile %2 [20, 30, 40]
+}
+
 // CHECK-LABEL: func @matmul_perm
 // CHECK-DAG:     %[[c0:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[c20:.*]] = arith.constant 20 : index
@@ -180,26 +229,3 @@ func.func @matmul_perm(%A: memref<?x?xf32, strided<[?, 1], offset: ?>>,
 // CHECK:                                 linalg.matmul
 // CHECK:                                  ins({{.*}}: memref<?x?xf32, strided<[?, 1], offset: ?>>, memref<?x?xf32, strided<[?, 1], offset: ?>>)
 // CHECK:                                   outs({{.*}}: memref<?x?xf32, strided<[?, 1], offset: ?>>)
-
-func.func @tile_permute_parallel_loop(%arg0: memref<?x?xf32>,
-                                 %arg1: memref<?x?xf32>,
-                                 %arg2: memref<?x?xf32>) {
-  linalg.matmul {__internal_linalg_transform__ = "par__with_perm__"}
-    ins(%arg0, %arg1: memref<?x?xf32>, memref<?x?xf32>)
-   outs(%arg2: memref<?x?xf32>)
-  return
-}
-// CHECK-LABEL: func @tile_permute_parallel_loop
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
-//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
-//  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
-//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
-//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
-//   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[D0:.*]] = memref.dim %[[ARG0]], %c0
-//   CHECK-DAG:   %[[D1:.*]] = memref.dim %[[ARG0]], %c1
-//   CHECK-DAG:   %[[D2:.*]] = memref.dim %[[ARG1]], %c1
-//       CHECK:   scf.parallel (%{{.*}}) = (%[[C0]]) to (%[[D2]]) step (%[[C8]])
-//       CHECK:     scf.for %{{.*}} = %[[C0]] to %[[D1]] step %[[C4]]
-//       CHECK:       scf.parallel (%{{.*}}) = (%[[C0]]) to (%[[D0]]) step (%[[C16]])
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
index 10c9adb..d79b402 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=4" -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
@@ -24,6 +24,12 @@ func.func @conv_1d(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf
   return
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.conv_1d"]} in %arg1
+    %1, %loop = transform.structured.tile %0 [4]
+}
+
 func.func @main() {
   %c3 = arith.constant 3 : index
   %c6 = arith.constant 6 : index
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
index 7d6e47b..cb9a033 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,4" -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
@@ -26,6 +26,12 @@ func.func @conv_1d_nwc_wcf(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %
   return
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.conv_1d_nwc_wcf"]} in %arg1
+    %1, %loops:2 = transform.structured.tile %0 [2, 4]
+}
+
 func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
index 21d2a19..78175ba 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,2" -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
@@ -24,6 +24,12 @@ func.func @conv_2d(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref
   return
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.conv_2d"]} in %arg1
+    %1, %loops:2 = transform.structured.tile %0 [2, 2]
+}
+
 func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
index 51708ea..b675f87 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,3,3,2" -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
@@ -26,6 +26,12 @@ func.func @conv_2d_nhwc_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf
   return
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_hwcf"]} in %arg1
+    %1, %loops:4 = transform.structured.tile %0 [2, 3, 3, 2]
+}
+
 func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
index 15cf003..361869e 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=2,2,2" -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
@@ -24,6 +24,12 @@ func.func @conv_3d(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: me
   return
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.conv_3d"]} in %arg1
+    %1, %loops:3 = transform.structured.tile %0 [2, 2, 2]
+}
+
 func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
index a4a51b8..d7245d3 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s -linalg-tile="tile-sizes=0,5,5,5" -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_runner_utils%shlibext \
@@ -26,6 +26,11 @@ func.func @conv_3d_ndhwc_dhwcf(%arg0: memref<?x?x?x?x?xf32>, %arg1: memref<?x?x?
   return
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.conv_3d_ndhwc_dhwcf"]} in %arg1
+    %1, %loops:3 = transform.structured.tile %0 [0, 5, 5, 5]
+}
 
 func.func @main() {
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
index a7f2b41..c35ad80 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
@@ -1,12 +1,12 @@
 // UNSUPPORTED: asan
-// RUN: mlir-opt %s -linalg-bufferize -arith-bufferize \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -linalg-bufferize -arith-bufferize \
 // RUN: -tensor-bufferize -func-bufferize -finalizing-bufferize -buffer-deallocation -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --convert-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext,%mlir_lib_dir/libmlir_runner_utils%shlibext \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s  -linalg-tile="tile-sizes=1,2,3" -linalg-bufferize \
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule -linalg-bufferize \
 // RUN: -scf-bufferize -arith-bufferize -tensor-bufferize \
 // RUN: -func-bufferize \
 // RUN: -finalizing-bufferize -convert-linalg-to-loops -convert-scf-to-cf -convert-scf-to-cf \
@@ -36,4 +36,10 @@ func.func @main() {
   return
 }
 
+transform.sequence failures(propagate) {
+  ^bb0(%arg1: !pdl.operation):
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1
+    %1, %loops:3 = transform.structured.tile %0 [1, 2, 3]
+}
+
 func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
index 781936f..ad27637 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -61,10 +61,6 @@ struct TestLinalgTransforms
   Option<bool> testPatterns{*this, "test-patterns",
                             llvm::cl::desc("Test a mixed set of patterns"),
                             llvm::cl::init(false)};
-  Option<bool> testTileAndDistributionOptions{
-      *this, "test-tile-and-distribute-options",
-      llvm::cl::desc("Test tile and distribute options"),
-      llvm::cl::init(false)};
   Option<bool> testVectorTransferForwardingPatterns{
       *this, "test-vector-transfer-forwarding-patterns",
       llvm::cl::desc(
@@ -75,13 +71,6 @@ struct TestLinalgTransforms
       llvm::cl::desc("Test a set of patterns that rewrite a linalg contraction "
                      "in vector.contract form"),
       llvm::cl::init(false)};
-  Option<bool> testTilePattern{*this, "test-tile-pattern",
-                               llvm::cl::desc("Test tile pattern"),
-                               llvm::cl::init(false)};
-  Option<bool> testTileScalarizeDynamicDims{
-      *this, "test-tile-scalarize-dynamic-dims",
-      llvm::cl::desc("Test tiling of dynamic dims by 1"),
-      llvm::cl::init(false)};
   Option<bool> testTransformPadTensor{
       *this, "test-transform-pad-tensor",
       llvm::cl::desc("Test transform pad tensor by copying with generic ops"),
@@ -136,90 +125,11 @@ static void applyPatterns(func::FuncOp funcOp) {
   RewritePatternSet patterns(ctx);
 
   //===--------------------------------------------------------------------===//
-  // Linalg tiling patterns.
-  //===--------------------------------------------------------------------===//
-  patterns.add<LinalgTilingPattern>(
-      MatmulOp::getOperationName(), ctx,
-      LinalgTilingOptions().setTileSizes({2000, 3000, 4000}),
-      LinalgTransformationFilter(StringAttr::get(ctx, "MEM"),
-                                 StringAttr::get(ctx, "L3")));
-  patterns.add<LinalgTilingPattern>(
-      MatmulOp::getOperationName(), ctx,
-      LinalgTilingOptions().setTileSizes({200, 300, 400}),
-      LinalgTransformationFilter(StringAttr::get(ctx, "L3"),
-                                 StringAttr::get(ctx, "L2")));
-  patterns.add<LinalgTilingPattern>(
-      MatmulOp::getOperationName(), ctx,
-      LinalgTilingOptions().setTileSizes({20, 30, 40}),
-      LinalgTransformationFilter(StringAttr::get(ctx, "L2"),
-                                 StringAttr::get(ctx, "L1")));
-  patterns.add<LinalgTilingPattern>(
-      MatmulOp::getOperationName(), ctx,
-      LinalgTilingOptions().setTileSizes({2, 3, 4}),
-      LinalgTransformationFilter(StringAttr::get(ctx, "L1"),
-                                 StringAttr::get(ctx, "REG")));
-
-  patterns.add<LinalgTilingPattern>(
-      MatvecOp::getOperationName(), ctx,
-      LinalgTilingOptions().setTileSizes({5, 6}).setLoopType(
-          LinalgTilingLoopType::ParallelLoops),
-      LinalgTransformationFilter(ArrayRef<StringAttr>{},
-                                 StringAttr::get(ctx, "L1")));
-
-  patterns.add<LinalgTilingPattern>(
-      DotOp::getOperationName(), ctx, LinalgTilingOptions().setTileSizes(8000),
-      LinalgTransformationFilter(
-          ArrayRef<StringAttr>{StringAttr::get(ctx, "MEM"),
-                               StringAttr::get(ctx, "L3"),
-                               StringAttr::get(ctx, "L2")},
-          StringAttr::get(ctx, "REG")));
-
-  //===--------------------------------------------------------------------===//
-  // Linalg tiling and permutation patterns.
-  //===--------------------------------------------------------------------===//
-  patterns.add<LinalgTilingPattern>(
-      MatmulOp::getOperationName(), ctx,
-      LinalgTilingOptions()
-          .setTileSizes({2000, 3000, 4000})
-          .setInterchange({1, 2, 0}),
-      LinalgTransformationFilter(StringAttr::get(ctx, "__with_perm__"),
-                                 StringAttr::get(ctx, "L2__with_perm__")));
-  patterns.add<LinalgTilingPattern>(
-      MatmulOp::getOperationName(), ctx,
-      LinalgTilingOptions()
-          .setTileSizes({200, 300, 400})
-          .setInterchange({1, 0, 2}),
-      LinalgTransformationFilter(StringAttr::get(ctx, "L2__with_perm__"),
-                                 StringAttr::get(ctx, "L1__with_perm__")));
-  patterns.add<LinalgTilingPattern>(
-      MatmulOp::getOperationName(), ctx,
-      LinalgTilingOptions().setTileSizes({20, 30, 40}),
-      LinalgTransformationFilter(StringAttr::get(ctx, "L1__with_perm__"),
-                                 StringAttr::get(ctx, "REG__with_perm__")));
-
-  patterns.add<LinalgTilingPattern>(
-      MatvecOp::getOperationName(), ctx,
-      LinalgTilingOptions().setTileSizes({5, 6}).setInterchange({1, 0}),
-      LinalgTransformationFilter(StringAttr::get(ctx, "__with_perm__"),
-                                 StringAttr::get(ctx, "L1__with_perm__")));
-
-  patterns.add<LinalgTilingPattern>(
-      MatmulOp::getOperationName(), ctx,
-      LinalgTilingOptions()
-          .setTileSizes({16, 8, 4})
-          .setInterchange({1, 2, 0})
-          .setLoopType(LinalgTilingLoopType::ParallelLoops),
-      LinalgTransformationFilter(
-          StringAttr::get(ctx, "par__with_perm__"),
-          StringAttr::get(ctx, "after_par__with_perm__")));
-
-  //===--------------------------------------------------------------------===//
   // Linalg to loops patterns.
   //===--------------------------------------------------------------------===//
   patterns.add<LinalgLoweringPattern<DotOp>>(
       ctx,
-      /*loweringType=*/LinalgLoweringType::Loops,
-      LinalgTransformationFilter(StringAttr::get(ctx, "REG")));
+      /*loweringType=*/LinalgLoweringType::Loops);
 
   //===--------------------------------------------------------------------===//
   // Linalg distribution patterns.
@@ -239,178 +149,6 @@ static void applyPatterns(func::FuncOp funcOp) {
   });
 }
 
-template <typename IdOp, typename NProcsOp>
-static SmallVector<ProcInfo, 2>
-getGpuProcIds(OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges,
-              ArrayRef<linalg::DistributionMethod> distributionMethod) {
-  size_t count = std::min<size_t>(3, parallelLoopRanges.size());
-  SmallVector<ProcInfo, 2> procInfo(count);
-  Type indexType = b.getIndexType();
-  for (unsigned i = 0; i < count; ++i) {
-    gpu::Dimension dim = *gpu::symbolizeDimension(i);
-    procInfo[count - 1 - i] = {b.create<IdOp>(loc, indexType, dim),
-                               b.create<NProcsOp>(loc, indexType, dim),
-                               distributionMethod[count - 1 - i]};
-  }
-  return procInfo;
-}
-
-static void fillTileAndDistributePatterns(MLIRContext *context,
-                                          RewritePatternSet &patterns) {
-  {
-    LinalgLoopDistributionOptions cyclicNprocsEqNiters;
-    SmallVector<linalg::DistributionMethod> distributionMethod = {
-        DistributionMethod::CyclicNumProcsEqNumIters,
-        DistributionMethod::CyclicNumProcsEqNumIters};
-    cyclicNprocsEqNiters.procInfo =
-        [distributionMethod](OpBuilder &b, Location loc,
-                             ArrayRef<Range> parallelLoopRanges) {
-          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
-              b, loc, parallelLoopRanges, distributionMethod);
-        };
-    patterns.add<LinalgTilingPattern>(
-        MatmulOp::getOperationName(), context,
-        LinalgTilingOptions()
-            .setTileSizes({8, 8, 4})
-            .setLoopType(LinalgTilingLoopType::ParallelLoops)
-            .setDistributionOptions(cyclicNprocsEqNiters),
-        LinalgTransformationFilter(
-            StringAttr::get(context, "distribute1"),
-            StringAttr::get(context, "after_distribute1")));
-  }
-
-  {
-    LinalgLoopDistributionOptions cyclicNprocsGeNiters;
-    SmallVector<linalg::DistributionMethod> distributionMethod = {
-        DistributionMethod::CyclicNumProcsGeNumIters,
-        DistributionMethod::CyclicNumProcsGeNumIters};
-    cyclicNprocsGeNiters.procInfo =
-        [distributionMethod](OpBuilder &b, Location loc,
-                             ArrayRef<Range> parallelLoopRanges) {
-          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
-              b, loc, parallelLoopRanges, distributionMethod);
-        };
-    patterns.add<LinalgTilingPattern>(
-        MatmulOp::getOperationName(), context,
-        LinalgTilingOptions()
-            .setTileSizes({8, 8, 4})
-            .setLoopType(LinalgTilingLoopType::ParallelLoops)
-            .setDistributionOptions(cyclicNprocsGeNiters),
-        LinalgTransformationFilter(
-            StringAttr::get(context, "distribute2"),
-            StringAttr::get(context, "after_distribute2")));
-  }
-
-  {
-    LinalgLoopDistributionOptions cyclicNprocsDefault;
-    SmallVector<linalg::DistributionMethod> distributionMethod = {
-        DistributionMethod::Cyclic, DistributionMethod::Cyclic};
-    cyclicNprocsDefault.procInfo =
-        [distributionMethod](OpBuilder &b, Location loc,
-                             ArrayRef<Range> parallelLoopRanges) {
-          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
-              b, loc, parallelLoopRanges, distributionMethod);
-        };
-    patterns.add<LinalgTilingPattern>(
-        MatmulOp::getOperationName(), context,
-        LinalgTilingOptions()
-            .setTileSizes({8, 8, 4})
-            .setLoopType(LinalgTilingLoopType::ParallelLoops)
-            .setDistributionOptions(cyclicNprocsDefault),
-        LinalgTransformationFilter(
-            StringAttr::get(context, "distribute3"),
-            StringAttr::get(context, "after_distribute3")));
-  }
-
-  {
-    LinalgLoopDistributionOptions cyclicNprocsMixed1;
-    SmallVector<linalg::DistributionMethod> distributionMethod = {
-        DistributionMethod::CyclicNumProcsEqNumIters,
-        DistributionMethod::CyclicNumProcsGeNumIters};
-    cyclicNprocsMixed1.procInfo =
-        [distributionMethod](OpBuilder &b, Location loc,
-                             ArrayRef<Range> parallelLoopRanges) {
-          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
-              b, loc, parallelLoopRanges, distributionMethod);
-        };
-    patterns.add<LinalgTilingPattern>(
-        MatmulOp::getOperationName(), context,
-        LinalgTilingOptions()
-            .setTileSizes({8, 8, 4})
-            .setLoopType(LinalgTilingLoopType::ParallelLoops)
-            .setDistributionOptions(cyclicNprocsMixed1),
-        LinalgTransformationFilter(
-            StringAttr::get(context, "distribute4"),
-            StringAttr::get(context, "after_distribute4")));
-  }
-
-  {
-    LinalgLoopDistributionOptions cyclicNprocsMixed2;
-    SmallVector<linalg::DistributionMethod> distributionMethod = {
-        DistributionMethod::CyclicNumProcsGeNumIters,
-        DistributionMethod::Cyclic};
-    cyclicNprocsMixed2.procInfo =
-        [distributionMethod](OpBuilder &b, Location loc,
-                             ArrayRef<Range> parallelLoopRanges) {
-          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
-              b, loc, parallelLoopRanges, distributionMethod);
-        };
-    patterns.add<LinalgTilingPattern>(
-        MatmulOp::getOperationName(), context,
-        LinalgTilingOptions()
-            .setTileSizes({8, 8, 4})
-            .setLoopType(LinalgTilingLoopType::ParallelLoops)
-            .setDistributionOptions(cyclicNprocsMixed2),
-        LinalgTransformationFilter(
-            StringAttr::get(context, "distribute5"),
-            StringAttr::get(context, "after_distribute5")));
-  }
-
-  {
-    LinalgLoopDistributionOptions cyclicNprocsMixed3;
-    SmallVector<linalg::DistributionMethod> distributionMethod = {
-        DistributionMethod::Cyclic,
-        DistributionMethod::CyclicNumProcsEqNumIters};
-    cyclicNprocsMixed3.procInfo =
-        [distributionMethod](OpBuilder &b, Location loc,
-                             ArrayRef<Range> parallelLoopRanges) {
-          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
-              b, loc, parallelLoopRanges, distributionMethod);
-        };
-
-    patterns.add<LinalgTilingPattern>(
-        MatmulOp::getOperationName(), context,
-        LinalgTilingOptions()
-            .setTileSizes({8, 8, 4})
-            .setLoopType(LinalgTilingLoopType::ParallelLoops)
-            .setDistributionOptions(cyclicNprocsMixed3),
-        LinalgTransformationFilter(
-            StringAttr::get(context, "distribute6"),
-            StringAttr::get(context, "after_distribute6")));
-  }
-
-  {
-    LinalgLoopDistributionOptions cyclicNprocsEqNiters;
-    SmallVector<linalg::DistributionMethod> distributionMethod = {
-        DistributionMethod::Cyclic, DistributionMethod::Cyclic};
-    cyclicNprocsEqNiters.procInfo =
-        [distributionMethod](OpBuilder &b, Location loc,
-                             ArrayRef<Range> parallelLoopRanges) {
-          return getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>(
-              b, loc, parallelLoopRanges, distributionMethod);
-        };
-    patterns.add<LinalgTilingPattern>(
-        MatmulOp::getOperationName(), context,
-        LinalgTilingOptions()
-            .setTileSizes({8, 8, 4})
-            .setLoopType(LinalgTilingLoopType::Loops)
-            .setDistributionOptions(cyclicNprocsEqNiters),
-        LinalgTransformationFilter(
-            StringAttr::get(context, "tensors_distribute1"),
-            StringAttr::get(context, "tensors_after_distribute1")));
-  }
-}
-
 static void applyVectorTransferForwardingPatterns(func::FuncOp funcOp) {
   RewritePatternSet forwardPattern(funcOp.getContext());
   forwardPattern.add<LinalgCopyVTRForwardingPattern>(funcOp.getContext());
@@ -445,33 +183,6 @@ static void applyExtractSliceOfPadTensorSwapPattern(func::FuncOp funcOp) {
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }
 
-static void applyTilePattern(func::FuncOp funcOp, const std::string &loopType,
-                             ArrayRef<int64_t> tileSizes,
-                             ArrayRef<int64_t> peeledLoops,
-                             bool scalarizeDynamicDims) {
-  MLIRContext *context = funcOp.getContext();
-  RewritePatternSet tilingPattern(context);
-  LinalgTilingLoopType type =
-      llvm::StringSwitch<LinalgTilingLoopType>(loopType)
-          .Case("for", LinalgTilingLoopType::Loops)
-          .Case("affine", LinalgTilingLoopType::AffineLoops)
-          .Case("parallel", LinalgTilingLoopType::ParallelLoops);
-  auto linalgTilingOptions = linalg::LinalgTilingOptions()
-                                 .setPeeledLoops(peeledLoops)
-                                 .setLoopType(type);
-  if (scalarizeDynamicDims) {
-    linalgTilingOptions.scalarizeDynamicDims();
-    assert(tileSizes.empty() &&
-           "tileSizes and scalarizeDynamicDims is mutually exclusive");
-  } else {
-    linalgTilingOptions.setTileSizes(tileSizes);
-  }
-  linalg::LinalgTransformationFilter f(StringAttr::get(context, "tile"));
-  TilingPatterns<linalg::MatmulOp, linalg::GenericOp>::insert(
-      tilingPattern, linalgTilingOptions, f);
-  (void)applyPatternsAndFoldGreedily(funcOp, std::move(tilingPattern));
-}
-
 static void applySplitReduction(func::FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
   linalg::populateSplitReductionPattern(
@@ -521,12 +232,6 @@ void TestLinalgTransforms::runOnOperation() {
   };
   std::unique_ptr<void, decltype(lambda)> cleanupGuard{(void *)1, lambda};
 
-  if (testTileAndDistributionOptions) {
-    RewritePatternSet patterns(&getContext());
-    fillTileAndDistributePatterns(&getContext(), patterns);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
-    return;
-  }
   if (testPatterns)
     return applyPatterns(getOperation());
   if (testVectorTransferForwardingPatterns)
@@ -539,12 +244,6 @@ void TestLinalgTransforms::runOnOperation() {
     return applyGeneralizePadTensorPatterns(getOperation());
   if (testSwapSubTensorPadTensor)
     return applyExtractSliceOfPadTensorSwapPattern(getOperation());
-  if (testTilePattern)
-    return applyTilePattern(getOperation(), loopType, tileSizes, peeledLoops,
-                            /*scalarizeDynamicDims=*/false);
-  if (testTileScalarizeDynamicDims)
-    return applyTilePattern(getOperation(), loopType, tileSizes,
-                            /*peeledLoops=*/{}, /*scalarizeDynamicDims=*/true);
   if (testSplitReduction)
     return applySplitReduction(getOperation());
   if (testSplitReductionInnerParallel)
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
index e74be0d..ad5dcab 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
@@ -57,10 +57,39 @@ public:
       llvm::cl::desc("perform expensive checks to better report errors in the "
                      "transform IR")};
 };
+
+struct TestTransformDialectEraseSchedulePass
+    : public PassWrapper<TestTransformDialectEraseSchedulePass,
+                         OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestTransformDialectEraseSchedulePass)
+
+  StringRef getArgument() const final {
+    return "test-transform-dialect-erase-schedule";
+  }
+
+  StringRef getDescription() const final {
+    return "erase transform dialect schedule from the IR";
+  }
+
+  void runOnOperation() override {
+    getOperation()->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
+      if (isa<transform::TransformOpInterface>(nestedOp)) {
+        nestedOp->erase();
+        return WalkResult::skip();
+      }
+      return WalkResult::advance();
+    });
+  }
+};
 } // namespace
 
 namespace mlir {
 namespace test {
+/// Registers the test pass for erasing transform dialect ops.
+void registerTestTransformDialectEraseSchedulePass() {
+  PassRegistration<TestTransformDialectEraseSchedulePass> reg;
+}
 /// Registers the test pass for applying transform dialect ops.
 void registerTestTransformDialectInterpreterPass() {
   PassRegistration<TestTransformDialectInterpreterPass> reg;
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 37d331b..9eb0a47 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -114,6 +114,7 @@ void registerTestSliceAnalysisPass();
 void registerTestTensorTransforms();
 void registerTestTilingInterface();
 void registerTestTopologicalSortAnalysisPass();
+void registerTestTransformDialectEraseSchedulePass();
 void registerTestTransformDialectInterpreterPass();
 void registerTestVectorLowerings();
 void registerTestNvgpuLowerings();
@@ -214,6 +215,7 @@ void registerTestPasses() {
   mlir::test::registerTestTensorTransforms();
   mlir::test::registerTestTilingInterface();
   mlir::test::registerTestTopologicalSortAnalysisPass();
+  mlir::test::registerTestTransformDialectEraseSchedulePass();
   mlir::test::registerTestTransformDialectInterpreterPass();
   mlir::test::registerTestVectorLowerings();
   mlir::test::registerTestNvgpuLowerings();
-- 
2.7.4