[mlir][gpu] Only transform mapped parallel loops to GPU.

author Stephan Herhut <herhut@google.com>

Thu, 12 Nov 2020 17:36:14 +0000 (18:36 +0100)

committer Stephan Herhut <herhut@google.com>

Fri, 13 Nov 2020 08:15:17 +0000 (09:15 +0100)
author Stephan Herhut <herhut@google.com>
Thu, 12 Nov 2020 17:36:14 +0000 (18:36 +0100)
committer Stephan Herhut <herhut@google.com>
Fri, 13 Nov 2020 08:15:17 +0000 (09:15 +0100)
diff --git a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h

index 900bc6e..d6316f6 100644 (file)
--- a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
+++ b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
@@ -12,9 +12,10 @@
  
  namespace mlir {
  class AffineForOp;
+class ConversionTarget;
+struct LogicalResult;
  class MLIRContext;
  class OwningRewritePatternList;
-struct LogicalResult;
  class Value;
  
  namespace scf {
@@ -44,6 +45,10 @@ LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
  void populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
                                         MLIRContext *ctx);
  
+/// Configures the rewrite target such that only `scf.parallel` operations that
+/// are not rewritten by the provided patterns are legal.
+void configureParallelLoopToGPULegality(ConversionTarget &target);
+
  } // namespace mlir
  
  #endif // MLIR_CONVERSION_SCFTOGPU_SCFTOGPU_H_
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp

index d494d12..b7b4e7a 100644 (file)
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -458,9 +458,10 @@ static LogicalResult processParallelLoop(
            if (!boundIsPrecise) {
              upperBound = deriveStaticUpperBound(upperBound, rewriter);
              if (!upperBound) {
-              return parallelOp.emitOpError()
-                     << "cannot derive loop-invariant upper bound for number "
-                        "of iterations";
+              return rewriter.notifyMatchFailure(
+                  parallelOp,
+                  "cannot derive loop-invariant upper bound for number of"
+                  "iterations");
              }
            }
            // Compute the number of iterations needed. We compute this as an
@@ -481,9 +482,9 @@ static LogicalResult processParallelLoop(
            // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
            // when this condition is relaxed.
            if (bounds.find(processor) != bounds.end()) {
-            return parallelOp.emitOpError()
-                   << "cannot redefine the bound for processor "
-                   << static_cast<int64_t>(processor);
+            return rewriter.notifyMatchFailure(
+                parallelOp, "cannot redefine the bound for processor " +
+                                Twine(static_cast<int64_t>(processor)));
            }
            bounds[processor] = launchBound;
          }
@@ -565,6 +566,10 @@ static LogicalResult processParallelLoop(
  LogicalResult
  ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
                                               PatternRewriter &rewriter) const {
+  // We can only transform starting at the outer-most loop. Launches inside of
+  // parallel loops are not supported.
+  if (auto parentLoop = parallelOp.getParentOfType<ParallelOp>())
+    return failure();
    // Create a launch operation. We start with bound one for all grid/block
    // sizes. Those will be refined later as we discover them from mappings.
    Location loc = parallelOp.getLoc();
@@ -640,3 +645,9 @@ void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
                                               MLIRContext *ctx) {
    patterns.insert<ParallelToGpuLaunchLowering>(ctx);
  }
+
+void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
+  target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
+    return !parallelOp.getAttr(gpu::getMappingAttrName());
+  });
+}
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp

index d04a773..2941b40 100644 (file)
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
@@ -53,7 +53,7 @@ struct ParallelLoopToGpuPass
      target.addLegalDialect<AffineDialect>();
      target.addLegalDialect<gpu::GPUDialect>();
      target.addLegalDialect<scf::SCFDialect>();
-    target.addIllegalOp<scf::ParallelOp>();
+    configureParallelLoopToGPULegality(target);
      if (failed(applyPartialConversion(getOperation(), target,
                                        std::move(patterns))))
        signalPassFailure();
diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir

index 3af50da..2454ced 100644 (file)
--- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
@@ -317,15 +317,13 @@ func @parallel_loop_optional_attr() {
  
  // -----
  
-// Mapping to the same processor twice.
+// Mapping to the same processor twice. Cannot be mapped.
  
  func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
                            %arg3 : index,
                            %buf : memref<?x?xf32>,
                            %res : memref<?x?xf32>) {
    %four = constant 4 : index
-  // expected-error@+2 {{cannot redefine the bound for processor 1}}
-  // expected-error@+1 {{failed to legalize operation 'scf.parallel'}}
    scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                                            step (%four, %four)  {
    } { mapping = [
@@ -335,9 +333,12 @@ func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
    return
  }
  
+// CHECK-LABEL: @parallel_double_map
+// CHECK: scf.parallel
+
  // -----
  
-// Loop with loop-variant upper bound.
+// Loop with loop-variant upper bound. Cannot be mapped.
  
  func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
                                         %arg3 : index,
@@ -346,10 +347,8 @@ func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : ind
    %zero = constant 0 : index
    %one = constant 1 : index
    %four = constant 4 : index
-  // expected-error@+1 {{failed to legalize operation 'scf.parallel'}}
    scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                                            step (%four, %four)  {
-    // expected-error@+1 {{cannot derive loop-invariant upper bound}}
      scf.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
                                              step (%one, %one)  {
        %idx0 = addi %i0, %si0 : index
@@ -366,3 +365,25 @@ func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : ind
      ] }
    return
  }
+
+// CHECK-LABEL: @parallel_loop_loop_variant_bound
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+// Loop without annotations. Cannot be mapped.
+
+func @parallel_no_annotations(%arg0 : index, %arg1 : index, %arg2 : index,
+                              %arg3 : index,
+                              %buf : memref<?x?xf32>,
+                              %res : memref<?x?xf32>) {
+  %four = constant 4 : index
+  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+  }
+  return
+}
+
+// CHECK-LABEL: @parallel_no_annotations
+// CHECK: scf.parallel
author	Stephan Herhut <herhut@google.com>
	Thu, 12 Nov 2020 17:36:14 +0000 (18:36 +0100)
committer	Stephan Herhut <herhut@google.com>
	Fri, 13 Nov 2020 08:15:17 +0000 (09:15 +0100)
mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h		patch \| blob \| history
mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp		patch \| blob \| history
mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp		patch \| blob \| history
mlir/test/Conversion/SCFToGPU/parallel_loop.mlir		patch \| blob \| history