Add 'gpu.terminator' operation.

author Stephan Herhut <herhut@google.com>

Wed, 29 Jan 2020 12:59:36 +0000 (13:59 +0100)

committer Stephan Herhut <herhut@google.com>

Thu, 30 Jan 2020 11:41:41 +0000 (12:41 +0100)
author Stephan Herhut <herhut@google.com>
Wed, 29 Jan 2020 12:59:36 +0000 (13:59 +0100)
committer Stephan Herhut <herhut@google.com>
Thu, 30 Jan 2020 11:41:41 +0000 (12:41 +0100)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td

index d4468ad..ece0568 100644 (file)
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -472,8 +472,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>,
    let verifier = [{ return ::verify(*this); }];
  }
  
-def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>,
-    Results<(outs)> {
+def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, Terminator]>,
+    Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> {
+  let summary = "Terminator for GPU functions.";
+  let description = [{
+    A terminator operation for regions that appear in the body of  `gpu.func`
+    functions. The operands to the `gpu.return` are the result values returned
+    by an incovation of the `gpu.func`.
+  }];
+
+  let builders = [OpBuilder<"Builder *builder, OperationState &result", " // empty">];
+
+  let parser = [{ return parseReturnOp(parser, result); }];
+  let printer = [{ p << getOperationName(); }];
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">, Terminator]>,
+    Arguments<(ins)>, Results<(outs)> {
    let summary = "Terminator for GPU launch regions.";
    let description = [{
      A terminator operation for regions that appear in the body of `gpu.launch`
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp

index d0cc32b..e5e4121 100644 (file)
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -306,9 +306,9 @@ createLaunchBody(OpBuilder &builder, OpTy rootForOp, gpu::LaunchOp launchOp,
                   unsigned numBlockDims, unsigned numThreadDims) {
    OpBuilder::InsertionGuard bodyInsertionGuard(builder);
    builder.setInsertionPointToEnd(&launchOp.body().front());
-  auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc());
+  auto terminatorOp = builder.create<gpu::TerminatorOp>(launchOp.getLoc());
  
-  rootForOp.getOperation()->moveBefore(returnOp);
+  rootForOp.getOperation()->moveBefore(terminatorOp);
    SmallVector<Value, 3> workgroupID, numWorkGroups;
    packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims,
                   workgroupID, numWorkGroups);
@@ -435,7 +435,7 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
    Location terminatorLoc = terminator.getLoc();
    terminator.erase();
    builder.setInsertionPointToEnd(innermostForOp.getBody());
-  builder.create<gpu::ReturnOp>(terminatorLoc);
+  builder.create<gpu::TerminatorOp>(terminatorLoc, llvm::None);
    launchOp.body().front().getOperations().splice(
        launchOp.body().front().begin(),
        innermostForOp.getBody()->getOperations());
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

index 99b73e5..d7dd699 100644 (file)
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -270,18 +270,19 @@ static LogicalResult verify(LaunchOp op) {
    }
  
    // Block terminators without successors are expected to exit the kernel region
-  // and must be `gpu.launch`.
+  // and must be `gpu.terminator`.
    for (Block &block : op.body()) {
      if (block.empty())
        continue;
      if (block.back().getNumSuccessors() != 0)
        continue;
-    if (!isa<gpu::ReturnOp>(&block.back())) {
+    if (!isa<gpu::TerminatorOp>(&block.back())) {
        return block.back()
-                 .emitError("expected 'gpu.terminator' or a terminator with "
-                            "successors")
-                 .attachNote(op.getLoc())
-             << "in '" << LaunchOp::getOperationName() << "' body region";
+          .emitError()
+          .append("expected '", gpu::TerminatorOp::getOperationName(),
+                  "' or a terminator with successors")
+          .attachNote(op.getLoc())
+          .append("in '", LaunchOp::getOperationName(), "' body region");
      }
    }
  
@@ -680,7 +681,7 @@ static ParseResult parseGPUFuncOp(OpAsmParser &parser, OperationState &result) {
             << "gpu.func requires named arguments";
  
    // Construct the function type. More types will be added to the region, but
-  // not to the functiont type.
+  // not to the function type.
    Builder &builder = parser.getBuilder();
    auto type = builder.getFunctionType(argTypes, resultTypes);
    result.addAttribute(GPUFuncOp::getTypeAttrName(), TypeAttr::get(type));
@@ -767,6 +768,10 @@ LogicalResult GPUFuncOp::verifyType() {
    if (!type.isa<FunctionType>())
      return emitOpError("requires '" + getTypeAttrName() +
                         "' attribute of function type");
+
+  if (isKernel() && getType().getNumResults() != 0)
+    return emitOpError() << "expected void return type for kernel function";
+
    return success();
  }
  
@@ -815,6 +820,45 @@ LogicalResult GPUFuncOp::verifyBody() {
  }
  
  //===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseReturnOp(OpAsmParser &parser, OperationState &result) {
+  llvm::SmallVector<OpAsmParser::OperandType, 4> operands;
+  llvm::SmallVector<Type, 4> types;
+  if (parser.parseOperandList(operands) ||
+      parser.parseOptionalColonTypeList(types) ||
+      parser.resolveOperands(operands, types, parser.getCurrentLocation(),
+                             result.operands))
+    return failure();
+
+  return success();
+}
+
+static LogicalResult verify(gpu::ReturnOp returnOp) {
+  GPUFuncOp function = returnOp.getParentOfType<GPUFuncOp>();
+
+  FunctionType funType = function.getType();
+
+  if (funType.getNumResults() != returnOp.operands().size())
+    return returnOp.emitOpError()
+        .append("expected ", funType.getNumResults(), " result operands")
+        .attachNote(function.getLoc())
+        .append("return type declared here");
+
+  for (auto pair : llvm::enumerate(
+           llvm::zip(function.getType().getResults(), returnOp.operands()))) {
+    Type type;
+    Value operand;
+    std::tie(type, operand) = pair.value();
+    if (type != operand.getType())
+      return returnOp.emitOpError() << "unexpected type `" << operand.getType()
+                                    << "' for operand #" << pair.index();
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
  // GPUModuleOp
  //===----------------------------------------------------------------------===//
  
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp

index 0594b5d..c17ead5 100644 (file)
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -99,7 +99,7 @@ static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc,
  }
  
  // Outline the `gpu.launch` operation body into a kernel function. Replace
-// `gpu.return` operations by `std.return` in the generated function.
+// `gpu.terminator` operations by `gpu.return` in the generated function.
  static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
    Location loc = launchOp.getLoc();
    // Create a builder with no insertion point, insertion will happen separately
@@ -116,6 +116,12 @@ static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
                         builder.getUnitAttr());
    outlinedFunc.body().takeBody(launchOp.body());
    injectGpuIndexOperations(loc, outlinedFunc.body());
+  outlinedFunc.walk([](gpu::TerminatorOp op) {
+    OpBuilder replacer(op);
+    replacer.create<gpu::ReturnOp>(op.getLoc());
+    op.erase();
+  });
+
    return outlinedFunc;
  }
  
diff --git a/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir b/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir

index f4567fb..81feba3 100644 (file)
--- a/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir
@@ -23,7 +23,7 @@ func @foo(%arg0: memref<?xf32>, %arg1 : index) {
        // CHECK: %[[prod_j:.*]] = muli %{{.*}}, %{{.*}} : index
        // CHECK: addi %{{.*}}, %[[prod_j]] : index
  
-      // CHECK: gpu.return
+      // CHECK: gpu.terminator
      }
    }
    return
diff --git a/mlir/test/Conversion/LoopsToGPU/step_one.mlir b/mlir/test/Conversion/LoopsToGPU/step_one.mlir

index a8db604..825b564 100644 (file)
--- a/mlir/test/Conversion/LoopsToGPU/step_one.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/step_one.mlir
@@ -73,8 +73,8 @@ func @step_1(%A : memref<?x?x?x?xf32>, %B : memref<?x?x?x?xf32>) {
            // CHECK-22-NEXT:   store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
            store %0, %B[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
  
-          // CHECK-11: gpu.return
-          // CHECK-22: gpu.return
+          // CHECK-11: gpu.terminator
+          // CHECK-22: gpu.terminator
          }
        }
      }
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir

index 8bb170c..75644b5 100644 (file)
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -21,7 +21,7 @@ func @propagate_constant(%arg1: memref<?xf32>) {
  
      // CHECK: "bar"(%[[inner_arg]])
      "bar"(%y) : (memref<?xf32>) -> ()
-    gpu.return
+    gpu.terminator
    }
    return
  }
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir

index 8f900bf..8b8085e 100644 (file)
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -376,7 +376,7 @@ func @shuffle_unsupported_type(%arg0 : index, %arg1 : i32, %arg2 : i32) {
  // -----
  
  module {
-  module @gpu_funcs attributes {gpu.kernel_module} {
+  gpu.module @gpu_funcs {
      // expected-error @+1 {{custom op 'gpu.func' gpu.func requires named arguments}}
      gpu.func @kernel_1(f32, f32) {
      ^bb0(%arg0: f32):
@@ -428,3 +428,39 @@ module {
      }
    }
  }
+
+// -----
+
+module {
+  module @gpu_funcs attributes {gpu.kernel_module} {
+    // expected-error @+1 {{expected memory space 5 in attribution}}
+    gpu.func @kernel() private(%0: memref<4xf32>) {
+      gpu.return
+    }
+  }
+}
+
+// -----
+
+module {
+  gpu.module @gpu_funcs {
+    // expected-note @+1 {{return type declared here}}
+    gpu.func @kernel() {
+      %0 = constant 0 : index
+      // expected-error @+1 {{'gpu.return' op expected 0 result operands}}
+      gpu.return %0 : index
+    }
+  }
+}
+
+// -----
+
+module {
+  gpu.module @gpu_funcs {
+    // expected-error @+1 {{'gpu.func' op expected void return type for kernel function}}
+    gpu.func @kernel() -> index kernel {
+      %0 = constant 0 : index
+      gpu.return
+    }
+  }
+}
+\ No newline at end of file
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir

index 033e7cb..6719e40 100644 (file)
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -7,8 +7,8 @@ module attributes {gpu.container_module} {
      // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}})
      gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
                 threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
-      // CHECK: gpu.return
-      gpu.return
+      // CHECK: gpu.terminator
+      gpu.terminator
      }
      return
    }
@@ -19,8 +19,8 @@ module attributes {gpu.container_module} {
      gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
                 threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
                 args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
-      // CHECK: gpu.return
-      gpu.return
+      // CHECK: gpu.terminator
+      gpu.terminator
      }
      return
    }
@@ -34,8 +34,8 @@ module attributes {gpu.container_module} {
                 args(%kernel_arg0 = %float, %kernel_arg1 = %data) : f32, memref<?xf32, 1> {
        // CHECK: "use"(%{{.*}})
        "use"(%kernel_arg0): (f32) -> ()
-      // CHECK: gpu.return
-      gpu.return
+      // CHECK: gpu.terminator
+      gpu.terminator
      }
      return
    }
@@ -54,8 +54,8 @@ module attributes {gpu.container_module} {
            "use"(%val) : (index) -> ()
          }) : () -> ()
        }) : () -> ()
-      // CHECK: gpu.return
-      gpu.return
+      // CHECK: gpu.terminator
+      gpu.terminator
      }
      return
    }
@@ -118,11 +118,11 @@ module attributes {gpu.container_module} {
    }
  
    module @gpu_funcs attributes {gpu.kernel_module} {
-    // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32) -> f32
+    // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
      // CHECK:       workgroup
      // CHECK:       private
      // CHECK:       attributes
-    gpu.func @kernel_1(%arg0: f32) -> f32
+    gpu.func @kernel_1(%arg0: f32)
          workgroup(%arg1: memref<42xf32, 3>)
          private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>)
          kernel
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir

index 425b4b3..e6fddb6 100644 (file)
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -31,7 +31,7 @@ func @launch() {
      "use"(%arg0): (f32) -> ()
      "some_op"(%bx, %block_x) : (index, index) -> ()
      %42 = load %arg1[%tx] : memref<?xf32, 1>
-    gpu.return
+    gpu.terminator
    }
    return
  }
@@ -68,14 +68,14 @@ func @multiple_launches() {
                                         %grid_z = %cst)
               threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
                                          %block_z = %cst) {
-    gpu.return
+    gpu.terminator
    }
    // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]]) {kernel = "multiple_launches_kernel", kernel_module = @multiple_launches_kernel_0} : (index, index, index, index, index, index) -> ()
    gpu.launch blocks(%bx2, %by2, %bz2) in (%grid_x2 = %cst, %grid_y2 = %cst,
                                            %grid_z2 = %cst)
               threads(%tx2, %ty2, %tz2) in (%block_x2 = %cst, %block_y2 = %cst,
                                             %block_z2 = %cst) {
-    gpu.return
+    gpu.terminator
    }
    return
  }
@@ -99,7 +99,7 @@ func @extra_constants(%arg0 : memref<?xf32>) {
                                          %block_z = %cst)
               args(%kernel_arg0 = %cst2, %kernel_arg1 = %arg0, %kernel_arg2 = %cst3) : index, memref<?xf32>, index {
      "use"(%kernel_arg0, %kernel_arg1, %kernel_arg2) : (index, memref<?xf32>, index) -> ()
-    gpu.return
+    gpu.terminator
    }
    return
  }
@@ -121,19 +121,19 @@ func @function_call(%arg0 : memref<?xf32>) {
      call @device_function() : () -> ()
      call @device_function() : () -> ()
      %0 = llvm.mlir.addressof @global : !llvm<"i64*">
-    gpu.return
+    gpu.terminator
    }
    return
  }
  
  func @device_function() {
    call @recursive_device_function() : () -> ()
-  gpu.return
+  return
  }
  
  func @recursive_device_function() {
    call @recursive_device_function() : () -> ()
-  gpu.return
+  return
  }
  
  // CHECK: gpu.module @function_call_kernel {
@@ -141,6 +141,7 @@ func @recursive_device_function() {
  // CHECK:     call @device_function() : () -> ()
  // CHECK:     call @device_function() : () -> ()
  // CHECK:     llvm.mlir.addressof @global : !llvm<"i64*">
+// CHECK:     gpu.return
  //
  // CHECK:   llvm.mlir.global internal @global(42 : i64) : !llvm.i64
  //
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir

index a993d0c..192bbd4 100644 (file)
--- a/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-op.mlir
@@ -20,7 +20,7 @@ func @main() {
      %val = sitofp %t3 : i32 to f32
      %sum = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
      store %sum, %kernel_dst[%tz, %ty, %tx] : memref<?x?x?xf32>
-    gpu.return
+    gpu.terminator
    }
    %U = memref_cast %dst : memref<?x?x?xf32> to memref<*xf32>
    call @print_memref_f32(%U) : (memref<*xf32>) -> ()
diff --git a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir

index 3839770..704a39d 100644 (file)
--- a/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
+++ b/mlir/test/mlir-cuda-runner/all-reduce-region.mlir
@@ -18,7 +18,7 @@ func @main() {
      }) : (i32) -> (i32)
      %res = sitofp %xor : i32 to f32
      store %res, %kernel_dst[%tx] : memref<?xf32>
-    gpu.return
+    gpu.terminator
    }
    %U = memref_cast %dst : memref<?xf32> to memref<*xf32>
    call @print_memref_f32(%U) : (memref<*xf32>) -> ()
diff --git a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir

index d887e73..45675aa 100644 (file)
--- a/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
+++ b/mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir
@@ -7,7 +7,7 @@ func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
               threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst)
               args(%kernel_arg0 = %arg0, %kernel_arg1 = %arg1) : f32, memref<?xf32> {
      store %kernel_arg0, %kernel_arg1[%tx] : memref<?xf32>
-    gpu.return
+    gpu.terminator
    }
    return
  }
diff --git a/mlir/test/mlir-cuda-runner/shuffle.mlir b/mlir/test/mlir-cuda-runner/shuffle.mlir

index 1b01399..0ae2e48 100644 (file)
--- a/mlir/test/mlir-cuda-runner/shuffle.mlir
+++ b/mlir/test/mlir-cuda-runner/shuffle.mlir
@@ -21,7 +21,7 @@ func @main() {
      br ^bb1(%m1 : f32)
    ^bb1(%value : f32):
      store %value, %kernel_dst[%tx] : memref<?xf32>
-    gpu.return
+    gpu.terminator
    }
    %U = memref_cast %dst : memref<?xf32> to memref<*xf32>
    call @print_memref_f32(%U) : (memref<*xf32>) -> ()
author	Stephan Herhut <herhut@google.com>
	Wed, 29 Jan 2020 12:59:36 +0000 (13:59 +0100)
committer	Stephan Herhut <herhut@google.com>
	Thu, 30 Jan 2020 11:41:41 +0000 (12:41 +0100)
mlir/include/mlir/Dialect/GPU/GPUOps.td		patch \| blob \| history
mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp		patch \| blob \| history
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp		patch \| blob \| history
mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp		patch \| blob \| history
mlir/test/Conversion/LoopsToGPU/linalg_to_gpu.mlir		patch \| blob \| history
mlir/test/Conversion/LoopsToGPU/step_one.mlir		patch \| blob \| history
mlir/test/Dialect/GPU/canonicalize.mlir		patch \| blob \| history
mlir/test/Dialect/GPU/invalid.mlir		patch \| blob \| history
mlir/test/Dialect/GPU/ops.mlir		patch \| blob \| history
mlir/test/Dialect/GPU/outlining.mlir		patch \| blob \| history
mlir/test/mlir-cuda-runner/all-reduce-op.mlir		patch \| blob \| history
mlir/test/mlir-cuda-runner/all-reduce-region.mlir		patch \| blob \| history
mlir/test/mlir-cuda-runner/gpu-to-cubin.mlir		patch \| blob \| history
mlir/test/mlir-cuda-runner/shuffle.mlir		patch \| blob \| history