From 6273fa0c6a85985395993e74c92a17709311fb84 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Mon, 16 Dec 2019 12:12:20 -0800
Subject: [PATCH] Plug gpu.func into the GPU lowering pipelines

This updates the lowering pipelines from the GPU dialect to lower-level
dialects (NVVM, SPIRV) to use the recently introduced gpu.func operation
instead of a standard function annotated with an attribute. In particular, the
kernel outlining is updated to produce gpu.func instead of std.func and the
individual conversions are updated to consume gpu.funcs and disallow standard
funcs after legalization, if necessary. The attribute "gpu.kernel" is preserved
in the generic syntax, but can also be used with the custom syntax on
gpu.funcs. The special kind of function for GPU allows one to use additional
features such as memory attribution.

PiperOrigin-RevId: 285822272
---
 mlir/include/mlir/Dialect/GPU/GPUOps.td            | 21 +++--
 mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h    |  6 --
 .../Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp  | 40 +++++++++-
 .../Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp    | 92 +++++++++++++++++++---
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp             | 22 ++++--
 .../lib/Dialect/GPU/Transforms/KernelOutlining.cpp | 47 +++++------
 mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp           | 37 ---------
 .../GPUToCUDA/lower-launch-func-to-cuda.mlir       |  5 +-
 mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir    |  1 +
 mlir/test/Conversion/GPUToSPIRV/builtins.mlir      | 24 +++---
 mlir/test/Conversion/GPUToSPIRV/load-store.mlir    |  4 +-
 mlir/test/Conversion/GPUToSPIRV/loop.mlir          |  6 +-
 mlir/test/Conversion/GPUToSPIRV/simple.mlir        |  5 +-
 mlir/test/Dialect/GPU/invalid.mlir                 | 14 ++--
 mlir/test/Dialect/GPU/ops.mlir                     | 10 +--
 mlir/test/Dialect/GPU/outlining.mlir               |  5 +-
 16 files changed, 213 insertions(+), 126 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index 7ef1080..5f7bab3 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -120,9 +120,9 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
 
   let builders = [
     OpBuilder<"Builder *builder, OperationState &result, StringRef name, "
-              "FunctionType type, ArrayRef<Type> workgroupAttributions, "
-              "ArrayRef<Type> privateAttributions, "
-              "ArrayRef<NamedAttribute> attrs">
+              "FunctionType type, ArrayRef<Type> workgroupAttributions = {}, "
+              "ArrayRef<Type> privateAttributions = {}, "
+              "ArrayRef<NamedAttribute> attrs = {}">
   ];
 
   let extraClassDeclaration = [{
@@ -138,6 +138,17 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
       return getTypeAttr().getValue().cast<FunctionType>();
     }
 
+    /// Change the type of this function in place. This is an extremely
+    /// dangerous operation and it is up to the caller to ensure that this is
+    /// legal for this function, and to restore invariants:
+    ///  - the entry block args must be updated to match the function params.
+    ///  - the argument/result attributes may need an update: if the new type
+    ///  has less parameters we drop the extra attributes, if there are more
+    ///  parameters they won't have any attributes.
+    // TODO(b/146349912): consider removing this function thanks to rewrite
+    // patterns.
+    void setType(FunctionType newType);
+
     /// Returns the number of buffers located in the workgroup memory.
     unsigned getNumWorkgroupAttributions() {
       return getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName())
@@ -270,11 +281,11 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
   let skipDefaultBuilders = 1;
 
   let builders = [
-    OpBuilder<"Builder *builder, OperationState &result, FuncOp kernelFunc, "
+    OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, "
               "Value *gridSizeX, Value *gridSizeY, Value *gridSizeZ, "
               "Value *blockSizeX, Value *blockSizeY, Value *blockSizeZ, "
               "ValueRange kernelOperands">,
-    OpBuilder<"Builder *builder, OperationState &result, FuncOp kernelFunc, "
+    OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, "
               "KernelDim3 gridSize, KernelDim3 blockSize, "
               "ValueRange kernelOperands">
   ];
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
index 1619a5e..f48a1d0 100644
--- a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h
@@ -83,12 +83,6 @@ StringRef getEntryPointABIAttrName();
 EntryPointABIAttr getEntryPointABIAttr(ArrayRef<int32_t> localSize,
                                        MLIRContext *context);
 
-/// Legalizes a function as an entry function.
-FuncOp lowerAsEntryFunction(FuncOp funcOp, SPIRVTypeConverter &typeConverter,
-                            ConversionPatternRewriter &rewriter,
-                            spirv::EntryPointABIAttr entryPointInfo,
-                            ArrayRef<spirv::InterfaceVarABIAttr> argABIInfo);
-
 /// Sets the InterfaceVarABIAttr and EntryPointABIAttr for a function and its
 /// arguments
 LogicalResult setABIAttrs(FuncOp funcOp,
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index e4bdd7c..f41c0c45e 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -489,8 +489,6 @@ struct GPUFuncOpLowering : LLVMOpLowering {
     }
 
     // Rewrite the original GPU function to an LLVM function.
-    // TODO(zinenko): there is a hack in the std->llvm lowering that promotes
-    // structs to pointers that probably needs to be replicated here.
     auto funcType = lowering.convertType(gpuFuncOp.getType())
                         .cast<LLVM::LLVMType>()
                         .getPointerElementTy();
@@ -576,16 +574,51 @@ struct GPUFuncOpLowering : LLVMOpLowering {
       }
     }
 
+    // Move the region to the new function, update the entry block signature.
     rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
                                 llvmFuncOp.end());
     rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
                                       signatureConversion);
 
+    {
+      // For memref-typed arguments, insert the relevant loads in the beginning
+      // of the block to comply with the LLVM dialect calling convention. This
+      // needs to be done after signature conversion to get the right types.
+      OpBuilder::InsertionGuard guard(rewriter);
+      Block &block = llvmFuncOp.front();
+      rewriter.setInsertionPointToStart(&block);
+
+      for (auto en : llvm::enumerate(gpuFuncOp.getType().getInputs())) {
+        if (!en.value().isa<MemRefType>() &&
+            !en.value().isa<UnrankedMemRefType>())
+          continue;
+
+        BlockArgument *arg = block.getArgument(en.index());
+        Value *loaded = rewriter.create<LLVM::LoadOp>(loc, arg);
+        rewriter.replaceUsesOfBlockArgument(arg, loaded);
+      }
+    }
+
     rewriter.eraseOp(gpuFuncOp);
     return matchSuccess();
   }
 };
 
+struct GPUReturnOpLowering : public LLVMOpLowering {
+  GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(gpu::ReturnOp::getOperationName(),
+                       typeConverter.getDialect()->getContext(),
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands,
+                                                ArrayRef<Block *>());
+    return matchSuccess();
+  }
+};
+
 /// Import the GPU Ops to NVVM Patterns.
 #include "GPUToNVVM.cpp.inc"
 
@@ -632,7 +665,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
                                           NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
               GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
                                           NVVM::GridDimYOp, NVVM::GridDimZOp>,
-              GPUAllReduceOpLowering, GPUFuncOpLowering>(converter);
+              GPUAllReduceOpLowering, GPUFuncOpLowering, GPUReturnOpLowering>(
+          converter);
   patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__nv_expf",
                                                "__nv_exp");
 }
diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
index 2b39c0d..a8747a7 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp
@@ -51,21 +51,20 @@ public:
                   ConversionPatternRewriter &rewriter) const override;
 };
 
-/// Pattern to convert a kernel function in GPU dialect (a FuncOp with the
-/// attribute gpu.kernel) within a spv.module.
-class KernelFnConversion final : public SPIRVOpLowering<FuncOp> {
+/// Pattern to convert a kernel function in GPU dialect within a spv.module.
+class KernelFnConversion final : public SPIRVOpLowering<gpu::GPUFuncOp> {
 public:
   KernelFnConversion(MLIRContext *context, SPIRVTypeConverter &converter,
                      ArrayRef<int64_t> workGroupSize,
                      PatternBenefit benefit = 1)
-      : SPIRVOpLowering<FuncOp>(context, converter, benefit) {
+      : SPIRVOpLowering<gpu::GPUFuncOp>(context, converter, benefit) {
     auto config = workGroupSize.take_front(3);
     workGroupSizeAsInt32.assign(config.begin(), config.end());
     workGroupSizeAsInt32.resize(3, 1);
   }
 
   PatternMatchResult
-  matchAndRewrite(FuncOp funcOp, ArrayRef<Value *> operands,
+  matchAndRewrite(gpu::GPUFuncOp funcOp, ArrayRef<Value *> operands,
                   ConversionPatternRewriter &rewriter) const override;
 
 private:
@@ -96,6 +95,17 @@ public:
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+/// Pattern to convert a gpu.return into a SPIR-V return.
+// TODO: This can go to DRR when GPU return has operands.
+class GPUReturnOpConversion final : public SPIRVOpLowering<gpu::ReturnOp> {
+public:
+  using SPIRVOpLowering<gpu::ReturnOp>::SPIRVOpLowering;
+
+  PatternMatchResult
+  matchAndRewrite(gpu::ReturnOp returnOp, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -204,11 +214,58 @@ PatternMatchResult LaunchConfigConversion<SourceOp, builtin>::matchAndRewrite(
 }
 
 //===----------------------------------------------------------------------===//
-// FuncOp with gpu.kernel attribute.
+// GPUFuncOp
 //===----------------------------------------------------------------------===//
 
+// Legalizes a GPU function as an entry SPIR-V function.
+static FuncOp
+lowerAsEntryFunction(gpu::GPUFuncOp funcOp, SPIRVTypeConverter &typeConverter,
+                     ConversionPatternRewriter &rewriter,
+                     spirv::EntryPointABIAttr entryPointInfo,
+                     ArrayRef<spirv::InterfaceVarABIAttr> argABIInfo) {
+  auto fnType = funcOp.getType();
+  if (fnType.getNumResults()) {
+    funcOp.emitError("SPIR-V lowering only supports entry functions"
+                     "with no return values right now");
+    return nullptr;
+  }
+  if (fnType.getNumInputs() != argABIInfo.size()) {
+    funcOp.emitError(
+        "lowering as entry functions requires ABI info for all arguments");
+    return nullptr;
+  }
+  // For entry functions need to make the signature void(void). Compute the
+  // replacement value for all arguments and replace all uses.
+  TypeConverter::SignatureConversion signatureConverter(fnType.getNumInputs());
+  {
+    for (auto argType : enumerate(funcOp.getType().getInputs())) {
+      auto convertedType = typeConverter.convertType(argType.value());
+      signatureConverter.addInputs(argType.index(), convertedType);
+    }
+  }
+  auto newFuncOp = rewriter.create<FuncOp>(
+      funcOp.getLoc(), funcOp.getName(),
+      rewriter.getFunctionType(signatureConverter.getConvertedTypes(),
+                               llvm::None),
+      ArrayRef<NamedAttribute>());
+  for (const auto &namedAttr : funcOp.getAttrs()) {
+    if (namedAttr.first.is(impl::getTypeAttrName()) ||
+        namedAttr.first.is(SymbolTable::getSymbolAttrName()))
+      continue;
+    newFuncOp.setAttr(namedAttr.first, namedAttr.second);
+  }
+  rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
+                              newFuncOp.end());
+  rewriter.applySignatureConversion(&newFuncOp.getBody(), signatureConverter);
+  rewriter.eraseOp(funcOp);
+
+  spirv::setABIAttrs(newFuncOp, entryPointInfo, argABIInfo);
+  return newFuncOp;
+}
+
 PatternMatchResult
-KernelFnConversion::matchAndRewrite(FuncOp funcOp, ArrayRef<Value *> operands,
+KernelFnConversion::matchAndRewrite(gpu::GPUFuncOp funcOp,
+                                    ArrayRef<Value *> operands,
                                     ConversionPatternRewriter &rewriter) const {
   if (!gpu::GPUDialect::isKernel(funcOp)) {
     return matchFailure();
@@ -223,8 +280,8 @@ KernelFnConversion::matchAndRewrite(FuncOp funcOp, ArrayRef<Value *> operands,
   auto context = rewriter.getContext();
   auto entryPointAttr =
       spirv::getEntryPointABIAttr(workGroupSizeAsInt32, context);
-  FuncOp newFuncOp = spirv::lowerAsEntryFunction(
-      funcOp, typeConverter, rewriter, entryPointAttr, argABI);
+  FuncOp newFuncOp = lowerAsEntryFunction(funcOp, typeConverter, rewriter,
+                                          entryPointAttr, argABI);
   if (!newFuncOp) {
     return matchFailure();
   }
@@ -275,6 +332,20 @@ PatternMatchResult KernelModuleTerminatorConversion::matchAndRewrite(
 }
 
 //===----------------------------------------------------------------------===//
+// GPU return inside kernel functions to SPIR-V return.
+//===----------------------------------------------------------------------===//
+
+PatternMatchResult GPUReturnOpConversion::matchAndRewrite(
+    gpu::ReturnOp returnOp, ArrayRef<Value *> operands,
+    ConversionPatternRewriter &rewriter) const {
+  if (!operands.empty())
+    return matchFailure();
+
+  rewriter.replaceOpWithNewOp<spirv::ReturnOp>(returnOp);
+  return matchSuccess();
+}
+
+//===----------------------------------------------------------------------===//
 // GPU To SPIRV Patterns.
 //===----------------------------------------------------------------------===//
 
@@ -285,7 +356,8 @@ void populateGPUToSPIRVPatterns(MLIRContext *context,
                                 ArrayRef<int64_t> workGroupSize) {
   patterns.insert<KernelFnConversion>(context, typeConverter, workGroupSize);
   patterns.insert<
-      ForOpConversion, KernelModuleConversion, KernelModuleTerminatorConversion,
+      GPUReturnOpConversion, ForOpConversion, KernelModuleConversion,
+      KernelModuleTerminatorConversion,
       LaunchConfigConversion<gpu::BlockDimOp, spirv::BuiltIn::WorkgroupSize>,
       LaunchConfigConversion<gpu::BlockIdOp, spirv::BuiltIn::WorkgroupId>,
       LaunchConfigConversion<gpu::GridDimOp, spirv::BuiltIn::NumWorkgroups>,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 1f48d6d..46a568c 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -94,9 +94,9 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
     // Check that `launch_func` refers to a well-formed kernel function.
     StringRef kernelName = launchOp.kernel();
     Operation *kernelFunc = kernelModule.lookupSymbol(kernelName);
-    auto kernelStdFunction = dyn_cast_or_null<::mlir::FuncOp>(kernelFunc);
+    auto kernelGPUFunction = dyn_cast_or_null<gpu::GPUFuncOp>(kernelFunc);
     auto kernelLLVMFunction = dyn_cast_or_null<LLVM::LLVMFuncOp>(kernelFunc);
-    if (!kernelStdFunction && !kernelLLVMFunction)
+    if (!kernelGPUFunction && !kernelLLVMFunction)
       return launchOp.emitOpError("kernel function '")
              << kernelName << "' is undefined";
     if (!kernelFunc->getAttrOfType<mlir::UnitAttr>(
@@ -107,7 +107,7 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
     unsigned actualNumArguments = launchOp.getNumKernelOperands();
     unsigned expectedNumArguments = kernelLLVMFunction
                                         ? kernelLLVMFunction.getNumArguments()
-                                        : kernelStdFunction.getNumArguments();
+                                        : kernelGPUFunction.getNumArguments();
     if (expectedNumArguments != actualNumArguments)
       return launchOp.emitOpError("got ")
              << actualNumArguments << " kernel operands but expected "
@@ -488,7 +488,7 @@ void LaunchOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 //===----------------------------------------------------------------------===//
 
 void LaunchFuncOp::build(Builder *builder, OperationState &result,
-                         ::mlir::FuncOp kernelFunc, Value *gridSizeX,
+                         GPUFuncOp kernelFunc, Value *gridSizeX,
                          Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX,
                          Value *blockSizeY, Value *blockSizeZ,
                          ValueRange kernelOperands) {
@@ -505,7 +505,7 @@ void LaunchFuncOp::build(Builder *builder, OperationState &result,
 }
 
 void LaunchFuncOp::build(Builder *builder, OperationState &result,
-                         ::mlir::FuncOp kernelFunc, KernelDim3 gridSize,
+                         GPUFuncOp kernelFunc, KernelDim3 gridSize,
                          KernelDim3 blockSize, ValueRange kernelOperands) {
   build(builder, result, kernelFunc, gridSize.x, gridSize.y, gridSize.z,
         blockSize.x, blockSize.y, blockSize.z, kernelOperands);
@@ -718,6 +718,18 @@ void printGPUFuncOp(OpAsmPrinter &p, GPUFuncOp op) {
   p.printRegion(op.getBody(), /*printEntryBlockArgs=*/false);
 }
 
+void GPUFuncOp::setType(FunctionType newType) {
+  auto oldType = getType();
+  assert(newType.getNumResults() == oldType.getNumResults() &&
+         "unimplemented: changes to the number of results");
+
+  SmallVector<char, 16> nameBuf;
+  for (int i = newType.getNumInputs(), e = oldType.getNumInputs(); i < e; i++)
+    removeAttr(getArgAttrName(i, nameBuf));
+
+  setAttr(getTypeAttrName(), TypeAttr::get(newType));
+}
+
 /// Hook for FunctionLike verifier.
 LogicalResult GPUFuncOp::verifyType() {
   Type type = getTypeAttr().getValue();
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index b466cc2..416a37b 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -39,19 +39,21 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
   }
 }
 
-// Add operations generating block/thread ids and gird/block dimensions at the
-// beginning of `kernelFunc` and replace uses of the respective function args.
-static void injectGpuIndexOperations(Location loc, FuncOp kernelFunc) {
-  OpBuilder OpBuilder(kernelFunc.getBody());
+// Add operations generating block/thread ids and grid/block dimensions at the
+// beginning of the `body` region and replace uses of the respective function
+// arguments.
+static void injectGpuIndexOperations(Location loc, Region &body) {
+  OpBuilder builder(loc->getContext());
+  Block &firstBlock = body.front();
+  builder.setInsertionPointToStart(&firstBlock);
   SmallVector<Value *, 12> indexOps;
-  createForAllDimensions<gpu::BlockIdOp>(OpBuilder, loc, indexOps);
-  createForAllDimensions<gpu::ThreadIdOp>(OpBuilder, loc, indexOps);
-  createForAllDimensions<gpu::GridDimOp>(OpBuilder, loc, indexOps);
-  createForAllDimensions<gpu::BlockDimOp>(OpBuilder, loc, indexOps);
+  createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
+  createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
+  createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
+  createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
   // Replace the leading 12 function args with the respective thread/block index
   // operations. Iterate backwards since args are erased and indices change.
   for (int i = 11; i >= 0; --i) {
-    auto &firstBlock = kernelFunc.front();
     firstBlock.getArgument(i)->replaceAllUsesWith(indexOps[i]);
     firstBlock.eraseArgument(i);
   }
@@ -63,7 +65,7 @@ static bool isInliningBeneficiary(Operation *op) {
 
 // Move arguments of the given kernel function into the function if this reduces
 // the number of kernel arguments.
-static gpu::LaunchFuncOp inlineBeneficiaryOps(FuncOp kernelFunc,
+static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc,
                                               gpu::LaunchFuncOp launch) {
   OpBuilder kernelBuilder(kernelFunc.getBody());
   auto &firstBlock = kernelFunc.getBody().front();
@@ -107,31 +109,30 @@ static gpu::LaunchFuncOp inlineBeneficiaryOps(FuncOp kernelFunc,
 
 // Outline the `gpu.launch` operation body into a kernel function. Replace
 // `gpu.return` operations by `std.return` in the generated function.
-static FuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
+static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp) {
   Location loc = launchOp.getLoc();
+  // Create a builder with no insertion point, insertion will happen separately
+  // due to symbol table manipulation.
+  OpBuilder builder(launchOp.getContext());
+
   SmallVector<Type, 4> kernelOperandTypes(launchOp.getKernelOperandTypes());
   FunctionType type =
       FunctionType::get(kernelOperandTypes, {}, launchOp.getContext());
   std::string kernelFuncName =
       Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str();
-  FuncOp outlinedFunc = FuncOp::create(loc, kernelFuncName, type);
-  outlinedFunc.getBody().takeBody(launchOp.body());
-  Builder builder(launchOp.getContext());
+  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFuncName, type);
   outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                        builder.getUnitAttr());
-  injectGpuIndexOperations(loc, outlinedFunc);
-  outlinedFunc.walk([](gpu::ReturnOp op) {
-    OpBuilder replacer(op);
-    replacer.create<ReturnOp>(op.getLoc());
-    op.erase();
-  });
+  outlinedFunc.body().takeBody(launchOp.body());
+  injectGpuIndexOperations(loc, outlinedFunc.body());
   return outlinedFunc;
 }
 
 // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
 // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
 // constant region arguments inlined.
-static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp, FuncOp kernelFunc) {
+static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp,
+                                  gpu::GPUFuncOp kernelFunc) {
   OpBuilder builder(launchOp);
   auto launchFuncOp = builder.create<gpu::LaunchFuncOp>(
       launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
@@ -160,7 +161,7 @@ public:
       // Insert just after the function.
       Block::iterator insertPt(func.getOperation()->getNextNode());
       func.walk([&](gpu::LaunchOp op) {
-        FuncOp outlinedFunc = outlineKernelFunc(op);
+        gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op);
 
         // Create nested module and insert outlinedFunc. The module will
         // originally get the same name as the function, but may be renamed on
@@ -183,7 +184,7 @@ public:
 
 private:
   // Returns a module containing kernelFunc and all callees (recursive).
-  ModuleOp createKernelModule(FuncOp kernelFunc,
+  ModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
                               const SymbolTable &parentSymbolTable) {
     auto context = getModule().getContext();
     Builder builder(context);
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
index 67c036d..1e68b49 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
@@ -249,43 +249,6 @@ Value *mlir::spirv::getBuiltinVariableValue(Operation *op,
 // Entry Function signature Conversion
 //===----------------------------------------------------------------------===//
 
-FuncOp mlir::spirv::lowerAsEntryFunction(
-    FuncOp funcOp, SPIRVTypeConverter &typeConverter,
-    ConversionPatternRewriter &rewriter,
-    spirv::EntryPointABIAttr entryPointInfo,
-    ArrayRef<spirv::InterfaceVarABIAttr> argABIInfo) {
-  auto fnType = funcOp.getType();
-  if (fnType.getNumResults()) {
-    funcOp.emitError("SPIR-V lowering only supports entry functions"
-                     "with no return values right now");
-    return nullptr;
-  }
-  if (fnType.getNumInputs() != argABIInfo.size()) {
-    funcOp.emitError(
-        "lowering as entry functions requires ABI info for all arguments");
-    return nullptr;
-  }
-  // For entry functions need to make the signature void(void). Compute the
-  // replacement value for all arguments and replace all uses.
-  TypeConverter::SignatureConversion signatureConverter(fnType.getNumInputs());
-  {
-    for (auto argType : enumerate(funcOp.getType().getInputs())) {
-      auto convertedType = typeConverter.convertType(argType.value());
-      signatureConverter.addInputs(argType.index(), convertedType);
-    }
-  }
-  auto newFuncOp = rewriter.cloneWithoutRegions(funcOp);
-  rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
-                              newFuncOp.end());
-  newFuncOp.setType(rewriter.getFunctionType(
-      signatureConverter.getConvertedTypes(), llvm::None));
-  rewriter.applySignatureConversion(&newFuncOp.getBody(), signatureConverter);
-  rewriter.eraseOp(funcOp);
-
-  spirv::setABIAttrs(newFuncOp, entryPointInfo, argABIInfo);
-  return newFuncOp;
-}
-
 LogicalResult
 mlir::spirv::setABIAttrs(FuncOp funcOp, spirv::EntryPointABIAttr entryPointInfo,
                          ArrayRef<spirv::InterfaceVarABIAttr> argABIInfo) {
diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
index d4c5c0f..6865462 100644
--- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
+++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@@ -6,8 +6,9 @@ module attributes {gpu.container_module} {
   // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN")
 
   module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} {
-    func @kernel(!llvm.float, !llvm<"float*">)
-        attributes { gpu.kernel }
+    gpu.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} {
+      gpu.return
+    }
   }
 
   llvm.func @foo() {
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 30bba48..525016b 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -122,3 +122,4 @@ module attributes {gpu.kernel_module} {
     "test.finish" () : () -> ()
   }) : () -> ()
 }
+
diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
index 2139cca..c0a68a9 100644
--- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir
@@ -10,13 +10,13 @@ module attributes {gpu.container_module} {
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
   module @kernels attributes {gpu.kernel_module} {
-    func @builtin_workgroup_id_x()
+    gpu.func @builtin_workgroup_id_x()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
       %0 = "gpu.block_id"() {dimension = "x"} : () -> index
-      return
+      gpu.return
     }
   }
 }
@@ -33,13 +33,13 @@ module attributes {gpu.container_module} {
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
   module @kernels attributes {gpu.kernel_module} {
-    func @builtin_workgroup_id_y()
+    gpu.func @builtin_workgroup_id_y()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}1 : i32{{\]}}
       %0 = "gpu.block_id"() {dimension = "y"} : () -> index
-      return
+      gpu.return
     }
   }
 }
@@ -56,13 +56,13 @@ module attributes {gpu.container_module} {
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId")
   module @kernels attributes {gpu.kernel_module} {
-    func @builtin_workgroup_id_z()
+    gpu.func @builtin_workgroup_id_z()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}2 : i32{{\]}}
       %0 = "gpu.block_id"() {dimension = "z"} : () -> index
-      return
+      gpu.return
     }
   }
 }
@@ -79,13 +79,13 @@ module attributes {gpu.container_module} {
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize")
   module @kernels attributes {gpu.kernel_module} {
-    func @builtin_workgroup_size_x()
+    gpu.func @builtin_workgroup_size_x()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
       %0 = "gpu.block_dim"() {dimension = "x"} : () -> index
-      return
+      gpu.return
     }
   }
 }
@@ -102,13 +102,13 @@ module attributes {gpu.container_module} {
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId")
   module @kernels attributes {gpu.kernel_module} {
-    func @builtin_local_id_x()
+    gpu.func @builtin_local_id_x()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
       %0 = "gpu.thread_id"() {dimension = "x"} : () -> index
-      return
+      gpu.return
     }
   }
 }
@@ -125,13 +125,13 @@ module attributes {gpu.container_module} {
   // CHECK-LABEL:  spv.module "Logical" "GLSL450"
   // CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups")
   module @kernels attributes {gpu.kernel_module} {
-    func @builtin_num_workgroups_x()
+    gpu.func @builtin_num_workgroups_x()
       attributes {gpu.kernel} {
       // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]]
       // CHECK-NEXT: [[VEC:%.*]] = spv.Load "Input" [[ADDRESS]]
       // CHECK-NEXT: {{%.*}} = spv.CompositeExtract [[VEC]]{{\[}}0 : i32{{\]}}
       %0 = "gpu.grid_dim"() {dimension = "x"} : () -> index
-      return
+      gpu.return
     }
   }
 }
diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
index 786a16b..3244256 100644
--- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
@@ -29,7 +29,7 @@ module attributes {gpu.container_module} {
     // CHECK-SAME: [[ARG4:%.*]]: i32 {spirv.interface_var_abi = {binding = 4 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
     // CHECK-SAME: [[ARG5:%.*]]: i32 {spirv.interface_var_abi = {binding = 5 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
     // CHECK-SAME: [[ARG6:%.*]]: i32 {spirv.interface_var_abi = {binding = 6 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
-    func @load_store_kernel(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>, %arg3: index, %arg4: index, %arg5: index, %arg6: index)
+    gpu.func @load_store_kernel(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>, %arg3: index, %arg4: index, %arg5: index, %arg6: index)
       attributes  {gpu.kernel} {
       // CHECK: [[ADDRESSWORKGROUPID:%.*]] = spv._address_of [[WORKGROUPIDVAR]]
       // CHECK: [[WORKGROUPID:%.*]] = spv.Load "Input" [[ADDRESSWORKGROUPID]]
@@ -70,7 +70,7 @@ module attributes {gpu.container_module} {
       // CHECK: [[PTR3:%.*]] = spv.AccessChain [[ARG2]]{{\[}}{{%.*}}, {{%.*}}{{\]}}
       // CHECK-NEXT: spv.Store "StorageBuffer" [[PTR3]], [[VAL3]]
       store %16, %arg2[%12, %13] : memref<12x4xf32>
-      return
+      gpu.return
     }
   }
 }
diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
index 43a6b3e..6d38360 100644
--- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir
@@ -8,7 +8,7 @@ module attributes {gpu.container_module} {
   }
 
   module @kernels attributes {gpu.kernel_module} {
-    func @loop_kernel(%arg2 : memref<10xf32>, %arg3 : memref<10xf32>)
+    gpu.func @loop_kernel(%arg2 : memref<10xf32>, %arg3 : memref<10xf32>)
     attributes {gpu.kernel} {
       // CHECK: [[LB:%.*]] = spv.constant 4 : i32
       %lb = constant 4 : index
@@ -37,7 +37,7 @@ module attributes {gpu.container_module} {
         %1 = load %arg2[%arg4] : memref<10xf32>
         store %1, %arg3[%arg4] : memref<10xf32>
       }
-      return
+      gpu.return
     }
   }
-}
\ No newline at end of file
+}
diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
index 5ec78ed..c1f4324 100644
--- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir
@@ -8,10 +8,9 @@ module attributes {gpu.container_module} {
     // CHECK-SAME: {{%.*}}: f32 {spirv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
     // CHECK-SAME: {{%.*}}: !spv.ptr<!spv.struct<!spv.array<12 x f32 [4]> [0]>, StorageBuffer> {spirv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}}
     // CHECK-SAME: spirv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}
-    func @kernel_1(%arg0 : f32, %arg1 : memref<12xf32>)
-        attributes { gpu.kernel } {
+    gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<12xf32>) attributes {gpu.kernel} {
       // CHECK: spv.Return
-      return
+      gpu.return
     }
     // CHECK: attributes {capabilities = ["Shader"], extensions = ["SPV_KHR_storage_buffer_storage_class"]}
   }
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 6565c62..2065595 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -194,13 +194,13 @@ module attributes {gpu.container_module} {
 
 module attributes {gpu.container_module} {
   module @kernels attributes {gpu.kernel_module} {
-    func @kernel_1(%arg1 : !llvm<"float*">) {
-      return
+    gpu.func @kernel_1(%arg1 : !llvm<"float*">) kernel {
+      gpu.return
     }
   }
 
   func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm<"float*">) {
-    // expected-error@+1 {{kernel function is missing the 'gpu.kernel' attribute}}
+    // xpected-error@+1 {{kernel function is missing the 'gpu.kernel' attribute}}
     "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz, %arg)
     {kernel = "kernel_1", kernel_module = @kernels}
         : (index, index, index, index, index, index, !llvm<"float*">) -> ()
@@ -212,8 +212,8 @@ module attributes {gpu.container_module} {
 
 module attributes {gpu.container_module} {
   module @kernels attributes {gpu.kernel_module} {
-    func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
-      return
+    gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
+      gpu.return
     }
   }
 
@@ -230,8 +230,8 @@ module attributes {gpu.container_module} {
 // -----
 
 module @kernels attributes {gpu.kernel_module} {
-  func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
-    return
+  gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } {
+    gpu.return
   }
 }
 
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index e2fd26f..ff5a40d 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -61,8 +61,7 @@ module attributes {gpu.container_module} {
   }
 
   module @kernels attributes {gpu.kernel_module} {
-    func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>)
-        attributes { gpu.kernel } {
+    gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) attributes {gpu.kernel} {
       %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
       %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
       %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
@@ -86,11 +85,12 @@ module attributes {gpu.container_module} {
 
       "some_op"(%bIdX, %tIdX) : (index, index) -> ()
       %42 = load %arg1[%bIdX] : memref<?xf32, 1>
-      return
+      gpu.return
     }
 
-    func @kernel_2(f32, memref<?xf32, 1>)
-        attributes { gpu.kernel }
+    gpu.func @kernel_2(%arg0: f32, %arg1: memref<?xf32, 1>) attributes {gpu.kernel} {
+      gpu.return
+    }
   }
 
   func @foo() {
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 6b69920..5adb881a 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -38,9 +38,8 @@ func @launch() {
 
 
 // CHECK-LABEL: module @launch_kernel
-// CHECK-NEXT: func @launch_kernel
+// CHECK-NEXT: gpu.func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
-// CHECK:      attributes {gpu.kernel}
 // CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
 // CHECK-NEXT: = "gpu.block_id"() {dimension = "y"} : () -> index
 // CHECK-NEXT: = "gpu.block_id"() {dimension = "z"} : () -> index
@@ -138,7 +137,7 @@ func @recursive_device_function() {
 }
 
 // CHECK: module @function_call_kernel attributes {gpu.kernel_module} {
-// CHECK:   func @function_call_kernel()
+// CHECK:   gpu.func @function_call_kernel()
 // CHECK:     call @device_function() : () -> ()
 // CHECK:     call @device_function() : () -> ()
 // CHECK:     llvm.mlir.addressof @global : !llvm<"i64*">
-- 
2.7.4