From: Christian Sigg Date: Fri, 18 Oct 2019 07:30:14 +0000 (-0700) Subject: Add gpu.barrier op to synchronize invocations of a local workgroup. X-Git-Tag: llvmorg-11-init~1466^2~489 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fe0ee32da537f94602275635a637e0b5ac4ef7cd;p=platform%2Fupstream%2Fllvm.git Add gpu.barrier op to synchronize invocations of a local workgroup. Adding gen table for rewrite patterns from GPU to NVVM dialect. Copy missing op documentation from GPUOps.td to GPU.md. PiperOrigin-RevId: 275419588 --- diff --git a/mlir/g3doc/Dialects/GPU.md b/mlir/g3doc/Dialects/GPU.md index 2e0e06a..b998320 100644 --- a/mlir/g3doc/Dialects/GPU.md +++ b/mlir/g3doc/Dialects/GPU.md @@ -192,3 +192,55 @@ Example: ```mlir {.mlir} %tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index) ``` + +### `gpu.yield` + +Is a special terminator operation for blocks inside regions in gpu ops. It +returns values to the immediately enclosing gpu op. + +Example: + +```mlir {.mlir} +gpu.yield %f0, %f1 : f32, f32 +``` + + +### `gpu.all_reduce` + +The "all_reduce" op reduces the value of every work item across a local +workgroup. The result is equal for all work items of a workgroup. + +For example, both + +```mlir {.mlir} +%1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32) +%2 = "gpu.all_reduce"(%0) ({ +^bb(%lhs : f32, %rhs : f32): + %sum = addf %lhs, %rhs : f32 + "gpu.yield"(%sum) : (f32) -> () +}) : (f32) -> (f32) +``` +compute the sum of each work item's %0 value. The first version specifies +the accumulation as operation, whereas the second version specifies the +accumulation as code region. The accumulation operation must either be +`add` or `mul`. + +Either none or all work items of a workgroup need to execute this op +in convergence. + +### `gpu.barrier` + +The "barrier" op synchronizes all work items of a workgroup. It is used +to coordinate communication between the work items of the workgroup. + +```mlir {.mlir} +gpu.barrier +``` +waits until all work items in the workgroup have reached this point +and all memory accesses made by these work items prior to the op are +visible to all work items in the workgroup. Data hazards between work items +accessing the same memory can be avoided by synchronizing work items +in-between these accesses. + +Either none or all work items of a workgroup need to execute this op +in convergence. diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td index 8b30f78..ab723ff 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -68,7 +68,8 @@ def gpu_Yield : GPU_Op<"yield", [Terminator]>, Example: - gpu.yield %f0, %f1 : f32, f32 + ```gpu.yield %f0, %f1 : f32, f32 + ``` }]; } @@ -79,8 +80,8 @@ def gpu_AllReduce : GPU_Op<"all_reduce", Results<(outs AnyType)> { let summary = "Reduce values among workgroup."; let description = [{ - The "all_reduce" op reduces the value of every invocation across a local - workgroup. The result is equal for all invocations of a local workgroup. + The "all_reduce" op reduces the value of every work item across a local + workgroup. The result is equal for all work items of a workgroup. For example, both ``` @@ -91,16 +92,38 @@ def gpu_AllReduce : GPU_Op<"all_reduce", "gpu.yield"(%sum) : (f32) -> () }) : (f32) -> (f32) ``` - compute the sum of each invocation's %0 value. The first version specifies + compute the sum of each work item's %0 value. The first version specifies the accumulation as operation, whereas the second version specifies the accumulation as code region. The accumulation operation must either be `add` or `mul`. - Either none or all invocations of a local workgroup need to execute this op + Either none or all work items of a workgroup need to execute this op in convergence. }]; let regions = (region AnyRegion:$body); let verifier = [{ return ::verifyAllReduce(*this); }]; } +def gpu_Barrier : GPU_Op<"barrier"> { + let summary = "Synchronizes all work items of a workgroup."; + let description = [{ + The "barrier" op synchronizes all work items of a workgroup. It is used + to coordinate communication between the work items of the workgroup. + + ``` + gpu.barrier + ``` + waits until all work items in the workgroup have reached this point + and all memory accesses made by these work items prior to the op are + visible to all work items in the workgroup. Data hazards between work items + accessing the same memory can be avoided by synchronizing work items + in-between these accesses. + + Either none or all work items of a workgroup need to execute this op + in convergence. + }]; + let parser = [{ return success(); }]; + let printer = [{ p << getOperationName(); }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt index 492f3a1..95bc586 100644 --- a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt @@ -1,6 +1,14 @@ +set(LLVM_TARGET_DEFINITIONS GPUToNVVM.td) +mlir_tablegen(GPUToNVVM.cpp.inc -gen-rewriters) +add_public_tablegen_target(MLIRGPUToNVVMIncGen) + add_llvm_library(MLIRGPUtoNVVMTransforms LowerGpuOpsToNVVMOps.cpp ) + +add_dependencies(GPUToNVVMTransforms + MLIRGPUToNVVMIncGen) + target_link_libraries(MLIRGPUtoNVVMTransforms LLVMSupport MLIRGPU diff --git a/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td new file mode 100644 index 0000000..d7daf10 --- /dev/null +++ b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td @@ -0,0 +1,38 @@ +//==-- GPUToNVVM.td - GPU Ops to NVVM Patterns ---------------*- tablegen -*==// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +// +// Defines Patterns to lower GPU ops to NVVM. +// +//===----------------------------------------------------------------------===// + +#ifdef MLIR_CONVERSION_GPUTONVVM_TD +#else +#define MLIR_CONVERSION_GPUTONVVM_TD + +#ifdef GPU_OPS +#else +include "mlir/Dialect/GPU/GPUOps.td" +#endif // GPU_OPS + +#ifdef NVVMIR_OPS +#else +include "mlir/Dialect/LLVMIR/NVVMOps.td" +#endif // NVVMIR_OPS + +def : Pat<(gpu_Barrier), (NVVM_Barrier0Op)>; + +#endif // MLIR_CONVERSION_GPUTONVVM_TD diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 6a3a66c..a1442d0 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -447,6 +447,9 @@ private: static constexpr int kWarpSize = 32; }; +/// Import the GPU Ops to NVVM Patterns. +#include "GPUToNVVM.cpp.inc" + /// A pass that replaces all occurrences of GPU device operations with their /// corresponding NVVM equivalent. /// @@ -462,6 +465,7 @@ public: OwningRewritePatternList patterns; LLVMTypeConverter converter(m.getContext()); populateStdToLLVMConversionPatterns(converter, patterns); + populateWithGenerated(&getContext(), &patterns); patterns.insert< GPUIndexIntrinsicOpLowering, diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index 714dd88..f353934 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -68,7 +68,18 @@ module attributes {gpu.kernel_module} { %xor = xor %lhs, %rhs : i32 "gpu.yield"(%xor) : (i32) -> () }) : (i32) -> (i32) + std.return + } +} + +// ----- +module attributes {gpu.kernel_module} { + // CHECK-LABEL: func @gpu_sync() + func @gpu_sync() + attributes { gpu.kernel } { + // CHECK: nvvm.barrier0 + gpu.barrier std.return } } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index 18ce5a6..bfc0f15 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -82,6 +82,8 @@ module attributes {gpu.container_module} { %one = constant 1.0 : f32 %sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32) + "gpu.barrier"() : () -> () + "some_op"(%bIdX, %tIdX) : (index, index) -> () %42 = load %arg1[%bIdX] : memref return