Add gpu.barrier op to synchronize invocations of a local workgroup.

author Christian Sigg <csigg@google.com>

Fri, 18 Oct 2019 07:30:14 +0000 (00:30 -0700)

committer A. Unique TensorFlower <gardener@tensorflow.org>

Fri, 18 Oct 2019 07:30:44 +0000 (00:30 -0700)
author Christian Sigg <csigg@google.com>
Fri, 18 Oct 2019 07:30:14 +0000 (00:30 -0700)
committer A. Unique TensorFlower <gardener@tensorflow.org>
Fri, 18 Oct 2019 07:30:44 +0000 (00:30 -0700)
diff --git a/mlir/g3doc/Dialects/GPU.md b/mlir/g3doc/Dialects/GPU.md

index 2e0e06a..b998320 100644 (file)
--- a/mlir/g3doc/Dialects/GPU.md
+++ b/mlir/g3doc/Dialects/GPU.md
@@ -192,3 +192,55 @@ Example:
  ```mlir {.mlir}
    %tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
  ```
+
+### `gpu.yield`
+
+Is a special terminator operation for blocks inside regions in gpu ops. It
+returns values to the immediately enclosing gpu op.
+
+Example:
+
+```mlir {.mlir}
+gpu.yield %f0, %f1 : f32, f32
+```
+
+
+### `gpu.all_reduce`
+
+The "all_reduce" op reduces the value of every work item across a local
+workgroup. The result is equal for all work items of a workgroup.
+
+For example, both
+
+```mlir {.mlir}
+%1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
+%2 = "gpu.all_reduce"(%0) ({
+^bb(%lhs : f32, %rhs : f32):
+  %sum = addf %lhs, %rhs : f32
+  "gpu.yield"(%sum) : (f32) -> ()
+}) : (f32) -> (f32)
+```
+compute the sum of each work item's %0 value. The first version specifies
+the accumulation as operation, whereas the second version specifies the
+accumulation as code region. The accumulation operation must either be
+`add` or `mul`.
+
+Either none or all work items of a workgroup need to execute this op
+in convergence.
+
+### `gpu.barrier`
+
+The "barrier" op synchronizes all work items of a workgroup. It is used
+to coordinate communication between the work items of the workgroup.
+
+```mlir {.mlir}
+gpu.barrier
+```
+waits until all work items in the workgroup have reached this point
+and all memory accesses made by these work items prior to the op are
+visible to all work items in the workgroup. Data hazards between work items
+accessing the same memory can be avoided by synchronizing work items
+in-between these accesses.
+
+Either none or all work items of a workgroup need to execute this op
+in convergence.
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td

index 8b30f78..ab723ff 100644 (file)
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -68,7 +68,8 @@ def gpu_Yield : GPU_Op<"yield", [Terminator]>,
  
      Example:
  
-       gpu.yield %f0, %f1 : f32, f32
+       ```gpu.yield %f0, %f1 : f32, f32
+       ```
    }];
  }
  
@@ -79,8 +80,8 @@ def gpu_AllReduce : GPU_Op<"all_reduce",
      Results<(outs AnyType)> {
    let summary = "Reduce values among workgroup.";
    let description = [{
-    The "all_reduce" op reduces the value of every invocation across a local
-    workgroup. The result is equal for all invocations of a local workgroup.
+    The "all_reduce" op reduces the value of every work item across a local
+    workgroup. The result is equal for all work items of a workgroup.
  
      For example, both
      ```
@@ -91,16 +92,38 @@ def gpu_AllReduce : GPU_Op<"all_reduce",
          "gpu.yield"(%sum) : (f32) -> ()
        }) : (f32) -> (f32)
      ```
-    compute the sum of each invocation's %0 value. The first version specifies
+    compute the sum of each work item's %0 value. The first version specifies
      the accumulation as operation, whereas the second version specifies the
      accumulation as code region. The accumulation operation must either be
      `add` or `mul`.
  
-    Either none or all invocations of a local workgroup need to execute this op
+    Either none or all work items of a workgroup need to execute this op
      in convergence.
    }];
    let regions = (region AnyRegion:$body);
    let verifier = [{ return ::verifyAllReduce(*this); }];
  }
  
+def gpu_Barrier : GPU_Op<"barrier"> {
+  let summary = "Synchronizes all work items of a workgroup.";
+  let description = [{
+    The "barrier" op synchronizes all work items of a workgroup. It is used
+    to coordinate communication between the work items of the workgroup.
+
+    ```
+      gpu.barrier
+    ```
+    waits until all work items in the workgroup have reached this point
+    and all memory accesses made by these work items prior to the op are
+    visible to all work items in the workgroup. Data hazards between work items
+    accessing the same memory can be avoided by synchronizing work items
+    in-between these accesses.
+
+    Either none or all work items of a workgroup need to execute this op
+    in convergence.
+  }];
+  let parser = [{ return success(); }];
+  let printer = [{ p << getOperationName(); }];
+}
+
  #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt

index 492f3a1..95bc586 100644 (file)
--- a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
@@ -1,6 +1,14 @@
+set(LLVM_TARGET_DEFINITIONS GPUToNVVM.td)
+mlir_tablegen(GPUToNVVM.cpp.inc -gen-rewriters)
+add_public_tablegen_target(MLIRGPUToNVVMIncGen)
+
  add_llvm_library(MLIRGPUtoNVVMTransforms
    LowerGpuOpsToNVVMOps.cpp
    )
+
+add_dependencies(GPUToNVVMTransforms
+  MLIRGPUToNVVMIncGen)
+
  target_link_libraries(MLIRGPUtoNVVMTransforms
    LLVMSupport
    MLIRGPU
diff --git a/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td

new file mode 100644 (file)

index 0000000..d7daf10
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td
@@ -0,0 +1,38 @@
+//==-- GPUToNVVM.td - GPU Ops to NVVM Patterns ---------------*- tablegen -*==//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines Patterns to lower GPU ops to NVVM.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef MLIR_CONVERSION_GPUTONVVM_TD
+#else
+#define MLIR_CONVERSION_GPUTONVVM_TD
+
+#ifdef GPU_OPS
+#else
+include "mlir/Dialect/GPU/GPUOps.td"
+#endif // GPU_OPS
+
+#ifdef NVVMIR_OPS
+#else
+include "mlir/Dialect/LLVMIR/NVVMOps.td"
+#endif // NVVMIR_OPS
+
+def : Pat<(gpu_Barrier), (NVVM_Barrier0Op)>;
+
+#endif // MLIR_CONVERSION_GPUTONVVM_TD
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

index 6a3a66c..a1442d0 100644 (file)
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -447,6 +447,9 @@ private:
    static constexpr int kWarpSize = 32;
  };
  
+/// Import the GPU Ops to NVVM Patterns.
+#include "GPUToNVVM.cpp.inc"
+
  /// A pass that replaces all occurrences of GPU device operations with their
  /// corresponding NVVM equivalent.
  ///
@@ -462,6 +465,7 @@ public:
      OwningRewritePatternList patterns;
      LLVMTypeConverter converter(m.getContext());
      populateStdToLLVMConversionPatterns(converter, patterns);
+    populateWithGenerated(&getContext(), &patterns);
      patterns.insert<
          GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir

index 714dd88..f353934 100644 (file)
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -68,7 +68,18 @@ module attributes {gpu.kernel_module} {
        %xor = xor %lhs, %rhs : i32
        "gpu.yield"(%xor) : (i32) -> ()
      }) : (i32) -> (i32)
+    std.return
+  }
+}
+
+// -----
  
+module attributes {gpu.kernel_module} {
+  // CHECK-LABEL: func @gpu_sync()
+  func @gpu_sync()
+      attributes { gpu.kernel } {
+    // CHECK: nvvm.barrier0
+    gpu.barrier
      std.return
    }
  }
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir

index 18ce5a6..bfc0f15 100644 (file)
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -82,6 +82,8 @@ module attributes {gpu.container_module} {
        %one = constant 1.0 : f32
        %sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32)
  
+      "gpu.barrier"() : () -> ()
+
        "some_op"(%bIdX, %tIdX) : (index, index) -> ()
        %42 = load %arg1[%bIdX] : memref<?xf32, 1>
        return
author	Christian Sigg <csigg@google.com>
	Fri, 18 Oct 2019 07:30:14 +0000 (00:30 -0700)
committer	A. Unique TensorFlower <gardener@tensorflow.org>
	Fri, 18 Oct 2019 07:30:44 +0000 (00:30 -0700)
mlir/g3doc/Dialects/GPU.md		patch \| blob \| history
mlir/include/mlir/Dialect/GPU/GPUOps.td		patch \| blob \| history
mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt		patch \| blob \| history
mlir/lib/Conversion/GPUToNVVM/GPUToNVVM.td	[new file with mode: 0644]	patch \| blob
mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp		patch \| blob \| history
mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir		patch \| blob \| history
mlir/test/Dialect/GPU/ops.mlir		patch \| blob \| history