```mlir {.mlir}
%tIdX = "gpu.thread_id"() {dimension: "x"} : () -> (index)
```
+
+### `gpu.yield`
+
+Is a special terminator operation for blocks inside regions in gpu ops. It
+returns values to the immediately enclosing gpu op.
+
+Example:
+
+```mlir {.mlir}
+gpu.yield %f0, %f1 : f32, f32
+```
+
+
+### `gpu.all_reduce`
+
+The "all_reduce" op reduces the value of every work item across a local
+workgroup. The result is equal for all work items of a workgroup.
+
+For example, both
+
+```mlir {.mlir}
+%1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
+%2 = "gpu.all_reduce"(%0) ({
+^bb(%lhs : f32, %rhs : f32):
+ %sum = addf %lhs, %rhs : f32
+ "gpu.yield"(%sum) : (f32) -> ()
+}) : (f32) -> (f32)
+```
+compute the sum of each work item's %0 value. The first version specifies
+the accumulation as operation, whereas the second version specifies the
+accumulation as code region. The accumulation operation must either be
+`add` or `mul`.
+
+Either none or all work items of a workgroup need to execute this op
+in convergence.
+
+### `gpu.barrier`
+
+The "barrier" op synchronizes all work items of a workgroup. It is used
+to coordinate communication between the work items of the workgroup.
+
+```mlir {.mlir}
+gpu.barrier
+```
+waits until all work items in the workgroup have reached this point
+and all memory accesses made by these work items prior to the op are
+visible to all work items in the workgroup. Data hazards between work items
+accessing the same memory can be avoided by synchronizing work items
+in-between these accesses.
+
+Either none or all work items of a workgroup need to execute this op
+in convergence.
Example:
- gpu.yield %f0, %f1 : f32, f32
+ ```gpu.yield %f0, %f1 : f32, f32
+ ```
}];
}
Results<(outs AnyType)> {
let summary = "Reduce values among workgroup.";
let description = [{
- The "all_reduce" op reduces the value of every invocation across a local
- workgroup. The result is equal for all invocations of a local workgroup.
+ The "all_reduce" op reduces the value of every work item across a local
+ workgroup. The result is equal for all work items of a workgroup.
For example, both
```
"gpu.yield"(%sum) : (f32) -> ()
}) : (f32) -> (f32)
```
- compute the sum of each invocation's %0 value. The first version specifies
+ compute the sum of each work item's %0 value. The first version specifies
the accumulation as operation, whereas the second version specifies the
accumulation as code region. The accumulation operation must either be
`add` or `mul`.
- Either none or all invocations of a local workgroup need to execute this op
+ Either none or all work items of a workgroup need to execute this op
in convergence.
}];
let regions = (region AnyRegion:$body);
let verifier = [{ return ::verifyAllReduce(*this); }];
}
+def gpu_Barrier : GPU_Op<"barrier"> {
+ let summary = "Synchronizes all work items of a workgroup.";
+ let description = [{
+ The "barrier" op synchronizes all work items of a workgroup. It is used
+ to coordinate communication between the work items of the workgroup.
+
+ ```
+ gpu.barrier
+ ```
+ waits until all work items in the workgroup have reached this point
+ and all memory accesses made by these work items prior to the op are
+ visible to all work items in the workgroup. Data hazards between work items
+ accessing the same memory can be avoided by synchronizing work items
+ in-between these accesses.
+
+ Either none or all work items of a workgroup need to execute this op
+ in convergence.
+ }];
+ let parser = [{ return success(); }];
+ let printer = [{ p << getOperationName(); }];
+}
+
#endif // GPU_OPS
+set(LLVM_TARGET_DEFINITIONS GPUToNVVM.td)
+mlir_tablegen(GPUToNVVM.cpp.inc -gen-rewriters)
+add_public_tablegen_target(MLIRGPUToNVVMIncGen)
+
add_llvm_library(MLIRGPUtoNVVMTransforms
LowerGpuOpsToNVVMOps.cpp
)
+
+add_dependencies(GPUToNVVMTransforms
+ MLIRGPUToNVVMIncGen)
+
target_link_libraries(MLIRGPUtoNVVMTransforms
LLVMSupport
MLIRGPU
--- /dev/null
+//==-- GPUToNVVM.td - GPU Ops to NVVM Patterns ---------------*- tablegen -*==//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// Defines Patterns to lower GPU ops to NVVM.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef MLIR_CONVERSION_GPUTONVVM_TD
+#else
+#define MLIR_CONVERSION_GPUTONVVM_TD
+
+#ifdef GPU_OPS
+#else
+include "mlir/Dialect/GPU/GPUOps.td"
+#endif // GPU_OPS
+
+#ifdef NVVMIR_OPS
+#else
+include "mlir/Dialect/LLVMIR/NVVMOps.td"
+#endif // NVVMIR_OPS
+
+def : Pat<(gpu_Barrier), (NVVM_Barrier0Op)>;
+
+#endif // MLIR_CONVERSION_GPUTONVVM_TD
static constexpr int kWarpSize = 32;
};
+/// Import the GPU Ops to NVVM Patterns.
+#include "GPUToNVVM.cpp.inc"
+
/// A pass that replaces all occurrences of GPU device operations with their
/// corresponding NVVM equivalent.
///
OwningRewritePatternList patterns;
LLVMTypeConverter converter(m.getContext());
populateStdToLLVMConversionPatterns(converter, patterns);
+ populateWithGenerated(&getContext(), &patterns);
patterns.insert<
GPUIndexIntrinsicOpLowering<gpu::ThreadId, NVVM::ThreadIdXOp,
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
%xor = xor %lhs, %rhs : i32
"gpu.yield"(%xor) : (i32) -> ()
}) : (i32) -> (i32)
+ std.return
+ }
+}
+
+// -----
+module attributes {gpu.kernel_module} {
+ // CHECK-LABEL: func @gpu_sync()
+ func @gpu_sync()
+ attributes { gpu.kernel } {
+ // CHECK: nvvm.barrier0
+ gpu.barrier
std.return
}
}
%one = constant 1.0 : f32
%sum = "gpu.all_reduce"(%one) ({}) {op = "add"} : (f32) -> (f32)
+ "gpu.barrier"() : () -> ()
+
"some_op"(%bIdX, %tIdX) : (index, index) -> ()
%42 = load %arg1[%bIdX] : memref<?xf32, 1>
return