From bc61cc9a2db56eeb5fd299132037757da339aebd Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Fri, 8 Jul 2022 19:11:03 +0000 Subject: [PATCH] [mlir][AMDGPU] Add lds_barrier op The lds_barrier op allows workgroups to wait at a barrier for operations to/from their local data store (LDS) to complete without incurring the performance penalties of a full memory fence. Reviewed By: nirvedhmeshram Differential Revision: https://reviews.llvm.org/D129522 --- mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td | 19 +++++++++++++++++++ mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 21 +++++++++++++++++++++ .../Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 7 +++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 7 +++++++ 4 files changed, 54 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td index 2caee48..ef54628 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td @@ -164,4 +164,23 @@ def AMDGPU_RawBufferAtomicFaddOp : let hasVerifier = 1; } +def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> { + let summary = "Barrier that includes a wait for LDS memory operations."; + let description = [{ + `amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach + the barrier before any of them may proceed past it) and a wait for all + operations that affect the Local Data Store (LDS) issued from that wrokgroup + to complete before the workgroup may continue. Since the LDS is per-workgroup + memory, this barrier may be used, for example, to ensure all workitems have + written data to LDS before any workitem attempts to read from it. + + Note that `lds_barrier` does **not** force reads to or from global memory + to complete before execution continues. Therefore, it should be used when + operations on global memory can be issued far in advance of when their results + are used (for example, by writing them to LDS). + }]; + let assemblyFormat = "attr-dict"; +} + + #endif // AMDGPU diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index c0da60a..1867df2 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -241,6 +241,26 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern { } }; +struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(), + LLVM::AsmDialect::AD_ATT); + const char *asmStr = "s_waitcnt lgkmcnt(0)\ns_barrier"; + const char *constraints = ""; + rewriter.replaceOpWithNewOp( + op, + /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(), + /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true, + /*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr, + /*operand_attrs=*/ArrayAttr()); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public ConvertAMDGPUToROCDLBase { ConvertAMDGPUToROCDLPass() = default; @@ -269,6 +289,7 @@ struct ConvertAMDGPUToROCDLPass void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, Chipset chipset) { + patterns.add(converter); patterns.add< RawBufferOpLowering, RawBufferOpLowering, diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index 129ebe6..e9a999d 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -101,3 +101,10 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32 func.return } + +// CHECK-LABEL: func @lds_barrier +func.func @lds_barrier() { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier" + amdgpu.lds_barrier + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index daf6b7a..3fff10c 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -59,3 +59,10 @@ func.func @raw_buffer_atomic_fadd_f32_to_rank_4(%value : f32, %dst : memref<128x amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32 func.return } + +// CHECK-LABEL: func @lds_barrier +func.func @lds_barrier() { + // CHECK: amdgpu.lds_barrier + amdgpu.lds_barrier + func.return +} -- 2.7.4