#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "llvm/ADT/TypeSwitch.h"
+#include <limits>
+
using namespace mlir;
using namespace mlir::amdgpu;
return verifyRawBufferOp(*this);
}
+static Optional<uint32_t> getConstantUint32(Value v) {
+ APInt cst;
+ if (!v.getType().isInteger(32))
+ return None;
+ if (matchPattern(v, m_ConstantInt(&cst)))
+ return cst.getZExtValue();
+ return None;
+}
+
+template <typename OpType>
+static bool staticallyOutOfBounds(OpType op) {
+ if (!op.getBoundsCheck())
+ return false;
+ MemRefType bufferType = op.getMemref().getType();
+ if (!bufferType.hasStaticShape())
+ return false;
+ int64_t offset;
+ SmallVector<int64_t> strides;
+ if (failed(getStridesAndOffset(bufferType, strides, offset)))
+ return false;
+ int64_t result = offset + op.getIndexOffset().value_or(0);
+ if (op.getSgprOffset()) {
+ Optional<uint32_t> sgprOffset = getConstantUint32(op.getSgprOffset());
+ if (!sgprOffset)
+ return false;
+ result += *sgprOffset;
+ }
+ if (strides.size() != op.getIndices().size())
+ return false;
+ int64_t indexVal = 0;
+ for (auto pair : llvm::zip(strides, op.getIndices())) {
+ int64_t stride = std::get<0>(pair);
+ Value idx = std::get<1>(pair);
+ Optional<uint32_t> idxVal = getConstantUint32(idx);
+ if (!idxVal)
+ return false;
+ indexVal += stride * idxVal.value();
+ }
+ result += indexVal;
+ if (result > std::numeric_limits<uint32_t>::max())
+ // Overflow means don't drop
+ return false;
+ return result >= bufferType.getNumElements();
+}
+
+namespace {
+struct RemoveStaticallyOobBufferLoads final
+ : public OpRewritePattern<RawBufferLoadOp> {
+ using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(RawBufferLoadOp op,
+ PatternRewriter &rw) const override {
+ if (!staticallyOutOfBounds(op))
+ return failure();
+ Type loadType = op.getResult().getType();
+ rw.replaceOpWithNewOp<arith::ConstantOp>(op, loadType,
+ rw.getZeroAttr(loadType));
+ return success();
+ }
+};
+
+template <typename OpType>
+struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
+ using OpRewritePattern<OpType>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
+ if (!staticallyOutOfBounds(op))
+ return failure();
+
+ rw.eraseOp(op);
+ return success();
+ }
+};
+} // end namespace
+
+void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
+ MLIRContext *context) {
+ results.add<RemoveStaticallyOobBufferLoads>(context);
+}
+
+void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
+ MLIRContext *context) {
+ results.add<RemoveStaticallyOobBufferWrites<RawBufferStoreOp>>(context);
+}
+
+void RawBufferAtomicFaddOp::getCanonicalizationPatterns(
+ RewritePatternSet &results, MLIRContext *context) {
+ results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFaddOp>>(context);
+}
+
//===----------------------------------------------------------------------===//
// MFMAOp
//===----------------------------------------------------------------------===//
--- /dev/null
+// RUN: mlir-opt %s -split-input-file -canonicalize | FileCheck %s
+
+// CHECK-LABEL: func @known_oob_load
+func.func @known_oob_load(%arg0: memref<4xf32>) -> f32 {
+ // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+ // CHECK: return %[[zero]]
+ %c4_i32 = arith.constant 4 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_2d
+func.func @known_oob_load_2d(%arg0: memref<4x4xf32>) -> f32 {
+ // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+ // CHECK: return %[[zero]]
+ %c0_i32 = arith.constant 0 : i32
+ %c4_i32 = arith.constant 4 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32, %c0_i32] : memref<4x4xf32>, i32, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_2d_on_last
+func.func @known_oob_load_2d_on_last(%arg0: memref<4x4xf32>) -> f32 {
+ // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+ // CHECK: return %[[zero]]
+ %c0_i32 = arith.constant 0 : i32
+ %c16_i32 = arith.constant 16 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c16_i32] : memref<4x4xf32>, i32, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_index
+func.func @known_oob_load_index(%arg0: memref<4xf32>) -> f32 {
+ // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+ // CHECK: return %[[zero]]
+ %c0_i32 = arith.constant 0 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 4 : i32} %arg0[%c0_i32] : memref<4xf32>, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_sgproffset
+func.func @known_oob_load_sgproffset(%arg0: memref<4xf32>) -> f32 {
+ // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+ // CHECK: return %[[zero]]
+ %c2_i32 = arith.constant 2 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c2_i32] sgprOffset %c2_i32 : memref<4xf32>, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unknown_load
+func.func @unknown_load(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
+ // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+ // CHECK: return %[[loaded]]
+ %c4_i32 = arith.constant 4 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%arg1] sgprOffset %c4_i32 : memref<4xf32>, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unknown_load_sgproffset
+func.func @unknown_load_sgproffset(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
+ // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+ // CHECK: return %[[loaded]]
+ %c4_i32 = arith.constant 4 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] sgprOffset %arg1 : memref<4xf32>, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unranked
+func.func @unranked(%arg0: memref<?xf32>) -> f32 {
+ // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+ // CHECK: return %[[loaded]]
+ %c4_i32 = arith.constant 4 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<?xf32>, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @no_oob_check
+func.func @no_oob_check(%arg0: memref<4xf32>) -> f32 {
+ // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+ // CHECK: return %[[loaded]]
+ %c4_i32 = arith.constant 4 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = false} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @in_bounds_overall
+func.func @in_bounds_overall(%arg0: memref<4x4xf32>) -> f32 {
+ // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+ // CHECK: return %[[loaded]]
+ %c0_i32 = arith.constant 0 : i32
+ %c15_i32 = arith.constant 15 : i32
+ %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c15_i32] : memref<4x4xf32>, i32, i32 -> f32
+ func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @dead_store
+func.func @dead_store(%arg0: memref<4xf32>, %arg1: f32) {
+ // CHECK-NOT: amdgpu.raw_buffer_store
+ %c4_i32 = arith.constant 4 : i32
+ amdgpu.raw_buffer_store {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
+ func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @dead_atomic_add
+func.func @dead_atomic_add(%arg0: memref<4xf32>, %arg1: f32) {
+ // CHECK-NOT: amdgpu.raw_buffer_atomic_fadd
+ %c4_i32 = arith.constant 4 : i32
+ amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
+ func.return
+}