[mlir][AMDGPU] Remove buffer ops that are statically out of bounds

author Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>

Wed, 9 Nov 2022 22:26:33 +0000 (22:26 +0000)

committer Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>

Mon, 21 Nov 2022 16:47:21 +0000 (16:47 +0000)
author Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Wed, 9 Nov 2022 22:26:33 +0000 (22:26 +0000)
committer Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Mon, 21 Nov 2022 16:47:21 +0000 (16:47 +0000)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td

index 91b62bb..f08f9fb 100644 (file)
--- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
@@ -23,6 +23,10 @@ def AMDGPU_Dialect : Dialect {
      that will eventually be executed on AMD hardware.
    }];
  
+
+  let dependentDialects = [
+    "arith::ArithDialect"
+  ];
    let useDefaultAttributePrinterParser = 1;
  }
  
@@ -83,6 +87,7 @@ def AMDGPU_RawBufferLoadOp :
        (`sgprOffset` $sgprOffset^)? `:`
        type($memref) `,` type($indices) `->` type($value)
    }];
+  let hasCanonicalizer = 1;
    let hasVerifier = 1;
  }
  
@@ -124,6 +129,7 @@ def AMDGPU_RawBufferStoreOp :
        (`sgprOffset` $sgprOffset^)? `:`
        type($value) `->` type($memref) `,` type($indices)
    }];
+  let hasCanonicalizer = 1;
    let hasVerifier = 1;
  }
  
@@ -162,6 +168,7 @@ def AMDGPU_RawBufferAtomicFaddOp :
        (`sgprOffset` $sgprOffset^)? `:`
        type($value) `->` type($memref) `,` type($indices)
    }];
+  let hasCanonicalizer = 1;
    let hasVerifier = 1;
  }
  
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp

index 26db766..1e5ba7a 100644 (file)
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -12,14 +12,19 @@
  
  #include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
  
+#include "mlir/Dialect/Arith/IR/Arith.h"
  #include "mlir/IR/Builders.h"
  #include "mlir/IR/BuiltinTypes.h"
  #include "mlir/IR/Diagnostics.h"
  #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Matchers.h"
  #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
  #include "mlir/IR/TypeUtilities.h"
  #include "llvm/ADT/TypeSwitch.h"
  
+#include <limits>
+
  using namespace mlir;
  using namespace mlir::amdgpu;
  
@@ -62,6 +67,96 @@ LogicalResult RawBufferAtomicFaddOp::verify() {
    return verifyRawBufferOp(*this);
  }
  
+static Optional<uint32_t> getConstantUint32(Value v) {
+  APInt cst;
+  if (!v.getType().isInteger(32))
+    return None;
+  if (matchPattern(v, m_ConstantInt(&cst)))
+    return cst.getZExtValue();
+  return None;
+}
+
+template <typename OpType>
+static bool staticallyOutOfBounds(OpType op) {
+  if (!op.getBoundsCheck())
+    return false;
+  MemRefType bufferType = op.getMemref().getType();
+  if (!bufferType.hasStaticShape())
+    return false;
+  int64_t offset;
+  SmallVector<int64_t> strides;
+  if (failed(getStridesAndOffset(bufferType, strides, offset)))
+    return false;
+  int64_t result = offset + op.getIndexOffset().value_or(0);
+  if (op.getSgprOffset()) {
+    Optional<uint32_t> sgprOffset = getConstantUint32(op.getSgprOffset());
+    if (!sgprOffset)
+      return false;
+    result += *sgprOffset;
+  }
+  if (strides.size() != op.getIndices().size())
+    return false;
+  int64_t indexVal = 0;
+  for (auto pair : llvm::zip(strides, op.getIndices())) {
+    int64_t stride = std::get<0>(pair);
+    Value idx = std::get<1>(pair);
+    Optional<uint32_t> idxVal = getConstantUint32(idx);
+    if (!idxVal)
+      return false;
+    indexVal += stride * idxVal.value();
+  }
+  result += indexVal;
+  if (result > std::numeric_limits<uint32_t>::max())
+    // Overflow means don't drop
+    return false;
+  return result >= bufferType.getNumElements();
+}
+
+namespace {
+struct RemoveStaticallyOobBufferLoads final
+    : public OpRewritePattern<RawBufferLoadOp> {
+  using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(RawBufferLoadOp op,
+                                PatternRewriter &rw) const override {
+    if (!staticallyOutOfBounds(op))
+      return failure();
+    Type loadType = op.getResult().getType();
+    rw.replaceOpWithNewOp<arith::ConstantOp>(op, loadType,
+                                             rw.getZeroAttr(loadType));
+    return success();
+  }
+};
+
+template <typename OpType>
+struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
+  using OpRewritePattern<OpType>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
+    if (!staticallyOutOfBounds(op))
+      return failure();
+
+    rw.eraseOp(op);
+    return success();
+  }
+};
+} // end namespace
+
+void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                  MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferLoads>(context);
+}
+
+void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                   MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferWrites<RawBufferStoreOp>>(context);
+}
+
+void RawBufferAtomicFaddOp::getCanonicalizationPatterns(
+    RewritePatternSet &results, MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFaddOp>>(context);
+}
+
  //===----------------------------------------------------------------------===//
  // MFMAOp
  //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt

index 1b80265..5dde478 100644 (file)
--- a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRAMDGPUDialect
    MLIRAMDGPUIncGen
  
    LINK_LIBS PUBLIC
+  MLIRArithDialect
    MLIRIR
    MLIRSideEffectInterfaces
    )
diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir

new file mode 100644 (file)

index 0000000..d984f8b
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -0,0 +1,132 @@
+// RUN: mlir-opt %s -split-input-file -canonicalize  | FileCheck %s
+
+// CHECK-LABEL: func @known_oob_load
+func.func @known_oob_load(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_2d
+func.func @known_oob_load_2d(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32, %c0_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_2d_on_last
+func.func @known_oob_load_2d_on_last(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %c16_i32 = arith.constant 16 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c16_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_index
+func.func @known_oob_load_index(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 4 : i32} %arg0[%c0_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_sgproffset
+func.func @known_oob_load_sgproffset(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c2_i32 = arith.constant 2 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c2_i32] sgprOffset %c2_i32 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unknown_load
+func.func @unknown_load(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%arg1] sgprOffset %c4_i32 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unknown_load_sgproffset
+func.func @unknown_load_sgproffset(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] sgprOffset %arg1 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unranked
+func.func @unranked(%arg0: memref<?xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<?xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @no_oob_check
+func.func @no_oob_check(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = false} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @in_bounds_overall
+func.func @in_bounds_overall(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c0_i32 = arith.constant 0 : i32
+  %c15_i32 = arith.constant 15 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c15_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @dead_store
+func.func @dead_store(%arg0: memref<4xf32>, %arg1: f32) {
+  // CHECK-NOT: amdgpu.raw_buffer_store
+  %c4_i32 = arith.constant 4 : i32
+  amdgpu.raw_buffer_store {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @dead_atomic_add
+func.func @dead_atomic_add(%arg0: memref<4xf32>, %arg1: f32) {
+  // CHECK-NOT: amdgpu.raw_buffer_atomic_fadd
+  %c4_i32 = arith.constant 4 : i32
+  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
+  func.return
+}
author	Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
	Wed, 9 Nov 2022 22:26:33 +0000 (22:26 +0000)
committer	Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
	Mon, 21 Nov 2022 16:47:21 +0000 (16:47 +0000)
mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td		patch \| blob \| history
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp		patch \| blob \| history
mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt		patch \| blob \| history
mlir/test/Dialect/AMDGPU/canonicalize.mlir	[new file with mode: 0644]	patch \| blob