From db590549a9905927399c102d15dde763c45b1b4d Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Wed, 6 Jul 2022 21:38:58 +0000
Subject: [PATCH] [mlir][AMDGPU] Use the correct values for OOB_SELECT on gfx10

Differential Revision: https://reviews.llvm.org/D129320
---
 mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td              | 16 ++++++++++------
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp     |  6 +++---
 mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
index f1e1bb3..2caee48 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
@@ -52,14 +52,17 @@ def AMDGPU_RawBufferLoadOp :
     available on AMD GPUs, including extensions in newer GPUs.
 
     The index into the buffer is computed as for `memref.load` with the additon
-    of `indexOffset` and `sgprOffset` (which is added after bounds checks and
-    includes any offset present on the memref type if it's non-zero).
+    of `indexOffset` and `sgprOffset` (which **may or may not** be considered
+    in bounds checks and includes any offset present on the memref type if it's
+    non-zero).
 
     All indices and offsets are in units of the memref's data type and are
     converted to bytes during lowering.
 
     When a load is out of bounds, the instruction returns zero.
-    Vector instructions bounds check each component's address.
+    Partially-out of bounds have chipset-dependent behavior: whether reading
+    2 elements starting at index 7 of a `memref<8xf32>` returns the last element
+    in the first vector component depends on the architecture.
 
     The memref struct is converted into a buffer resource (a V#) and the arguments
     are translated to intrinsic arguments as follows:
@@ -71,7 +74,7 @@ def AMDGPU_RawBufferLoadOp :
     - The offset enable bit is 1, the index enable bit is 0.
     - The thread ID addition bit is off
     - If `boundsCheck` is false and the target chipset is RDNA, OOB_SELECT is set
-      to 2 to disable bounds checks, otherwise it is 0
+      to 2 to disable bounds checks, otherwise it is 3
     - The cache coherency bits are off
   }];
   let assemblyFormat = [{
@@ -108,8 +111,9 @@ def AMDGPU_RawBufferStoreOp :
     All index components are in terms of the elements of the memref, not bytes,
     and are scaled up appropriately.
 
-    Out of bounds stores are ignored in hardware, including the out of bounds
-    components of vector writes.
+    Out of bounds stores are ignored in hardware.
+    Wthether a vector write that includes some in-bounds and soeme out-of-bounds
+    components is partically completed is chipset-dependent.
 
     See `amdgpu.raw_buffer_load` for a description of how the underlying
     instruction is constructed.
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index f7e99e3..c0da60a 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -166,13 +166,13 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     // bit 24: Reserved to 1 (RDNA) or 0 (CDNA)
     // bits 25-26: Reserved (0)
     // bit 27: Buffer is non-volatile (CDNA only)
-    // bits 28-29: Out of bounds select (0 = structured, 1 = raw, 2 = none, 3 =
-    // swizzles) RDNA only
+    // bits 28-29: Out of bounds select (0 = structured, 1 = check index, 2 =
+    //  none, 3 = either swizzles or testing against offset field) RDNA only
     // bits 30-31: Type (must be 0)
     uint32_t word3 = (7 << 12) | (4 << 15);
     if (chipset.majorVersion == 10) {
       word3 |= (1 << 24);
-      uint32_t oob = adaptor.getBoundsCheck() ? 1 : 2;
+      uint32_t oob = adaptor.getBoundsCheck() ? 3 : 2;
       word3 |= (oob << 28);
     }
     Value word3Const = createI32Constant(rewriter, loc, word3);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index f3f4a29..129ebe6 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -6,7 +6,7 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
   // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
   // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
-  // RDNA: %[[word3:.*]] = llvm.mlir.constant(285372416 : i32)
+  // RDNA: %[[word3:.*]] = llvm.mlir.constant(822243328 : i32)
   // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
   // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
   // CHECK: return %[[ret]]
-- 
2.7.4