func @entry() -> i32 {
// Initialize large buffers that can be used for multiple test cases of
// different sizes.
- %b_A = alloc() : memref<128xi64>
- %b_B = alloc() : memref<128xf64>
- %b_C = alloc() : memref<128xi64>
- %b_D = alloc() : memref<128xf64>
+ %b_A = memref.alloc() : memref<128xi64>
+ %b_B = memref.alloc() : memref<128xf64>
+ %b_C = memref.alloc() : memref<128xi64>
+ %b_D = memref.alloc() : memref<128xf64>
- %m_A = memref_cast %b_A : memref<128xi64> to memref<?xi64>
- %m_B = memref_cast %b_B : memref<128xf64> to memref<?xf64>
- %m_C = memref_cast %b_C : memref<128xi64> to memref<?xi64>
- %m_D = memref_cast %b_D : memref<128xf64> to memref<?xf64>
+ %m_A = memref.cast %b_A : memref<128xi64> to memref<?xi64>
+ %m_B = memref.cast %b_B : memref<128xf64> to memref<?xf64>
+ %m_C = memref.cast %b_C : memref<128xi64> to memref<?xi64>
+ %m_D = memref.cast %b_D : memref<128xf64> to memref<?xf64>
// --- Test case 1 ---.
// M and N must be a multiple of 8 if smaller than 128.
// CHECK: 111
// Release all resources.
- dealloc %b_A : memref<128xi64>
- dealloc %b_B : memref<128xf64>
- dealloc %b_C : memref<128xi64>
- dealloc %b_D : memref<128xf64>
+ memref.dealloc %b_A : memref<128xi64>
+ memref.dealloc %b_B : memref<128xf64>
+ memref.dealloc %b_C : memref<128xi64>
+ memref.dealloc %b_D : memref<128xf64>
%r = constant 0 : i32
return %r : i32
// RUN: | FileCheck %s
func @main() {
- %data = alloc() : memref<2x6xi32>
- %sum = alloc() : memref<2xi32>
+ %data = memref.alloc() : memref<2x6xi32>
+ %sum = memref.alloc() : memref<2xi32>
%cst0 = constant 0 : i32
%cst1 = constant 1 : i32
%cst2 = constant 2 : i32
%c5 = constant 5 : index
%c6 = constant 6 : index
- %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
+ %cast_data = memref.cast %data : memref<2x6xi32> to memref<*xi32>
gpu.host_register %cast_data : memref<*xi32>
- %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
+ %cast_sum = memref.cast %sum : memref<2xi32> to memref<*xi32>
gpu.host_register %cast_sum : memref<*xi32>
- store %cst0, %data[%c0, %c0] : memref<2x6xi32>
- store %cst1, %data[%c0, %c1] : memref<2x6xi32>
- store %cst2, %data[%c0, %c2] : memref<2x6xi32>
- store %cst4, %data[%c0, %c3] : memref<2x6xi32>
- store %cst8, %data[%c0, %c4] : memref<2x6xi32>
- store %cst16, %data[%c0, %c5] : memref<2x6xi32>
+ memref.store %cst0, %data[%c0, %c0] : memref<2x6xi32>
+ memref.store %cst1, %data[%c0, %c1] : memref<2x6xi32>
+ memref.store %cst2, %data[%c0, %c2] : memref<2x6xi32>
+ memref.store %cst4, %data[%c0, %c3] : memref<2x6xi32>
+ memref.store %cst8, %data[%c0, %c4] : memref<2x6xi32>
+ memref.store %cst16, %data[%c0, %c5] : memref<2x6xi32>
- store %cst2, %data[%c1, %c0] : memref<2x6xi32>
- store %cst3, %data[%c1, %c1] : memref<2x6xi32>
- store %cst6, %data[%c1, %c2] : memref<2x6xi32>
- store %cst7, %data[%c1, %c3] : memref<2x6xi32>
- store %cst10, %data[%c1, %c4] : memref<2x6xi32>
- store %cst11, %data[%c1, %c5] : memref<2x6xi32>
+ memref.store %cst2, %data[%c1, %c0] : memref<2x6xi32>
+ memref.store %cst3, %data[%c1, %c1] : memref<2x6xi32>
+ memref.store %cst6, %data[%c1, %c2] : memref<2x6xi32>
+ memref.store %cst7, %data[%c1, %c3] : memref<2x6xi32>
+ memref.store %cst10, %data[%c1, %c4] : memref<2x6xi32>
+ memref.store %cst11, %data[%c1, %c5] : memref<2x6xi32>
// AND
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
- %val = load %data[%bx, %tx] : memref<2x6xi32>
+ %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
%reduced = "gpu.all_reduce"(%val) ({}) { op = "and" } : (i32) -> (i32)
- store %reduced, %sum[%bx] : memref<2xi32>
+ memref.store %reduced, %sum[%bx] : memref<2xi32>
gpu.terminator
}
// RUN: | FileCheck %s
func @main() {
- %data = alloc() : memref<2x6xi32>
- %sum = alloc() : memref<2xi32>
+ %data = memref.alloc() : memref<2x6xi32>
+ %sum = memref.alloc() : memref<2xi32>
%cst0 = constant 0 : i32
%cst1 = constant 1 : i32
%cst2 = constant 2 : i32
%c5 = constant 5 : index
%c6 = constant 6 : index
- %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
+ %cast_data = memref.cast %data : memref<2x6xi32> to memref<*xi32>
gpu.host_register %cast_data : memref<*xi32>
- %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
+ %cast_sum = memref.cast %sum : memref<2xi32> to memref<*xi32>
gpu.host_register %cast_sum : memref<*xi32>
- store %cst0, %data[%c0, %c0] : memref<2x6xi32>
- store %cst1, %data[%c0, %c1] : memref<2x6xi32>
- store %cst2, %data[%c0, %c2] : memref<2x6xi32>
- store %cst4, %data[%c0, %c3] : memref<2x6xi32>
- store %cst8, %data[%c0, %c4] : memref<2x6xi32>
- store %cst16, %data[%c0, %c5] : memref<2x6xi32>
+ memref.store %cst0, %data[%c0, %c0] : memref<2x6xi32>
+ memref.store %cst1, %data[%c0, %c1] : memref<2x6xi32>
+ memref.store %cst2, %data[%c0, %c2] : memref<2x6xi32>
+ memref.store %cst4, %data[%c0, %c3] : memref<2x6xi32>
+ memref.store %cst8, %data[%c0, %c4] : memref<2x6xi32>
+ memref.store %cst16, %data[%c0, %c5] : memref<2x6xi32>
- store %cst2, %data[%c1, %c0] : memref<2x6xi32>
- store %cst3, %data[%c1, %c1] : memref<2x6xi32>
- store %cst6, %data[%c1, %c2] : memref<2x6xi32>
- store %cst7, %data[%c1, %c3] : memref<2x6xi32>
- store %cst10, %data[%c1, %c4] : memref<2x6xi32>
- store %cst11, %data[%c1, %c5] : memref<2x6xi32>
+ memref.store %cst2, %data[%c1, %c0] : memref<2x6xi32>
+ memref.store %cst3, %data[%c1, %c1] : memref<2x6xi32>
+ memref.store %cst6, %data[%c1, %c2] : memref<2x6xi32>
+ memref.store %cst7, %data[%c1, %c3] : memref<2x6xi32>
+ memref.store %cst10, %data[%c1, %c4] : memref<2x6xi32>
+ memref.store %cst11, %data[%c1, %c5] : memref<2x6xi32>
// MAX
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
- %val = load %data[%bx, %tx] : memref<2x6xi32>
+ %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
%reduced = "gpu.all_reduce"(%val) ({}) { op = "max" } : (i32) -> (i32)
- store %reduced, %sum[%bx] : memref<2xi32>
+ memref.store %reduced, %sum[%bx] : memref<2xi32>
gpu.terminator
}
// RUN: | FileCheck %s
func @main() {
- %data = alloc() : memref<2x6xi32>
- %sum = alloc() : memref<2xi32>
+ %data = memref.alloc() : memref<2x6xi32>
+ %sum = memref.alloc() : memref<2xi32>
%cst0 = constant 0 : i32
%cst1 = constant 1 : i32
%cst2 = constant 2 : i32
%c5 = constant 5 : index
%c6 = constant 6 : index
- %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
+ %cast_data = memref.cast %data : memref<2x6xi32> to memref<*xi32>
gpu.host_register %cast_data : memref<*xi32>
- %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
+ %cast_sum = memref.cast %sum : memref<2xi32> to memref<*xi32>
gpu.host_register %cast_sum : memref<*xi32>
- store %cst0, %data[%c0, %c0] : memref<2x6xi32>
- store %cst1, %data[%c0, %c1] : memref<2x6xi32>
- store %cst2, %data[%c0, %c2] : memref<2x6xi32>
- store %cst4, %data[%c0, %c3] : memref<2x6xi32>
- store %cst8, %data[%c0, %c4] : memref<2x6xi32>
- store %cst16, %data[%c0, %c5] : memref<2x6xi32>
+ memref.store %cst0, %data[%c0, %c0] : memref<2x6xi32>
+ memref.store %cst1, %data[%c0, %c1] : memref<2x6xi32>
+ memref.store %cst2, %data[%c0, %c2] : memref<2x6xi32>
+ memref.store %cst4, %data[%c0, %c3] : memref<2x6xi32>
+ memref.store %cst8, %data[%c0, %c4] : memref<2x6xi32>
+ memref.store %cst16, %data[%c0, %c5] : memref<2x6xi32>
- store %cst2, %data[%c1, %c0] : memref<2x6xi32>
- store %cst3, %data[%c1, %c1] : memref<2x6xi32>
- store %cst6, %data[%c1, %c2] : memref<2x6xi32>
- store %cst7, %data[%c1, %c3] : memref<2x6xi32>
- store %cst10, %data[%c1, %c4] : memref<2x6xi32>
- store %cst11, %data[%c1, %c5] : memref<2x6xi32>
+ memref.store %cst2, %data[%c1, %c0] : memref<2x6xi32>
+ memref.store %cst3, %data[%c1, %c1] : memref<2x6xi32>
+ memref.store %cst6, %data[%c1, %c2] : memref<2x6xi32>
+ memref.store %cst7, %data[%c1, %c3] : memref<2x6xi32>
+ memref.store %cst10, %data[%c1, %c4] : memref<2x6xi32>
+ memref.store %cst11, %data[%c1, %c5] : memref<2x6xi32>
// MIN
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
- %val = load %data[%bx, %tx] : memref<2x6xi32>
+ %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
%reduced = "gpu.all_reduce"(%val) ({}) { op = "min" } : (i32) -> (i32)
- store %reduced, %sum[%bx] : memref<2xi32>
+ memref.store %reduced, %sum[%bx] : memref<2xi32>
gpu.terminator
}
// CHECK-COUNT-8: [{{(5356, ){12}5356}}]
func @main() {
- %arg = alloc() : memref<2x4x13xf32>
- %dst = memref_cast %arg : memref<2x4x13xf32> to memref<?x?x?xf32>
+ %arg = memref.alloc() : memref<2x4x13xf32>
+ %dst = memref.cast %arg : memref<2x4x13xf32> to memref<?x?x?xf32>
%c0 = constant 0 : index
%c1 = constant 1 : index
%c2 = constant 2 : index
- %sx = dim %dst, %c2 : memref<?x?x?xf32>
- %sy = dim %dst, %c1 : memref<?x?x?xf32>
- %sz = dim %dst, %c0 : memref<?x?x?xf32>
- %cast_dst = memref_cast %dst : memref<?x?x?xf32> to memref<*xf32>
+ %sx = memref.dim %dst, %c2 : memref<?x?x?xf32>
+ %sy = memref.dim %dst, %c1 : memref<?x?x?xf32>
+ %sz = memref.dim %dst, %c0 : memref<?x?x?xf32>
+ %cast_dst = memref.cast %dst : memref<?x?x?xf32> to memref<*xf32>
gpu.host_register %cast_dst : memref<*xf32>
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %sy, %block_z = %sz) {
%t3 = index_cast %idx : index to i32
%val = sitofp %t3 : i32 to f32
%sum = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
- store %sum, %dst[%tz, %ty, %tx] : memref<?x?x?xf32>
+ memref.store %sum, %dst[%tz, %ty, %tx] : memref<?x?x?xf32>
gpu.terminator
}
call @print_memref_f32(%cast_dst) : (memref<*xf32>) -> ()
// RUN: | FileCheck %s
func @main() {
- %data = alloc() : memref<2x6xi32>
- %sum = alloc() : memref<2xi32>
+ %data = memref.alloc() : memref<2x6xi32>
+ %sum = memref.alloc() : memref<2xi32>
%cst0 = constant 0 : i32
%cst1 = constant 1 : i32
%cst2 = constant 2 : i32
%c5 = constant 5 : index
%c6 = constant 6 : index
- %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
+ %cast_data = memref.cast %data : memref<2x6xi32> to memref<*xi32>
gpu.host_register %cast_data : memref<*xi32>
- %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
+ %cast_sum = memref.cast %sum : memref<2xi32> to memref<*xi32>
gpu.host_register %cast_sum : memref<*xi32>
- store %cst0, %data[%c0, %c0] : memref<2x6xi32>
- store %cst1, %data[%c0, %c1] : memref<2x6xi32>
- store %cst2, %data[%c0, %c2] : memref<2x6xi32>
- store %cst4, %data[%c0, %c3] : memref<2x6xi32>
- store %cst8, %data[%c0, %c4] : memref<2x6xi32>
- store %cst16, %data[%c0, %c5] : memref<2x6xi32>
+ memref.store %cst0, %data[%c0, %c0] : memref<2x6xi32>
+ memref.store %cst1, %data[%c0, %c1] : memref<2x6xi32>
+ memref.store %cst2, %data[%c0, %c2] : memref<2x6xi32>
+ memref.store %cst4, %data[%c0, %c3] : memref<2x6xi32>
+ memref.store %cst8, %data[%c0, %c4] : memref<2x6xi32>
+ memref.store %cst16, %data[%c0, %c5] : memref<2x6xi32>
- store %cst2, %data[%c1, %c0] : memref<2x6xi32>
- store %cst3, %data[%c1, %c1] : memref<2x6xi32>
- store %cst6, %data[%c1, %c2] : memref<2x6xi32>
- store %cst7, %data[%c1, %c3] : memref<2x6xi32>
- store %cst10, %data[%c1, %c4] : memref<2x6xi32>
- store %cst11, %data[%c1, %c5] : memref<2x6xi32>
+ memref.store %cst2, %data[%c1, %c0] : memref<2x6xi32>
+ memref.store %cst3, %data[%c1, %c1] : memref<2x6xi32>
+ memref.store %cst6, %data[%c1, %c2] : memref<2x6xi32>
+ memref.store %cst7, %data[%c1, %c3] : memref<2x6xi32>
+ memref.store %cst10, %data[%c1, %c4] : memref<2x6xi32>
+ memref.store %cst11, %data[%c1, %c5] : memref<2x6xi32>
// OR
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
- %val = load %data[%bx, %tx] : memref<2x6xi32>
+ %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
%reduced = "gpu.all_reduce"(%val) ({}) { op = "or" } : (i32) -> (i32)
- store %reduced, %sum[%bx] : memref<2xi32>
+ memref.store %reduced, %sum[%bx] : memref<2xi32>
gpu.terminator
}
// CHECK: [{{(35, ){34}35}}]
func @main() {
- %arg = alloc() : memref<35xf32>
- %dst = memref_cast %arg : memref<35xf32> to memref<?xf32>
+ %arg = memref.alloc() : memref<35xf32>
+ %dst = memref.cast %arg : memref<35xf32> to memref<?xf32>
%one = constant 1 : index
%c0 = constant 0 : index
- %sx = dim %dst, %c0 : memref<?xf32>
- %cast_dst = memref_cast %dst : memref<?xf32> to memref<*xf32>
+ %sx = memref.dim %dst, %c0 : memref<?xf32>
+ %cast_dst = memref.cast %dst : memref<?xf32> to memref<*xf32>
gpu.host_register %cast_dst : memref<*xf32>
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
"gpu.yield"(%xor) : (i32) -> ()
}) : (i32) -> (i32)
%res = sitofp %xor : i32 to f32
- store %res, %dst[%tx] : memref<?xf32>
+ memref.store %res, %dst[%tx] : memref<?xf32>
gpu.terminator
}
call @print_memref_f32(%cast_dst) : (memref<*xf32>) -> ()
// RUN: | FileCheck %s
func @main() {
- %data = alloc() : memref<2x6xi32>
- %sum = alloc() : memref<2xi32>
+ %data = memref.alloc() : memref<2x6xi32>
+ %sum = memref.alloc() : memref<2xi32>
%cst0 = constant 0 : i32
%cst1 = constant 1 : i32
%cst2 = constant 2 : i32
%c5 = constant 5 : index
%c6 = constant 6 : index
- %cast_data = memref_cast %data : memref<2x6xi32> to memref<*xi32>
+ %cast_data = memref.cast %data : memref<2x6xi32> to memref<*xi32>
gpu.host_register %cast_data : memref<*xi32>
- %cast_sum = memref_cast %sum : memref<2xi32> to memref<*xi32>
+ %cast_sum = memref.cast %sum : memref<2xi32> to memref<*xi32>
gpu.host_register %cast_sum : memref<*xi32>
- store %cst0, %data[%c0, %c0] : memref<2x6xi32>
- store %cst1, %data[%c0, %c1] : memref<2x6xi32>
- store %cst2, %data[%c0, %c2] : memref<2x6xi32>
- store %cst4, %data[%c0, %c3] : memref<2x6xi32>
- store %cst8, %data[%c0, %c4] : memref<2x6xi32>
- store %cst16, %data[%c0, %c5] : memref<2x6xi32>
+ memref.store %cst0, %data[%c0, %c0] : memref<2x6xi32>
+ memref.store %cst1, %data[%c0, %c1] : memref<2x6xi32>
+ memref.store %cst2, %data[%c0, %c2] : memref<2x6xi32>
+ memref.store %cst4, %data[%c0, %c3] : memref<2x6xi32>
+ memref.store %cst8, %data[%c0, %c4] : memref<2x6xi32>
+ memref.store %cst16, %data[%c0, %c5] : memref<2x6xi32>
- store %cst2, %data[%c1, %c0] : memref<2x6xi32>
- store %cst3, %data[%c1, %c1] : memref<2x6xi32>
- store %cst6, %data[%c1, %c2] : memref<2x6xi32>
- store %cst7, %data[%c1, %c3] : memref<2x6xi32>
- store %cst10, %data[%c1, %c4] : memref<2x6xi32>
- store %cst11, %data[%c1, %c5] : memref<2x6xi32>
+ memref.store %cst2, %data[%c1, %c0] : memref<2x6xi32>
+ memref.store %cst3, %data[%c1, %c1] : memref<2x6xi32>
+ memref.store %cst6, %data[%c1, %c2] : memref<2x6xi32>
+ memref.store %cst7, %data[%c1, %c3] : memref<2x6xi32>
+ memref.store %cst10, %data[%c1, %c4] : memref<2x6xi32>
+ memref.store %cst11, %data[%c1, %c5] : memref<2x6xi32>
// XOR
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
- %val = load %data[%bx, %tx] : memref<2x6xi32>
+ %val = memref.load %data[%bx, %tx] : memref<2x6xi32>
%reduced = "gpu.all_reduce"(%val) ({}) { op = "xor" } : (i32) -> (i32)
- store %reduced, %sum[%bx] : memref<2xi32>
+ memref.store %reduced, %sum[%bx] : memref<2xi32>
gpu.terminator
}
%count = constant 2 : index
// initialize h0 on host
- %h0 = alloc(%count) : memref<?xi32>
- %h0_unranked = memref_cast %h0 : memref<?xi32> to memref<*xi32>
+ %h0 = memref.alloc(%count) : memref<?xi32>
+ %h0_unranked = memref.cast %h0 : memref<?xi32> to memref<*xi32>
gpu.host_register %h0_unranked : memref<*xi32>
%v0 = constant 42 : i32
- store %v0, %h0[%c0] : memref<?xi32>
- store %v0, %h0[%c1] : memref<?xi32>
+ memref.store %v0, %h0[%c0] : memref<?xi32>
+ memref.store %v0, %h0[%c1] : memref<?xi32>
// copy h0 to b0 on device.
%t0, %f0 = async.execute () -> !async.value<memref<?xi32>> {
) {
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %c1, %block_z = %c1) {
- %v1 = load %b1[%tx] : memref<?xi32>
- %v2 = load %b2[%tx] : memref<?xi32>
+ %v1 = memref.load %b1[%tx] : memref<?xi32>
+ %v2 = memref.load %b2[%tx] : memref<?xi32>
%sum = addi %v1, %v2 : i32
- store %sum, %h0[%tx] : memref<?xi32>
+ memref.store %sum, %h0[%tx] : memref<?xi32>
gpu.terminator
}
async.yield
func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
%cst = constant 1 : index
%c0 = constant 0 : index
- %cst2 = dim %arg1, %c0 : memref<?xf32>
+ %cst2 = memref.dim %arg1, %c0 : memref<?xf32>
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst)
threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) {
- store %arg0, %arg1[%tx] : memref<?xf32>
+ memref.store %arg0, %arg1[%tx] : memref<?xf32>
gpu.terminator
}
return
// CHECK: [1, 1, 1, 1, 1]
func @main() {
- %arg0 = alloc() : memref<5xf32>
+ %arg0 = memref.alloc() : memref<5xf32>
%21 = constant 5 : i32
- %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
- %23 = memref_cast %22 : memref<?xf32> to memref<*xf32>
+ %22 = memref.cast %arg0 : memref<5xf32> to memref<?xf32>
+ %23 = memref.cast %22 : memref<?xf32> to memref<*xf32>
gpu.host_register %23 : memref<*xf32>
call @print_memref_f32(%23) : (memref<*xf32>) -> ()
%24 = constant 1.0 : f32
// RUN: | FileCheck %s
func @main() {
- %data = alloc() : memref<2x6xf32>
- %sum = alloc() : memref<2xf32>
- %mul = alloc() : memref<2xf32>
+ %data = memref.alloc() : memref<2x6xf32>
+ %sum = memref.alloc() : memref<2xf32>
+ %mul = memref.alloc() : memref<2xf32>
%cst0 = constant 0.0 : f32
%cst1 = constant 1.0 : f32
%cst2 = constant 2.0 : f32
%c5 = constant 5 : index
%c6 = constant 6 : index
- %cast_data = memref_cast %data : memref<2x6xf32> to memref<*xf32>
+ %cast_data = memref.cast %data : memref<2x6xf32> to memref<*xf32>
gpu.host_register %cast_data : memref<*xf32>
- %cast_sum = memref_cast %sum : memref<2xf32> to memref<*xf32>
+ %cast_sum = memref.cast %sum : memref<2xf32> to memref<*xf32>
gpu.host_register %cast_sum : memref<*xf32>
- %cast_mul = memref_cast %mul : memref<2xf32> to memref<*xf32>
+ %cast_mul = memref.cast %mul : memref<2xf32> to memref<*xf32>
gpu.host_register %cast_mul : memref<*xf32>
- store %cst0, %data[%c0, %c0] : memref<2x6xf32>
- store %cst1, %data[%c0, %c1] : memref<2x6xf32>
- store %cst2, %data[%c0, %c2] : memref<2x6xf32>
- store %cst4, %data[%c0, %c3] : memref<2x6xf32>
- store %cst8, %data[%c0, %c4] : memref<2x6xf32>
- store %cst16, %data[%c0, %c5] : memref<2x6xf32>
+ memref.store %cst0, %data[%c0, %c0] : memref<2x6xf32>
+ memref.store %cst1, %data[%c0, %c1] : memref<2x6xf32>
+ memref.store %cst2, %data[%c0, %c2] : memref<2x6xf32>
+ memref.store %cst4, %data[%c0, %c3] : memref<2x6xf32>
+ memref.store %cst8, %data[%c0, %c4] : memref<2x6xf32>
+ memref.store %cst16, %data[%c0, %c5] : memref<2x6xf32>
- store %cst2, %data[%c1, %c0] : memref<2x6xf32>
- store %cst3, %data[%c1, %c1] : memref<2x6xf32>
- store %cst6, %data[%c1, %c2] : memref<2x6xf32>
- store %cst7, %data[%c1, %c3] : memref<2x6xf32>
- store %cst10, %data[%c1, %c4] : memref<2x6xf32>
- store %cst11, %data[%c1, %c5] : memref<2x6xf32>
+ memref.store %cst2, %data[%c1, %c0] : memref<2x6xf32>
+ memref.store %cst3, %data[%c1, %c1] : memref<2x6xf32>
+ memref.store %cst6, %data[%c1, %c2] : memref<2x6xf32>
+ memref.store %cst7, %data[%c1, %c3] : memref<2x6xf32>
+ memref.store %cst10, %data[%c1, %c4] : memref<2x6xf32>
+ memref.store %cst11, %data[%c1, %c5] : memref<2x6xf32>
// ADD + MUL
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) {
- %val = load %data[%bx, %tx] : memref<2x6xf32>
+ %val = memref.load %data[%bx, %tx] : memref<2x6xf32>
%reduced0 = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32)
- store %reduced0, %sum[%bx] : memref<2xf32>
+ memref.store %reduced0, %sum[%bx] : memref<2xf32>
%reduced1 = "gpu.all_reduce"(%val) ({}) { op = "mul" } : (f32) -> (f32)
- store %reduced1, %mul[%bx] : memref<2xf32>
+ memref.store %reduced1, %mul[%bx] : memref<2xf32>
gpu.terminator
}
// CHECK: [4, 5, 6, 7, 0, 1, 2, 3, 12, -1, -1, -1, 8]
func @main() {
- %arg = alloc() : memref<13xf32>
- %dst = memref_cast %arg : memref<13xf32> to memref<?xf32>
+ %arg = memref.alloc() : memref<13xf32>
+ %dst = memref.cast %arg : memref<13xf32> to memref<?xf32>
%one = constant 1 : index
%c0 = constant 0 : index
- %sx = dim %dst, %c0 : memref<?xf32>
- %cast_dst = memref_cast %dst : memref<?xf32> to memref<*xf32>
+ %sx = memref.dim %dst, %c0 : memref<?xf32>
+ %cast_dst = memref.cast %dst : memref<?xf32> to memref<*xf32>
gpu.host_register %cast_dst : memref<*xf32>
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
%m1 = constant -1.0 : f32
br ^bb1(%m1 : f32)
^bb1(%value : f32):
- store %value, %dst[%tx] : memref<?xf32>
+ memref.store %value, %dst[%tx] : memref<?xf32>
gpu.terminator
}
call @print_memref_f32(%cast_dst) : (memref<*xf32>) -> ()
// CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
func @main() {
- %arg = alloc() : memref<13xi32>
- %dst = memref_cast %arg : memref<13xi32> to memref<?xi32>
+ %arg = memref.alloc() : memref<13xi32>
+ %dst = memref.cast %arg : memref<13xi32> to memref<?xi32>
%one = constant 1 : index
%c0 = constant 0 : index
- %sx = dim %dst, %c0 : memref<?xi32>
- %cast_dst = memref_cast %dst : memref<?xi32> to memref<*xi32>
+ %sx = memref.dim %dst, %c0 : memref<?xi32>
+ %cast_dst = memref.cast %dst : memref<?xi32> to memref<*xi32>
gpu.host_register %cast_dst : memref<*xi32>
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
%t0 = index_cast %tx : index to i32
- store %t0, %dst[%tx] : memref<?xi32>
+ memref.store %t0, %dst[%tx] : memref<?xi32>
gpu.terminator
}
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %one, %block_z = %one) {
%t0 = index_cast %tx : index to i32
- store %t0, %dst[%tx] : memref<?xi32>
+ memref.store %t0, %dst[%tx] : memref<?xi32>
gpu.terminator
}
call @print_memref_i32(%cast_dst) : (memref<*xi32>) -> ()