From 914eff5a8ca9a05cf14ca758c282e8ca8987967d Mon Sep 17 00:00:00 2001 From: Aart Bik Date: Fri, 18 Nov 2022 16:57:02 -0800 Subject: [PATCH] [mlir][sparse][vector] ensure loop peeling to remove vector masks works Reviewed By: Peiming Differential Revision: https://reviews.llvm.org/D138343 --- .../Dialect/SparseTensor/sparse_vector_peeled.mlir | 63 ++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir new file mode 100644 index 0000000..d379965 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir @@ -0,0 +1,63 @@ +// RUN: mlir-opt %s --sparsification -cse -sparse-vectorization="vl=16" -scf-for-loop-peeling -canonicalize -cse | \ +// RUN: FileCheck %s + +#SparseVector = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed" ], + pointerBitWidth = 32, + indexBitWidth = 32 +}> + +#trait_mul_s = { + indexing_maps = [ + affine_map<(i) -> (i)>, // a + affine_map<(i) -> (i)>, // b + affine_map<(i) -> (i)> // x (out) + ], + iterator_types = ["parallel"], + doc = "x(i) = a(i) * b(i)" +} + +// CHECK-DAG: #[[$map0:.*]] = affine_map<()[s0, s1] -> (s0 + ((-s0 + s1) floordiv 16) * 16)> +// CHECK-DAG: #[[$map1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)> +// CHECK-LABEL: func @mul_s +// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[c16:.*]] = arith.constant 16 : index +// CHECK: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref +// CHECK: %[[a:.*]] = arith.extui %[[p]] : i32 to i64 +// CHECK: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index +// CHECK: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref +// CHECK: %[[b:.*]] = arith.extui %[[r]] : i32 to i64 +// CHECK: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index +// CHECK: %[[boundary:.*]] = affine.apply #[[$map0]]()[%[[q]], %[[s]]] +// CHECK: scf.for %[[i:.*]] = %[[q]] to %[[boundary]] step %[[c16]] { +// CHECK: %[[mask:.*]] = vector.constant_mask [16] : vector<16xi1> +// CHECK: %[[li:.*]] = vector.load %{{.*}}[%[[i]]] : memref, vector<16xi32> +// CHECK: %[[zi:.*]] = arith.extui %[[li]] : vector<16xi32> to vector<16xi64> +// CHECK: %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref, vector<16xf32> +// CHECK: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<16xf32> +// CHECK: vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> +// CHECK: } +// CHECK: scf.for %[[i2:.*]] = %[[boundary]] to %[[s]] step %[[c16]] { +// CHECK: %[[sub:.*]] = affine.apply #[[$map1]](%[[i2]])[%[[s]]] +// CHECK: %[[mask2:.*]] = vector.create_mask %[[sub]] : vector<16xi1> +// CHECK: %[[li2:.*]] = vector.maskedload %{{.*}}[%[[i2]]], %[[mask2]], %{{.*}} : memref, vector<16xi1>, vector<16xi32> into vector<16xi32> +// CHECK: %[[zi2:.*]] = arith.extui %[[li2]] : vector<16xi32> to vector<16xi64> +// CHECK: %[[la2:.*]] = vector.maskedload %{{.*}}[%[[i2]]], %[[mask2]], %{{.*}} : memref, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK: %[[lb2:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32> +// CHECK: %[[m2:.*]] = arith.mulf %[[la2]], %[[lb2]] : vector<16xf32> +// CHECK: vector.scatter %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %[[m2]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> +// CHECK: } +// CHECK: return +// +func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>, %argb: tensor<1024xf32>, %argx: tensor<1024xf32>) -> tensor<1024xf32> { + %0 = linalg.generic #trait_mul_s + ins(%arga, %argb: tensor<1024xf32, #SparseVector>, tensor<1024xf32>) + outs(%argx: tensor<1024xf32>) { + ^bb(%a: f32, %b: f32, %x: f32): + %0 = arith.mulf %a, %b : f32 + linalg.yield %0 : f32 + } -> tensor<1024xf32> + return %0 : tensor<1024xf32> +} -- 2.7.4