From: Aart Bik <ajcbik@google.com>
Date: Tue, 29 Nov 2022 23:02:38 +0000 (-0800)
Subject: [mlir][sparse][vectorization] implement "index" vectorization
X-Git-Tag: upstream/17.0.6~25825
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2fda6207118d1d1c19e3b66f615f332ffc2792d0;p=platform%2Fupstream%2Fllvm.git

[mlir][sparse][vectorization] implement "index" vectorization

This adds the capability to vectorize computations like a[i] = i.
This also generalizes the supported unary and binary ops and
adds a test for each to ensure actual SIMD code can result.

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D138956
---

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
index 028a471..7d6ac51 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
@@ -216,7 +216,8 @@ static Value genVectorReducInit(PatternRewriter &rewriter, Location loc,
 /// The first call (!codegen) does the analysis. Then, on success, the second
 /// call (codegen) yields the proper vector form in the output parameter
 /// vector 'idxs'. This mechanism ensures that analysis and rewriting code
-/// stay in sync.
+/// stay in sync. Note that the analyis part is simple because the sparse
+/// compiler only generates relatively simple subscript expressions.
 ///
 /// See https://llvm.org/docs/GetElementPtr.html for some background on
 /// the complications described below.
@@ -234,7 +235,7 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
                                 VL vl, ValueRange subs, bool codegen,
                                 Value vmask, SmallVectorImpl<Value> &idxs) {
   for (auto sub : subs) {
-    // Invariant indices simply pass through.
+    // Invariant/loop indices simply pass through.
     if (sub.dyn_cast<BlockArgument>() ||
         sub.getDefiningOp()->getBlock() != &forOp.getRegion().front()) {
       if (codegen)
@@ -293,6 +294,15 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
     return true;                                                               \
   }
 
+#define TYPEDUNAOP(xxx)                                                        \
+  if (auto x = dyn_cast<xxx>(def)) {                                           \
+    if (codegen) {                                                             \
+      VectorType vtp = vectorType(vl, x.getType());                            \
+      vexp = rewriter.create<xxx>(loc, vtp, vx);                               \
+    }                                                                          \
+    return true;                                                               \
+  }
+
 #define BINOP(xxx)                                                             \
   if (isa<xxx>(def)) {                                                         \
     if (codegen)                                                               \
@@ -303,27 +313,60 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
 /// This method is called twice to analyze and rewrite the given expression.
 /// The first call (!codegen) does the analysis. Then, on success, the second
 /// call (codegen) yields the proper vector form in the output parameter 'vexp'.
-/// This mechanism ensures that analysis and rewriting code stay in sync.
+/// This mechanism ensures that analysis and rewriting code stay in sync. Note
+/// that the analyis part is simple because the sparse compiler only generates
+/// relatively simple expressions inside the for-loops.
 static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
                           Value exp, bool codegen, Value vmask, Value &vexp) {
-  // A block argument in invariant.
+  Location loc = forOp.getLoc();
+  // Reject unsupported types.
+  if (!VectorType::isValidElementType(exp.getType()))
+    return false;
+  // A block argument is invariant/reduction/index.
   if (auto arg = exp.dyn_cast<BlockArgument>()) {
-    if (codegen)
-      vexp = genVectorInvariantValue(rewriter, vl, exp);
-    return true;
+    if (arg == forOp.getInductionVar()) {
+      // We encountered a single, innermost index inside the computation,
+      // such as a[i] = i, which must convert to [i, i+1, ...].
+      if (codegen) {
+        VectorType vtp = vectorType(vl, arg.getType());
+        Value veci = rewriter.create<vector::BroadcastOp>(loc, vtp, arg);
+        Value incr;
+        if (vl.enableVLAVectorization) {
+          Type stepvty = vectorType(vl, rewriter.getI64Type());
+          Value stepv = rewriter.create<LLVM::StepVectorOp>(loc, stepvty);
+          incr = rewriter.create<arith::IndexCastOp>(loc, vtp, stepv);
+        } else {
+          SmallVector<APInt> integers;
+          for (unsigned i = 0, l = vl.vectorLength; i < l; i++)
+            integers.push_back(APInt(/*width=*/64, i));
+          auto values = DenseElementsAttr::get(vtp, integers);
+          incr = rewriter.create<arith::ConstantOp>(loc, vtp, values);
+        }
+        vexp = rewriter.create<arith::AddIOp>(loc, veci, incr);
+      }
+      return true;
+    } else {
+      // An invariant or reduction. In both cases, we treat this as an
+      // invariant value, and rely on later replacing and folding to
+      // construct a proper reduction chain for the latter case.
+      if (codegen)
+        vexp = genVectorInvariantValue(rewriter, vl, exp);
+      return true;
+    }
   }
-  // Something defined outside the loop-body is invariant as well.
+  // Something defined outside the loop-body is invariant.
   Operation *def = exp.getDefiningOp();
   if (def->getBlock() != &forOp.getRegion().front()) {
     if (codegen)
       vexp = genVectorInvariantValue(rewriter, vl, exp);
     return true;
   }
-  // Inside loop-body unary and binary operations. Note that it would be
-  // nicer if we could somehow test and build the operations in a more
-  // concise manner than just listing them all (although this way we know
-  // for certain that they can vectorize).
-  Location loc = forOp.getLoc();
+  // Proper load operations. These are either values involved in the
+  // actual computation, such as a[i] = b[i] becomes a[lo:hi] = b[lo:hi],
+  // or index values inside the computation that are now fetched from
+  // the sparse storage index arrays, such as a[i] = i becomes
+  // a[lo:hi] = ind[lo:hi], where 'lo' denotes the current index
+  // and 'hi = lo + vl - 1'.
   if (auto load = dyn_cast<memref::LoadOp>(def)) {
     auto subs = load.getIndices();
     SmallVector<Value> idxs;
@@ -332,7 +375,16 @@ static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
         vexp = genVectorLoad(rewriter, loc, vl, load.getMemRef(), idxs, vmask);
       return true;
     }
-  } else if (def->getNumOperands() == 1) {
+    return false;
+  }
+  // Inside loop-body unary and binary operations. Note that it would be
+  // nicer if we could somehow test and build the operations in a more
+  // concise manner than just listing them all (although this way we know
+  // for certain that they can vectorize).
+  //
+  // TODO: avoid visiting CSEs multiple times
+  //
+  if (def->getNumOperands() == 1) {
     Value vx;
     if (vectorizeExpr(rewriter, forOp, vl, def->getOperand(0), codegen, vmask,
                       vx)) {
@@ -346,6 +398,17 @@ static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
       UNAOP(math::SinOp)
       UNAOP(math::TanhOp)
       UNAOP(arith::NegFOp)
+      TYPEDUNAOP(arith::TruncFOp)
+      TYPEDUNAOP(arith::ExtFOp)
+      TYPEDUNAOP(arith::FPToSIOp)
+      TYPEDUNAOP(arith::FPToUIOp)
+      TYPEDUNAOP(arith::SIToFPOp)
+      TYPEDUNAOP(arith::UIToFPOp)
+      TYPEDUNAOP(arith::ExtSIOp)
+      TYPEDUNAOP(arith::ExtUIOp)
+      TYPEDUNAOP(arith::IndexCastOp)
+      TYPEDUNAOP(arith::TruncIOp)
+      TYPEDUNAOP(arith::BitcastOp)
     }
   } else if (def->getNumOperands() == 2) {
     Value vx, vy;
@@ -365,12 +428,14 @@ static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
       BINOP(arith::AndIOp)
       BINOP(arith::OrIOp)
       BINOP(arith::XOrIOp)
+      // TODO: shift by invariant?
     }
   }
   return false;
 }
 
 #undef UNAOP
+#undef TYPEDUNAOP
 #undef BINOP
 
 /// This method is called twice to analyze and rewrite the given for-loop.
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir
new file mode 100644
index 0000000..37d5b80
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir
@@ -0,0 +1,124 @@
+// RUN: mlir-opt %s -sparsification -cse -sparse-vectorization="vl=8" -cse | \
+// RUN:   FileCheck %s
+
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+#SparseVector = #sparse_tensor.encoding<{
+  dimLevelType = ["compressed"]
+}>
+
+#trait_1d = {
+  indexing_maps = [
+    affine_map<(i) -> (i)>,  // a
+    affine_map<(i) -> (i)>   // x (out)
+  ],
+  iterator_types = ["parallel"],
+  doc = "X(i) = a(i) op i"
+}
+
+// CHECK-LABEL: func.func @sparse_index_1d_conj(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<8xi64> {
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<0> : vector<8xi64>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xindex>
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.empty() : tensor<8xi64>
+// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xi64>
+// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : memref<8xi64>
+// CHECK:           linalg.fill ins(%[[VAL_4]] : i64) outs(%[[VAL_11]] : memref<8xi64>)
+// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_1]] {
+// CHECK:             %[[VAL_15:.*]] = affine.min #map1(%[[VAL_13]], %[[VAL_14]]){{\[}}%[[VAL_1]]]
+// CHECK:             %[[VAL_16:.*]] = vector.create_mask %[[VAL_15]] : vector<8xi1>
+// CHECK:             %[[VAL_17:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_14]]], %[[VAL_16]], %[[VAL_3]] : memref<?xindex>, vector<8xi1>, vector<8xindex> into vector<8xindex>
+// CHECK:             %[[VAL_18:.*]] = vector.maskedload %[[VAL_10]]{{\[}}%[[VAL_14]]], %[[VAL_16]], %[[VAL_2]] : memref<?xi64>, vector<8xi1>, vector<8xi64> into vector<8xi64>
+// CHECK:             %[[VAL_19:.*]] = arith.index_cast %[[VAL_17]] : vector<8xindex> to vector<8xi64>
+// CHECK:             %[[VAL_20:.*]] = arith.muli %[[VAL_18]], %[[VAL_19]] : vector<8xi64>
+// CHECK:             vector.scatter %[[VAL_11]]{{\[}}%[[VAL_5]]] {{\[}}%[[VAL_17]]], %[[VAL_16]], %[[VAL_20]] : memref<8xi64>, vector<8xindex>, vector<8xi1>, vector<8xi64>
+// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           %[[VAL_21:.*]] = bufferization.to_tensor %[[VAL_11]] : memref<8xi64>
+// CHECK:           return %[[VAL_21]] : tensor<8xi64>
+// CHECK:         }
+func.func @sparse_index_1d_conj(%arga: tensor<8xi64, #SparseVector>) -> tensor<8xi64> {
+  %init = tensor.empty() : tensor<8xi64>
+  %r = linalg.generic #trait_1d
+      ins(%arga: tensor<8xi64, #SparseVector>)
+     outs(%init: tensor<8xi64>) {
+      ^bb(%a: i64, %x: i64):
+        %i = linalg.index 0 : index
+        %ii = arith.index_cast %i : index to i64
+        %m1 = arith.muli %a, %ii : i64
+        linalg.yield %m1 : i64
+  } -> tensor<8xi64>
+  return %r : tensor<8xi64>
+}
+
+// CHECK-LABEL: func.func @sparse_index_1d_disj(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<8xi64> {
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.empty() : tensor<8xi64>
+// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xi64>
+// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : memref<8xi64>
+// CHECK:           linalg.fill ins(%[[VAL_3]] : i64) outs(%[[VAL_11]] : memref<8xi64>)
+// CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
+// CHECK:             %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_15]], %[[VAL_13]] : index
+// CHECK:             scf.condition(%[[VAL_17]]) %[[VAL_15]], %[[VAL_16]] : index, index
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_18:.*]]: index, %[[VAL_19:.*]]: index):
+// CHECK:             %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_18]]] : memref<?xindex>
+// CHECK:             %[[VAL_21:.*]] = arith.cmpi eq, %[[VAL_20]], %[[VAL_19]] : index
+// CHECK:             scf.if %[[VAL_21]] {
+// CHECK:               %[[VAL_22:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_18]]] : memref<?xi64>
+// CHECK:               %[[VAL_23:.*]] = arith.index_cast %[[VAL_19]] : index to i64
+// CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_22]], %[[VAL_23]] : i64
+// CHECK:               memref.store %[[VAL_24]], %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref<8xi64>
+// CHECK:             } else {
+// CHECK:               scf.if %[[VAL_6]] {
+// CHECK:                 %[[VAL_25:.*]] = arith.index_cast %[[VAL_19]] : index to i64
+// CHECK:                 memref.store %[[VAL_25]], %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref<8xi64>
+// CHECK:               } else {
+// CHECK:               }
+// CHECK:             }
+// CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_18]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_27:.*]] = arith.select %[[VAL_21]], %[[VAL_26]], %[[VAL_18]] : index
+// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_19]], %[[VAL_5]] : index
+// CHECK:             scf.yield %[[VAL_27]], %[[VAL_28]] : index, index
+// CHECK:           } attributes {"Emitted from" = "linalg.generic"}
+// CHECK:           scf.for %[[VAL_29:.*]] = %[[VAL_30:.*]]#1 to %[[VAL_1]] step %[[VAL_1]] {
+// CHECK:             %[[VAL_31:.*]] = affine.min #map1(%[[VAL_1]], %[[VAL_29]]){{\[}}%[[VAL_1]]]
+// CHECK:             %[[VAL_32:.*]] = vector.create_mask %[[VAL_31]] : vector<8xi1>
+// CHECK:             %[[VAL_33:.*]] = vector.broadcast %[[VAL_29]] : index to vector<8xindex>
+// CHECK:             %[[VAL_34:.*]] = arith.addi %[[VAL_33]], %[[VAL_2]] : vector<8xindex>
+// CHECK:             %[[VAL_35:.*]] = arith.index_cast %[[VAL_34]] : vector<8xindex> to vector<8xi64>
+// CHECK:             vector.maskedstore %[[VAL_11]]{{\[}}%[[VAL_29]]], %[[VAL_32]], %[[VAL_35]] : memref<8xi64>, vector<8xi1>, vector<8xi64>
+// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           %[[VAL_36:.*]] = bufferization.to_tensor %[[VAL_11]] : memref<8xi64>
+// CHECK:           return %[[VAL_36]] : tensor<8xi64>
+// CHECK:         }
+func.func @sparse_index_1d_disj(%arga: tensor<8xi64, #SparseVector>) -> tensor<8xi64> {
+  %init = tensor.empty() : tensor<8xi64>
+  %r = linalg.generic #trait_1d
+      ins(%arga: tensor<8xi64, #SparseVector>)
+     outs(%init: tensor<8xi64>) {
+      ^bb(%a: i64, %x: i64):
+        %i = linalg.index 0 : index
+        %ii = arith.index_cast %i : index to i64
+        %m1 = arith.addi %a, %ii : i64
+        linalg.yield %m1 : i64
+  } -> tensor<8xi64>
+  return %r : tensor<8xi64>
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir
new file mode 100644
index 0000000..32900d9
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir
@@ -0,0 +1,77 @@
+// RUN: mlir-opt %s -sparsification -cse -sparse-vectorization="vl=8" -cse | \
+// RUN:   FileCheck %s
+
+#DenseVector = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
+
+#trait = {
+  indexing_maps = [
+    affine_map<(i) -> (i)>,  // a
+    affine_map<(i) -> (i)>,  // b
+    affine_map<(i) -> (i)>   // x (out)
+  ],
+  iterator_types = ["parallel"],
+  doc = "x(i) = a(i) ops b(i)"
+}
+
+// CHECK-LABEL: func.func @vops
+// CHECK-DAG:       %[[C1:.*]] = arith.constant dense<2.000000e+00> : vector<8xf32>
+// CHECK-DAG:       %[[C2:.*]] = arith.constant dense<1.000000e+00> : vector<8xf32>
+// CHECK-DAG:       %[[C3:.*]] = arith.constant dense<255> : vector<8xi64>
+// CHECK:           scf.for
+// CHECK:             %[[VAL_14:.*]] = vector.load
+// CHECK:             %[[VAL_15:.*]] = math.absf %[[VAL_14]] : vector<8xf32>
+// CHECK:             %[[VAL_16:.*]] = math.ceil %[[VAL_15]] : vector<8xf32>
+// CHECK:             %[[VAL_17:.*]] = math.floor %[[VAL_16]] : vector<8xf32>
+// CHECK:             %[[VAL_18:.*]] = math.sqrt %[[VAL_17]] : vector<8xf32>
+// CHECK:             %[[VAL_19:.*]] = math.expm1 %[[VAL_18]] : vector<8xf32>
+// CHECK:             %[[VAL_20:.*]] = math.sin %[[VAL_19]] : vector<8xf32>
+// CHECK:             %[[VAL_21:.*]] = math.tanh %[[VAL_20]] : vector<8xf32>
+// CHECK:             %[[VAL_22:.*]] = arith.negf %[[VAL_21]] : vector<8xf32>
+// CHECK:             %[[VAL_23:.*]] = vector.load
+// CHECK:             %[[VAL_24:.*]] = arith.mulf %[[VAL_22]], %[[VAL_23]] : vector<8xf32>
+// CHECK:             %[[VAL_25:.*]] = arith.divf %[[VAL_24]], %[[C1]] : vector<8xf32>
+// CHECK:             %[[VAL_26:.*]] = arith.addf %[[VAL_25]], %[[C1]] : vector<8xf32>
+// CHECK:             %[[VAL_27:.*]] = arith.subf %[[VAL_26]], %[[C2]] : vector<8xf32>
+// CHECK:             %[[VAL_28:.*]] = arith.extf %[[VAL_27]] : vector<8xf32> to vector<8xf64>
+// CHECK:             %[[VAL_29:.*]] = arith.bitcast %[[VAL_28]] : vector<8xf64> to vector<8xi64>
+// CHECK:             %[[VAL_30:.*]] = arith.addi %[[VAL_29]], %[[VAL_29]] : vector<8xi64>
+// CHECK:             %[[VAL_31:.*]] = arith.andi %[[VAL_30]], %[[C3]] : vector<8xi64>
+// CHECK:             %[[VAL_32:.*]] = arith.trunci %[[VAL_31]] : vector<8xi64> to vector<8xi16>
+// CHECK:             %[[VAL_33:.*]] = arith.extsi %[[VAL_32]] : vector<8xi16> to vector<8xi32>
+// CHECK:             %[[VAL_34:.*]] = arith.uitofp %[[VAL_33]] : vector<8xi32> to vector<8xf32>
+// CHECK:             vector.store %[[VAL_34]]
+// CHECK:           }
+func.func @vops(%arga: tensor<1024xf32, #DenseVector>,
+                %argb: tensor<1024xf32, #DenseVector>) -> tensor<1024xf32> {
+  %init = bufferization.alloc_tensor() : tensor<1024xf32>
+  %o = arith.constant 1.0 : f32
+  %c = arith.constant 2.0 : f32
+  %i = arith.constant 255 : i64
+  %0 = linalg.generic #trait
+    ins(%arga, %argb: tensor<1024xf32, #DenseVector>, tensor<1024xf32, #DenseVector>)
+    outs(%init: tensor<1024xf32>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
+        %0 = math.absf %a : f32
+        %1 = math.ceil %0 : f32
+        %2 = math.floor %1 : f32
+        %3 = math.sqrt %2 : f32
+        %4 = math.expm1 %3 : f32
+        %5 = math.sin %4 : f32
+        %6 = math.tanh %5 : f32
+        %7 = arith.negf %6 : f32
+        %8 = arith.mulf %7, %b : f32
+        %9 = arith.divf %8, %c : f32
+        %10 = arith.addf %9, %c : f32
+        %11 = arith.subf %10, %o : f32
+        %12 = arith.extf %11 : f32 to f64
+        %13 = arith.bitcast %12 : f64 to i64
+        %14 = arith.addi %13, %13 : i64
+        %15 = arith.andi %14, %i : i64
+        %16 = arith.trunci %15 : i64 to i16
+        %17 = arith.extsi %16 : i16 to i32
+        %18 = arith.uitofp %17 : i32 to f32
+        linalg.yield %18 : f32
+  } -> tensor<1024xf32>
+  return %0 : tensor<1024xf32>
+}
+