From: Aart Bik Date: Tue, 29 Nov 2022 23:02:38 +0000 (-0800) Subject: [mlir][sparse][vectorization] implement "index" vectorization X-Git-Tag: upstream/17.0.6~25825 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2fda6207118d1d1c19e3b66f615f332ffc2792d0;p=platform%2Fupstream%2Fllvm.git [mlir][sparse][vectorization] implement "index" vectorization This adds the capability to vectorize computations like a[i] = i. This also generalizes the supported unary and binary ops and adds a test for each to ensure actual SIMD code can result. Reviewed By: bixia Differential Revision: https://reviews.llvm.org/D138956 --- diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp index 028a471..7d6ac51 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp @@ -216,7 +216,8 @@ static Value genVectorReducInit(PatternRewriter &rewriter, Location loc, /// The first call (!codegen) does the analysis. Then, on success, the second /// call (codegen) yields the proper vector form in the output parameter /// vector 'idxs'. This mechanism ensures that analysis and rewriting code -/// stay in sync. +/// stay in sync. Note that the analyis part is simple because the sparse +/// compiler only generates relatively simple subscript expressions. /// /// See https://llvm.org/docs/GetElementPtr.html for some background on /// the complications described below. @@ -234,7 +235,7 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, ValueRange subs, bool codegen, Value vmask, SmallVectorImpl &idxs) { for (auto sub : subs) { - // Invariant indices simply pass through. + // Invariant/loop indices simply pass through. if (sub.dyn_cast() || sub.getDefiningOp()->getBlock() != &forOp.getRegion().front()) { if (codegen) @@ -293,6 +294,15 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp, return true; \ } +#define TYPEDUNAOP(xxx) \ + if (auto x = dyn_cast(def)) { \ + if (codegen) { \ + VectorType vtp = vectorType(vl, x.getType()); \ + vexp = rewriter.create(loc, vtp, vx); \ + } \ + return true; \ + } + #define BINOP(xxx) \ if (isa(def)) { \ if (codegen) \ @@ -303,27 +313,60 @@ static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp, /// This method is called twice to analyze and rewrite the given expression. /// The first call (!codegen) does the analysis. Then, on success, the second /// call (codegen) yields the proper vector form in the output parameter 'vexp'. -/// This mechanism ensures that analysis and rewriting code stay in sync. +/// This mechanism ensures that analysis and rewriting code stay in sync. Note +/// that the analyis part is simple because the sparse compiler only generates +/// relatively simple expressions inside the for-loops. static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, Value exp, bool codegen, Value vmask, Value &vexp) { - // A block argument in invariant. + Location loc = forOp.getLoc(); + // Reject unsupported types. + if (!VectorType::isValidElementType(exp.getType())) + return false; + // A block argument is invariant/reduction/index. if (auto arg = exp.dyn_cast()) { - if (codegen) - vexp = genVectorInvariantValue(rewriter, vl, exp); - return true; + if (arg == forOp.getInductionVar()) { + // We encountered a single, innermost index inside the computation, + // such as a[i] = i, which must convert to [i, i+1, ...]. + if (codegen) { + VectorType vtp = vectorType(vl, arg.getType()); + Value veci = rewriter.create(loc, vtp, arg); + Value incr; + if (vl.enableVLAVectorization) { + Type stepvty = vectorType(vl, rewriter.getI64Type()); + Value stepv = rewriter.create(loc, stepvty); + incr = rewriter.create(loc, vtp, stepv); + } else { + SmallVector integers; + for (unsigned i = 0, l = vl.vectorLength; i < l; i++) + integers.push_back(APInt(/*width=*/64, i)); + auto values = DenseElementsAttr::get(vtp, integers); + incr = rewriter.create(loc, vtp, values); + } + vexp = rewriter.create(loc, veci, incr); + } + return true; + } else { + // An invariant or reduction. In both cases, we treat this as an + // invariant value, and rely on later replacing and folding to + // construct a proper reduction chain for the latter case. + if (codegen) + vexp = genVectorInvariantValue(rewriter, vl, exp); + return true; + } } - // Something defined outside the loop-body is invariant as well. + // Something defined outside the loop-body is invariant. Operation *def = exp.getDefiningOp(); if (def->getBlock() != &forOp.getRegion().front()) { if (codegen) vexp = genVectorInvariantValue(rewriter, vl, exp); return true; } - // Inside loop-body unary and binary operations. Note that it would be - // nicer if we could somehow test and build the operations in a more - // concise manner than just listing them all (although this way we know - // for certain that they can vectorize). - Location loc = forOp.getLoc(); + // Proper load operations. These are either values involved in the + // actual computation, such as a[i] = b[i] becomes a[lo:hi] = b[lo:hi], + // or index values inside the computation that are now fetched from + // the sparse storage index arrays, such as a[i] = i becomes + // a[lo:hi] = ind[lo:hi], where 'lo' denotes the current index + // and 'hi = lo + vl - 1'. if (auto load = dyn_cast(def)) { auto subs = load.getIndices(); SmallVector idxs; @@ -332,7 +375,16 @@ static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, vexp = genVectorLoad(rewriter, loc, vl, load.getMemRef(), idxs, vmask); return true; } - } else if (def->getNumOperands() == 1) { + return false; + } + // Inside loop-body unary and binary operations. Note that it would be + // nicer if we could somehow test and build the operations in a more + // concise manner than just listing them all (although this way we know + // for certain that they can vectorize). + // + // TODO: avoid visiting CSEs multiple times + // + if (def->getNumOperands() == 1) { Value vx; if (vectorizeExpr(rewriter, forOp, vl, def->getOperand(0), codegen, vmask, vx)) { @@ -346,6 +398,17 @@ static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, UNAOP(math::SinOp) UNAOP(math::TanhOp) UNAOP(arith::NegFOp) + TYPEDUNAOP(arith::TruncFOp) + TYPEDUNAOP(arith::ExtFOp) + TYPEDUNAOP(arith::FPToSIOp) + TYPEDUNAOP(arith::FPToUIOp) + TYPEDUNAOP(arith::SIToFPOp) + TYPEDUNAOP(arith::UIToFPOp) + TYPEDUNAOP(arith::ExtSIOp) + TYPEDUNAOP(arith::ExtUIOp) + TYPEDUNAOP(arith::IndexCastOp) + TYPEDUNAOP(arith::TruncIOp) + TYPEDUNAOP(arith::BitcastOp) } } else if (def->getNumOperands() == 2) { Value vx, vy; @@ -365,12 +428,14 @@ static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, BINOP(arith::AndIOp) BINOP(arith::OrIOp) BINOP(arith::XOrIOp) + // TODO: shift by invariant? } } return false; } #undef UNAOP +#undef TYPEDUNAOP #undef BINOP /// This method is called twice to analyze and rewrite the given for-loop. diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir new file mode 100644 index 0000000..37d5b80 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir @@ -0,0 +1,124 @@ +// RUN: mlir-opt %s -sparsification -cse -sparse-vectorization="vl=8" -cse | \ +// RUN: FileCheck %s + +// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py + +#SparseVector = #sparse_tensor.encoding<{ + dimLevelType = ["compressed"] +}> + +#trait_1d = { + indexing_maps = [ + affine_map<(i) -> (i)>, // a + affine_map<(i) -> (i)> // x (out) + ], + iterator_types = ["parallel"], + doc = "X(i) = a(i) op i" +} + +// CHECK-LABEL: func.func @sparse_index_1d_conj( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<8xi64> { +// CHECK-DAG: %[[VAL_1:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[VAL_2:.*]] = arith.constant dense<0> : vector<8xi64> +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xindex> +// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : i64 +// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[VAL_7:.*]] = tensor.empty() : tensor<8xi64> +// CHECK: %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref +// CHECK: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref +// CHECK: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref +// CHECK: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : memref<8xi64> +// CHECK: linalg.fill ins(%[[VAL_4]] : i64) outs(%[[VAL_11]] : memref<8xi64>) +// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref +// CHECK: scf.for %[[VAL_14:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_1]] { +// CHECK: %[[VAL_15:.*]] = affine.min #map1(%[[VAL_13]], %[[VAL_14]]){{\[}}%[[VAL_1]]] +// CHECK: %[[VAL_16:.*]] = vector.create_mask %[[VAL_15]] : vector<8xi1> +// CHECK: %[[VAL_17:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_14]]], %[[VAL_16]], %[[VAL_3]] : memref, vector<8xi1>, vector<8xindex> into vector<8xindex> +// CHECK: %[[VAL_18:.*]] = vector.maskedload %[[VAL_10]]{{\[}}%[[VAL_14]]], %[[VAL_16]], %[[VAL_2]] : memref, vector<8xi1>, vector<8xi64> into vector<8xi64> +// CHECK: %[[VAL_19:.*]] = arith.index_cast %[[VAL_17]] : vector<8xindex> to vector<8xi64> +// CHECK: %[[VAL_20:.*]] = arith.muli %[[VAL_18]], %[[VAL_19]] : vector<8xi64> +// CHECK: vector.scatter %[[VAL_11]]{{\[}}%[[VAL_5]]] {{\[}}%[[VAL_17]]], %[[VAL_16]], %[[VAL_20]] : memref<8xi64>, vector<8xindex>, vector<8xi1>, vector<8xi64> +// CHECK: } {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_21:.*]] = bufferization.to_tensor %[[VAL_11]] : memref<8xi64> +// CHECK: return %[[VAL_21]] : tensor<8xi64> +// CHECK: } +func.func @sparse_index_1d_conj(%arga: tensor<8xi64, #SparseVector>) -> tensor<8xi64> { + %init = tensor.empty() : tensor<8xi64> + %r = linalg.generic #trait_1d + ins(%arga: tensor<8xi64, #SparseVector>) + outs(%init: tensor<8xi64>) { + ^bb(%a: i64, %x: i64): + %i = linalg.index 0 : index + %ii = arith.index_cast %i : index to i64 + %m1 = arith.muli %a, %ii : i64 + linalg.yield %m1 : i64 + } -> tensor<8xi64> + return %r : tensor<8xi64> +} + +// CHECK-LABEL: func.func @sparse_index_1d_disj( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<8xi64> { +// CHECK-DAG: %[[VAL_1:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i64 +// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[VAL_6:.*]] = arith.constant true +// CHECK-DAG: %[[VAL_7:.*]] = tensor.empty() : tensor<8xi64> +// CHECK: %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref +// CHECK: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref +// CHECK: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref +// CHECK: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : memref<8xi64> +// CHECK: linalg.fill ins(%[[VAL_3]] : i64) outs(%[[VAL_11]] : memref<8xi64>) +// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref +// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref +// CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) { +// CHECK: %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_15]], %[[VAL_13]] : index +// CHECK: scf.condition(%[[VAL_17]]) %[[VAL_15]], %[[VAL_16]] : index, index +// CHECK: } do { +// CHECK: ^bb0(%[[VAL_18:.*]]: index, %[[VAL_19:.*]]: index): +// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_18]]] : memref +// CHECK: %[[VAL_21:.*]] = arith.cmpi eq, %[[VAL_20]], %[[VAL_19]] : index +// CHECK: scf.if %[[VAL_21]] { +// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_18]]] : memref +// CHECK: %[[VAL_23:.*]] = arith.index_cast %[[VAL_19]] : index to i64 +// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_22]], %[[VAL_23]] : i64 +// CHECK: memref.store %[[VAL_24]], %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref<8xi64> +// CHECK: } else { +// CHECK: scf.if %[[VAL_6]] { +// CHECK: %[[VAL_25:.*]] = arith.index_cast %[[VAL_19]] : index to i64 +// CHECK: memref.store %[[VAL_25]], %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref<8xi64> +// CHECK: } else { +// CHECK: } +// CHECK: } +// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_18]], %[[VAL_5]] : index +// CHECK: %[[VAL_27:.*]] = arith.select %[[VAL_21]], %[[VAL_26]], %[[VAL_18]] : index +// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_19]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_27]], %[[VAL_28]] : index, index +// CHECK: } attributes {"Emitted from" = "linalg.generic"} +// CHECK: scf.for %[[VAL_29:.*]] = %[[VAL_30:.*]]#1 to %[[VAL_1]] step %[[VAL_1]] { +// CHECK: %[[VAL_31:.*]] = affine.min #map1(%[[VAL_1]], %[[VAL_29]]){{\[}}%[[VAL_1]]] +// CHECK: %[[VAL_32:.*]] = vector.create_mask %[[VAL_31]] : vector<8xi1> +// CHECK: %[[VAL_33:.*]] = vector.broadcast %[[VAL_29]] : index to vector<8xindex> +// CHECK: %[[VAL_34:.*]] = arith.addi %[[VAL_33]], %[[VAL_2]] : vector<8xindex> +// CHECK: %[[VAL_35:.*]] = arith.index_cast %[[VAL_34]] : vector<8xindex> to vector<8xi64> +// CHECK: vector.maskedstore %[[VAL_11]]{{\[}}%[[VAL_29]]], %[[VAL_32]], %[[VAL_35]] : memref<8xi64>, vector<8xi1>, vector<8xi64> +// CHECK: } {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_36:.*]] = bufferization.to_tensor %[[VAL_11]] : memref<8xi64> +// CHECK: return %[[VAL_36]] : tensor<8xi64> +// CHECK: } +func.func @sparse_index_1d_disj(%arga: tensor<8xi64, #SparseVector>) -> tensor<8xi64> { + %init = tensor.empty() : tensor<8xi64> + %r = linalg.generic #trait_1d + ins(%arga: tensor<8xi64, #SparseVector>) + outs(%init: tensor<8xi64>) { + ^bb(%a: i64, %x: i64): + %i = linalg.index 0 : index + %ii = arith.index_cast %i : index to i64 + %m1 = arith.addi %a, %ii : i64 + linalg.yield %m1 : i64 + } -> tensor<8xi64> + return %r : tensor<8xi64> +} diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir new file mode 100644 index 0000000..32900d9 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir @@ -0,0 +1,77 @@ +// RUN: mlir-opt %s -sparsification -cse -sparse-vectorization="vl=8" -cse | \ +// RUN: FileCheck %s + +#DenseVector = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }> + +#trait = { + indexing_maps = [ + affine_map<(i) -> (i)>, // a + affine_map<(i) -> (i)>, // b + affine_map<(i) -> (i)> // x (out) + ], + iterator_types = ["parallel"], + doc = "x(i) = a(i) ops b(i)" +} + +// CHECK-LABEL: func.func @vops +// CHECK-DAG: %[[C1:.*]] = arith.constant dense<2.000000e+00> : vector<8xf32> +// CHECK-DAG: %[[C2:.*]] = arith.constant dense<1.000000e+00> : vector<8xf32> +// CHECK-DAG: %[[C3:.*]] = arith.constant dense<255> : vector<8xi64> +// CHECK: scf.for +// CHECK: %[[VAL_14:.*]] = vector.load +// CHECK: %[[VAL_15:.*]] = math.absf %[[VAL_14]] : vector<8xf32> +// CHECK: %[[VAL_16:.*]] = math.ceil %[[VAL_15]] : vector<8xf32> +// CHECK: %[[VAL_17:.*]] = math.floor %[[VAL_16]] : vector<8xf32> +// CHECK: %[[VAL_18:.*]] = math.sqrt %[[VAL_17]] : vector<8xf32> +// CHECK: %[[VAL_19:.*]] = math.expm1 %[[VAL_18]] : vector<8xf32> +// CHECK: %[[VAL_20:.*]] = math.sin %[[VAL_19]] : vector<8xf32> +// CHECK: %[[VAL_21:.*]] = math.tanh %[[VAL_20]] : vector<8xf32> +// CHECK: %[[VAL_22:.*]] = arith.negf %[[VAL_21]] : vector<8xf32> +// CHECK: %[[VAL_23:.*]] = vector.load +// CHECK: %[[VAL_24:.*]] = arith.mulf %[[VAL_22]], %[[VAL_23]] : vector<8xf32> +// CHECK: %[[VAL_25:.*]] = arith.divf %[[VAL_24]], %[[C1]] : vector<8xf32> +// CHECK: %[[VAL_26:.*]] = arith.addf %[[VAL_25]], %[[C1]] : vector<8xf32> +// CHECK: %[[VAL_27:.*]] = arith.subf %[[VAL_26]], %[[C2]] : vector<8xf32> +// CHECK: %[[VAL_28:.*]] = arith.extf %[[VAL_27]] : vector<8xf32> to vector<8xf64> +// CHECK: %[[VAL_29:.*]] = arith.bitcast %[[VAL_28]] : vector<8xf64> to vector<8xi64> +// CHECK: %[[VAL_30:.*]] = arith.addi %[[VAL_29]], %[[VAL_29]] : vector<8xi64> +// CHECK: %[[VAL_31:.*]] = arith.andi %[[VAL_30]], %[[C3]] : vector<8xi64> +// CHECK: %[[VAL_32:.*]] = arith.trunci %[[VAL_31]] : vector<8xi64> to vector<8xi16> +// CHECK: %[[VAL_33:.*]] = arith.extsi %[[VAL_32]] : vector<8xi16> to vector<8xi32> +// CHECK: %[[VAL_34:.*]] = arith.uitofp %[[VAL_33]] : vector<8xi32> to vector<8xf32> +// CHECK: vector.store %[[VAL_34]] +// CHECK: } +func.func @vops(%arga: tensor<1024xf32, #DenseVector>, + %argb: tensor<1024xf32, #DenseVector>) -> tensor<1024xf32> { + %init = bufferization.alloc_tensor() : tensor<1024xf32> + %o = arith.constant 1.0 : f32 + %c = arith.constant 2.0 : f32 + %i = arith.constant 255 : i64 + %0 = linalg.generic #trait + ins(%arga, %argb: tensor<1024xf32, #DenseVector>, tensor<1024xf32, #DenseVector>) + outs(%init: tensor<1024xf32>) { + ^bb(%a: f32, %b: f32, %x: f32): + %0 = math.absf %a : f32 + %1 = math.ceil %0 : f32 + %2 = math.floor %1 : f32 + %3 = math.sqrt %2 : f32 + %4 = math.expm1 %3 : f32 + %5 = math.sin %4 : f32 + %6 = math.tanh %5 : f32 + %7 = arith.negf %6 : f32 + %8 = arith.mulf %7, %b : f32 + %9 = arith.divf %8, %c : f32 + %10 = arith.addf %9, %c : f32 + %11 = arith.subf %10, %o : f32 + %12 = arith.extf %11 : f32 to f64 + %13 = arith.bitcast %12 : f64 to i64 + %14 = arith.addi %13, %13 : i64 + %15 = arith.andi %14, %i : i64 + %16 = arith.trunci %15 : i64 to i16 + %17 = arith.extsi %16 : i16 to i32 + %18 = arith.uitofp %17 : i32 to f32 + linalg.yield %18 : f32 + } -> tensor<1024xf32> + return %0 : tensor<1024xf32> +} +