AffineForOp forOp,
SmallVectorImpl<LoopReduction> *parallelReductions = nullptr);
+/// Returns true if `forOp' doesn't have memory dependences preventing
+/// parallelization. This function doesn't check iter_args and should be used
+/// only as a building block for full parallel-checking functions.
+bool isLoopMemoryParallel(AffineForOp forOp);
+
/// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
/// Operations that are reachable via a search starting from `operands` and
/// ending at those operands that are not the result of an AffineApplyOp.
"Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
"dimensions to match. See defaultPatterns in Vectorize.cpp for "
"a description and examples. This is used for testing purposes",
- "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+ "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+ Option<"vectorizeReductions", "vectorize-reductions", "bool",
+ /*default=*/"false",
+ "Vectorize known reductions expressed via iter_args. "
+ "Switched off by default.">
];
}
#ifndef MLIR_DIALECT_AFFINE_UTILS_H
#define MLIR_DIALECT_AFFINE_UTILS_H
+#include "mlir/Analysis/AffineAnalysis.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/DenseMap.h"
struct LoopReduction;
class Operation;
+using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
+
/// Replaces parallel affine.for op with 1-d affine.parallel op.
/// mlir::isLoopParallel detects the parallel affine.for ops.
/// Parallelizes the specified reductions. Parallelization will fail in presence
// The candidate will be vectorized using the vectorization factor in
// 'vectorSizes' for that dimension.
DenseMap<Operation *, unsigned> loopToVectorDim;
+ // Maps loops that implement vectorizable reductions to the corresponding
+ // reduction descriptors.
+ ReductionLoopMap reductionLoops;
};
/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
/// 'vectorSizes'. By default, each vectorization factor is applied
/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
/// be optionally used to provide a different loop vectorization order.
+/// If `reductionLoops` is not empty, the given reduction loops may be
+/// vectorized along the reduction dimension.
+/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
void vectorizeAffineLoops(
Operation *parentOp,
llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
- ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
+ ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern,
+ const ReductionLoopMap &reductionLoops = ReductionLoopMap());
/// External utility to vectorize affine loops from a single loop nest using an
/// n-D vectorization strategy (see doc in VectorizationStrategy definition).
/// Ignore integer bitwitdh and type mismatch that come from the fact there is
/// no IndexAttr and that IndexType have no bitwidth.
bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2);
+
+/// Returns the identity value attribute associated with an AtomicRMWKind op.
+Attribute getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
+ OpBuilder &builder, Location loc);
+
+/// Returns the identity value associated with an AtomicRMWKind op.
+Value getIdentityValue(AtomicRMWKind op, Type resultType, OpBuilder &builder,
+ Location loc);
+
+/// Returns the value obtained by applying the reduction operation kind
+/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
+Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+ Value lhs, Value rhs);
+
} // end namespace mlir
#endif // MLIR_DIALECT_IR_STANDARDOPS_IR_OPS_H
#ifndef MLIR_DIALECT_VECTOR_VECTOROPS_H
#define MLIR_DIALECT_VECTOR_VECTOROPS_H
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/BuiltinTypes.h"
/// the integer type required for subscripts in the vector dialect.
ArrayAttr getVectorSubscriptAttr(Builder &b, ArrayRef<int64_t> values);
+/// Returns the value obtained by reducing the vector into a scalar using the
+/// operation kind associated with a binary AtomicRMWKind op.
+Value getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+ Value vector);
+
namespace impl {
/// Build the default minor identity map suitable for a vector transfer. This
/// also handles the case memref<... x vector<...>> -> vector<...> in which the
return nullptr;
if (!forOp.getRegionIterArgs()[pos].hasOneUse())
return nullptr;
+ if (!yielded.hasOneUse())
+ return nullptr;
Optional<AtomicRMWKind> maybeKind =
TypeSwitch<Operation *, Optional<AtomicRMWKind>>(definition)
return false;
}
+ // Check memory dependences.
+ return isLoopMemoryParallel(forOp);
+}
+
+/// Returns true if `forOp' doesn't have memory dependences preventing
+/// parallelization. This function doesn't check iter_args and should be used
+/// only as a building block for full parallel-checking functions.
+bool mlir::isLoopMemoryParallel(AffineForOp forOp) {
// Collect all load and store ops in loop nest rooted at 'forOp'.
SmallVector<Operation *, 8> loadAndStoreOps;
auto walkResult = forOp.walk([&](Operation *op) -> WalkResult {
}
};
-/// Returns the identity value associated with an AtomicRMWKind op.
-static Value getIdentityValue(AtomicRMWKind op, Type resultType,
- OpBuilder &builder, Location loc) {
- switch (op) {
- case AtomicRMWKind::addf:
- return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 0));
- case AtomicRMWKind::addi:
- return builder.create<ConstantOp>(loc,
- builder.getIntegerAttr(resultType, 0));
- case AtomicRMWKind::mulf:
- return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 1));
- case AtomicRMWKind::muli:
- return builder.create<ConstantOp>(loc,
- builder.getIntegerAttr(resultType, 1));
- // TODO: Add remaining reduction operations.
- default:
- (void)emitOptionalError(loc, "Reduction operation type not supported");
- break;
- }
- return nullptr;
-}
-
-/// Return the value obtained by applying the reduction operation kind
-/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
-static Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
- Value lhs, Value rhs) {
- switch (op) {
- case AtomicRMWKind::addf:
- return builder.create<AddFOp>(loc, lhs, rhs);
- case AtomicRMWKind::addi:
- return builder.create<AddIOp>(loc, lhs, rhs);
- case AtomicRMWKind::mulf:
- return builder.create<MulFOp>(loc, lhs, rhs);
- case AtomicRMWKind::muli:
- return builder.create<MulIOp>(loc, lhs, rhs);
- // TODO: Add remaining reduction operations.
- default:
- (void)emitOptionalError(loc, "Reduction operation type not supported");
- break;
- }
- return nullptr;
-}
-
/// Convert an `affine.parallel` (loop nest) operation into a `scf.parallel`
/// operation.
class AffineParallelLowering : public OpRewritePattern<AffineParallelOp> {
#include "mlir/Dialect/Vector/VectorUtils.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Debug.h"
using namespace mlir;
/// Unsupported cases, extensions, and work in progress (help welcome :-) ):
/// ========================================================================
/// 1. lowering to concrete vector types for various HW;
-/// 2. reduction support;
+/// 2. reduction support for n-D vectorization and non-unit steps;
/// 3. non-effecting padding during vector.transfer_read and filter during
/// vector.transfer_write;
/// 4. misalignment support vector.transfer_read / vector.transfer_write
///
/// Of course, much more intricate n-D imperfectly-nested patterns can be
/// vectorized too and specified in a fully declarative fashion.
+///
+/// Reduction:
+/// ==========
+/// Vectorizing reduction loops along the reduction dimension is supported if:
+/// - the reduction kind is supported,
+/// - the vectorization is 1-D, and
+/// - the step size of the loop equals to one.
+///
+/// Comparing to the non-vector-dimension case, two additional things are done
+/// during vectorization of such loops:
+/// - The resulting vector returned from the loop is reduced to a scalar using
+/// `vector.reduce`.
+/// - In some cases a mask is applied to the vector yielded at the end of the
+/// loop to prevent garbage values from being written to the accumulator.
+///
+/// Reduction vectorization is switched off by default, it can be enabled by
+/// passing a map from loops to reductions to utility functions, or by passing
+/// `vectorize-reductions=true` to the vectorization pass.
+///
+/// Consider the following example:
+/// ```mlir
+/// func @vecred(%in: memref<512xf32>) -> f32 {
+/// %cst = constant 0.000000e+00 : f32
+/// %sum = affine.for %i = 0 to 500 iter_args(%part_sum = %cst) -> (f32) {
+/// %ld = affine.load %in[%i] : memref<512xf32>
+/// %cos = math.cos %ld : f32
+/// %add = addf %part_sum, %cos : f32
+/// affine.yield %add : f32
+/// }
+/// return %sum : f32
+/// }
+/// ```
+///
+/// The -affine-vectorize pass with the following arguments:
+/// ```
+/// -affine-vectorize="virtual-vector-size=128 test-fastest-varying=0 \
+/// vectorize-reductions=true"
+/// ```
+/// produces the following output:
+/// ```mlir
+/// #map = affine_map<(d0) -> (-d0 + 500)>
+/// func @vecred(%arg0: memref<512xf32>) -> f32 {
+/// %cst = constant 0.000000e+00 : f32
+/// %cst_0 = constant dense<0.000000e+00> : vector<128xf32>
+/// %0 = affine.for %arg1 = 0 to 500 step 128 iter_args(%arg2 = %cst_0)
+/// -> (vector<128xf32>) {
+/// // %2 is the number of iterations left in the original loop.
+/// %2 = affine.apply #map(%arg1)
+/// %3 = vector.create_mask %2 : vector<128xi1>
+/// %cst_1 = constant 0.000000e+00 : f32
+/// %4 = vector.transfer_read %arg0[%arg1], %cst_1 :
+/// memref<512xf32>, vector<128xf32>
+/// %5 = math.cos %4 : vector<128xf32>
+/// %6 = addf %arg2, %5 : vector<128xf32>
+/// // We filter out the effect of last 12 elements using the mask.
+/// %7 = select %3, %6, %arg2 : vector<128xi1>, vector<128xf32>
+/// affine.yield %7 : vector<128xf32>
+/// }
+/// %1 = vector.reduction "add", %0 : vector<128xf32> into f32
+/// return %1 : f32
+/// }
+/// ```
+///
+/// Note that because of loop misalignment we needed to apply a mask to prevent
+/// last 12 elements from affecting the final result. The mask is full of ones
+/// in every iteration except for the last one, in which it has the form
+/// `11...100...0` with 116 ones and 12 zeros.
#define DEBUG_TYPE "early-vect"
void registerValueScalarReplacement(BlockArgument replaced,
BlockArgument replacement);
+ /// Registers the scalar replacement of a scalar result returned from a
+ /// reduction loop. 'replacement' must be scalar.
+ ///
+ /// This utility is used to register the replacement for scalar results of
+ /// vectorized reduction loops with iter_args.
+ ///
+ /// Example 2:
+ /// * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
+ /// * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
+ void registerLoopResultScalarReplacement(Value replaced, Value replacement);
+
/// Returns in 'replacedVals' the scalar replacement for values in
/// 'inputVals'.
void getScalarValueReplacementsFor(ValueRange inputVals,
// Maps input scalar values to their new scalar counterparts in the vector
// loop nest.
BlockAndValueMapping valueScalarReplacement;
+ // Maps results of reduction loops to their new scalar counterparts.
+ DenseMap<Value, Value> loopResultScalarReplacement;
// Maps the newly created vector loops to their vector dimension.
DenseMap<Operation *, unsigned> vecLoopToVecDim;
+ // Maps the new vectorized loops to the corresponding vector masks if it is
+ // required.
+ DenseMap<Operation *, Value> vecLoopToMask;
+
// The strategy drives which loop to vectorize by which amount.
const VectorizationStrategy *strategy;
registerValueScalarReplacementImpl(replaced, replacement);
}
+/// Registers the scalar replacement of a scalar result returned from a
+/// reduction loop. 'replacement' must be scalar.
+///
+/// This utility is used to register the replacement for scalar results of
+/// vectorized reduction loops with iter_args.
+///
+/// Example 2:
+/// * 'replaced': %0 = affine.for %i = 0 to 512 iter_args(%x = ...) -> (f32)
+/// * 'replacement': %1 = vector.reduction "add" %0 : vector<4xf32> into f32
+void VectorizationState::registerLoopResultScalarReplacement(
+ Value replaced, Value replacement) {
+ assert(isa<AffineForOp>(replaced.getDefiningOp()));
+ assert(loopResultScalarReplacement.count(replaced) == 0 &&
+ "already registered");
+ LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ will replace a result of the loop "
+ "with scalar: "
+ << replacement);
+ loopResultScalarReplacement[replaced] = replacement;
+}
+
void VectorizationState::registerValueScalarReplacementImpl(Value replaced,
Value replacement) {
assert(!valueScalarReplacement.contains(replaced) &&
return newConstOp;
}
+/// Creates a constant vector filled with the neutral elements of the given
+/// reduction. The scalar type of vector elements will be taken from
+/// `oldOperand`.
+static ConstantOp createInitialVector(AtomicRMWKind reductionKind,
+ Value oldOperand,
+ VectorizationState &state) {
+ Type scalarTy = oldOperand.getType();
+ if (!VectorType::isValidElementType(scalarTy))
+ return nullptr;
+
+ Attribute valueAttr = getIdentityValueAttr(
+ reductionKind, scalarTy, state.builder, oldOperand.getLoc());
+ auto vecTy = getVectorType(scalarTy, state.strategy);
+ auto vecAttr = DenseElementsAttr::get(vecTy, valueAttr);
+ auto newConstOp =
+ state.builder.create<ConstantOp>(oldOperand.getLoc(), vecAttr);
+
+ return newConstOp;
+}
+
+/// Creates a mask used to filter out garbage elements in the last iteration
+/// of unaligned loops. If a mask is not required then `nullptr` is returned.
+/// The mask will be a vector of booleans representing meaningful vector
+/// elements in the current iteration. It is filled with ones for each iteration
+/// except for the last one, where it has the form `11...100...0` with the
+/// number of ones equal to the number of meaningful elements (i.e. the number
+/// of iterations that would be left in the original loop).
+static Value createMask(AffineForOp vecForOp, VectorizationState &state) {
+ assert(state.strategy->vectorSizes.size() == 1 &&
+ "Creating a mask non-1-D vectors is not supported.");
+ assert(vecForOp.getStep() == state.strategy->vectorSizes[0] &&
+ "Creating a mask for loops with non-unit original step size is not "
+ "supported.");
+
+ // Check if we have already created the mask.
+ if (Value mask = state.vecLoopToMask.lookup(vecForOp))
+ return mask;
+
+ // If the loop has constant bounds and the original number of iterations is
+ // divisable by the vector size then we don't need a mask.
+ if (vecForOp.hasConstantBounds()) {
+ int64_t originalTripCount =
+ vecForOp.getConstantUpperBound() - vecForOp.getConstantLowerBound();
+ if (originalTripCount % vecForOp.getStep() == 0)
+ return nullptr;
+ }
+
+ OpBuilder::InsertionGuard guard(state.builder);
+ state.builder.setInsertionPointToStart(vecForOp.getBody());
+
+ // We generate the mask using the `vector.create_mask` operation which accepts
+ // the number of meaningful elements (i.e. the legth of the prefix of 1s).
+ // To compute the number of meaningful elements we subtract the current value
+ // of the iteration variable from the upper bound of the loop. Example:
+ //
+ // // 500 is the upper bound of the loop
+ // #map = affine_map<(d0) -> (500 - d0)>
+ // %elems_left = affine.apply #map(%iv)
+ // %mask = vector.create_mask %elems_left : vector<128xi1>
+
+ Location loc = vecForOp.getLoc();
+
+ // First we get the upper bound of the loop using `affine.apply` or
+ // `affine.min`.
+ AffineMap ubMap = vecForOp.getUpperBoundMap();
+ Value ub;
+ if (ubMap.getNumResults() == 1)
+ ub = state.builder.create<AffineApplyOp>(loc, vecForOp.getUpperBoundMap(),
+ vecForOp.getUpperBoundOperands());
+ else
+ ub = state.builder.create<AffineMinOp>(loc, vecForOp.getUpperBoundMap(),
+ vecForOp.getUpperBoundOperands());
+ // Then we compute the number of (original) iterations left in the loop.
+ AffineExpr subExpr =
+ state.builder.getAffineDimExpr(0) - state.builder.getAffineDimExpr(1);
+ Value itersLeft =
+ makeComposedAffineApply(state.builder, loc, AffineMap::get(2, 0, subExpr),
+ {ub, vecForOp.getInductionVar()});
+ // If the affine maps were successfully composed then `ub` is unneeded.
+ if (ub.use_empty())
+ ub.getDefiningOp()->erase();
+ // Finally we create the mask.
+ Type maskTy = VectorType::get(state.strategy->vectorSizes,
+ state.builder.getIntegerType(1));
+ Value mask =
+ state.builder.create<vector::CreateMaskOp>(loc, maskTy, itersLeft);
+
+ LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a mask:\n"
+ << itersLeft << "\n"
+ << mask << "\n");
+
+ state.vecLoopToMask[vecForOp] = mask;
+ return mask;
+}
+
/// Returns true if the provided value is vector uniform given the vectorization
/// strategy.
// TODO: For now, only values that are invariants to all the loops in the
return transfer;
}
+/// Returns true if `value` is a constant equal to the neutral element of the
+/// given vectorizable reduction.
+static bool isNeutralElementConst(AtomicRMWKind reductionKind, Value value,
+ VectorizationState &state) {
+ Type scalarTy = value.getType();
+ if (!VectorType::isValidElementType(scalarTy))
+ return false;
+ Attribute valueAttr = getIdentityValueAttr(reductionKind, scalarTy,
+ state.builder, value.getLoc());
+ if (auto constOp = dyn_cast_or_null<ConstantOp>(value.getDefiningOp()))
+ return constOp.value() == valueAttr;
+ return false;
+}
+
/// Vectorizes a loop with the vectorization strategy in 'state'. A new loop is
/// created and registered as replacement for the scalar loop. The builder's
/// insertion point is set to the new loop's body so that subsequent vectorized
/// operations are inserted into the new loop. If the loop is a vector
/// dimension, the step of the newly created loop will reflect the vectorization
/// factor used to vectorized that dimension.
-// TODO: Add support for 'iter_args'. Related operands and results will be
-// vectorized at this point.
static Operation *vectorizeAffineForOp(AffineForOp forOp,
VectorizationState &state) {
const VectorizationStrategy &strategy = *state.strategy;
auto loopToVecDimIt = strategy.loopToVectorDim.find(forOp);
bool isLoopVecDim = loopToVecDimIt != strategy.loopToVectorDim.end();
- // We only support 'iter_args' when the loop is not one of the vector
- // dimensions.
- // TODO: Support vector dimension loops. They require special handling:
- // generate horizontal reduction, last-value extraction, etc.
- if (forOp.getNumIterOperands() > 0 && isLoopVecDim)
+ // TODO: Vectorization of reduction loops is not supported for non-unit steps.
+ if (isLoopVecDim && forOp.getNumIterOperands() > 0 && forOp.getStep() != 1) {
+ LLVM_DEBUG(
+ dbgs()
+ << "\n[early-vect]+++++ unsupported step size for reduction loop: "
+ << forOp.getStep() << "\n");
return nullptr;
+ }
// If we are vectorizing a vector dimension, compute a new step for the new
// vectorized loop using the vectorization factor for the vector dimension.
newStep = forOp.getStep();
}
+ // Get information about reduction kinds.
+ ArrayRef<LoopReduction> reductions;
+ if (isLoopVecDim && forOp.getNumIterOperands() > 0) {
+ auto it = strategy.reductionLoops.find(forOp);
+ assert(it != strategy.reductionLoops.end() &&
+ "Reduction descriptors not found when vectorizing a reduction loop");
+ reductions = it->second;
+ assert(reductions.size() == forOp.getNumIterOperands() &&
+ "The size of reductions array must match the number of iter_args");
+ }
+
// Vectorize 'iter_args'.
SmallVector<Value, 8> vecIterOperands;
- for (auto operand : forOp.getIterOperands())
- vecIterOperands.push_back(vectorizeOperand(operand, state));
+ if (!isLoopVecDim) {
+ for (auto operand : forOp.getIterOperands())
+ vecIterOperands.push_back(vectorizeOperand(operand, state));
+ } else {
+ // For reduction loops we need to pass a vector of neutral elements as an
+ // initial value of the accumulator. We will add the original initial value
+ // later.
+ for (auto redAndOperand : llvm::zip(reductions, forOp.getIterOperands())) {
+ vecIterOperands.push_back(createInitialVector(
+ std::get<0>(redAndOperand).kind, std::get<1>(redAndOperand), state));
+ }
+ }
auto vecForOp = state.builder.create<AffineForOp>(
forOp.getLoc(), forOp.getLowerBoundOperands(), forOp.getLowerBoundMap(),
// Register loop-related replacements:
// 1) The new vectorized loop is registered as vector replacement of the
// scalar loop.
- // TODO: Support reductions along the vector dimension.
// 2) The new iv of the vectorized loop is registered as scalar replacement
// since a scalar copy of the iv will prevail in the vectorized loop.
// TODO: A vector replacement will also be added in the future when
// vectorization of linear ops is supported.
// 3) The new 'iter_args' region arguments are registered as vector
// replacements since they have been vectorized.
+ // 4) If the loop performs a reduction along the vector dimension, a
+ // `vector.reduction` or similar op is inserted for each resulting value
+ // of the loop and its scalar value replaces the corresponding scalar
+ // result of the loop.
state.registerOpVectorReplacement(forOp, vecForOp);
state.registerValueScalarReplacement(forOp.getInductionVar(),
vecForOp.getInductionVar());
state.registerBlockArgVectorReplacement(std::get<0>(iterTuple),
std::get<1>(iterTuple));
+ if (isLoopVecDim) {
+ for (unsigned i = 0; i < vecForOp.getNumIterOperands(); ++i) {
+ // First, we reduce the vector returned from the loop into a scalar.
+ Value reducedRes =
+ getVectorReductionOp(reductions[i].kind, state.builder,
+ vecForOp.getLoc(), vecForOp.getResult(i));
+ LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ creating a vector reduction: "
+ << reducedRes);
+ // Then we combine it with the original (scalar) initial value unless it
+ // is equal to the neutral element of the reduction.
+ Value origInit = forOp.getOperand(forOp.getNumControlOperands() + i);
+ Value finalRes = reducedRes;
+ if (!isNeutralElementConst(reductions[i].kind, origInit, state))
+ finalRes = getReductionOp(reductions[i].kind, state.builder,
+ reducedRes.getLoc(), reducedRes, origInit);
+ state.registerLoopResultScalarReplacement(forOp.getResult(i), finalRes);
+ }
+ }
+
if (isLoopVecDim)
state.vecLoopToVecDim[vecForOp] = loopToVecDimIt->second;
// Change insertion point so that upcoming vectorized instructions are
// inserted into the vectorized loop's body.
state.builder.setInsertionPointToStart(vecForOp.getBody());
+
+ // If this is a reduction loop then we may need to create a mask to filter out
+ // garbage in the last iteration.
+ if (isLoopVecDim && forOp.getNumIterOperands() > 0)
+ createMask(vecForOp, state);
+
return vecForOp;
}
/// Vectorizes a yield operation by widening its types. The builder's insertion
/// point is set after the vectorized parent op to continue vectorizing the
-/// operations after the parent op.
+/// operations after the parent op. When vectorizing a reduction loop a mask may
+/// be used to prevent adding garbage values to the accumulator.
static Operation *vectorizeAffineYieldOp(AffineYieldOp yieldOp,
VectorizationState &state) {
Operation *newYieldOp = widenOp(yieldOp, state);
Operation *newParentOp = state.builder.getInsertionBlock()->getParentOp();
+
+ // If there is a mask for this loop then we must prevent garbage values from
+ // being added to the accumulator by inserting `select` operations, for
+ // example:
+ //
+ // %res = addf %acc, %val : vector<128xf32>
+ // %res_masked = select %mask, %res, %acc : vector<128xi1>, vector<128xf32>
+ // affine.yield %res_masked : vector<128xf32>
+ //
+ if (Value mask = state.vecLoopToMask.lookup(newParentOp)) {
+ state.builder.setInsertionPoint(newYieldOp);
+ for (unsigned i = 0; i < newYieldOp->getNumOperands(); ++i) {
+ Value result = newYieldOp->getOperand(i);
+ Value iterArg = cast<AffineForOp>(newParentOp).getRegionIterArgs()[i];
+ Value maskedResult = state.builder.create<SelectOp>(result.getLoc(), mask,
+ result, iterArg);
+ LLVM_DEBUG(
+ dbgs() << "\n[early-vect]+++++ masking a yielded vector value: "
+ << maskedResult);
+ newYieldOp->setOperand(i, maskedResult);
+ }
+ }
+
state.builder.setInsertionPointAfter(newParentOp);
return newYieldOp;
}
auto opVecResult = rootLoop.walk<WalkOrder::PreOrder>([&](Operation *op) {
LLVM_DEBUG(dbgs() << "[early-vect]+++++ Vectorizing: " << *op);
Operation *vectorOp = vectorizeOneOperation(op, state);
- if (!vectorOp)
+ if (!vectorOp) {
+ LLVM_DEBUG(
+ dbgs() << "[early-vect]+++++ failed vectorizing the operation: "
+ << *op << "\n");
return WalkResult::interrupt();
+ }
return WalkResult::advance();
});
return failure();
}
+ // Replace results of reduction loops with the scalar values computed using
+ // `vector.reduce` or similar ops.
+ for (auto resPair : state.loopResultScalarReplacement)
+ resPair.first.replaceAllUsesWith(resPair.second);
+
assert(state.opVectorReplacement.count(rootLoop) == 1 &&
"Expected vector replacement for loop nest");
LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
/// vectorization factors in 'vectorSizes'. By default, each vectorization
/// factor is applied inner-to-outer to the loops of each loop nest.
/// 'fastestVaryingPattern' can be optionally used to provide a different loop
-/// vectorization order.
+/// vectorization order. `reductionLoops` can be provided to specify loops which
+/// can be vectorized along the reduction dimension.
static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
ArrayRef<int64_t> vectorSizes,
- ArrayRef<int64_t> fastestVaryingPattern) {
+ ArrayRef<int64_t> fastestVaryingPattern,
+ const ReductionLoopMap &reductionLoops) {
+ assert((reductionLoops.empty() || vectorSizes.size() == 1) &&
+ "Vectorizing reductions is supported only for 1-D vectors");
+
// Compute 1-D, 2-D or 3-D loop pattern to be matched on the target loops.
Optional<NestedPattern> pattern =
makePattern(loops, vectorSizes.size(), fastestVaryingPattern);
VectorizationStrategy strategy;
// TODO: depending on profitability, elect to reduce the vector size.
strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
+ strategy.reductionLoops = reductionLoops;
if (failed(analyzeProfitability(match.getMatchedChildren(), 1,
patternDepth, &strategy))) {
continue;
return signalPassFailure();
}
+ if (vectorizeReductions && vectorSizes.size() != 1) {
+ f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
+ return signalPassFailure();
+ }
+
DenseSet<Operation *> parallelLoops;
- f.walk([¶llelLoops](AffineForOp loop) {
- if (isLoopParallel(loop))
- parallelLoops.insert(loop);
- });
+ ReductionLoopMap reductionLoops;
+
+ // If 'vectorize-reduction=true' is provided, we also populate the
+ // `reductionLoops` map.
+ if (vectorizeReductions) {
+ f.walk([¶llelLoops, &reductionLoops](AffineForOp loop) {
+ SmallVector<LoopReduction, 2> reductions;
+ if (isLoopParallel(loop, &reductions)) {
+ parallelLoops.insert(loop);
+ // If it's not a reduction loop, adding it to the map is not necessary.
+ if (!reductions.empty())
+ reductionLoops[loop] = reductions;
+ }
+ });
+ } else {
+ f.walk([¶llelLoops](AffineForOp loop) {
+ if (isLoopParallel(loop))
+ parallelLoops.insert(loop);
+ });
+ }
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
- vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
+ vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
+ reductionLoops);
}
/// Verify that affine loops in 'loops' meet the nesting criteria expected by
/// factor is applied inner-to-outer to the loops of each loop nest.
/// 'fastestVaryingPattern' can be optionally used to provide a different loop
/// vectorization order.
+/// If `reductionLoops` is not empty, the given reduction loops may be
+/// vectorized along the reduction dimension.
+/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
ArrayRef<int64_t> vectorSizes,
- ArrayRef<int64_t> fastestVaryingPattern) {
+ ArrayRef<int64_t> fastestVaryingPattern,
+ const ReductionLoopMap &reductionLoops) {
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
- vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
+ vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern,
+ reductionLoops);
}
/// External utility to vectorize affine loops from a single loop nest using an
return success();
}
+/// Returns the identity value attribute associated with an AtomicRMWKind op.
+Attribute mlir::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
+ OpBuilder &builder, Location loc) {
+ switch (kind) {
+ case AtomicRMWKind::addf:
+ case AtomicRMWKind::addi:
+ return builder.getZeroAttr(resultType);
+ case AtomicRMWKind::muli:
+ return builder.getIntegerAttr(resultType, 1);
+ case AtomicRMWKind::mulf:
+ return builder.getFloatAttr(resultType, 1);
+ // TODO: Add remaining reduction operations.
+ default:
+ (void)emitOptionalError(loc, "Reduction operation type not supported");
+ break;
+ }
+ return nullptr;
+}
+
+/// Returns the identity value associated with an AtomicRMWKind op.
+Value mlir::getIdentityValue(AtomicRMWKind op, Type resultType,
+ OpBuilder &builder, Location loc) {
+ Attribute attr = getIdentityValueAttr(op, resultType, builder, loc);
+ return builder.create<ConstantOp>(loc, attr);
+}
+
+/// Return the value obtained by applying the reduction operation kind
+/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
+Value mlir::getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
+ Value lhs, Value rhs) {
+ switch (op) {
+ case AtomicRMWKind::addf:
+ return builder.create<AddFOp>(loc, lhs, rhs);
+ case AtomicRMWKind::addi:
+ return builder.create<AddIOp>(loc, lhs, rhs);
+ case AtomicRMWKind::mulf:
+ return builder.create<MulFOp>(loc, lhs, rhs);
+ case AtomicRMWKind::muli:
+ return builder.create<MulIOp>(loc, lhs, rhs);
+ // TODO: Add remaining reduction operations.
+ default:
+ (void)emitOptionalError(loc, "Reduction operation type not supported");
+ break;
+ }
+ return nullptr;
+}
+
//===----------------------------------------------------------------------===//
// GenericAtomicRMWOp
//===----------------------------------------------------------------------===//
p << " : " << op.vector().getType() << " into " << op.dest().getType();
}
+Value mlir::vector::getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder,
+ Location loc, Value vector) {
+ Type scalarType = vector.getType().cast<ShapedType>().getElementType();
+ switch (op) {
+ case AtomicRMWKind::addf:
+ case AtomicRMWKind::addi:
+ return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
+ builder.getStringAttr("add"),
+ vector, ValueRange{});
+ case AtomicRMWKind::mulf:
+ case AtomicRMWKind::muli:
+ return builder.create<vector::ReductionOp>(vector.getLoc(), scalarType,
+ builder.getStringAttr("mul"),
+ vector, ValueRange{});
+ // TODO: Add remaining reduction operations.
+ default:
+ (void)emitOptionalError(loc, "Reduction operation type not supported");
+ break;
+ }
+ return nullptr;
+}
+
//===----------------------------------------------------------------------===//
// ContractionOp
//===----------------------------------------------------------------------===//
// CHECK: }
// CHECK: vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>
// CHECK: }
+
+// -----
+
+// The inner reduction loop '%j' is not vectorized if we do not request
+// reduction vectorization.
+
+func @vec_vecdim_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %red_iter, %ld : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vec_vecdim_reduction_rejected
+// CHECK-NOT: vector
--- /dev/null
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0 vectorize-reductions=true" -split-input-file | FileCheck %s
+
+// The inner reduction loop '%j' is vectorized.
+
+func @vecdim_reduction(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %red_iter, %ld : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK: affine.yield %[[add]] : vector<128xf32>
+// CHECK: }
+// CHECK: %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK: }
+
+// -----
+
+// The inner reduction loop '%j' is vectorized. (The order of addf's operands is
+// different than in the previous test case).
+
+func @vecdim_reduction_comm(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %ld, %red_iter : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_comm
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[ld]], %[[red_iter]] : vector<128xf32>
+// CHECK: affine.yield %[[add]] : vector<128xf32>
+// CHECK: }
+// CHECK: %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK: }
+
+// -----
+
+// The inner reduction loop '%j' is vectorized. Transforming the input before
+// performing the accumulation doesn't cause any problem.
+
+func @vecdim_reduction_expsin(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %sin = math.sin %ld : f32
+ %exp = math.exp %sin : f32
+ %add = addf %red_iter, %exp : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_expsin
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[sin:.*]] = math.sin %[[ld]]
+// CHECK: %[[exp:.*]] = math.exp %[[sin]]
+// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[exp]] : vector<128xf32>
+// CHECK: affine.yield %[[add]] : vector<128xf32>
+// CHECK: }
+// CHECK: %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK: }
+
+// -----
+
+// Two reductions at the same time. The inner reduction loop '%j' is vectorized.
+
+func @two_vecdim_reductions(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ // Note that we pass the same constant '1.0' as initial values for both
+ // reductions.
+ %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %part_sum, %ld : f32
+ %mul = mulf %part_prod, %ld : f32
+ affine.yield %add, %mul : f32, f32
+ }
+ affine.store %sum, %out_sum[%i] : memref<256xf32>
+ affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @two_vecdim_reductions
+// CHECK: %[[cst:.*]] = constant 1.000000e+00 : f32
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK: %[[vone:.*]] = constant dense<1.000000e+00> : vector<128xf32>
+// CHECK: %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xf32>, vector<128xf32>) {
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[part_sum]], %[[ld]] : vector<128xf32>
+// CHECK: %[[mul:.*]] = mulf %[[part_prod]], %[[ld]] : vector<128xf32>
+// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xf32>
+// CHECK: }
+// CHECK: %[[nonfinal_sum:.*]] = vector.reduction "add", %[[vred:.*]]#0 : vector<128xf32> into f32
+// Note that to compute the final sum we need to add the original initial value
+// (%cst) since it is not zero.
+// CHECK: %[[final_sum:.*]] = addf %[[nonfinal_sum]], %[[cst]] : f32
+// For the final product we don't need to do this additional step because the
+// initial value equals to 1 (the neutral element for multiplication).
+// CHECK: %[[final_prod:.*]] = vector.reduction "mul", %[[vred:.*]]#1 : vector<128xf32> into f32
+// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK: affine.store %[[final_prod]], %{{.*}} : memref<256xf32>
+// CHECK: }
+
+// -----
+
+// The integer case.
+
+func @two_vecdim_reductions_int(%in: memref<256x512xi64>, %out_sum: memref<256xi64>, %out_prod: memref<256xi64>) {
+ %cst0 = constant 0 : i64
+ %cst1 = constant 1 : i64
+ affine.for %i = 0 to 256 {
+ %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst0, %part_prod = %cst1) -> (i64, i64) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xi64>
+ %add = addi %part_sum, %ld : i64
+ %mul = muli %part_prod, %ld : i64
+ affine.yield %add, %mul : i64, i64
+ }
+ affine.store %sum, %out_sum[%i] : memref<256xi64>
+ affine.store %prod, %out_prod[%i] : memref<256xi64>
+ }
+ return
+}
+
+// CHECK-LABEL: @two_vecdim_reductions
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %[[vzero:.*]] = constant dense<0> : vector<128xi64>
+// CHECK: %[[vone:.*]] = constant dense<1> : vector<128xi64>
+// CHECK: %[[vred:.*]]:2 = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[part_sum:.*]] = %[[vzero]], %[[part_prod:.*]] = %[[vone]]) -> (vector<128xi64>, vector<128xi64>) {
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xi64>, vector<128xi64>
+// CHECK: %[[add:.*]] = addi %[[part_sum]], %[[ld]] : vector<128xi64>
+// CHECK: %[[mul:.*]] = muli %[[part_prod]], %[[ld]] : vector<128xi64>
+// CHECK: affine.yield %[[add]], %[[mul]] : vector<128xi64>, vector<128xi64>
+// CHECK: }
+// CHECK: %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]]#0 : vector<128xi64> into i64
+// CHECK: %[[final_prod:.*]] = vector.reduction "mul", %[[vred:.*]]#1 : vector<128xi64> into i64
+// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xi64>
+// CHECK: affine.store %[[final_prod]], %{{.*}} : memref<256xi64>
+// CHECK: }
+
+// -----
+
+// The outer reduction loop '%j' is vectorized.
+
+func @vecdim_reduction_nested(%in: memref<256x512xf32>, %out: memref<1xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ %outer_red = affine.for %j = 0 to 512 iter_args(%outer_iter = %cst) -> (f32) {
+ %inner_red = affine.for %i = 0 to 256 iter_args(%inner_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %inner_iter, %ld : f32
+ affine.yield %add : f32
+ }
+ %outer_add = addf %outer_iter, %inner_red : f32
+ affine.yield %outer_add : f32
+ }
+ affine.store %outer_red, %out[0] : memref<1xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_nested
+// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK: %[[outer_red:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[outer_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK: %[[inner_red:.*]] = affine.for %{{.*}} = 0 to 256 iter_args(%[[inner_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[inner_iter]], %[[ld]] : vector<128xf32>
+// CHECK: affine.yield %[[add]] : vector<128xf32>
+// CHECK: }
+// CHECK: %[[outer_add:.*]] = addf %[[outer_iter]], %[[inner_red]] : vector<128xf32>
+// CHECK: affine.yield %[[outer_add]] : vector<128xf32>
+// CHECK: }
+// CHECK: %[[final_sum:.*]] = vector.reduction "add", %[[outer_red:.*]] : vector<128xf32> into f32
+// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<1xf32>
+
+// -----
+
+// The inner reduction loop '%j' computes partial sums as a side effect and
+// is not vectorized.
+
+func @vecdim_partial_sums_1_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %part_sum, %ld : f32
+ %mul = mulf %part_prod, %ld : f32
+ affine.store %add, %out_partsum[%i, %j] : memref<256x512xf32>
+ affine.yield %add, %mul : f32, f32
+ }
+ affine.store %sum, %out_sum[%i] : memref<256xf32>
+ affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_partial_sums_1_rejected
+// CHECK-NOT: vector
+
+// -----
+
+// The inner reduction loop '%j' computes partial sums as a side effect and
+// is not vectorized.
+
+func @vecdim_partial_sums_2_rejected(%in: memref<256x512xf32>, %out_sum: memref<256xf32>, %out_prod: memref<256xf32>, %out_partsum: memref<256x512xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %sum, %prod = affine.for %j = 0 to 512 iter_args(%part_sum = %cst, %part_prod = %cst) -> (f32, f32) {
+ affine.store %part_sum, %out_partsum[%i, %j] : memref<256x512xf32>
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %part_sum, %ld : f32
+ %mul = mulf %part_prod, %ld : f32
+ affine.yield %add, %mul : f32, f32
+ }
+ affine.store %sum, %out_sum[%i] : memref<256xf32>
+ affine.store %prod, %out_prod[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_partial_sums_2_rejected
+// CHECK-NOT: vector
+
+// -----
+
+// The inner reduction loop '%j' performs an unknown reduction operation and is
+// not vectorized.
+
+func @vecdim_unknown_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+ %add = addf %red_iter, %red_iter : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[0] : memref<256xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_unknown_reduction_rejected
+// CHECK-NOT: vector
+
+// -----
+
+// The inner reduction loop '%j' doesn't perform any operation which is not
+// recognized as a standard reduction.
+
+func @vecdim_none_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 1.000000e+00 : f32
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+ affine.yield %red_iter : f32
+ }
+ affine.store %final_red, %out[0] : memref<256xf32>
+ return
+}
+
+// CHECK-LABEL: @vecdim_none_reduction_rejected
+// CHECK-NOT: vector
+
+// -----
+
+// The number of iterations is not divisable by the vector size, so a mask has
+// to be applied to the last update of the accumulator.
+
+func @vecdim_reduction_masked(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to 500 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %red_iter, %ld : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK: #[[$map0:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)>
+// CHECK-LABEL: @vecdim_reduction_masked
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK: %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK: %[[elems_left:.*]] = affine.apply #[[$map0]](%[[iv]])
+// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK: affine.yield %[[new_acc]] : vector<128xf32>
+// CHECK: }
+// CHECK: %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK: }
+
+// -----
+
+// The number of iteration is not known, so a mask has to be applied.
+
+func @vecdim_reduction_masked_unknown_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %bnd: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to %bnd iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %red_iter, %ld : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK: #[[$map1:.*]] = affine_map<([[d0:.*]]){{\[}}[[s0:.*]]{{\]}} -> (-[[d0]] + [[s0]])>
+// CHECK-LABEL: @vecdim_reduction_masked_unknown_ub
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %[[vzero:.*]] = constant dense<0.000000e+00> : vector<128xf32>
+// CHECK: %[[vred:.*]] = affine.for %[[iv:.*]] = 0 to %[[bnd:.*]] step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK: %[[elems_left:.*]] = affine.apply #[[$map1]](%[[iv]])[%[[bnd]]]
+// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK: affine.yield %[[new_acc]] : vector<128xf32>
+// CHECK: }
+// CHECK: %[[final_sum:.*]] = vector.reduction "add", %[[vred:.*]] : vector<128xf32> into f32
+// CHECK: affine.store %[[final_sum]], %{{.*}} : memref<256xf32>
+// CHECK: }
+
+// -----
+
+// The lower bound is nonzero, but the number of iterations is divisible by the
+// vector size, so masking is not needed.
+
+func @vecdim_reduction_nonzero_lb(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 127 to 511 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %red_iter, %ld : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_nonzero_lb
+// CHECK: %{{.*}} = affine.for %{{.*}} = 127 to 511 step 128 iter_args({{.*}}) -> (vector<128xf32>) {
+// CHECK-NOT: vector.create_mask
+
+// -----
+
+// The lower bound is unknown, so we need to create a mask.
+
+func @vecdim_reduction_masked_unknown_lb(%in: memref<256x512xf32>, %out: memref<256xf32>, %lb: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = %lb to 512 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %red_iter, %ld : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK: #[[$map2:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 512)>
+// CHECK-LABEL: @vecdim_reduction_masked_unknown_lb
+// CHECK: %{{.*}} = affine.for %[[iv:.*]] = %[[lb:.*]] to 512 step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
+// CHECK: %[[elems_left:.*]] = affine.apply #[[$map2]](%[[iv]])
+// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK: affine.yield %[[new_acc]] : vector<128xf32>
+
+// -----
+
+// The upper bound is a minimum expression.
+
+func @vecdim_reduction_complex_ub(%in: memref<256x512xf32>, %out: memref<256xf32>, %M: index, %N: index) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to min affine_map<(d0, d1) -> (d0, d1*2)>(%M, %N) iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %add = addf %red_iter, %ld : f32
+ affine.yield %add : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK: #[[$map3:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]], [[d1]] * 2)>
+// CHECK: #[[$map3_sub:.*]] = affine_map<([[d0:.*]], [[d1:.*]]) -> ([[d0]] - [[d1]])>
+// CHECK-LABEL: @vecdim_reduction_complex_ub
+// CHECK: %{{.*}} = affine.for %[[iv:.*]] = 0 to min #[[$map3]](%[[M:.*]], %[[N:.*]]) step 128 iter_args(%[[red_iter:.*]] = {{.*}}) -> (vector<128xf32>) {
+// CHECK: %[[ub:.*]] = affine.min #[[$map3]](%[[M]], %[[N]])
+// CHECK: %[[elems_left:.*]] = affine.apply #[[$map3_sub]](%[[ub]], %[[iv]])
+// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[red_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK: affine.yield %[[new_acc]] : vector<128xf32>
+
+// -----
+
+// The same mask is applied to both reductions.
+
+func @vecdim_two_reductions_masked(%in: memref<256x512xf32>, %out: memref<512xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %final_sum, %final_expsum = affine.for %j = 0 to 500 iter_args(%sum_iter = %cst, %expsum_iter = %cst) -> (f32, f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %exp = math.exp %ld : f32
+ %add = addf %sum_iter, %ld : f32
+ %eadd = addf %expsum_iter, %exp : f32
+ affine.yield %add, %eadd : f32, f32
+ }
+ affine.store %final_sum, %out[2*%i] : memref<512xf32>
+ affine.store %final_expsum, %out[2*%i + 1] : memref<512xf32>
+ }
+ return
+}
+
+// CHECK: #[[$map4:.*]] = affine_map<([[d0:.*]]) -> (-[[d0]] + 500)>
+// CHECK-LABEL: @vecdim_two_reductions_masked
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %{{.*}} = affine.for %[[iv:.*]] = 0 to 500 step 128 iter_args(%[[sum_iter:.*]] = {{.*}}, %[[esum_iter:.*]] = {{.*}}) -> (vector<128xf32>, vector<128xf32>) {
+// CHECK: %[[elems_left:.*]] = affine.apply #[[$map4]](%[[iv]])
+// CHECK: %[[mask:.*]] = vector.create_mask %[[elems_left]] : vector<128xi1>
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[exp:.*]] = math.exp %[[ld]] : vector<128xf32>
+// CHECK: %[[add:.*]] = addf %[[sum_iter]], %[[ld]] : vector<128xf32>
+// CHECK: %[[eadd:.*]] = addf %[[esum_iter]], %[[exp]] : vector<128xf32>
+// CHECK: %[[new_acc:.*]] = select %[[mask]], %[[add]], %[[sum_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK: %[[new_eacc:.*]] = select %[[mask]], %[[eadd]], %[[esum_iter]] : vector<128xi1>, vector<128xf32>
+// CHECK: affine.yield %[[new_acc]], %[[new_eacc]] : vector<128xf32>
+// CHECK: }
--- /dev/null
+// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0 vectorize-reductions=true" -verify-diagnostics
+
+// TODO: Vectorization of reduction loops along the reduction dimension is not
+// supported for higher-rank vectors yet, so we are just checking that an
+// error message is produced.
+
+// expected-error@+1 {{Vectorizing reductions is supported only for 1-D vectors}}
+func @vecdim_reduction_2d(%in: memref<256x512x1024xf32>, %out: memref<256xf32>) {
+ %cst = constant 0.000000e+00 : f32
+ affine.for %i = 0 to 256 {
+ %sum_j = affine.for %j = 0 to 512 iter_args(%red_iter_j = %cst) -> (f32) {
+ %sum_k = affine.for %k = 0 to 1024 iter_args(%red_iter_k = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j, %k] : memref<256x512x1024xf32>
+ %add = addf %red_iter_k, %ld : f32
+ affine.yield %add : f32
+ }
+ %add = addf %red_iter_j, %sum_k : f32
+ affine.yield %add : f32
+ }
+ affine.store %sum_j, %out[%i] : memref<256xf32>
+ }
+ return
+}
+