From: Rob Suderman Date: Wed, 5 May 2021 20:10:49 +0000 (-0700) Subject: [mlir][tosa] Add tosa.depthwise lowering to existing linalg.depthwise_conv X-Git-Tag: llvmorg-14-init~7512 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7abb56c78ba7bb9e2a91f61a65bb8feb69a92865;p=platform%2Fupstream%2Fllvm.git [mlir][tosa] Add tosa.depthwise lowering to existing linalg.depthwise_conv Implements support for undialated depthwise convolution using the existing depthwise convolution operation. Once convolutions migrate to yaml defined versions we can rewrite for cleaner implementation. Reviewed By: mravishankar Differential Revision: https://reviews.llvm.org/D101579 --- diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index fb34ff5..4bf2dc7 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -59,6 +59,37 @@ static mlir::SelectOp clampHelper(Location loc, Value arg, mlir::ConstantOp min, return rewriter.create(loc, largerThanMax, max, minOrArg); } +static mlir::Value applyPad(Location loc, Value input, ArrayRef pad, + Attribute padAttr, OpBuilder &rewriter) { + // Input should be padded if necessary. + if (llvm::all_of(pad, [](int64_t p) { return p == 0; })) + return input; + + ShapedType inputTy = input.getType().cast(); + Type inputETy = inputTy.getElementType(); + auto inputShape = inputTy.getShape(); + + assert((inputShape.size() * 2) == pad.size()); + + SmallVector paddedShape; + SmallVector lowIndices; + SmallVector highIndices; + for (int i = 0, s = inputShape.size(); i < s; i++) { + auto lowPad = pad[i * 2]; + auto highPad = pad[i * 2 + 1]; + paddedShape.push_back(inputShape[i] + highPad + lowPad); + lowIndices.push_back(rewriter.getIndexAttr(lowPad)); + highIndices.push_back(rewriter.getIndexAttr(highPad)); + } + + Value padValue = rewriter.create(loc, padAttr); + + return linalg::PadTensorOp::createPadScalarOp( + RankedTensorType::get(paddedShape, inputETy), input, padValue, + lowIndices, highIndices, loc, rewriter) + .result(); +} + static Value createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, ArrayRef resultTypes, @@ -757,6 +788,138 @@ static LogicalResult reduceMatchAndRewriteHelper(Operation *op, uint64_t axis, return success(); } +static LogicalResult +convolutionMatchAndRewriterHelper(Operation *op, + ConversionPatternRewriter &rewriter) { + Location loc = op->getLoc(); + Value input = op->getOperand(0); + Value weight = op->getOperand(1); + Value bias = op->getOperand(2); + + ShapedType inputTy = input.getType().cast(); + ShapedType weightTy = weight.getType().cast(); + ShapedType biasTy = bias.getType().cast(); + ShapedType resultTy = op->getResult(0).getType().cast(); + + Type inputETy = inputTy.getElementType(); + Type weightETy = weightTy.getElementType(); + Type biasETy = biasTy.getElementType(); + Type resultETy = resultTy.getElementType(); + + auto padAttr = op->getAttr("pad").cast(); + auto strideTosaAttr = op->getAttr("stride").cast(); + auto dilationTosaAttr = op->getAttr("dilation").cast(); + + if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || + !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) + return rewriter.notifyMatchFailure(op, + "tosa.conv ops require static shapes"); + + auto weightShape = weightTy.getShape(); + auto resultShape = resultTy.getShape(); + + // TODO(suderman): Support other types. + if (!inputETy.isF32() || !weightETy.isF32() || !biasETy.isF32() || + !resultETy.isF32()) + return failure(); + + // Apply padding as necessary. + Attribute zeroAttr = rewriter.getZeroAttr(inputETy); + llvm::SmallVector pad; + pad.resize(2, 0); + getValuesFromIntArrayAttribute(padAttr, pad); + pad.resize(pad.size() + 2, 0); + + input = applyPad(loc, input, pad, zeroAttr, rewriter); + + // We need to transpose the Conv2DOp kernel to line up the last input/output + // kernels. + // TODO(suderman): Eventually we will support specifying the filter channel + // ordering then we can avoid transposing the kernel. + if (isa(op)) { + int32_t weightRank = weightTy.getRank(); + SmallVector permutation, transposeWeightShape; + permutation.resize(weightRank, 0); + transposeWeightShape.resize(weightRank, 0); + for (int i = 0; i < weightRank; i++) { + permutation[i] = (i + 1) % weightRank; + transposeWeightShape[i] = weightShape[permutation[i]]; + } + + Value permutationValue = rewriter.create( + loc, DenseIntElementsAttr::get( + RankedTensorType::get({weightRank}, rewriter.getI64Type()), + permutation)); + Type newWeightTy = RankedTensorType::get(transposeWeightShape, biasETy); + + weight = rewriter.create(loc, newWeightTy, weight, + permutationValue); + } + + // Broadcast the initial value to the output tensor before convolving. + SmallVector indexingMaps; + indexingMaps.push_back(AffineMap::get( + /*dimCount=*/resultTy.getRank(), /*symbolCount=*/0, + {rewriter.getAffineDimExpr(3)}, rewriter.getContext())); + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); + + Value initTensor = rewriter.create( + loc, resultTy.getShape(), resultTy.getElementType()); + + Value biasBroadcast = + rewriter + .create( + loc, resultTy, bias, initTensor, indexingMaps, + getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange args) { + nestedBuilder.create(nestedLoc, args[0]); + }) + .getResult(0); + + // Extract the attributes for convolution. + llvm::SmallVector stride, dilation; + getValuesFromIntArrayAttribute(strideTosaAttr, stride); + getValuesFromIntArrayAttribute(dilationTosaAttr, dilation); + + // Create the convolution op. + auto strideAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), stride); + auto dilationAttr = DenseIntElementsAttr::get( + RankedTensorType::get({2}, rewriter.getI64Type()), dilation); + + if (isa(op)) { + rewriter.replaceOpWithNewOp( + op, resultTy, ValueRange{input, weight}, ValueRange{biasBroadcast}, + dilationAttr, strideAttr); + return success(); + } + + if (isa(op)) { + if (llvm::any_of(dilation, [](int64_t d) { return d > 1; })) + return failure(); + + ShapedType linalgConvTy = + RankedTensorType::get({resultShape[0], resultShape[1], resultShape[2], + weightShape[2], weightShape[3]}, + resultETy); + + Value biasReshape = + rewriter.create(loc, linalgConvTy, biasBroadcast); + Value conv = rewriter + .create( + loc, linalgConvTy, ValueRange{input, weight}, + ValueRange{biasReshape}, strideAttr) + .getResult(0); + + Value reshape = rewriter.create(loc, resultTy, conv); + rewriter.replaceOp(op, reshape); + return success(); + } + + return failure(); +} + namespace { template @@ -770,6 +933,17 @@ public: } }; +template +class ConvConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(T op, ArrayRef args, + ConversionPatternRewriter &rewriter) const final { + return convolutionMatchAndRewriterHelper(op, rewriter); + } +}; + class MatMulConverter : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -782,8 +956,8 @@ public: auto outputTy = op.getType().cast(); auto outputElementTy = outputTy.getElementType(); - auto zero_attr = rewriter.getZeroAttr(outputElementTy); - Value zero = rewriter.create(loc, zero_attr); + auto zeroAttr = rewriter.getZeroAttr(outputElementTy); + Value zero = rewriter.create(loc, zeroAttr); auto initTensor = rewriter.create( loc, outputTy.getShape(), outputTy.getElementType()); Value zeroTensor = @@ -862,108 +1036,6 @@ public: } }; -class Conv2DConverter : public OpConversionPattern { -public: - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(tosa::Conv2DOp op, ArrayRef args, - ConversionPatternRewriter &rewriter) const final { - Location loc = op.getLoc(); - Value input = op.input(); - Value weight = op.weight(); - Value bias = op.bias(); - - ShapedType inputTy = input.getType().cast(); - ShapedType weightTy = weight.getType().cast(); - ShapedType biasTy = bias.getType().cast(); - ShapedType resultTy = op.getType().cast(); - - Type inputETy = inputTy.getElementType(); - Type weightETy = weightTy.getElementType(); - Type biasETy = biasTy.getElementType(); - Type resultETy = resultTy.getElementType(); - - if (!inputTy.hasStaticShape() || !weightTy.hasStaticShape() || - !biasTy.hasStaticShape() || !resultTy.hasStaticShape()) - return rewriter.notifyMatchFailure(op, - "tosa.conv2d requires static shapes"); - - auto inputShape = inputTy.getShape(); - auto weightShape = weightTy.getShape(); - - // TODO(suderman): Support other types. - if (!inputETy.isF32() || !weightETy.isF32() || !biasETy.isF32() || - !resultETy.isF32()) - return failure(); - - // Broadcast the initial value to the output tensor before convolving. - SmallVector indexingMaps; - indexingMaps.push_back(AffineMap::get(/*dimCount=*/4, /*symbolCount=*/0, - {rewriter.getAffineDimExpr(3)}, - rewriter.getContext())); - indexingMaps.push_back(rewriter.getMultiDimIdentityMap(resultTy.getRank())); - - Value initTensor = rewriter.create( - loc, resultTy.getShape(), resultTy.getElementType()); - Value biasBroadcast = - rewriter - .create( - loc, resultTy, bias, initTensor, indexingMaps, - getNParallelLoopsAttrs(resultTy.getRank()), - [&](OpBuilder &nestedBuilder, Location nestedLoc, - ValueRange args) { - nestedBuilder.create(nestedLoc, args[0]); - }) - .getResult(0); - - // Transpose weights tensor to be in dim order: spatial dims, - // input channels, and output channels. - SmallVector permutation{1, 2, 3, 0}; - auto permutationAttr = DenseIntElementsAttr::get( - RankedTensorType::get({4}, rewriter.getI64Type()), permutation); - Value permutationValue = rewriter.create(loc, permutationAttr); - - SmallVector newKernelShape{weightShape[1], weightShape[2], - weightShape[3], weightShape[0]}; - Type newKernelTy = RankedTensorType::get(newKernelShape, biasETy); - - Value transposedKernel = rewriter.create( - loc, newKernelTy, weight, permutationValue); - - // Extract the attributes for convolution. - llvm::SmallVector stride, dilation, pad; - getValuesFromIntArrayAttribute(op.stride(), stride); - getValuesFromIntArrayAttribute(op.dilation(), dilation); - getValuesFromIntArrayAttribute(op.pad(), pad); - - // Input should be padded if necessary. - if (llvm::any_of(pad, [](int64_t p) { return p; })) { - llvm::SmallVector newPad{0, 0, pad[0], pad[1], - pad[2], pad[3], 0, 0}; - auto padAttr = DenseIntElementsAttr::get( - RankedTensorType::get({4, 2}, rewriter.getI64Type()), newPad); - Value padValue = rewriter.create(loc, padAttr); - - SmallVector paddedShape{ - inputShape[0], inputShape[1] + pad[0] + pad[1], - inputShape[2] + pad[2] + pad[3], inputShape[3]}; - Type paddedTy = RankedTensorType::get(paddedShape, inputETy); - input = rewriter.create(loc, paddedTy, input, padValue); - } - - auto strideAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), stride); - auto dilationAttr = DenseIntElementsAttr::get( - RankedTensorType::get({2}, rewriter.getI64Type()), dilation); - - auto convOp = rewriter.create( - loc, resultTy, ValueRange{input, transposedKernel}, - ValueRange{biasBroadcast}, dilationAttr, strideAttr); - - rewriter.replaceOp(op, convOp.getResult(0)); - return success(); - } -}; class ReshapeConverter : public OpConversionPattern { public: @@ -2102,7 +2174,6 @@ public: ShapedType resultTy = op.getType().template cast(); Type outElementTy = inputTy.getElementType(); - int64_t rank = inputTy.getRank(); if (!inputTy.hasStaticShape()) return failure(); @@ -2127,43 +2198,24 @@ public: return rewriter.notifyMatchFailure( op, "Unsupported initial value for tosa.maxpool_2d op"); + // Apply padding as necessary. + llvm::SmallVector pad; + pad.resize(2, 0); + getValuesFromIntArrayAttribute(op.pad(), pad); + pad.resize(pad.size() + 2, 0); + input = applyPad(loc, input, pad, initialAttr, rewriter); + Value initialValue = rewriter.create(loc, initialAttr); - SmallVector kernel, stride, pad; + SmallVector kernel, stride; getValuesFromIntArrayAttribute(op.kernel(), kernel); getValuesFromIntArrayAttribute(op.stride(), stride); - getValuesFromIntArrayAttribute(op.pad(), pad); Attribute strideAttr = rewriter.getI64VectorAttr(stride); Attribute dilationAttr = rewriter.getI64VectorAttr({1, 1}); int64_t kernelSize = kernel[0] * kernel[1]; - // If non-zero padding we need to pad the input - if (llvm::any_of(pad, [](int64_t v) { return v != 0; })) { - SmallVector paddedShape; - for (int64_t i = 0; i < rank; i++) - paddedShape.push_back(inputTy.getDimSize(i)); - - paddedShape[1] += pad[0] + pad[1]; - paddedShape[2] += pad[2] + pad[3]; - - OpFoldResult zeroIndex = rewriter.getIndexAttr(0); - OpFoldResult heightLowPadIndex = rewriter.getIndexAttr(pad[0]); - OpFoldResult heightHighPadIndex = rewriter.getIndexAttr(pad[1]); - OpFoldResult widthLowPadIndex = rewriter.getIndexAttr(pad[2]); - OpFoldResult widthHighPadIndex = rewriter.getIndexAttr(pad[3]); - - SmallVector lowIndices = {zeroIndex, heightLowPadIndex, - widthLowPadIndex, zeroIndex}; - SmallVector highIndices = {zeroIndex, heightHighPadIndex, - widthHighPadIndex, zeroIndex}; - - input = linalg::PadTensorOp::createPadScalarOp( - RankedTensorType::get(paddedShape, inElementTy), input, - initialValue, lowIndices, highIndices, loc, rewriter) - .result(); - } - + // Create the linalg op that performs pooling. Value initTensor = rewriter.create( loc, resultTy.getShape(), resultTy.getElementType()); @@ -2277,7 +2329,8 @@ void mlir::tosa::populateTosaToLinalgOnTensorsConversionPatterns( ReduceConverter, ArgMaxConverter, ConcatConverter, - Conv2DConverter, + ConvConverter, + ConvConverter, GatherConverter, PadConverter, ReshapeConverter, diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index d52c632..dbd4f90 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -1007,8 +1007,9 @@ func @max_pool_padded(%arg0: tensor<1x6x34x62xf32>) -> () { // CHECK-DAG: [[CONST:%.+]] = constant -3.40282347E+38 : f32 // CHECK-DAG: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 0, 0, 0] high[0, 0, 1, 0] // CHECK-DAG: linalg.yield [[CONST]] + // CHECK-DAG: [[INITVAL:%.+]] = constant -3.40282347E+38 : f32 // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 33, 62] - // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INIT]], [[CONST]]) + // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INIT]], [[INITVAL]]) // CHECK-DAG: [[KERNEL:%.+]] = linalg.init_tensor [3, 3] // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x6x35x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x33x62xf32>) %0 = "tosa.max_pool2d"(%arg0) {pad = [0, 0, 0, 1], kernel = [3, 3], stride = [1, 1]} : (tensor<1x6x34x62xf32>) -> (tensor<1x4x33x62xf32>) @@ -1056,18 +1057,18 @@ func @avg_pool(%arg0: tensor<1x6x34x62xf32>) -> () { // ----- -// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)> // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)> +// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> func @conv2d_f32(%input: tensor<1x49x42x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () { - // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 45, 40, 28] : tensor<1x45x40x28xf32> - // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) + // CHECK: %[[INIT:.+]] = linalg.init_tensor [3, 3, 28, 28] + // CHECK: %[[KERNEL:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1 : tensor<28x3x3x28xf32>) outs(%[[INIT]] : tensor<3x3x28x28xf32>) // CHECK: ^bb0(%arg3: f32, %arg4: f32): // CHECK: linalg.yield %arg3 : f32 - // CHECK: %[[INITKERNEL:.+]] = linalg.init_tensor [3, 3, 28, 28] - // CHECK: %[[TRANSPOSEKERNEL:.+]] = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1 : tensor<28x3x3x28xf32>) outs(%[[INITKERNEL]] : tensor<3x3x28x28xf32>) - // CHECK: linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSEKERNEL]] : tensor<1x49x42x28xf32>, tensor<3x3x28x28xf32>) outs(%[[BROADCAST]] : tensor<1x45x40x28xf32>) + // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 45, 40, 28] + // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) + // CHECK: linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[KERNEL]] : tensor<1x49x42x28xf32>, tensor<3x3x28x28xf32>) outs(%[[BROADCAST]] : tensor<1x45x40x28xf32>) %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [0, 0, 0, 0], stride = [1, 1], dilation = [2, 1]} : (tensor<1x49x42x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> (tensor<1x45x40x28xf32>) return } @@ -1081,6 +1082,26 @@ func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x // ----- +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + +// CHECK-LABEL: @depthwise_conv +func @depthwise_conv(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () { + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 5, 5, 33] + // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<33xf32>) outs([[INIT]] : tensor<1x5x5x33xf32>) { + // CHECK: ^bb0(%arg3: f32, %arg4: f32): // no predecessors + // CHECK: linalg.yield %arg3 : f32 + // CHECK: } -> tensor<1x5x5x33xf32> + // CHECK: [[DBIAS:%.+]] = linalg.tensor_reshape [[BIAS]] {{\[}}[0], [1], [2], [3, 4]] + // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_input_nhwc_filter_hwcf {strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>) outs([[DBIAS]] : tensor<1x5x5x3x11xf32>) + // CHECK: linalg.tensor_reshape %3 {{\[}}[0], [1], [2], [3, 4]] + %2 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) { pad = [0, 0, 0, 0], stride = [1, 1], dilation = [1, 1] } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>) -> (tensor<1x5x5x33xf32>) + return +} + + +// ----- + // CHECK-LABEL: @resize_nearest func @resize_nearest(%input: tensor<1x2x2x1xf32>) -> () { // CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 4, 4, 1]