From 71c6339af0c0f50df754c2196a2de83e1211c42b Mon Sep 17 00:00:00 2001 From: zihaomu Date: Fri, 23 Dec 2022 16:50:28 +0800 Subject: [PATCH] remove old convolution branch, and optimize conv3d and conv1d. --- modules/dnn/src/layers/convolution_layer.cpp | 841 +-------------------- .../fast_convolution/depthwise_convolution.cpp | 623 +++++++-------- .../fast_convolution/fast_convolution.avx2.cpp | 329 ++------ .../layers/fast_convolution/fast_convolution.cpp | 753 ++++++++++++++---- .../layers/fast_convolution/fast_convolution.hpp | 60 +- .../fast_convolution/fast_convolution.simd.hpp | 280 ++++++- .../layers/fast_convolution/winograd_3x3s1_f63.cpp | 4 +- modules/dnn/src/layers/layers_common.simd.hpp | 806 -------------------- 8 files changed, 1271 insertions(+), 2425 deletions(-) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index cc39593..0bdb0df 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -259,7 +259,7 @@ public: std::vector reluslope; Ptr activ; - Ptr fastConv2dImpl; + Ptr fastConvImpl; #ifdef HAVE_OPENCL Ptr > convolutionOp; @@ -967,808 +967,6 @@ public: } #endif // HAVE_WEBNN - class ParallelConv : public cv::ParallelLoopBody - { - public: - enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 }; - - const Mat* input_; - const Mat* weights_; - Mat* output_; - int outShape[4]; // used only for conv2d - std::vector kernel_size, pads_begin, pads_end, strides, dilations; - int ngroups_, nstripes_; - std::vector ofstab_; - const std::vector* biasvec_; - const std::vector* reluslope_; - const ActivationLayer* activ_; - bool is1x1_; - bool useAVX; - bool useAVX2; - bool useAVX512; - bool useRVV; - bool useLASX; - int blk_size_cn; - - ParallelConv() - : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), - biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) - , useLASX(false), blk_size_cn(0) - {} - - static void run( const Mat& input, Mat& output, const Mat& weights, - const std::vector& biasvec, - const std::vector& reluslope, - const std::vector& kernel_size, const std::vector& strides, - const std::vector& pads_begin, const std::vector& pads_end, - const std::vector& dilations, - const ActivationLayer* activ, int ngroups, int nstripes ) - { - size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(), - 1, std::multiplies()); - bool isConv1D = input.dims == 3; - bool isConv2D = input.dims == 4; - bool isConv3D = input.dims == 5; - CV_CheckEQ(static_cast(kernel_size.size()), input.dims - 2, ""); - CV_Assert_N(input.dims == output.dims, - input.size[0] == output.size[0], - weights.rows == output.size[1], - weights.cols == (input.size[1]/ngroups)*karea, - input.type() == output.type(), - input.type() == weights.type(), - input.type() == CV_32FC1, - input.isContinuous(), - output.isContinuous(), - biasvec.size() == (size_t)output.size[1]+2); - CV_Check(weights.step1(), weights.step1() % VEC_ALIGN == 0, ""); - CV_CheckType(weights.type(), CV_32FC1, ""); - ParallelConv p; - - p.input_ = &input; - p.weights_ = &weights; - p.output_ = &output; - int max_ind = isConv1D? 3: 4; - for( int i = 0; i < max_ind; i++ ) p.outShape[i] = output.size[i]; - p.outShape[1] /= ngroups; - - p.kernel_size = kernel_size; p.strides = strides; p.dilations = dilations; - p.pads_begin = pads_begin; p.pads_end = pads_end; - - p.ngroups_ = ngroups; - p.nstripes_ = nstripes; - - int inpCnAll = input.size[1]; - int depth = (input.dims == 5) ? input.size[2] : 1; - int width = input.size[input.dims - 1]; - int height = isConv1D? 1 : input.size[input.dims - 2]; - int inpCn = inpCnAll / ngroups; - - p.is1x1_ = (isConv2D && kernel_size[0] == 1 && kernel_size[1] == 1 && - pads_begin[0] == 0 && pads_begin[1] == 0) || - (isConv1D && pads_begin[0] == 0 && kernel_size[0] == 1); - - p.useAVX = checkHardwareSupport(CPU_AVX) && isConv2D; - p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D; - p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D; - p.useRVV = checkHardwareSupport(CPU_RVV) && isConv2D; - p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D; - - int kernel_d = isConv3D? kernel_size[0] : 1; - int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; - int kernel_w = kernel_size.back(); - - int blk_size_cn0 = cvCeil(800./(kernel_w*kernel_h)); - int ncn = 16; - while (ncn*2 < blk_size_cn0 && ncn < inpCn) - ncn *= 2; - ncn = std::min(ncn, inpCn); - p.blk_size_cn = ncn; - - int dil_d = isConv3D? dilations[0] : 1; - int dil_h = isConv1D? 1 : dilations[dilations.size() - 2]; - int dil_w = dilations.back(); - - p.ofstab_.resize(karea * ncn); - int* ofstab = &p.ofstab_[0]; - - if (isConv1D) - { - for( int k = 0; k < ncn; k++ ) - for( int k_c = 0; k_c < kernel_w; k_c++ ) - ofstab[k*kernel_w + k_c] = k*width + k_c*dil_w; - } - else if (isConv2D) - { - for( int k = 0; k < ncn; k++ ) - for( int k_r = 0; k_r < kernel_h; k_r++ ) - for( int k_c = 0; k_c < kernel_w; k_c++ ) - ofstab[(k*kernel_h + k_r)*kernel_w + k_c] = - (k*height + k_r*dil_h)*width + k_c*dil_w; - } - else - { - for( int k = 0; k < ncn; k++ ) - for (int k_d = 0; k_d < kernel_d; k_d++) - for( int k_r = 0; k_r < kernel_h; k_r++ ) - for( int k_c = 0; k_c < kernel_w; k_c++ ) - ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] = - (k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w; - } - - p.biasvec_ = &biasvec; - p.reluslope_ = &reluslope; - p.activ_ = p.reluslope_->empty() ? activ : 0; - - parallel_for_(Range(0, nstripes), p, nstripes); - } - - virtual void operator ()(const Range &r0) const CV_OVERRIDE - { - const int valign = ConvolutionLayerImpl::VEC_ALIGN; - int ngroups = ngroups_, batchSize = input_->size[0]*ngroups; - bool isConv1D = input_->dims == 3; - bool isConv2D = input_->dims == 4; - bool isConv3D = input_->dims == 5; - - int outW = output_->size[output_->dims - 1]; - int outH = isConv1D? 1 : output_->size[output_->dims - 2]; - int outCn = output_->size[1]/ngroups; - - int depth = isConv3D? input_->size[2] : 1; - int height = isConv1D? 1 : input_->size[input_->dims - 2]; - int width = input_->size[input_->dims - 1]; - int inpCn = input_->size[1]/ngroups; - - const int nstripes = nstripes_; - - int kernel_d = isConv3D? kernel_size[0] : 1; - int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; - int kernel_w = kernel_size.back(); - int karea = kernel_w*kernel_h*kernel_d; - - int pad_d = isConv3D? pads_begin[0] : 0; - int pad_t = isConv1D? 0 : pads_begin[pads_begin.size() - 2]; - int pad_l = pads_begin.back(); - - int stride_d = isConv3D? strides[0] : 0; - int stride_h = isConv1D? 0 : strides[strides.size() - 2]; - int stride_w = strides.back(); - - int dilation_d = isConv3D? dilations[0] : 1; - int dilation_h = isConv1D? 1 : dilations[dilations.size() - 2]; - int dilation_w = dilations.back(); - - int i, j, k, d; - int inpPlaneSize = (int)input_->total(2); - int outPlaneSize = (int)output_->total(2); - bool is1x1 = is1x1_; - - int stripesPerSample; - int stripeSize; - Range r = r0; - bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 && - outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 && - width >= 16 + dilation_w*(kernel_w - 1); - // for now only 3x3 depth-wise convolutions are supported - depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 && - // computing at most 1 pixel from each side can involve padding - max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t && - pad_l <= 1 && pad_t <= 1; - - if( !depthWiseConvolution && nstripes >= batchSize*2 ) - { - stripesPerSample = nstripes/batchSize; - stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign); - stripeSize = std::min(stripeSize, outPlaneSize); - } - else - { - stripesPerSample = 1; - int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1); - r.start *= samplesPerStripe; - r.end *= samplesPerStripe; - stripeSize = outPlaneSize; - } - - const float* data_inp0_ = input_->ptr(); - const int* ofstab = &ofstab_[0]; - const float* wptr_orig_ = weights_->ptr(); - size_t wstep = weights_->step1(); - const float* biasptr_ = &biasvec_->at(0); - const float* reluptr_ = reluslope_->empty() ? 0 : &reluslope_->at(0); - float* data_out0_ = output_->ptr(); - AutoBuffer rowbuf0_; - float* rowbuf0 = 0; - bool use_rowbuf = !depthWiseConvolution; - int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize); - - // im2row buffer is not used for depth-wise convolution - if(use_rowbuf) - { - size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size); - //printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize); - rowbuf0_.allocate(rowbufsz + valign); - rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float))); - // we clear the buffer once; ultimately, it lets us to avoid - // tail processing after running the unrolled/vectorized loop. - // the main idea is to make sure that the tail (a.k.a. padding) of each row - // (i.e. the elements with indices between vsz=karea*ncn and vsz_a) - // does not contain NaNs or Infs. Because the padding in the weights - // matrix is explicitly initialized with 0's, we handle all other - // cases nicely, i.e. we can skip expliciting re-initialization - // of the padding - we just retain elements from the previous iteration - // of the loop over channels (cn0). - memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) ); - } - - for( int stripe = r.start; stripe < r.end; stripe++ ) - { - int subsampleIdx = stripe/stripesPerSample; - if( subsampleIdx >= batchSize ) - break; - int stripeStart = (int)((stripe - subsampleIdx*stripesPerSample)*stripeSize); - int stripeEnd = (int)std::min(stripeStart + stripeSize, outPlaneSize); - const float* data_inp0 = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn; - float* data_out0 = data_out0_ + subsampleIdx*outPlaneSize*outCn; - int startOutCn = (subsampleIdx % ngroups)*outCn; - const float* wptr_orig = wptr_orig_ + wstep*startOutCn; - const float* biasptr = biasptr_ + startOutCn; - - for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn ) - { - int cn1 = std::min(cn0 + blk_size_cn, inpCn); - int ncn = cn1 - cn0, vsz = karea*ncn; - int vsz_a = (int)alignSize(vsz, valign); - const float* wptr = wptr_orig + cn0*karea; - // we apply [Channels][P]ReLU (if any) during the final pass only. - const float* relu = cn1 == inpCn && reluptr_ ? reluptr_ + startOutCn : 0; - - for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size ) - { - int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd); - int bsz = ofs1 - ofs0; - - int out_d = ofs0 / (outH * outW); - int out_i = (ofs0 - out_d * outH * outW) / outW; - int out_j = ofs0 % outW; - - if (depthWiseConvolution) - { - CV_Assert(out_i == 0 && out_j == 0); - int in_d = out_d * stride_d - pad_d; - const float* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width; - float* outptr_ = data_out0 + ofs0; - - #if CV_TRY_AVX2 - if(useAVX2) - opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w, - stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, - biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); - else - #endif - #if CV_TRY_AVX - if(useAVX) - opt_AVX::fastDepthwiseConv(wptr, kernel_h, kernel_w, - stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, - biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); - else - #endif - #if CV_TRY_RVV - if(useRVV) - opt_RVV::fastDepthwiseConv(wptr, kernel_h, kernel_w, - stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, - biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); - else - #endif - #if CV_TRY_LASX - if(useLASX) - opt_LASX::fastDepthwiseConv(wptr, kernel_h, kernel_w, - stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, - biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW); - else - #endif - { - const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], - w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], - w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; - int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); - float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; - - for (int out_i = 0; out_i < outH; out_i++) - { - int in_i = out_i * stride_h - pad_t, out_j = 0; - const float* imgptr0 = inptr_ + in_i*width; - const float* imgptr1 = imgptr0 + dilation_h*width; - const float* imgptr2 = imgptr0 + (dilation_h*2)*width; - float out, w00 = w00_, w01 = w01_, w02 = w02_; - float w20 = w20_, w21 = w21_, w22 = w22_; - if (in_i < 0) - { - w00 = w01 = w02 = 0.f; - imgptr0 = imgptr1; - } - else if (in_i + dilation_h*(kernel_h-1) >= height) - { - w20 = w21 = w22 = 0.f; - imgptr2 = imgptr1; - } - float* outptr = outptr_ + out_i*outW; - if (pad_l > 0) - { - out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + - imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + - imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[0] = out; - out_j = 1; - } - - #if CV_SIMD - // maybe with AVX or AVX512 strided depthwise convolution - // can be accelerated with vector code, but with 4xfloat vectors - // it's hardly the case - if( stride_w == 1 ) - { - const int VECSZ = v_float32::nlanes; - const int out_delta = VECSZ/stride_w; - v_float32 vw00 = vx_setall_f32(w00), vw01 = vx_setall_f32(w01), vw02 = vx_setall_f32(w02), - vw10 = vx_setall_f32(w10), vw11 = vx_setall_f32(w11), vw12 = vx_setall_f32(w12), - vw20 = vx_setall_f32(w20), vw21 = vx_setall_f32(w21), vw22 = vx_setall_f32(w22); - v_float32 z = vx_setzero_f32(), vbias = vx_setall_f32(bias), vrc = vx_setall_f32(relu_coeff); - for( ; out_j < outW1; out_j += out_delta ) - { - if (out_j + out_delta > outW1) - { - if (out_j <= pad_l) - break; - out_j = outW1 - out_delta; - } - int in_j = out_j * stride_w - pad_l; - v_float32 v00 = vx_load(imgptr0 + in_j), - v01 = vx_load(imgptr0 + in_j + dilation_w), - v02 = vx_load(imgptr0 + in_j + dilation_w*2), - v10 = vx_load(imgptr1 + in_j), - v11 = vx_load(imgptr1 + in_j + dilation_w), - v12 = vx_load(imgptr1 + in_j + dilation_w*2), - v20 = vx_load(imgptr2 + in_j), - v21 = vx_load(imgptr2 + in_j + dilation_w), - v22 = vx_load(imgptr2 + in_j + dilation_w*2); - - v_float32 vout = v00*vw00 + v01*vw01 + v02*vw02 + - v10*vw10 + v11*vw11 + v12*vw12 + - v20*vw20 + v21*vw21 + v22*vw22 + vbias; - if (relu) - vout = v_select(vout > z, vout, vout*vrc); - v_store(outptr + out_j, vout); - } - } - #endif - for (; out_j < outW1; out_j++) - { - int in_j = out_j * stride_w - pad_l; - out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + - imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + - imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[out_j] = out; - } - - for (; out_j < outW; out_j++ ) - { - int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; - float s0 = 1.f, s1 = 1.f, s2 = 1.f; - if (in_j0 >= width) - { - in_j0 = 0; - s0 = 0.f; - } - if (in_j1 >= width) - { - in_j1 = 0; - s1 = 0.f; - } - if (in_j2 >= width) - { - in_j2 = 0; - s2 = 0.f; - } - out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + - imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + - imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[out_j] = out; - } - } - } - continue; - } - - // do im2row for a part of input tensor - float* rowbuf = rowbuf0; - - if (isConv1D) - { - for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i ) - { - int delta = std::min(ofs1 - ofs, outW - out_j); - int out_j1 = out_j + delta; - - int in_j = out_j * stride_w - pad_l; - const float* imgptr = data_inp0 + cn0*width + in_j; - ofs += delta; - - // do im2row for a part of input tensor - if( is1x1 ) - { - for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w ) - { - for( k = 0; k < vsz; k++ ) - rowbuf[k] = imgptr[k*inpPlaneSize]; - } - } - else - { - for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w ) - { - // this condition should be true for most of the tensor elements, i.e. - // most of the time the kernel aperture is inside the tensor X-Y plane. - if( out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w ) - { - for( k = 0; k < vsz; k++ ) - { - int k1 = ofstab[k]; - float v0 = imgptr[k1]; - float v1 = imgptr[k1 + stride_w]; - rowbuf[k] = v0; - rowbuf[k+vsz_a] = v1; - } - out_j++; - rowbuf += vsz_a; - imgptr += stride_w; - in_j += stride_w; - } - else - { - int i0 = std::max(0, (-in_j + dilation_w-1)/dilation_w); - int i1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w); - - // here some non-continuous sub-row of the row will not be - // filled from the tensor; we need to make sure that the uncovered - // elements are explicitly set to 0's. the easiest way is to - // set all the elements to 0's before the loop. - memset(rowbuf, 0, vsz*sizeof(rowbuf[0])); - for( k = 0; k < ncn; k++ ) - { - for( i = i0; i < i1; i++ ) - { - int imgofs = k*width + i*dilation_w; - rowbuf[k*kernel_w + i] = imgptr[imgofs]; - } - } - } - } - } - } - } - else if (isConv2D) - { - if( is1x1 && stride_w == 1 && stride_h == 1 ) - { - const float* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j; - for( int j = 0; j < bsz; j++, rowbuf += vsz_a ) - { - if( j + 4 <= bsz ) - { - k = 0; - #if CV_SIMD128 - for( ; k <= vsz - 4; k += 4 ) - { - const float* inp = imgptr + j + k*inpPlaneSize; - v_float32x4 p0 = v_load(inp), p1 = v_load(inp + inpPlaneSize); - v_float32x4 p2 = v_load(inp + inpPlaneSize*2), p3 = v_load(inp + inpPlaneSize*3); - v_float32x4 r0, r1, r2, r3; - v_transpose4x4(p0, p1, p2, p3, r0, r1, r2, r3); - v_store(rowbuf + k, r0); - v_store(rowbuf + k + vsz_a, r1); - v_store(rowbuf + k + vsz_a*2, r2); - v_store(rowbuf + k + vsz_a*3, r3); - } - #endif - for( ; k < vsz; k++ ) - { - const float* inp = imgptr + j + k*inpPlaneSize; - float v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3]; - rowbuf[k] = v0; - rowbuf[k + vsz_a] = v1; - rowbuf[k + vsz_a*2] = v2; - rowbuf[k + vsz_a*3] = v3; - } - j += 3; - rowbuf += vsz_a*3; - } - else - { - for( k = 0; k < vsz; k++ ) - { - rowbuf[k] = imgptr[j + k*inpPlaneSize]; - } - } - } - } - else - for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i ) - { - int delta = std::min(ofs1 - ofs, outW - out_j); - int out_j1 = out_j + delta; - - int in_i = out_i * stride_h - pad_t; - int in_j = out_j * stride_w - pad_l; - const float* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j; - ofs += delta; - - // do im2row for a part of input tensor - if( is1x1 ) - { - for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w ) - { - for( k = 0; k < vsz; k++ ) - rowbuf[k] = imgptr[k*inpPlaneSize]; - } - } - else - { - bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h; - int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h); - int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h); - - for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w ) - { - // this condition should be true for most of the tensor elements, i.e. - // most of the time the kernel aperture is inside the tensor X-Y plane. - if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w ) - { - for( k = 0; k < vsz; k++ ) - { - int k1 = ofstab[k]; - float v0 = imgptr[k1]; - float v1 = imgptr[k1 + stride_w]; - rowbuf[k] = v0; - rowbuf[k+vsz_a] = v1; - } - out_j++; - rowbuf += vsz_a; - imgptr += stride_w; - in_j += stride_w; - } - else - { - int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w); - int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w); - - // here some non-continuous sub-row of the row will not be - // filled from the tensor; we need to make sure that the uncovered - // elements are explicitly set to 0's. the easiest way is to - // set all the elements to 0's before the loop. - memset(rowbuf, 0, vsz*sizeof(rowbuf[0])); - for( k = 0; k < ncn; k++ ) - { - for( i = i0; i < i1; i++ ) - { - for( j = j0; j < j1; j++ ) - { - int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w; - rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs]; - } - } - } - } - } - } - } - } - else - { - for( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1) / outH, out_i = (out_i + 1) % outH, out_j = 0 ) - { - int delta = std::min(ofs1 - ofs, outW - out_j); - int out_j1 = out_j + delta; - - int in_d = out_d * stride_d - pad_d; - int in_i = out_i * stride_h - pad_t; - int in_j = out_j * stride_w - pad_l; - const float* imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j; - ofs += delta; - - int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d); - int d1 = std::min(kernel_d, (depth - in_d + dilation_d - 1) / dilation_d); - - int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h); - int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h); - - for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w ) - { - int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w); - int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w); - - // here some non-continuous sub-row of the row will not be - // filled from the tensor; we need to make sure that the uncovered - // elements are explicitly set to 0's. the easiest way is to - // set all the elements to 0's before the loop. - memset(rowbuf, 0, vsz*sizeof(rowbuf[0])); - for( k = 0; k < ncn; k++ ) - { - for ( d = d0; d < d1; d++) - { - for( i = i0; i < i1; i++ ) - { - for( j = j0; j < j1; j++ ) - { - int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w; - rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs]; - } - } - } - } - } - } - } - // now compute dot product of the weights - // and im2row-transformed part of the tensor - #if CV_TRY_AVX512_SKX - /* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */ - if(useAVX512) - opt_AVX512_SKX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, - outShape, bsz, vsz, vsz_a, relu, cn0 == 0); - else - #endif - #if CV_TRY_AVX2 - if(useAVX2) - opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, - outShape, bsz, vsz, vsz_a, relu, cn0 == 0); - else - #endif - #if CV_TRY_AVX - if(useAVX) - opt_AVX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, - outShape, bsz, vsz, vsz_a, relu, cn0 == 0); - else - #endif - #if CV_TRY_RVV - if(useRVV) - opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, - outShape, bsz, vsz, vsz_a, relu, cn0 == 0); - else - #endif - #if CV_TRY_LASX - if(useLASX) - opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, - outShape, bsz, vsz, vsz_a, relu, cn0 == 0); - else - #endif - for( int i = 0; i < outCn; i += 2 ) - { - const float* wptr0 = wptr + i*wstep; - const float* wptr1 = wptr0 + wstep; - float* outptr0 = data_out0 + ofs0 + i*outPlaneSize; - float* outptr1 = outptr0 + outPlaneSize; - float bias0 = biasptr[i], bias1 = biasptr[i+1]; - float r0 = 1.f, r1 = 1.f; - - if( i+1 >= outCn ) - { - wptr1 = wptr0; - outptr1 = outptr0; - bias1 = bias0; - } - - if( relu ) - { - r0 = relu[i]; r1 = relu[i+1]; - if( i+1 >= outCn ) - r1 = r0; - } - - int j = 0; - #if CV_SIMD128 - v_float32x4 vr0 = v_setall_f32(r0), vr1 = v_setall_f32(r1), z = v_setzero_f32(); - - for( ; j <= bsz - 4; j += 4 ) - { - const float* rptr = rowbuf0 + j*vsz_a; - v_float32x4 s0, s1; - - if( cn0 == 0 ) - { - s0 = v_setall_f32(bias0); - s1 = v_setall_f32(bias1); - } - else - { - s0 = v_load(outptr0 + j); - s1 = v_load(outptr1 + j); - } - - v_float32x4 vs00 = v_setzero_f32(), vs01 = v_setzero_f32(), - vs02 = v_setzero_f32(), vs03 = v_setzero_f32(), - vs10 = v_setzero_f32(), vs11 = v_setzero_f32(), - vs12 = v_setzero_f32(), vs13 = v_setzero_f32(); - for( k = 0; k < vsz; k += 4, rptr += 4 ) - { - v_float32x4 w0 = v_load_aligned(wptr0 + k); - v_float32x4 w1 = v_load_aligned(wptr1 + k); - v_float32x4 r0 = v_load_aligned(rptr); - v_float32x4 r1 = v_load_aligned(rptr + vsz_a); - v_float32x4 r2 = v_load_aligned(rptr + vsz_a*2); - v_float32x4 r3 = v_load_aligned(rptr + vsz_a*3); - - vs00 = v_fma(w0, r0, vs00); - vs01 = v_fma(w0, r1, vs01); - vs02 = v_fma(w0, r2, vs02); - vs03 = v_fma(w0, r3, vs03); - - vs10 = v_fma(w1, r0, vs10); - vs11 = v_fma(w1, r1, vs11); - vs12 = v_fma(w1, r2, vs12); - vs13 = v_fma(w1, r3, vs13); - } - s0 += v_reduce_sum4(vs00, vs01, vs02, vs03); - s1 += v_reduce_sum4(vs10, vs11, vs12, vs13); - if( relu ) - { - s0 = v_select(s0 > z, s0, s0*vr0); - s1 = v_select(s1 > z, s1, s1*vr1); - } - - v_store(outptr0 + j, s0); - v_store(outptr1 + j, s1); - } - #endif - for( ; j < bsz; j++ ) - { - const float* rptr = rowbuf0 + j*vsz_a; - float s00, s10; - - if( cn0 == 0 ) - { - s00 = bias0; - s10 = bias1; - } - else - { - s00 = outptr0[j]; - s10 = outptr1[j]; - } - - for( k = 0; k < vsz; k++ ) - { - float r0 = rptr[k]; - s00 += wptr0[k]*r0; - s10 += wptr1[k]*r0; - } - if( relu ) - { - s00 = s00 > 0.f ? s00 : s00*r0; - s10 = s10 > 0.f ? s10 : s10*r1; - } - - outptr0[j] = s00; - outptr1[j] = s10; - } - } - } - } - - if( activ_ ) - activ_->forwardSlice(data_out0 + stripeStart, data_out0 + stripeStart, - (int)(stripeEnd - stripeStart), - outPlaneSize, startOutCn, startOutCn + outCn); - } - } - }; - #ifdef HAVE_OPENCL bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { @@ -2096,40 +1294,27 @@ public: #endif { int nstripes = std::max(getNumThreads(), 1); + int conv_dim = CONV_2D; + if (inputs[0].dims == 3) + conv_dim = CONV_1D; + if (inputs[0].dims == 5) + conv_dim = CONV_3D; // Initialization of FastCovn2d, pack weight. - if ((!fastConv2dImpl || variableWeight) && inputs[0].dims == 4) + if (!fastConvImpl || variableWeight) { int K = outputs[0].size[1]; int C = inputs[0].size[1]; - int Hk = kernel_size[kernel_size.size() - 2]; - int Wk = kernel_size.back(); - - CV_Assert(outputs[0].size[1] % ngroups == 0); - int stride_h = strides[strides.size() - 2]; - int stride_w = strides.back(); - int dilation_h = dilations[dilations.size() - 2]; - int dilation_w = dilations.back(); + // Winograd only works when input h and w >= 12. + bool canUseWinograd = useWinograd && conv_dim == CONV_2D && inputs[0].size[2] >= 12 && inputs[0].size[3] >= 12; - // Winograd only works well on input h and w >12. - bool canUseWinograd = useWinograd && inputs[0].size[2] >= 12 && inputs[0].size[3] >= 12; - - fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h, dilation_w, - dilation_h, pads_begin, pads_end, weightsMat, &biasvec[0], canUseWinograd); - } - - if (fastConv2dImpl) - { - runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ, fusedAdd); - return; + CV_Assert(outputs[0].size[1] % ngroups == 0); + fastConvImpl = initFastConv(weightsMat, &biasvec[0], ngroups, K, C, kernel_size, strides, + dilations, pads_begin, pads_end, conv_dim, canUseWinograd); } - //TODO: Add support of Conv1D and Conv3D to fastConv, and remove the old Conv branch. - // Use only for Conv1D and Conv3D. - CV_Assert(!fusedAdd); - ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope, - kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes); + runFastConv(inputs[0], outputs[0], fastConvImpl, nstripes, activ, reluslope, fusedAdd); } } diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp index a527523..0c471e8 100644 --- a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp +++ b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp @@ -11,381 +11,404 @@ #include "../../precomp.hpp" #include "fast_convolution.hpp" +#include "../layers_common.hpp" namespace cv { namespace dnn { -static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab, - float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left, - int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop, - int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3) +static void depthWiseBlockConv2D(const float* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, + int height, int width, + float* outptr_, + int out_d, int outH, int outW) { -#if CV_SIMD128 - const int VEC_NLANES = 4; - v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval); + const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; - v_float32x4 w0 = v_setall_f32( - 0.f), w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0; - if (useSIMD) + for (int out_i = 0; out_i < outH; out_i++) { - vbias = v_setall_f32(biasval); - if (is3x3) + int in_i = out_i * stride_h - pad_t, out_j = 0; + const float* imgptr0 = inptr_ + in_i*width; + const float* imgptr1 = imgptr0 + dilation_h*width; + const float* imgptr2 = imgptr0 + (dilation_h*2)*width; + float out, w00 = w00_, w01 = w01_, w02 = w02_; + float w20 = w20_, w21 = w21_, w22 = w22_; + if (in_i < 0) { - w0 = v_setall_f32(weights[0]); - w1 = v_setall_f32(weights[1]); - w2 = v_setall_f32(weights[2]); - w3 = v_setall_f32(weights[3]); - w4 = v_setall_f32(weights[4]); - w5 = v_setall_f32(weights[5]); - w6 = v_setall_f32(weights[6]); - w7 = v_setall_f32(weights[7]); - w8 = v_setall_f32(weights[8]); + w00 = w01 = w02 = 0.f; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + w20 = w21 = w22 = 0.f; + imgptr2 = imgptr1; } - } -#endif - int dy0 = 1; - for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0) - { -#if CV_SIMD128 - dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1 - ? 3 : 1; -#endif - int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0; - int yi_ = y0 * stride_y - pad_top; - for (;;) + float* outptr = outptr_ + out_i*outW; + if (pad_l > 0) { - float s_0, s_1, s_2; - if (dy0 == 3) + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + + imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + + imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[0] = out; + out_j = 1; + } + +#if CV_SIMD128 + const int VEC_NLANES = 4; + v_float32x4 vw00 = v_setall_f32(w00); + v_float32x4 vw01 = v_setall_f32(w01); + v_float32x4 vw02 = v_setall_f32(w02); + v_float32x4 vw10 = v_setall_f32(w10); + v_float32x4 vw11 = v_setall_f32(w11); + v_float32x4 vw12 = v_setall_f32(w12); + v_float32x4 vw20 = v_setall_f32(w20); + v_float32x4 vw21 = v_setall_f32(w21); + v_float32x4 vw22 = v_setall_f32(w22); + v_float32x4 z = v_setzero_f32(); + v_float32x4 vbias = v_setall_f32(bias); + v_float32x4 vrc = v_setall_f32(relu_coeff); + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + if( stride_w == 1 ) { - for (; x0 < x1; x0++) + for( ; out_j < outW1; out_j += VEC_NLANES ) { - int xi_ = x0 * stride_x - pad_left; - s_0 = s_1 = s_2 = biasval; - for (int k = 0; k < ksize; k++) + if (out_j + VEC_NLANES > outW1) { - int dy = yxtab[k * 2]; - int yi = yi_ + dy; - int xi = xi_ + yxtab[k * 2 + 1]; - float w = weights[k]; - - if ((unsigned) xi < (unsigned) Wi) - { - s_0 += inptr[yi * Wi + xi] * w; - s_1 += inptr[(yi + 1) * Wi + xi] * w; - s_2 += inptr[(yi + 2) * Wi + xi] * w; - } - } - s_0 = std::min(std::max(s_0, minval), maxval); - s_1 = std::min(std::max(s_1, minval), maxval); - s_2 = std::min(std::max(s_2, minval), maxval); - outptr[x0] = s_0; - outptr[x0 + W0] = s_1; - outptr[x0 + W0 * 2] = s_2; - } - } - else - { - for (; x0 < x1; x0++) - { - int xi_ = x0 * stride_x - pad_left; - s_0 = biasval; - for (int k = 0; k < ksize; k++) { - int dy = yxtab[k * 2]; - int yi = yi_ + dy; - int xi = xi_ + yxtab[k * 2 + 1]; - float w = weights[k]; - if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi)) - s_0 += inptr[yi * Wi + xi] * w; + if (out_j <= pad_l || outW1 - VEC_NLANES < 0) + break; + out_j = outW1 - VEC_NLANES; } - s_0 = std::min(std::max(s_0, minval), maxval); - outptr[x0] = s_0; + int in_j = out_j * stride_w - pad_l; + v_float32x4 v00 = v_load(imgptr0 + in_j), + v01 = v_load(imgptr0 + in_j + dilation_w), + v02 = v_load(imgptr0 + in_j + dilation_w*2), + v10 = v_load(imgptr1 + in_j), + v11 = v_load(imgptr1 + in_j + dilation_w), + v12 = v_load(imgptr1 + in_j + dilation_w*2), + v20 = v_load(imgptr2 + in_j), + v21 = v_load(imgptr2 + in_j + dilation_w), + v22 = v_load(imgptr2 + in_j + dilation_w*2); + + v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + + v10*vw10 + v11*vw11 + v12*vw12 + + v20*vw20 + v21*vw21 + v22*vw22 + vbias; + if (relu) + vout = v_select(vout > z, vout, vout*vrc); + v_store(outptr + out_j, vout); } } - if (x0 == W0) - break; - x1 = inner_xright; -#if CV_SIMD128 - if (useSIMD) + else // (stride_w == 2 && dilation_w == 1) { - if (is3x3) - { - if (dy0 == 3) - { - for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) - { - int xi_ = x0 * stride_x - pad_left; - const float *inptr_xi = inptr + Wi * yi_ + xi_; - - v_float32x4 s0, s1, s2; - v_float32x4 x00 = v_load(inptr_xi); - v_float32x4 x01 = v_load(inptr_xi + 1); - v_float32x4 x02 = v_load(inptr_xi + 2); - - v_float32x4 x10 = v_load(inptr_xi + Wi); - v_float32x4 x11 = v_load(inptr_xi + Wi + 1); - v_float32x4 x12 = v_load(inptr_xi + Wi + 2); - - v_float32x4 x20 = v_load(inptr_xi + Wi * 2); - v_float32x4 x21 = v_load(inptr_xi + Wi * 2 + 1); - v_float32x4 x22 = v_load(inptr_xi + Wi * 2 + 2); - - v_float32x4 x30 = v_load(inptr_xi + Wi * 3); - v_float32x4 x31 = v_load(inptr_xi + Wi * 3 + 1); - v_float32x4 x32 = v_load(inptr_xi + Wi * 3 + 2); - - v_float32x4 x40 = v_load(inptr_xi + Wi * 4); - v_float32x4 x41 = v_load(inptr_xi + Wi * 4 + 1); - v_float32x4 x42 = v_load(inptr_xi + Wi * 4 + 2); - - s0 = v_fma(x00, w0, vbias); - s1 = v_fma(x10, w0, vbias); - s2 = v_fma(x20, w0, vbias); - - s0 = v_fma(x01, w1, s0); - s1 = v_fma(x11, w1, s1); - s2 = v_fma(x21, w1, s2); - - s0 = v_fma(x02, w2, s0); - s1 = v_fma(x12, w2, s1); - s2 = v_fma(x22, w2, s2); - - s0 = v_fma(x10, w3, s0); - s1 = v_fma(x20, w3, s1); - s2 = v_fma(x30, w3, s2); - - s0 = v_fma(x11, w4, s0); - s1 = v_fma(x21, w4, s1); - s2 = v_fma(x31, w4, s2); - - s0 = v_fma(x12, w5, s0); - s1 = v_fma(x22, w5, s1); - s2 = v_fma(x32, w5, s2); - - s0 = v_fma(x20, w6, s0); - s1 = v_fma(x30, w6, s1); - s2 = v_fma(x40, w6, s2); - - s0 = v_fma(x21, w7, s0); - s1 = v_fma(x31, w7, s1); - s2 = v_fma(x41, w7, s2); - - s0 = v_fma(x22, w8, s0); - s1 = v_fma(x32, w8, s1); - s2 = v_fma(x42, w8, s2); - - if (ifMinMaxAct) - { - s0 = v_min(v_max(s0, vminval), vmaxval); - s1 = v_min(v_max(s1, vminval), vmaxval); - s2 = v_min(v_max(s2, vminval), vmaxval); - } - - v_store(outptr + x0, s0); - v_store(outptr + W0 + x0, s1); - v_store(outptr + W0 * 2 + x0, s2); - } - } - else - { - for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) - { - int xi_ = x0 * stride_x - pad_left; - const float *inptr_xi = inptr + Wi * yi_ + xi_; - v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias); - v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1; - v_float32x4 s2 = v_load(inptr_xi + ofstab[2]) * w2; - - s0 = v_fma(v_load(inptr_xi + ofstab[3]), w3, s0); - s1 = v_fma(v_load(inptr_xi + ofstab[4]), w4, s1); - s2 = v_fma(v_load(inptr_xi + ofstab[5]), w5, s2); - - s0 = v_fma(v_load(inptr_xi + ofstab[6]), w6, s0); - s1 = v_fma(v_load(inptr_xi + ofstab[7]), w7, s1); - s2 = v_fma(v_load(inptr_xi + ofstab[8]), w8, s2); - - s0 = s0 + s1 + s2; - if (ifMinMaxAct) - s0 = v_min(v_max(s0, vminval), vmaxval); - v_store(outptr + x0, s0); - } - } - } - else + for( ; out_j < outW1; out_j += VEC_NLANES ) { - for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) + if (out_j + VEC_NLANES > outW1 && out_j > pad_l) { - int xi_ = x0 * stride_x - pad_left, k = 0; - const float *inptr_xi = inptr + Wi * yi_ + xi_; - v_float32x4 s0 = vbias; - for (; k <= ksize - 4; k += 4) - { - v_float32x4 v0 = v_load(inptr_xi + ofstab[k]); - v_float32x4 v1 = v_load(inptr_xi + ofstab[k + 1]); - v_float32x4 v2 = v_load(inptr_xi + ofstab[k + 2]); - v_float32x4 v3 = v_load(inptr_xi + ofstab[k + 3]); - - v_float32x4 ww0 = v_setall_f32(weights[k]); - v_float32x4 ww1 = v_setall_f32(weights[k+1]); - v_float32x4 ww2 = v_setall_f32(weights[k+2]); - v_float32x4 ww3 = v_setall_f32(weights[k+3]); - - s0 = v_fma(v0, ww0, s0); - s0 = v_fma(v1, ww1, s0); - s0 = v_fma(v2, ww2, s0); - s0 = v_fma(v3, ww3, s0); - } - for (; k < ksize; k++) - s0 = v_fma(v_load(inptr_xi + ofstab[k]), - v_setall_f32(weights[k]), s0); - if (ifMinMaxAct) - s0 = v_min(v_max(s0, vminval), vmaxval); - v_store(outptr + x0, s0); + if (outW1 - VEC_NLANES < 0) + break; + out_j = outW1 - VEC_NLANES; } + + int in_j = out_j * stride_w - pad_l; + + v_float32x4 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; + v_load_deinterleave(imgptr0 + in_j, v00, v01); + v_load_deinterleave(imgptr0 + in_j + 2, v02, unused); + v_load_deinterleave(imgptr1 + in_j, v10, v11); + v_load_deinterleave(imgptr1 + in_j + 2, v12, unused); + v_load_deinterleave(imgptr2 + in_j, v20, v21); + v_load_deinterleave(imgptr2 + in_j + 2, v22, unused); + + v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + + v10 * vw10 + v11 * vw11 + v12 * vw12 + + v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias; + + if (relu) + vout = v_select(vout > z, vout, vout*vrc); + v_store(outptr + out_j, vout); } } + } #endif - if (dy0 == 3) + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + + imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + + imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + float s0 = 1.f, s1 = 1.f, s2 = 1.f; + if (in_j0 >= width) { - for (; x0 < x1; x0++) - { - int xi_ = x0 * stride_x - pad_left; - const float *inptr_xi = inptr + W0 * yi_ + xi_; - s_0 = s_1 = s_2 = biasval; - for (int k = 0; k < ksize; k++) - { - int inp_ofs = ofstab[k]; - float w = weights[k]; - s_0 += inptr_xi[inp_ofs] * w; - s_1 += inptr_xi[inp_ofs + Wi] * w; - s_2 += inptr_xi[inp_ofs + Wi * 2] * w; - } - if (ifMinMaxAct) - { - s_0 = std::min(std::max(s_0, minval), maxval); - s_1 = std::min(std::max(s_1, minval), maxval); - s_2 = std::min(std::max(s_2, minval), maxval); - } + in_j0 = 0; + s0 = 0.f; + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0.f; + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0.f; + } + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + + imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + + imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + } +} + +static void depthWiseBlockConv1D(const float* wptr, + int kernel_w, int stride_w, int dilation_w, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, int width, + float* outptr_, + int out_d, int outW) +{ + const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2]; + int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l)/stride_w); + float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; + + int out_j = 0; + const float* imgptr0 = inptr_; + float out, w00 = w00_, w01 = w01_, w02 = w02_; + float* outptr = outptr_; + + if (pad_l > 0) + { + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + bias; + + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[0] = out; + out_j = 1; + } - outptr[x0] = s_0; - outptr[x0 + W0] = s_1; - outptr[x0 + W0 * 2] = s_2; +#if CV_SIMD128 + const int VEC_NLANES = 4; + v_float32x4 vw00 = v_setall_f32(w00); + v_float32x4 vw01 = v_setall_f32(w01); + v_float32x4 vw02 = v_setall_f32(w02); + v_float32x4 z = v_setzero_f32(); + v_float32x4 vbias = v_setall_f32(bias); + v_float32x4 vrc = v_setall_f32(relu_coeff); + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + if( stride_w == 1 ) + { + for( ; out_j < outW1; out_j += VEC_NLANES ) + { + if (out_j + VEC_NLANES > outW1) + { + if (out_j <= pad_l || outW1 - VEC_NLANES < 0) + break; + out_j = outW1 - VEC_NLANES; } + int in_j = out_j * stride_w - pad_l; + v_float32x4 v00 = v_load(imgptr0 + in_j), + v01 = v_load(imgptr0 + in_j + dilation_w), + v02 = v_load(imgptr0 + in_j + dilation_w*2); + + v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias; + if (relu) + vout = v_select(vout > z, vout, vout*vrc); + v_store(outptr + out_j, vout); } - else + } + else // (stride_w == 2 && dilation_w == 1) + { + for( ; out_j < outW1; out_j += VEC_NLANES ) { - for (; x0 < x1; x0++) + if (out_j + VEC_NLANES > outW1) { - int xi_ = x0 * stride_x - pad_left; - const float *inptr_xi = inptr + Wi * yi_ + xi_; - s_0 = biasval; - for (int k = 0; k < ksize; k++) - { - s_0 += inptr_xi[ofstab[k]] * weights[k]; - } - - if (ifMinMaxAct) - s_0 = std::min(std::max(s_0, minval), maxval); - outptr[x0] = s_0; + if (out_j <= pad_l || outW1 - VEC_NLANES < 0) + break; + out_j = outW1 - VEC_NLANES; } + int in_j = out_j * stride_w - pad_l; + + v_float32x4 v00, v01, v02, unused; + v_load_deinterleave(imgptr0 + in_j, v00, v01); + v_load_deinterleave(imgptr0 + in_j + 2, v02, unused); + + v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias; + + if (relu) + vout = v_select(vout > z, vout, vout*vrc); + v_store(outptr + out_j, vout); } - x1 = W0; } } +#endif + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + float s0 = 1.f, s1 = 1.f, s2 = 1.f; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0.f; + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0.f; + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0.f; + } + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } } -void runDepthwise(InputArray _input, OutputArray _output, const Ptr& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) { +void runDepthwise(InputArray _input, OutputArray _output, const Ptr& conv, ActivationLayer* activ_, + const std::vector& reluslope) +{ Mat input = _input.getMat(); Mat output = _output.getMat(); MatShape inputShape = shape(input); MatShape outputShape = shape(output); - CV_Assert(inputShape.size() == 4 && outputShape.size() == 4); - int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W] + CV_Assert(inputShape.size() == 3 || inputShape.size() == 4); + CV_Assert(inputShape.size() == outputShape.size()); + + int conv_dim = conv->conv_dim; + CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) && + "DNN: Currently we do not support depth-wise for Convolution 3D!"); + + ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr; + int N = inputShape[0], C = inputShape[1]; + + int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2]; + int Wi = inputShape[inputShape.size() - 1]; + int K = conv->K, Hk = conv->Hk, Wk = conv->Wk; - int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups; + + int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2]; + int W0 = outputShape[outputShape.size() - 1]; + int ngroups = conv->ngroups; const size_t inp_planesize = (size_t) Hi * Wi; const size_t out_planesize = (size_t) H0 * W0; CV_Assert(ngroups > 1 && ngroups == K && ngroups == C); - int stride_y = conv->stride_y, stride_x = conv->stride_x; - int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x; + int stride_h = conv->stride_h, stride_w = conv->stride_w; + int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w; int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom; int pad_left = conv->pad_left, pad_right = conv->pad_right; - int VEC_NLANES = 4; -#if CV_TRY_AVX2 - if (conv->useAVX2) - VEC_NLANES = 8; -#endif - int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES; + int ksize = Hk * Wk; + + const int VEC_NLANES = 32; + int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES; const float *inp = input.ptr(); float *out = output.ptr(); - std::vector ofstab_(3 * padded_ksize, 0); +#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV + // TODO: remove the following limitation, need change code in layers_common.simd.hpp. + bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1); +#endif + std::vector ofstab_(3 * ksize, 0); int *ofstab = ofstab_.data(); - int *yxtab = ofstab + padded_ksize; + int *yxtab = ofstab + ksize; - for (int k = 0; k < padded_ksize; k++) + for (int k = 0; k < ksize; k++) { int y = k < ksize ? k / Wk : 0; int x = k < ksize ? k % Wk : 0; - int dy = y * dilation_y, dx = x * dilation_x; + int dy = y * dilation_h, dx = x * dilation_w; yxtab[k * 2] = dy; yxtab[k * 2 + 1] = dx; ofstab[k] = dy * Wi + dx; } const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data(); - int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3; - int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4; - + const float* relu = reluslope.data(); CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0)); - inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x; - inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi; - inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y; - inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi; - - if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom) + parallel_for_(Range(0, N * C), [&](const Range &r0) { + for (int nc = r0.start; nc < r0.end; nc++) { - inner_xleft = W0; - inner_ytop = H0; - } - - inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0; + int c = nc % C; + const float *inptr0 = inp + inp_planesize * nc; + float *outptr0 = out + out_planesize * nc; - bool useSIMD = stride_x == 1 && inner_xleft < W0; - bool is3x3 = Hk == 3 && Wk == 3; + const float *weights = weights0 + c * padded_ksize; - parallel_for_(Range(0, N * C), [&](const Range &r0) { - for (int nc = r0.start; nc < r0.end; nc++) + if (conv_dim == CONV_2D) { - int c = nc % C; - const float *inptr = inp + inp_planesize * nc; - float *outptr0 = out + out_planesize * nc; - - float biasval = bias[c]; - const float *weights = weights0 + c * padded_ksize; - #if CV_TRY_AVX2 - if (conv->useAVX2) - opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize, - pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop, - inner_ybottom, ifMinMaxAct, useSIMD, is3x3); + if(canRunOpt && conv->useAVX2) + opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, + pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); else #endif - depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize, - pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop, - inner_ybottom, ifMinMaxAct, useSIMD, is3x3); - - if (activ) - activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1); +#if CV_TRY_AVX + if(canRunOpt && conv->useAVX) + opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, + pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); + else +#endif +#if CV_TRY_RVV + if(canRunOpt && conv->useRVV) + opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, + pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); + else +#endif + depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, + pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); } - }); + else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D. + { + depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0); + } + + if (activ) + activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1); + }}); } -}} // namespace cv::dnn \ No newline at end of file +}} // namespace cv::dnn diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp index aa10d40..0d3c144 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp +++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp @@ -6,9 +6,52 @@ #include "fast_convolution.hpp" namespace cv { +namespace dnn { namespace opt_AVX2 { #if CV_TRY_AVX2 +void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct) +{ +#if CONV_NR == 24 + __m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0; + + for (int p = 0; p < np; p++, a++, b += CONV_NR) + { + __m256 a0 = _mm256_set1_ps(a[0]); + __m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16); + + c0 = _mm256_fmadd_ps(b0, a0, c0); + c1 = _mm256_fmadd_ps(b1, a0, c1); + c2 = _mm256_fmadd_ps(b2, a0, c2); + } + + if (init_c) + { + c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0); + c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1); + c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2); + } + + if (ifMinMaxAct) + { + __m256 vmax = _mm256_set1_ps(maxval); + __m256 vmin = _mm256_set1_ps(minval); + + c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax); + c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax); + c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax); + } + + _mm256_storeu_ps(c, c0); + _mm256_storeu_ps(c + 8, c1); + _mm256_storeu_ps(c + 16, c2); + _mm256_zeroupper(); +#else +#error "unsupported CONV_NR in convBlockMR1." +#endif +} + void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c) { #if CONV_MR == 4 && CONV_NR == 24 @@ -73,291 +116,6 @@ void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, b #endif } -void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab, - float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left, - int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop, - int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3) -{ - const int VEC_NLANES = 8; - __m256 vminval = _mm256_set1_ps(minval); - __m256 vmaxval = _mm256_set1_ps(maxval); - - __m256 w0 = _mm256_setzero_ps(), - w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0; - - if (useSIMD) - { - vbias = _mm256_set1_ps(biasval); - if (is3x3) - { - w0 = _mm256_set1_ps(weights[0]); - w1 = _mm256_set1_ps(weights[1]); - w2 = _mm256_set1_ps(weights[2]); - w3 = _mm256_set1_ps(weights[3]); - w4 = _mm256_set1_ps(weights[4]); - w5 = _mm256_set1_ps(weights[5]); - w6 = _mm256_set1_ps(weights[6]); - w7 = _mm256_set1_ps(weights[7]); - w8 = _mm256_set1_ps(weights[8]); - } - } - - int dy0 = 1; - for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0) - { - dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1 - ? 3 : 1; - - int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0; - int yi_ = y0 * stride_y - pad_top; - - for (;;) - { - float s_0, s_1, s_2; - if (dy0 == 3) - { - for (; x0 < x1; x0++) - { - int xi_ = x0 * stride_x - pad_left; - s_0 = s_1 = s_2 = biasval; - for (int k = 0; k < ksize; k++) - { - int dy = yxtab[k * 2]; - int yi = yi_ + dy; - int xi = xi_ + yxtab[k * 2 + 1]; - float w = weights[k]; - - if ((unsigned) xi < (unsigned) Wi) - { - s_0 += inptr[yi * Wi + xi] * w; - s_1 += inptr[(yi + 1) * Wi + xi] * w; - s_2 += inptr[(yi + 2) * Wi + xi] * w; - } - } - if (ifMinMaxAct) - { - s_0 = std::min(std::max(s_0, minval), maxval); - s_1 = std::min(std::max(s_1, minval), maxval); - s_2 = std::min(std::max(s_2, minval), maxval); - } - - outptr[x0] = s_0; - outptr[x0 + W0] = s_1; - outptr[x0 + W0 * 2] = s_2; - } - } - else - { - for (; x0 < x1; x0++) - { - int xi_ = x0 * stride_x - pad_left; - s_0 = biasval; - for (int k = 0; k < ksize; k++) { - int dy = yxtab[k * 2]; - int yi = yi_ + dy; - int xi = xi_ + yxtab[k * 2 + 1]; - float w = weights[k]; - if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi)) - s_0 += inptr[yi * Wi + xi] * w; - } - if (ifMinMaxAct) - s_0 = std::min(std::max(s_0, minval), maxval); - outptr[x0] = s_0; - } - } - if (x0 == W0) - break; - x1 = inner_xright; - - if (useSIMD) - { - if (is3x3) - { - if (dy0 == 3) - { - for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) - { - int xi_ = x0 * stride_x - pad_left; - const float *inptr_xi = inptr + Wi * yi_ + xi_; - - __m256 s0, s1, s2; - __m256 x00 = _mm256_loadu_ps(inptr_xi); - __m256 x01 = _mm256_loadu_ps(inptr_xi + 1); - __m256 x02 = _mm256_loadu_ps(inptr_xi + 2); - - __m256 x10 = _mm256_loadu_ps(inptr_xi + Wi); - __m256 x11 = _mm256_loadu_ps(inptr_xi + Wi + 1); - __m256 x12 = _mm256_loadu_ps(inptr_xi + Wi + 2); - - __m256 x20 = _mm256_loadu_ps(inptr_xi + Wi * 2); - __m256 x21 = _mm256_loadu_ps(inptr_xi + Wi * 2 + 1); - __m256 x22 = _mm256_loadu_ps(inptr_xi + Wi * 2 + 2); - - __m256 x30 = _mm256_loadu_ps(inptr_xi + Wi * 3); - __m256 x31 = _mm256_loadu_ps(inptr_xi + Wi * 3 + 1); - __m256 x32 = _mm256_loadu_ps(inptr_xi + Wi * 3 + 2); - - __m256 x40 = _mm256_loadu_ps(inptr_xi + Wi * 4); - __m256 x41 = _mm256_loadu_ps(inptr_xi + Wi * 4 + 1); - __m256 x42 = _mm256_loadu_ps(inptr_xi + Wi * 4 + 2); - - s0 = _mm256_fmadd_ps(x00, w0, vbias); - s1 = _mm256_fmadd_ps(x10, w0, vbias); - s2 = _mm256_fmadd_ps(x20, w0, vbias); - - s0 = _mm256_fmadd_ps(x01, w1, s0); - s1 = _mm256_fmadd_ps(x11, w1, s1); - s2 = _mm256_fmadd_ps(x21, w1, s2); - - s0 = _mm256_fmadd_ps(x02, w2, s0); - s1 = _mm256_fmadd_ps(x12, w2, s1); - s2 = _mm256_fmadd_ps(x22, w2, s2); - - s0 = _mm256_fmadd_ps(x10, w3, s0); - s1 = _mm256_fmadd_ps(x20, w3, s1); - s2 = _mm256_fmadd_ps(x30, w3, s2); - - s0 = _mm256_fmadd_ps(x11, w4, s0); - s1 = _mm256_fmadd_ps(x21, w4, s1); - s2 = _mm256_fmadd_ps(x31, w4, s2); - - s0 = _mm256_fmadd_ps(x12, w5, s0); - s1 = _mm256_fmadd_ps(x22, w5, s1); - s2 = _mm256_fmadd_ps(x32, w5, s2); - - s0 = _mm256_fmadd_ps(x20, w6, s0); - s1 = _mm256_fmadd_ps(x30, w6, s1); - s2 = _mm256_fmadd_ps(x40, w6, s2); - - s0 = _mm256_fmadd_ps(x21, w7, s0); - s1 = _mm256_fmadd_ps(x31, w7, s1); - s2 = _mm256_fmadd_ps(x41, w7, s2); - - s0 = _mm256_fmadd_ps(x22, w8, s0); - s1 = _mm256_fmadd_ps(x32, w8, s1); - s2 = _mm256_fmadd_ps(x42, w8, s2); - - if (ifMinMaxAct) - { - s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval); - s1 = _mm256_min_ps(_mm256_max_ps(s1, vminval), vmaxval); - s2 = _mm256_min_ps(_mm256_max_ps(s2, vminval), vmaxval); - } - - _mm256_storeu_ps(outptr + x0, s0); - _mm256_storeu_ps(outptr + W0 + x0, s1); - _mm256_storeu_ps(outptr + W0 * 2 + x0, s2); - } - } - else - { - for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) - { - int xi_ = x0 * stride_x - pad_left; - const float *inptr_xi = inptr + Wi * yi_ + xi_; - __m256 s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[0]), w0, vbias); - __m256 s1 = _mm256_mul_ps(_mm256_loadu_ps(inptr_xi + ofstab[1]), w1); - __m256 s2 = _mm256_mul_ps(_mm256_loadu_ps(inptr_xi + ofstab[2]), w2); - - s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[3]), w3, s0); - s1 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[4]), w4, s1); - s2 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[5]), w5, s2); - - s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[6]), w6, s0); - s1 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[7]), w7, s1); - s2 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[8]), w8, s2); - - s0 = _mm256_add_ps(_mm256_add_ps(s0, s1), s2); - - if (ifMinMaxAct) - s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval); - _mm256_storeu_ps(outptr + x0, s0); - } - } - } - else - { - for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES) - { - int xi_ = x0 * stride_x - pad_left, k = 0; - const float *inptr_xi = inptr + Wi * yi_ + xi_; - __m256 s0 = vbias; - for (; k <= ksize - 4; k += 4) - { - __m256 v0 = _mm256_loadu_ps(inptr_xi + ofstab[k]); - __m256 v1 = _mm256_loadu_ps(inptr_xi + ofstab[k + 1]); - __m256 v2 = _mm256_loadu_ps(inptr_xi + ofstab[k + 2]); - __m256 v3 = _mm256_loadu_ps(inptr_xi + ofstab[k + 3]); - - __m256 ww0 = _mm256_set1_ps(weights[k]); - __m256 ww1 = _mm256_set1_ps(weights[k+1]); - __m256 ww2 = _mm256_set1_ps(weights[k+2]); - __m256 ww3 = _mm256_set1_ps(weights[k+3]); - - s0 = _mm256_fmadd_ps(v0, ww0, s0); - s0 = _mm256_fmadd_ps(v1, ww1, s0); - s0 = _mm256_fmadd_ps(v2, ww2, s0); - s0 = _mm256_fmadd_ps(v3, ww3, s0); - } - for (; k < ksize; k++) - s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[k]), - _mm256_set1_ps(weights[k]), s0); - - if (ifMinMaxAct) - s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval); - _mm256_storeu_ps(outptr + x0, s0); - } - } - } - - if (dy0 == 3) - { - for (; x0 < x1; x0++) - { - int xi_ = x0 * stride_x - pad_left; - const float *inptr_xi = inptr + W0 * yi_ + xi_; - s_0 = s_1 = s_2 = biasval; - for (int k = 0; k < ksize; k++) { - int inp_ofs = ofstab[k]; - float w = weights[k]; - s_0 += inptr_xi[inp_ofs] * w; - s_1 += inptr_xi[inp_ofs + Wi] * w; - s_2 += inptr_xi[inp_ofs + Wi * 2] * w; - } - if (ifMinMaxAct) - { - s_0 = std::min(std::max(s_0, minval), maxval); - s_1 = std::min(std::max(s_1, minval), maxval); - s_2 = std::min(std::max(s_2, minval), maxval); - } - - outptr[x0] = s_0; - outptr[x0 + W0] = s_1; - outptr[x0 + W0 * 2] = s_2; - } - } - else - { - for (; x0 < x1; x0++) - { - int xi_ = x0 * stride_x - pad_left; - const float *inptr_xi = inptr + Wi * yi_ + xi_; - s_0 = biasval; - for (int k = 0; k < ksize; k++) - { - s_0 += inptr_xi[ofstab[k]] * weights[k]; - } - if (ifMinMaxAct) - s_0 = std::min(std::max(s_0, minval), maxval); - outptr[x0] = s_0; - } - } - x1 = W0; - } - } - _mm256_zeroupper(); -} - void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock) { @@ -737,4 +495,5 @@ void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, #endif } // namespace opt_AVX2 +} // namespace dnn } // namespace cv \ No newline at end of file diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp index fba57e7..02d7121 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp +++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp @@ -15,47 +15,94 @@ namespace cv { namespace dnn { enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment. -Ptr initFastConv2d( +Ptr initFastConv( + InputArray _weightsMat, + float* srcBias, int ngroups, - int K, int C, int Hk, int Wk, - int stride_x, int stride_y, - int dilation_x, int dilation_y, + int K, int C, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& dilations, const std::vector& pads_begin, const std::vector& pads_end, - InputArray _weightsMat, - float* srcBias, + int conv_dim, bool useWinograd) { - Ptr conv = makePtr(); + Ptr conv = makePtr(); CV_Assert(ngroups > 0 && K > 0 && C > 0 && K % ngroups == 0); - CV_Assert(Hk > 0 && Wk > 0); - CV_Assert(stride_y > 0 && stride_x > 0); - CV_Assert(dilation_y > 0 && dilation_x > 0); - conv->K = K; conv->C = C; conv->Hk = Hk; conv->Wk = Wk; // [K, iC, kH, kW] - conv->stride_y = stride_y; - conv->stride_x = stride_x; - conv->dilation_y = dilation_y; - conv->dilation_x = dilation_x; + // Weight shape, [K, C, Dk, Hk, Wk] for Conv3D, [K, C, Hk, Wk] for Conv2D, [K, C, Wk] for Conv1D. + int Dk = conv_dim == CONV_3D ? (int)kernel_size[0] : 1; + int Hk = conv_dim == CONV_1D ? 1 : (int)kernel_size[kernel_size.size() - 2]; + int Wk = (int)kernel_size.back(); + int karea = Wk*Hk*Dk; + + conv->pad_front = conv_dim == CONV_3D ? (int)pads_begin[0] : 0; + conv->pad_top = conv_dim == CONV_1D ? 0 : (int)pads_begin[pads_begin.size() - 2]; + conv->pad_left = (int)pads_begin.back(); + + conv->pad_behind = conv_dim == CONV_3D ? (int)pads_end[0] : 0; + conv->pad_bottom = conv_dim == CONV_1D ? 0 : (int)pads_end[pads_end.size() - 2]; + conv->pad_right = (int)pads_end.back(); + int stride_d = conv_dim == CONV_3D ? (int)strides[0] : 0; + int stride_h = conv_dim == CONV_1D ? 0 : (int)strides[strides.size() - 2]; + int stride_w = (int)strides.back(); + + int dilation_d = conv_dim == CONV_3D ? (int)dilations[0] : 1; + int dilation_h = conv_dim == CONV_1D ? 1 : (int)dilations[dilations.size() - 2]; + int dilation_w = (int)dilations.back(); + + CV_Assert(Dk > 0 && Hk > 0 && Wk > 0); + CV_Assert(stride_d >= 0 && stride_h >= 0 && stride_w > 0); + CV_Assert(dilation_d > 0 && dilation_h > 0 && dilation_w > 0); + + conv->K = K; conv->C = C; conv->Hk = Hk; conv->Wk = Wk, conv->Dk = Dk; + + conv->stride_d = stride_d; + conv->stride_h = stride_h; + conv->stride_w = stride_w; + + conv->dilation_d = dilation_d; + conv->dilation_h = dilation_h; + conv->dilation_w = dilation_w; + conv->conv_dim = conv_dim; conv->ngroups = ngroups; - conv->pad_top = pads_begin[0]; - conv->pad_bottom = pads_end[0]; - conv->pad_left = pads_begin[1]; - conv->pad_right = pads_end[1]; - conv->conv_type = - (ngroups > 1 && ngroups == K && ngroups == C) ? _FX_CONV_TYPE_DEPTHWISE : - useWinograd && ((conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && Hk == 3 && Wk == 3 && - dilation_y == 1 && dilation_x == 1 && stride_y == 1 && stride_x == 1) ? _FX_CONV_TYPE_WINOGRAD3X3 : - _FX_CONV_TYPE_GENERIC; - - int VEC_NLANES = 4; -#if CV_TRY_AVX2 - if (!conv->useAVX2 && conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // convert Winograd to generic conv. + + bool ifRunDepthWise = ngroups > 1 && ngroups == K && ngroups == C; + bool ifRunDepthWiseRemain = false; // It's for big padding or big kernel or Conv3D depth-wise convolution. + + if (ifRunDepthWise) + { + if (conv_dim == CONV_1D) + { + ifRunDepthWise &= Hk == 1 && Wk == 3 && (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + && max(stride_w, dilation_w) >= conv->pad_left && conv->pad_left <= 1; + } + else if (conv_dim == CONV_2D) + { + ifRunDepthWise &= Hk == 3 && Wk == 3 && ((stride_w == 1) || (stride_w == 2 && dilation_w == 1)) && + max(stride_w, dilation_w) >= conv->pad_left && max(stride_h, dilation_h) >= conv->pad_top + && conv->pad_left <= 1 && conv->pad_top <= 1; + } + + if (!ifRunDepthWise || conv_dim == CONV_3D) + { + ifRunDepthWise = false; + ifRunDepthWiseRemain = true; + } + } + + conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? _FX_CONV_TYPE_DEPTHWISE : + useWinograd && (conv_dim == CONV_2D && (conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && + Hk == 3 && Wk == 3 && dilation_h == 1 && dilation_w == 1 && stride_h == 1 && stride_w == 1) ? + _FX_CONV_TYPE_WINOGRAD3X3 : + (ifRunDepthWiseRemain ? _FX_CONV_TYPE_DEPTHWISE_REMAIN : _FX_CONV_TYPE_GENERIC); + +#if !(CV_NEON || CV_SIMD128 || CV_TRY_AVX2) + if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 and CV_TRY_AVX2 are not available. conv->conv_type = _FX_CONV_TYPE_GENERIC; - if (conv->useAVX2) - VEC_NLANES = 8; #endif Mat weightsMat = _weightsMat.getMat(); @@ -63,14 +110,16 @@ Ptr initFastConv2d( const size_t wstep = weightsMat.step1(); float *srcWeights = (float *)weightsMat.data; - if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE) + if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE || conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) { + // Handle the Conv1D, Conv2D and Conv3D depth-wise. // for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout, // but add some padding to make the weights array layout more SIMD-friendly - int ksize = Hk*Wk; + int ksize = karea; + // TODO: simplify the following code with std::copy. // this code aims to let memory fit with vector size. - int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES; + int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN; int nweights = C*padded_ksize; conv->weightsBuf.reserve(nweights + VEC_ALIGN); conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN); @@ -163,12 +212,12 @@ Ptr initFastConv2d( else if (conv->conv_type == _FX_CONV_TYPE_GENERIC) { // The weights are packed as - // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk) x CONV_MR tensor + // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk*Dk) x CONV_MR tensor int Kg = K/ngroups, Cg = max(C/ngroups, 1); int numStripsMR = (Kg + CONV_MR - 1) / CONV_MR; int Kg_aligned = numStripsMR * CONV_MR; - int HkWkCg = Hk*Wk*Cg; - size_t nweights = ngroups*Kg_aligned*HkWkCg; + int DkHkWkCg = Dk*Hk*Wk*Cg; + size_t nweights = ngroups*Kg_aligned*DkHkWkCg; conv->weightsBuf.reserve(nweights + VEC_ALIGN); conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN); float* weightsBufPtr = conv->weightsBufPtr; @@ -184,14 +233,14 @@ Ptr initFastConv2d( int startK = si * CONV_MR; CV_Assert(startK < Kg_aligned); - float* packed_wptr = weightsBufPtr + HkWkCg * (startK + g * Kg_aligned); + float* packed_wptr = weightsBufPtr + DkHkWkCg * (startK + g * Kg_aligned); int dk = Kg - startK < CONV_MR ? Kg - startK : CONV_MR; // check if we need zero padding. int k_idx = g*Kg + startK; - for(int yx = 0; yx < Hk*Wk; yx++) { + for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++) { for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR) { - const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk + yx; + const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd; int k = 0; for(; k < dk; k++, wptr += wstep) packed_wptr[k] = *wptr; @@ -207,7 +256,7 @@ Ptr initFastConv2d( // store bias; append some zero's to make sure that // we can always read MR elements starting from any valid index { - int k = 0, nbias = K + 32; + int k = 0, nbias = K + VEC_ALIGN; conv->biasBuf.reserve(nbias); float* biasBufPtr = conv->biasBuf.data(); for(; k < K; k++) @@ -218,23 +267,121 @@ Ptr initFastConv2d( return conv; } -void runFastConv2d(InputArray _input, OutputArray _output, const Ptr& conv, int ntasks, - const Ptr& actLayer, bool fusedAdd) +static inline void packData8(float*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab, + const int stride_w, const int ksize) +{ + float* inpbufC = inpbuf + s0; + float* inptrInC = inptrIn; + + if (stride_w == 1) + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; + float v0 = inptrInC[k1]; + float v1 = inptrInC[k1 + 1]; + float v2 = inptrInC[k1 + 2]; + float v3 = inptrInC[k1 + 3]; + float v4 = inptrInC[k1 + 4]; + float v5 = inptrInC[k1 + 5]; + float v6 = inptrInC[k1 + 6]; + float v7 = inptrInC[k1 + 7]; + + inpbufC[k*CONV_NR] = v0; + inpbufC[k*CONV_NR+1] = v1; + inpbufC[k*CONV_NR+2] = v2; + inpbufC[k*CONV_NR+3] = v3; + inpbufC[k*CONV_NR+4] = v4; + inpbufC[k*CONV_NR+5] = v5; + inpbufC[k*CONV_NR+6] = v6; + inpbufC[k*CONV_NR+7] = v7; + } + else + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; + float v0 = inptrInC[k1]; + float v1 = inptrInC[k1 + stride_w]; + float v2 = inptrInC[k1 + 2*stride_w]; + float v3 = inptrInC[k1 + 3*stride_w]; + float v4 = inptrInC[k1 + 4*stride_w]; + float v5 = inptrInC[k1 + 5*stride_w]; + float v6 = inptrInC[k1 + 6*stride_w]; + float v7 = inptrInC[k1 + 7*stride_w]; + + inpbufC[k*CONV_NR] = v0; + inpbufC[k*CONV_NR+1] = v1; + inpbufC[k*CONV_NR+2] = v2; + inpbufC[k*CONV_NR+3] = v3; + inpbufC[k*CONV_NR+4] = v4; + inpbufC[k*CONV_NR+5] = v5; + inpbufC[k*CONV_NR+6] = v6; + inpbufC[k*CONV_NR+7] = v7; + } + x0+=7; + s0+=7; + inptrIn += 7*stride_w; + in_w += 7*stride_w; +} + +static inline void packData2(float*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab, + const int stride_w, const int ksize) +{ + float* inpbufC = inpbuf + s0; + float* inptrInC = inptrIn; + + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; + float v0 = inptrInC[k1]; + float v1 = inptrInC[k1 + stride_w]; + inpbufC[k*CONV_NR] = v0; + inpbufC[k*CONV_NR+1] = v1; + } + + x0++; + s0++; + inptrIn += stride_w; + in_w += stride_w; +} + +void runFastConv(InputArray _input, OutputArray _output, const Ptr& conv, int ntasks, + const Ptr& actLayer, const std::vector& reluslope, bool fusedAdd) { Mat input = _input.getMat(); Mat output = _output.getMat(); + int conv_dim = conv->conv_dim; + + CV_Assert_N(input.dims == output.dims, + input.size[0] == output.size[0], + conv->C == input.size[1], + conv->K == output.size[1], + input.type() == output.type(), + input.isContinuous(), + output.isContinuous()); Mat fusedAddMat; if (fusedAdd) + { + CV_Assert(conv->conv_dim != CONV_3D && "Conv3D does not support Conv+Add fusion optimization!"); fusedAddMat = _output.getMat(); + } + + if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE) + { + // Depthwise-Convolution layer should not be followed by Add layer. + CV_Assert(fusedAddMat.empty() && (conv_dim == CONV_1D || conv_dim == CONV_2D)); + return runDepthwise(input, output, conv,actLayer.get(), reluslope); + } MatShape inputShape = shape(input); MatShape outputShape = shape(output); - CV_Assert(inputShape.size() == 4 && outputShape.size() == 4); - ActivationLayer* activ = 0; + CV_Assert(inputShape.size() == outputShape.size()); + + ActivationLayer* activ = nullptr; float minval = -FLT_MAX, maxval = FLT_MAX; bool ifMinMaxAct = false; + if (actLayer) { Ptr activ_relu = actLayer.dynamicCast(); @@ -267,47 +414,97 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr else activ = nullptr; - if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE) + if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd { - CV_Assert(fusedAddMat.empty()); // Depthwise-Convolution layer should not be followed by Add layer. - return runDepthwise(input, output, conv, minval, maxval, activ, ifMinMaxAct); - } - else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd - { - CV_Assert(conv->weightsWinoBufPtr); + CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D); if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct)) return; } - int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W] - int K = conv->K, Hk = conv->Hk, Wk = conv->Wk; - int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups; + int N = inputShape[0], C = inputShape[1]; + + // input shape: [N, C, D, H, W] for Conv3D, [N, C, H, W] for Conv2D, [N, C, W] for Conv1D. + int Di = conv_dim == CONV_3D ? inputShape[2] : 1; + int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2]; + int Wi = inputShape[inputShape.size() - 1]; + + int ngroups = conv->ngroups; + int K = conv->K, Dk = conv->Dk, Hk = conv->Hk, Wk = conv->Wk; + + int D0 = conv_dim == CONV_3D ? outputShape[2] : 1; + int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2]; + int W0 = outputShape[outputShape.size() - 1]; + int Cg = C/ngroups, Kg = K/ngroups; - const size_t inp_planesize = (size_t)Hi*Wi; - const size_t out_planesize = (size_t)H0*W0; + const size_t inp_planesize = (size_t)Di*Hi*Wi; + const size_t out_planesize = (size_t)D0*H0*W0; + int pad_front = conv->pad_front; int pad_top = conv->pad_top; int pad_left = conv->pad_left; - int stride_y = conv->stride_y, stride_x = conv->stride_x; - int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x; + int stride_d = conv->stride_d, stride_h = conv->stride_h, stride_w = conv->stride_w; + int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w; - int ksize = Hk * Wk; - bool fast_1x1 = stride_x == 1 && stride_y == 1 && ksize == 1; - int HkWkCg = Hk*Wk*Cg; + int ksize = Dk*Hk*Wk; + bool fast_1x1 = stride_d == 1 && stride_w == 1 && stride_h == 1 && ksize == 1; + int DkHkWkCg = Dk*Hk*Wk*Cg; - int MAX_STRIPES = 2; // (56 + CONV_NR - 1)/CONV_NR; + std::vector ofstab_(Hk*Wk*Dk*4, 0); + int* ofstab = ofstab_.data(); + int* dhwTab = ofstab + Hk*Wk*Dk; + int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN; + + if (conv_dim == CONV_1D) + { + for( int w = 0; w < Wk; w++) + { + int dw = w*dilation_w; + dhwTab[w*3+2] = dw; + ofstab[w] = dw; + } + } + else if (conv_dim == CONV_2D) + { + for (int h = 0; h < Hk; h++) + for( int w = 0; w < Wk; w++) + { + int k = h*Wk + w; + int dh = h*dilation_h, dw = w*dilation_w; + dhwTab[k*3+1] = dh; + dhwTab[k*3+2] = dw; + ofstab[k] = dh*Wi + dw; + } + } + else + { + for (int d = 0; d < Dk; d++) + for (int h = 0; h < Hk; h++) + { + for (int w = 0; w < Wk; w++) + { + int k = d*Hk*Wk + h*Wk + w; + int dd = d*dilation_d, dh = h*dilation_h, dw = w*dilation_w; + dhwTab[k*3] = dd; + dhwTab[k*3+1] = dh; + dhwTab[k*3+2] = dw; + ofstab[k] = dd*Hi*Wi + dh*Wi + dw; + } + } + } + + int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR; // Friendly to L1 cache - const int K_BLOCK_SIZE = 32; + const int K_BLOCK_SIZE = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32; const int C_BLOCK_SIZE = 256; int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR, Kg_aligned = Kg_nblocks * CONV_MR; - int stripes_per_sample = (out_planesize + CONV_NR - 1) / CONV_NR; + int stripes_per_sample = ((int)out_planesize + CONV_NR - 1) / CONV_NR; - if (stripes_per_sample < ntasks * 4) + if (stripes_per_sample < ntasks * 4 && conv->conv_type != _FX_CONV_TYPE_DEPTHWISE_REMAIN) { MAX_STRIPES = 1; stripes_per_sample = 1; @@ -327,20 +524,6 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr inpbuf_all_.allocate(totalbufsize + VEC_ALIGN); float* inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN*sizeof(inpbuf_all_[0]))); - std::vector ofstab_(Hk*Wk*3, 0); - int* ofstab = ofstab_.data(); - int* yxtab = ofstab + Hk*Wk; - - for (int y = 0; y < Hk; y++) - for( int x = 0; x < Wk; x++) - { - int k = y*Wk + x; - int dy = y*dilation_y, dx = x*dilation_x; - yxtab[k*2] = dy; - yxtab[k*2+1] = dx; - ofstab[k] = dy*Wi + dx; - } - float* inp = input.ptr(); float* out = output.ptr(); float* fusedAddPtr0 = fusedAddMat.empty() ? 0 : fusedAddMat.ptr(); @@ -356,47 +539,47 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr for (int subtask = ngs0; subtask < ngs1; ) { int ng = subtask / Kstripes; - int kyx0 = subtask - ng * Kstripes; - int kyx1 = kyx0 + (ngs1 - subtask); + int kzyx0 = subtask - ng * Kstripes; + int kzyx1 = kzyx0 + (ngs1 - subtask); int n = ng / ngroups, g = ng % ngroups; // ng - n * ngroups; size_t inp_plane_ofs = (size_t)(n * ngroups + g) * Cg * inp_planesize; - kyx1 = kyx1 <= Kstripes ? kyx1 : Kstripes; - subtask += kyx1 - kyx0; + kzyx1 = kzyx1 <= Kstripes ? kzyx1 : Kstripes; + subtask += kzyx1 - kzyx0; int k0, k1; - int yx0, yx_limit, yx_block_limit = 0; + int zyx0, zyx_limit, zyx_block_limit = 0; - if (stripes_per_sample == 1) + if (stripes_per_sample == 1 && conv->conv_type != _FX_CONV_TYPE_DEPTHWISE_REMAIN) { - k0 = kyx0 * CONV_MR; - k1 = kyx1 * CONV_MR; + k0 = kzyx0 * CONV_MR; + k1 = kzyx1 * CONV_MR; k1 = k1 <= Kg ? k1 : Kg; - yx0 = 0; - yx_limit = out_planesize; + zyx0 = 0; + zyx_limit = (int)out_planesize; } else { k0 = 0; k1 = Kg; - yx0 = kyx0 * CONV_NR; - yx_limit = kyx1 * CONV_NR; - yx_limit = yx_limit < out_planesize ? yx_limit : out_planesize; + zyx0 = kzyx0 * CONV_NR; + zyx_limit = kzyx1 * CONV_NR; + zyx_limit = zyx_limit < out_planesize ? zyx_limit : (int)out_planesize; } - for (; yx0 < yx_limit; yx0 = yx_block_limit) + for (; zyx0 < zyx_limit; zyx0 = zyx_block_limit) { // step 1. extract part of input tensor and represent it in zigzag form - yx_block_limit = yx0 + CONV_NR * MAX_STRIPES; - yx_block_limit = yx_block_limit < yx_limit ? yx_block_limit : yx_limit; + zyx_block_limit = zyx0 + CONV_NR * MAX_STRIPES; + zyx_block_limit = zyx_block_limit < zyx_limit ? zyx_block_limit : zyx_limit; - int nstripes = (yx_block_limit - yx0 + CONV_NR - 1) / CONV_NR; - int yx0_saved = yx0; + int nstripes = (zyx_block_limit - zyx0 + CONV_NR - 1) / CONV_NR; + int zyx0_saved = zyx0; CV_Assert(nstripes <= MAX_STRIPES); - for (int stripe = 0; yx0 < yx_block_limit; stripe++, yx0 += CONV_NR) + for (int stripe = 0; zyx0 < zyx_block_limit; stripe++, zyx0 += CONV_NR) { - float* inpbuf = inpbuf_task + stripe * stripesize; - float* inptr = inp + inp_plane_ofs; + float *inpbuf = inpbuf_task + stripe * stripesize; + float *inptr = inp + inp_plane_ofs; /* 1. pack the data. Copy the HkxWk CONV_NR-wide slices from @@ -404,20 +587,20 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr */ if (fast_1x1) { - int slice_len = yx_block_limit - yx0; + int slice_len = zyx_block_limit - zyx0; bool partial = slice_len < CONV_NR; // Superfast branch for 1x1 convolutions with sy=sx=1. // in this case each feature plane can be safely treated // as 1D array, and we just extract next portion // of CONV_NR elements from each feature plane and // put it together. - inptr += yx0; + inptr += zyx0; if (!partial) { // Make special branch where memcpy() is called with a constant buffer size. // Compilers will likely unroll this loop properly. for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR) - memcpy(inpbuf, inptr, CONV_NR*sizeof(inpbuf[0])); + memcpy(inpbuf, inptr, CONV_NR * sizeof(inpbuf[0])); } else { @@ -428,25 +611,198 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr } } } + else if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) + { + CV_Assert(Cg == 1); + const int HW0 = H0 * W0; + const int HWi = Hi * Wi; + int slice_len = std::min(zyx_block_limit - zyx0, CONV_NR); + + // here some non-continuous sub-row of the row will not be + // filled from the tensor; we need to make sure that the uncovered + // elements are explicitly set to 0's. the easiest way is to + // set all the elements to 0's before the loop. + memset(inpbuf, 0, stripesize*sizeof(inpbuf[0])); + + int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0; + int y0 = yx0 / W0, x0 = yx0 - y0 * W0; + + if (conv_dim == CONV_1D) + { + for (int slice_i = 0; slice_i < slice_len; y0++, x0=0) + { + int delta = std::min(slice_len - slice_i, W0 - x0); + int x1 = x0 + delta; + + int in_w = x0 * stride_w - pad_left; + float* inptrIn = inptr + in_w; + + int s0 = slice_i; + + for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) + { + // Pack 8 + if (x0 + 8 <= x1 && 0 <= in_w && + in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) + { + packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); + } + else if (x0 + 2 <= x1 && 0 <= in_w && + in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) + { + packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); + } + else + { + int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); + int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); + + float* inpbufC = inpbuf + s0; + float* inptrInC = inptrIn; + for (int w = w0; w < w1; w++) + { + int imgofs = w*dilation_w; + inpbufC[w*CONV_NR] = inptrInC[imgofs]; + } + } + } + slice_i += delta; + } + } + else if (conv_dim == CONV_2D) + { + for (int slice_i = 0; slice_i < slice_len; y0++, x0=0) + { + int delta = std::min(slice_len - slice_i, W0 - x0); + int x1 = x0 + delta; + + int in_h = y0 * stride_h - pad_top; + int in_w = x0 * stride_w - pad_left; + + float* inptrIn = inptr + in_h*Wi + in_w; + + bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h; + int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h); + int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h); + + int s0 = slice_i; + for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) + { + // Pack 8 + if (ok_i && x0 + 8 <= x1 && 0 <= in_w && + in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) + { + packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); + } + else if (ok_i && x0 + 2 <= x1 && 0 <= in_w && + in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) + { + packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); + } + else + { + int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); + int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); + + float* inpbufC = inpbuf + s0; + float* inptrInC = inptrIn; + + for (int h = h0; h < h1; h++) + { + for (int w = w0; w < w1; w++) + { + int imgofs = h*(dilation_h*Wi) + w*dilation_w; + inpbufC[(h*Wk + w)*CONV_NR] = inptrInC[imgofs]; + } + } + } + } + slice_i += delta; + } + } + else if (conv_dim == CONV_3D) + { + for (int slice_i = 0; slice_i < slice_len; z0 += (y0+1)/H0, y0 = (y0+1)%H0, x0=0) + { + int delta = std::min(slice_len - slice_i, W0 - x0); + int x1 = x0 + delta; + + int in_d = z0 * stride_d - pad_front; + int in_h = y0 * stride_h - pad_top; + int in_w = x0 * stride_w - pad_left; + + float* inptrIn = inptr + in_d*HWi + in_h*Wi + in_w; + + int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d); + int d1 = std::min(Dk, (Di - in_d + dilation_d - 1) / dilation_d); + + bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h; + int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h); + int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h); + + int s0 = slice_i; + for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) + { + // Pack 8 + if (ok_i && x0 + 8 <= x1 && 0 <= in_w && + in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) + { + packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); + } + else if (ok_i && x0 + 2 <= x1 && 0 <= in_w && + in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) + { + packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); + } + else + { + int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); + int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); + + float* inpbufC = inpbuf + s0; + float* inptrInC = inptrIn; + + for ( int d = d0; d < d1; d++) + { + for (int h = h0; h < h1; h++) + { + for (int w = w0; w < w1; w++) + { + int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w; + inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = inptrInC[imgofs]; + } + } + } + } + } + slice_i += delta; + } + } + } else { + const int HW0 = H0 * W0; + const int HWi = Hi * Wi; + int z0_ = zyx0 / HW0, yx0 = zyx0 - z0_ * HW0; int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0; for (int k = 0; k < ksize; k++) { - int dy = yxtab[k * 2], dx = yxtab[k * 2 + 1]; - int i = 0, y0 = y0_, x0 = x0_; + int dz = dhwTab[k * 3], dy = dhwTab[k * 3 + 1], dx = dhwTab[k * 3 + 2]; + int i = 0, z0 = z0_, y0 = y0_, x0 = x0_; for (; i < CONV_NR;) { float *inpbuf_ki = inpbuf + k * CONV_NR * Cg + i; - int yi = y0 * stride_y + dy - pad_top; - int xi = x0 * stride_x + dx - pad_left; + int zi = z0 * stride_d + dz - pad_front; + int yi = y0 * stride_h + dy - pad_top; + int xi = x0 * stride_w + dx - pad_left; - if ((unsigned) yi < (unsigned) Hi && (unsigned) xi < (unsigned) Wi) + if ((unsigned) zi < (unsigned) Di && (unsigned) yi < (unsigned) Hi && + (unsigned) xi < (unsigned) Wi) { - const float *inptr_ki = inptr + yi * Wi + xi; - if (i + 8 <= CONV_NR && x0 + 8 <= W0 && xi + stride_x * 8 <= Wi) + const float *inptr_ki = inptr + zi * HWi + yi * Wi + xi; + if (i + 8 <= CONV_NR && x0 + 8 <= W0 && xi + stride_w * 8 <= Wi) { - if (stride_x == 1) + if (stride_w == 1) { for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) { @@ -454,49 +810,79 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr float t2 = inptr_ki[2], t3 = inptr_ki[3]; float t4 = inptr_ki[4], t5 = inptr_ki[5]; float t6 = inptr_ki[6], t7 = inptr_ki[7]; - inpbuf_ki[0] = t0; inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; inpbuf_ki[3] = t3; - inpbuf_ki[4] = t4; inpbuf_ki[5] = t5; - inpbuf_ki[6] = t6; inpbuf_ki[7] = t7; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; + inpbuf_ki[4] = t4; + inpbuf_ki[5] = t5; + inpbuf_ki[6] = t6; + inpbuf_ki[7] = t7; + } + } + else if (stride_w == 2) + { + for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) + { + float t0 = inptr_ki[0], t1 = inptr_ki[2]; + float t2 = inptr_ki[4], t3 = inptr_ki[6]; + float t4 = inptr_ki[8], t5 = inptr_ki[10]; + float t6 = inptr_ki[12], t7 = inptr_ki[14]; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; + inpbuf_ki[4] = t4; + inpbuf_ki[5] = t5; + inpbuf_ki[6] = t6; + inpbuf_ki[7] = t7; } } else { for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) { - float t0 = inptr_ki[0], t1 = inptr_ki[stride_x]; - float t2 = inptr_ki[stride_x*2], t3 = inptr_ki[stride_x*3]; - float t4 = inptr_ki[stride_x*4], t5 = inptr_ki[stride_x*5]; - float t6 = inptr_ki[stride_x*6], t7 = inptr_ki[stride_x*7]; - inpbuf_ki[0] = t0; inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; inpbuf_ki[3] = t3; - inpbuf_ki[4] = t4; inpbuf_ki[5] = t5; - inpbuf_ki[6] = t6; inpbuf_ki[7] = t7; + float t0 = inptr_ki[0], t1 = inptr_ki[stride_w]; + float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3]; + float t4 = inptr_ki[stride_w * 4], t5 = inptr_ki[stride_w * 5]; + float t6 = inptr_ki[stride_w * 6], t7 = inptr_ki[stride_w * 7]; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; + inpbuf_ki[4] = t4; + inpbuf_ki[5] = t5; + inpbuf_ki[6] = t6; + inpbuf_ki[7] = t7; } } i += 8; x0 += 8; } - else if (i + 4 <= CONV_NR && x0 + 4 <= W0 && xi + stride_x * 4 <= Wi) + else if (i + 4 <= CONV_NR && x0 + 4 <= W0 && xi + stride_w * 4 <= Wi) { - if (stride_x == 1) + if (stride_w == 1) { for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) { float t0 = inptr_ki[0], t1 = inptr_ki[1]; float t2 = inptr_ki[2], t3 = inptr_ki[3]; - inpbuf_ki[0] = t0; inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; inpbuf_ki[3] = t3; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; } } else { for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) { - float t0 = inptr_ki[0], t1 = inptr_ki[stride_x]; - float t2 = inptr_ki[stride_x*2], t3 = inptr_ki[stride_x*3]; - inpbuf_ki[0] = t0; inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; inpbuf_ki[3] = t3; + float t0 = inptr_ki[0], t1 = inptr_ki[stride_w]; + float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3]; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; } } i += 4; @@ -517,64 +903,117 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr i++; x0++; } + int mask = x0 >= W0; y0 += mask; x0 &= mask - 1; + + mask = y0 >= H0; + z0 += mask; + y0 &= mask - 1; } } } } - yx0 = yx0_saved; - float* weights = conv->weightsBufPtr + g * Kg_aligned * HkWkCg; - const float* biasptr = conv->biasBuf.data() + Kg * g; + zyx0 = zyx0_saved; + + // spacial branch for depth-wise convolution implemented using generic convolution. + // In this case, CONV_MR is 1, and CONV_NR is the same. + if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) + { + size_t outofs = (n * ngroups + g) * out_planesize + zyx0; + float *cptr0 = cbuf_task; + float *weights = conv->weightsBufPtr + g * padded_ksize; + int out_width = zyx_block_limit - zyx0; + float *outptr = out + outofs; + const float biasVal = *(conv->biasBuf.data() + g); + for (int stripe = 0; stripe < nstripes; stripe++) + { + const float *inptr = inpbuf_task + stripe * stripesize; + const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR); + bool ifBuffer = outLen < CONV_NR; + float *cptr = outptr + stripe * CONV_NR; + if (ifBuffer) + { + memcpy(cptr0, cptr, outLen * sizeof(cptr[0])); + cptr = cptr0; + } +#if CV_TRY_AVX2 + if (conv->useAVX2 && outLen > CONV_NR/3) + opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct); + else +#endif + convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen); + + if (ifBuffer) + { + memcpy(outptr + stripe * CONV_NR, cptr, outLen * sizeof(cptr[0])); + } + } + if (activ) + activ->forwardSlice(outptr, outptr, out_width, out_planesize, g, g + 1); + continue; + } + + float *weights = conv->weightsBufPtr + g * Kg_aligned * DkHkWkCg; + const float *biasptr = conv->biasBuf.data() + Kg * g; int ldc = nstripes * CONV_NR; - // 2. do convolution, compute Kg x (yx_block_limit - yx0) part of the output tensor + // 2. do convolution, compute Kg x (zyx_block_limit - zyx0) part of the output tensor + int out_width = zyx_block_limit - zyx0; for (int k0_block = k0; k0_block < k1; k0_block += K_BLOCK_SIZE) { int k1_block = k0_block + K_BLOCK_SIZE < k1 ? k0_block + K_BLOCK_SIZE : k1; - for (int c0 = 0; c0 < HkWkCg; c0 += C_BLOCK_SIZE) + for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE) { - int c1 = c0 + C_BLOCK_SIZE < HkWkCg ? c0 + C_BLOCK_SIZE : HkWkCg; + int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg; for (int stripe = 0; stripe < nstripes; stripe++) { - float* wptr = weights + k0_block*HkWkCg + c0*CONV_MR; - const float* inptr = inpbuf_task + stripe*stripesize + c0 * CONV_NR; - float* cptr = cbuf_task + stripe * CONV_NR; + const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR); + +#if CV_TRY_AVX2 || CV_TRY_NEON + // The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4. + bool runOpt = outLen > std::min(8, CONV_NR/3); +#endif + float *wptr = weights + k0_block * DkHkWkCg + c0 * CONV_MR; + const float *inptr = inpbuf_task + stripe * stripesize + c0 * CONV_NR; + float *cptr = cbuf_task + stripe * CONV_NR; for (int k = k0_block; k < k1_block; k += CONV_MR, - wptr += HkWkCg * CONV_MR, cptr += CONV_MR * ldc) + wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc) { #if CV_TRY_AVX2 - if (conv->useAVX2) + if (conv->useAVX2 && runOpt) opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0); else #endif #if CV_TRY_NEON - if (conv->useNEON) + if (conv->useNEON && runOpt) opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0); else #endif - convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0); + // The possible outLen range is 24 or 8~1. + convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen); } } } - size_t outofs = ((n*ngroups + g) * Kg + k0_block) * out_planesize + yx0; - int out_width = yx_block_limit - yx0; - const float* cptr = cbuf_task; + size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0; + const float *cptr = cbuf_task; - float* outptr = out + outofs; - const float* pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0; + float *outptr = out + outofs; + const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0; for (int k = k0_block; k < k1_block; k++, cptr += ldc, outptr += out_planesize, - pbptr += (pbptr ? out_planesize : 0)) - { + pbptr += (pbptr ? out_planesize : 0)) { float biasval = biasptr[k]; int j = 0; #if CV_SIMD128 - v_float32x4 vbias = v_setall_f32(biasval), vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval); + v_float32x4 vbias = v_setall_f32(biasval); + v_float32x4 vmax = v_setall_f32(maxval); + v_float32x4 vmin = v_setall_f32(minval); + if (pbptr) { for (; j + 7 < out_width; j += 8) @@ -614,19 +1053,15 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr } #endif if (pbptr) { - for (; j < out_width; j++) - { + for (; j < out_width; j++) { float v = cptr[j] + biasval; v += pbptr[j]; if (ifMinMaxAct) v = std::min(std::max(v, minval), maxval); outptr[j] = v; } - } - else - { - for (; j < out_width; j++) - { + } else { + for (; j < out_width; j++) { float v = cptr[j] + biasval; if (ifMinMaxAct) diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp index 01f5ede..895ad56 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp +++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp @@ -42,19 +42,20 @@ enum { _FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16. }; -enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2 }; +enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2, _FX_CONV_TYPE_DEPTHWISE_REMAIN=3 }; +enum { CONV_1D = 0, CONV_2D = 1, CONV_3D = 2 }; #endif namespace cv { namespace dnn { -struct FastConv2d +struct FastConv { int ngroups; - int K, C, Hk, Wk; - int stride_y, stride_x; - int dilation_y, dilation_x; - int pad_top, pad_bottom, pad_left, pad_right; + int K, C, Hk, Wk, Dk; + int stride_h, stride_w, stride_d; + int dilation_h, dilation_w, dilation_d; + int pad_top, pad_bottom, pad_left, pad_right, pad_front, pad_behind; std::vector weightsBuf; // For generic Conv 2D float* weightsBufPtr; @@ -62,57 +63,55 @@ struct FastConv2d float* weightsWinoBufPtr; std::vector biasBuf; int conv_type; + int conv_dim; // Flag for conv1d, conv2d, or conv3d. #if CV_SIMD128 bool useSIMD128 = true; #else bool useSIMD128 = false; #endif -#if CV_TRY_AVX2 - bool useAVX2 = checkHardwareSupport(CPU_AVX2); -#else - bool useAVX2 = false; -#endif - #if CV_NEON bool useNEON = checkHardwareSupport(CPU_NEON); #else bool useNEON = false; #endif + + bool useAVX = checkHardwareSupport(CPU_AVX); + bool useAVX2 = checkHardwareSupport(CPU_AVX2); + bool useRVV = checkHardwareSupport(CPU_RVV); }; -// return a FastConv2d instance. -Ptr initFastConv2d( +// return a FastConv instance. +Ptr initFastConv( + InputArray weightsMat, + float* srcBias, int ngroups, - int K, int C, int Hk, int Wk, - int stride_x, int stride_y, - int dilation_x, int dilation_y, + int K, int C, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& dilations, const std::vector& pads_begin, const std::vector& pads_end, - InputArray weightsMat, - float* srcBias, bool useWinograd); + int conv_dim, + bool useWinograd); // It contains different computing branches, like winograd, 1x1 conv. -void runFastConv2d(InputArray _input, OutputArray _output, const Ptr& conv, int ntasks, - const Ptr& actLayer, bool fusedAdd); +void runFastConv(InputArray _input, OutputArray _output, const Ptr& conv, int ntasks, + const Ptr& actLayer, const std::vector& reluslope, bool fusedAdd); -void runDepthwise(InputArray _input, OutputArray _output, const Ptr& conv, float minval, float maxval, - ActivationLayer* activ, bool ifMinMaxAct); +void runDepthwise(InputArray _input, OutputArray _output, const Ptr& conv, ActivationLayer* activ, + const std::vector& reluslope); -int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, +int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct); -} // namespace dnn - namespace opt_AVX2 { #if CV_TRY_AVX2 void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c); -void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab, - float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left, - int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop, - int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3); +void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, const float minval, + const float maxval, bool ifMinMaxAct); void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock); void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg); @@ -122,6 +121,7 @@ void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, in #endif } // namespace opt_AVX2 +} // namespace dnn } // namespace cv #endif //OPENCV_FAST_CONVOLUTION_HPP diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp index 7325cc3..e146c09 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp +++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp @@ -11,9 +11,132 @@ namespace cv { namespace dnn { -void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c) +static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int outLen) +{ + std::vector cbuffer(outLen, 0); + float* cbuf = cbuffer.data(); + for( int p = 0; p < np; p++ ) + { + float ai = a[p]; + for( int j = 0; j < outLen; j++ ) + cbuf[j] += b[CONV_NR*p + j] * ai; + } + + if (init_c) + { + for(int j = 0; j < outLen; j++) + { + c[j] += cbuf[j] + bias; + if (ifMinMaxAct) + c[j] = std::min(std::max(c[j], minval), maxval); + } + } + else + { + for(int j = 0; j < outLen; j++) + { + c[j] = cbuf[j] + bias; + if (ifMinMaxAct) + c[j] = std::min(std::max(c[j], minval), maxval); + } + } +} + +void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int outLen) +{ +#if CV_SIMD128 + // The outLen represents the valid output value in CONV_NR length. + // When outLen is very small, we use the no-SIMD branch. + const int CONV_NRby3 = CONV_NR/3; + if (outLen > CONV_NRby3) + { + v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; // CONV_NR == 12 +#if CONV_NR == 28 || CONV_NR == 24 + v_float32x4 c3 = c0, c4 = c0, c5 = c0; +#endif +#if CONV_NR == 28 + v_float32x4 c6 = c0; +#endif + for (int p = 0; p < np; p++, a++, b += CONV_NR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); +#if CONV_NR == 28 || CONV_NR == 24 + v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); +#endif +#if CONV_NR == 28 + v_float32x4 b6 = v_load(b + 24); +#endif + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b1, a0, c1); + c2 = v_fma(b2, a0, c2); +#if CONV_NR == 28 || CONV_NR == 24 + c3 = v_fma(b3, a0, c3); + c4 = v_fma(b4, a0, c4); + c5 = v_fma(b5, a0, c5); +#endif +#if CONV_NR == 28 + c6 = v_fma(b6, a0, c6); +#endif + } + + if (init_c) + { + c0 += v_load(c); + c1 += v_load(c + 4); + c2 += v_load(c + 8); +#if CONV_NR == 28 || CONV_NR == 24 + c3 += v_load(c + 12); + c4 += v_load(c + 16); + c5 += v_load(c + 20); +#endif +#if CONV_NR == 28 + c6 += v_load(c + 24); +#endif + } + + if (ifMinMaxAct) + { + v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval); + c0 = v_min(v_max(c0, vmin), vmax); + c1 = v_min(v_max(c1, vmin), vmax); + c2 = v_min(v_max(c2, vmin), vmax); +#if CONV_NR == 28 || CONV_NR == 24 + c3 = v_min(v_max(c3, vmin), vmax); + c4 = v_min(v_max(c4, vmin), vmax); + c5 = v_min(v_max(c5, vmin), vmax); +#endif +#if CONV_NR == 28 + c6 = v_min(v_max(c6, vmin), vmax); +#endif + } + + v_store(c, c0); + v_store(c + 4, c1); + v_store(c + 8, c2); +#if CONV_NR == 28 || CONV_NR == 24 + v_store(c + 12, c3); + v_store(c + 16, c4); + v_store(c + 20, c5); +#endif +#if CONV_NR == 28 + v_store(c + 24, c6); +#endif + } + else + convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen); +#else + convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen); +#endif +} + +#if CV_SIMD128 +#if CONV_MR == 4 && CONV_NR == 24 +static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c) { -#if CV_SIMD128 && CONV_MR == 4 && CONV_NR == 24 v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0; v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6; v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12; @@ -115,29 +238,156 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i v_store(c + ldc * 3 + 12, c21); v_store(c + ldc * 3 + 16, c22); v_store(c + ldc * 3 + 20, c23); -#else - float cbuf[CONV_MR * CONV_NR]; - memset(cbuf, 0, sizeof(cbuf)); +} +#endif + +static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c) +{ + CV_Assert(CONV_NR >= 4); + v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; + v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0; + + for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 a1 = v_setall_f32(a[1]); + v_float32x4 a2 = v_setall_f32(a[2]); + v_float32x4 a3 = v_setall_f32(a[3]); + + v_float32x4 b0 = v_load(b), b1 = v_load(b + 4); + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b1, a0, c1); + + c2 = v_fma(b0, a1, c2); + c3 = v_fma(b1, a1, c3); + + c4 = v_fma(b0, a2, c4); + c5 = v_fma(b1, a2, c5); + + c6 = v_fma(b0, a3, c6); + c7 = v_fma(b1, a3, c7); + } + + if (!init_c) + { + c0 += v_load(c); + c1 += v_load(c + 4); + + c2 += v_load(c + ldc); + c3 += v_load(c + ldc + 4); + + c4 += v_load(c + ldc*2); + c5 += v_load(c + ldc*2 + 4); + + c6 += v_load(c + ldc*3); + c7 += v_load(c + ldc*3 + 4); + } + + v_store(c, c0); + v_store(c + 4, c1); + v_store(c + ldc, c2); + v_store(c + ldc + 4, c3); + v_store(c + ldc * 2, c4); + v_store(c + ldc * 2 + 4, c5); + v_store(c + ldc * 3, c6); + v_store(c + ldc * 3 + 4, c7); +} + +static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c) +{ + CV_Assert(CONV_NR >= 4); + v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; + + for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 a1 = v_setall_f32(a[1]); + v_float32x4 a2 = v_setall_f32(a[2]); + v_float32x4 a3 = v_setall_f32(a[3]); + + v_float32x4 b0 = v_load(b); + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b0, a1, c1); + c2 = v_fma(b0, a2, c2); + c3 = v_fma(b0, a3, c3); + } + + if (!init_c) + { + c0 += v_load(c); + c1 += v_load(c + ldc); + c2 += v_load(c + ldc*2); + c3 += v_load(c + ldc*3); + } + + v_store(c, c0); + v_store(c + ldc, c1); + v_store(c + ldc * 2, c2); + v_store(c + ldc * 3, c3); +} +#endif + +static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen) +{ + std::vector cbuffer(CONV_MR * outLen, 0); + float* cbuf = cbuffer.data(); for( int p = 0; p < np; p++ ) { for( int i = 0; i < CONV_MR; i++ ) { float ai = a[CONV_MR*p + i]; - for( int j = 0; j < CONV_NR; j++ ) - cbuf[i * CONV_NR+j] += b[CONV_NR*p + j] * ai; + for( int j = 0; j < outLen; j++ ) + cbuf[i * outLen+j] += b[CONV_NR*p + j] * ai; } } - if (!init_c) { - for(int i = 0; i < CONV_MR; i++) { - for(int j = 0; j < CONV_NR; j++) - c[i*ldc + j] += cbuf[i*CONV_NR + j]; + + if (!init_c) + { + for(int i = 0; i < CONV_MR; i++) + { + for(int j = 0; j < outLen; j++) + c[i*ldc + j] += cbuf[i*outLen + j]; } - } else { - for(int i = 0; i < CONV_MR; i++) { - for(int j = 0; j < CONV_NR; j++) - c[i*ldc + j] = cbuf[i*CONV_NR + j]; + } + else + { + for(int i = 0; i < CONV_MR; i++) + { + for(int j = 0; j < outLen; j++) + c[i*ldc + j] = cbuf[i*outLen + j]; } } +} + +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen) +{ + // The possible outLen range is [24, 8~1]. +#if CV_SIMD128 +#if CONV_MR == 4 && CONV_NR == 24 + const int CONV_NRby3 = CONV_NR/3; + if (outLen > CONV_NRby3) + { + convBlock4x24(np, a, b, c, ldc, init_c); + return; + } +#endif + + if (outLen <= 8 && outLen > 4) + { + convBlock4x8(np, a, b, c, ldc, init_c); + return; + } + + if (outLen <= 4 && outLen > 1) + { + convBlock4x4(np, a, b, c, ldc, init_c); + return; + } + convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen); +#else + convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen); #endif } } // namespace dnn diff --git a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp index 10b55f3..e3b8088 100644 --- a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp +++ b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp @@ -920,7 +920,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, #endif } -int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, +int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) { Mat input = _input.getMat(); @@ -1144,7 +1144,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu #else -int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, +int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) { return 0; diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp index f706abf..eb17356 100644 --- a/modules/dnn/src/layers/layers_common.simd.hpp +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -46,10 +46,6 @@ namespace cv { namespace dnn { CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -void fastConv( const float* weights, size_t wstep, const float* bias, - const float* rowbuf, float* output, const int* outShape, - int blockSize, int vecsize, int vecsize_aligned, - const float* relu, bool initOutput ); void fastDepthwiseConv( const float* weights, int kernel_h, int kernel_w, int stride_h, int stride_w, @@ -74,305 +70,6 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr, #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) #endif -enum { FASCONV_BASE_VECSZ = 4 }; - -void fastConv( const float* weights, size_t wstep, const float* bias, - const float* rowbuf, float* output, const int* outShape, - int blockSize, int vecsize, int vecsize_aligned, - const float* relu, bool initOutput ) -{ - CV_Assert(isAligned<32>(weights)); - - int outCn = outShape[1]; - size_t outPlaneSize = outShape[2]*outShape[3]; - float r0 = 1.f, r1 = 1.f, r2 = 1.f; - __m128 vr0 = _mm_set1_ps(1.f), vr1 = vr0, vr2 = vr0, z = _mm_setzero_ps(); - int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0}; - int rsz = blockSize % FASCONV_BASE_VECSZ; - for( int i = 0; i < rsz; i++ ) - maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1; - __m128 mask = _mm_loadu_ps((const float*)maskbuf); - - // now compute dot product of the weights - // and im2row-transformed part of the tensor - for( int i = 0; i < outCn; i += 3 ) - { - const float* wptr0 = weights + i*wstep; - const float* wptr1 = wptr0 + wstep; - const float* wptr2 = wptr1 + wstep; - float* outptr0 = output + i*outPlaneSize; - float* outptr1 = outptr0 + outPlaneSize; - float* outptr2 = outptr1 + outPlaneSize; - float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; - - if( i+2 >= outCn ) - { - wptr2 = wptr1; - outptr2 = outptr1; - bias2 = bias1; - if( i+1 >= outCn ) - { - wptr2 = wptr1 = wptr0; - outptr2 = outptr1 = outptr0; - bias2 = bias1 = bias0; - } - } - - if( relu ) - { - r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2]; - if( i+2 >= outCn ) - { - r2 = r1; - if( i+1 >= outCn ) - r2 = r1 = r0; - } - vr0 = _mm_set1_ps(r0); - vr1 = _mm_set1_ps(r1); - vr2 = _mm_set1_ps(r2); - } - - int j = 0; - for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) - { - bool tail = false; - if (j + FASCONV_BASE_VECSZ > blockSize) - { - if (j == 0) - break; - j = blockSize - FASCONV_BASE_VECSZ; - tail = true; - } - int k = 0; - const float* rptr = rowbuf + j*vecsize_aligned; - - __m256 vs00 = _mm256_setzero_ps(), vs01 = _mm256_setzero_ps(), - vs02 = _mm256_setzero_ps(), vs03 = _mm256_setzero_ps(), - vs10 = _mm256_setzero_ps(), vs11 = _mm256_setzero_ps(), - vs12 = _mm256_setzero_ps(), vs13 = _mm256_setzero_ps(), - vs20 = _mm256_setzero_ps(), vs21 = _mm256_setzero_ps(), - vs22 = _mm256_setzero_ps(), vs23 = _mm256_setzero_ps(); - -#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling - if (vecsize >= 32) - { - __m512 vs00_5 = _mm512_setzero_ps(), vs01_5 = _mm512_setzero_ps(), - vs02_5 = _mm512_setzero_ps(), vs03_5 = _mm512_setzero_ps(), - vs10_5 = _mm512_setzero_ps(), vs11_5 = _mm512_setzero_ps(), - vs12_5 = _mm512_setzero_ps(), vs13_5 = _mm512_setzero_ps(), - vs20_5 = _mm512_setzero_ps(), vs21_5 = _mm512_setzero_ps(), - vs22_5 = _mm512_setzero_ps(), vs23_5 = _mm512_setzero_ps(); - - for (; k <= vecsize - 16; k += 16, rptr += 16) - { - __m512 w0 = _mm512_loadu_ps(wptr0 + k); - __m512 w1 = _mm512_loadu_ps(wptr1 + k); - __m512 w2 = _mm512_loadu_ps(wptr2 + k); - __m512 r0 = _mm512_loadu_ps(rptr); - - vs00_5 = _mm512_fmadd_ps(w0, r0, vs00_5); - vs10_5 = _mm512_fmadd_ps(w1, r0, vs10_5); - vs20_5 = _mm512_fmadd_ps(w2, r0, vs20_5); - - r0 = _mm512_loadu_ps(rptr + vecsize_aligned); - vs01_5 = _mm512_fmadd_ps(w0, r0, vs01_5); - vs11_5 = _mm512_fmadd_ps(w1, r0, vs11_5); - vs21_5 = _mm512_fmadd_ps(w2, r0, vs21_5); - - r0 = _mm512_loadu_ps(rptr + vecsize_aligned*2); - vs02_5 = _mm512_fmadd_ps(w0, r0, vs02_5); - vs12_5 = _mm512_fmadd_ps(w1, r0, vs12_5); - vs22_5 = _mm512_fmadd_ps(w2, r0, vs22_5); - - r0 = _mm512_loadu_ps(rptr + vecsize_aligned*3); - vs03_5 = _mm512_fmadd_ps(w0, r0, vs03_5); - vs13_5 = _mm512_fmadd_ps(w1, r0, vs13_5); - vs23_5 = _mm512_fmadd_ps(w2, r0, vs23_5); - } - /* - * now fold the 512 bit accumulator vectors into 256 bit vectors so that the AVX2 code can finish - * the tail of the vector - */ - vs00 = _mm256_add_ps( _mm512_extractf32x8_ps(vs00_5, 0), _mm512_extractf32x8_ps(vs00_5, 1)); - vs10 = _mm256_add_ps( _mm512_extractf32x8_ps(vs10_5, 0), _mm512_extractf32x8_ps(vs10_5, 1)); - vs20 = _mm256_add_ps( _mm512_extractf32x8_ps(vs20_5, 0), _mm512_extractf32x8_ps(vs20_5, 1)); - - vs01 = _mm256_add_ps( _mm512_extractf32x8_ps(vs01_5, 0), _mm512_extractf32x8_ps(vs01_5, 1)); - vs11 = _mm256_add_ps( _mm512_extractf32x8_ps(vs11_5, 0), _mm512_extractf32x8_ps(vs11_5, 1)); - vs21 = _mm256_add_ps( _mm512_extractf32x8_ps(vs21_5, 0), _mm512_extractf32x8_ps(vs21_5, 1)); - - vs02 = _mm256_add_ps( _mm512_extractf32x8_ps(vs02_5, 0), _mm512_extractf32x8_ps(vs02_5, 1)); - vs12 = _mm256_add_ps( _mm512_extractf32x8_ps(vs12_5, 0), _mm512_extractf32x8_ps(vs12_5, 1)); - vs22 = _mm256_add_ps( _mm512_extractf32x8_ps(vs22_5, 0), _mm512_extractf32x8_ps(vs22_5, 1)); - - vs03 = _mm256_add_ps( _mm512_extractf32x8_ps(vs03_5, 0), _mm512_extractf32x8_ps(vs03_5, 1)); - vs13 = _mm256_add_ps( _mm512_extractf32x8_ps(vs13_5, 0), _mm512_extractf32x8_ps(vs13_5, 1)); - vs23 = _mm256_add_ps( _mm512_extractf32x8_ps(vs23_5, 0), _mm512_extractf32x8_ps(vs23_5, 1)); - } -#endif - - for (; k < vecsize; k += 8, rptr += 8 ) - { - __m256 w0 = _mm256_load_ps(wptr0 + k); - __m256 w1 = _mm256_load_ps(wptr1 + k); - __m256 w2 = _mm256_load_ps(wptr2 + k); - __m256 r0 = _mm256_load_ps(rptr); - - vs00 = _mm256_fmadd_ps(w0, r0, vs00); - vs10 = _mm256_fmadd_ps(w1, r0, vs10); - vs20 = _mm256_fmadd_ps(w2, r0, vs20); - - r0 = _mm256_load_ps(rptr + vecsize_aligned); - vs01 = _mm256_fmadd_ps(w0, r0, vs01); - vs11 = _mm256_fmadd_ps(w1, r0, vs11); - vs21 = _mm256_fmadd_ps(w2, r0, vs21); - - r0 = _mm256_load_ps(rptr + vecsize_aligned*2); - vs02 = _mm256_fmadd_ps(w0, r0, vs02); - vs12 = _mm256_fmadd_ps(w1, r0, vs12); - vs22 = _mm256_fmadd_ps(w2, r0, vs22); - - r0 = _mm256_load_ps(rptr + vecsize_aligned*3); - vs03 = _mm256_fmadd_ps(w0, r0, vs03); - vs13 = _mm256_fmadd_ps(w1, r0, vs13); - vs23 = _mm256_fmadd_ps(w2, r0, vs23); - } - - __m256 t0 = _mm256_hadd_ps(_mm256_hadd_ps(vs00, vs01), _mm256_hadd_ps(vs02, vs03)); - __m256 t1 = _mm256_hadd_ps(_mm256_hadd_ps(vs10, vs11), _mm256_hadd_ps(vs12, vs13)); - __m256 t2 = _mm256_hadd_ps(_mm256_hadd_ps(vs20, vs21), _mm256_hadd_ps(vs22, vs23)); - - t0 = _mm256_add_ps(t0, _mm256_permute2f128_ps(t0, t0, 1)); - t1 = _mm256_add_ps(t1, _mm256_permute2f128_ps(t1, t1, 1)); - t2 = _mm256_add_ps(t2, _mm256_permute2f128_ps(t2, t2, 1)); - - __m128 s0, s1, s2; - - if( initOutput ) - { - s0 = _mm_set1_ps(bias0); - s1 = _mm_set1_ps(bias1); - s2 = _mm_set1_ps(bias2); - } - else - { - s0 = _mm_loadu_ps(outptr0 + j); - s1 = _mm_loadu_ps(outptr1 + j); - s2 = _mm_loadu_ps(outptr2 + j); - } - - s0 = _mm_add_ps(s0, _mm256_castps256_ps128(t0)); - s1 = _mm_add_ps(s1, _mm256_castps256_ps128(t1)); - s2 = _mm_add_ps(s2, _mm256_castps256_ps128(t2)); - - if( relu ) - { - __m128 m0 = _mm_cmp_ps(s0, z, _CMP_GT_OS); - __m128 m1 = _mm_cmp_ps(s1, z, _CMP_GT_OS); - __m128 m2 = _mm_cmp_ps(s2, z, _CMP_GT_OS); - s0 = _mm_blendv_ps(_mm_mul_ps(s0, vr0), s0, m0); - s1 = _mm_blendv_ps(_mm_mul_ps(s1, vr1), s1, m1); - s2 = _mm_blendv_ps(_mm_mul_ps(s2, vr2), s2, m2); - } - - if( tail ) - { - s0 = _mm_blendv_ps(_mm_loadu_ps(outptr0 + j), s0, mask); - s1 = _mm_blendv_ps(_mm_loadu_ps(outptr1 + j), s1, mask); - s2 = _mm_blendv_ps(_mm_loadu_ps(outptr2 + j), s2, mask); - } - - _mm_storeu_ps(outptr0 + j, s0); - _mm_storeu_ps(outptr1 + j, s1); - _mm_storeu_ps(outptr2 + j, s2); - } - - for( ; j <= blockSize - 2; j += 2 ) - { - const float* rptr0 = rowbuf + j*vecsize_aligned; - const float* rptr1 = rowbuf + (j+1)*vecsize_aligned; - float s00, s01, s10, s11, s20, s21; - - if( initOutput ) - { - s00 = s01 = bias0; - s10 = s11 = bias1; - s20 = s21 = bias2; - } - else - { - s00 = outptr0[j]; s01 = outptr0[j+1]; - s10 = outptr1[j]; s11 = outptr1[j+1]; - s20 = outptr2[j]; s21 = outptr2[j+1]; - } - - for( int k = 0; k < vecsize; k++ ) - { - float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; - float r = rptr0[k]; - s00 += w0*r; s10 += w1*r; s20 += w2*r; - r = rptr1[k]; - s01 += w0*r; s11 += w1*r; s21 += w2*r; - } - - if( relu ) - { - s00 = s00 > 0.f ? s00 : s00*r0; - s01 = s01 > 0.f ? s01 : s01*r0; - s10 = s10 > 0.f ? s10 : s10*r1; - s11 = s11 > 0.f ? s11 : s11*r1; - s20 = s20 > 0.f ? s20 : s20*r2; - s21 = s21 > 0.f ? s21 : s21*r2; - } - - outptr0[j] = s00; - outptr0[j+1] = s01; - outptr1[j] = s10; - outptr1[j+1] = s11; - outptr2[j] = s20; - outptr2[j+1] = s21; - } - - for( ; j < blockSize; j++ ) - { - const float* rptr0 = rowbuf + j*vecsize_aligned; - float s00, s10, s20; - - if( initOutput ) - { - s00 = bias0; - s10 = bias1; - s20 = bias2; - } - else - { - s00 = outptr0[j]; - s10 = outptr1[j]; - s20 = outptr2[j]; - } - - for( int k = 0; k < vecsize; k++ ) - { - float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; - float r = rptr0[k]; - s00 += w0*r; s10 += w1*r; s20 += w2*r; - } - - if( relu ) - { - s00 = s00 > 0.f ? s00 : s00*r0; - s10 = s10 > 0.f ? s10 : s10*r1; - s20 = s20 > 0.f ? s20 : s20*r2; - } - - outptr0[j] = s00; - outptr1[j] = s10; - outptr2[j] = s20; - } - } - _mm256_zeroupper(); -} - static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b) { __m256 t0 = _mm256_loadu_ps(ptr); @@ -957,198 +654,6 @@ void fastGEMM1T( const float* vec, const float* weights, } } -enum { FASCONV_BASE_VECSZ = 8 }; -void fastConv( const float* weights, size_t wstep, const float* bias, - const float* rowbuf, float* output, const int* outShape, - int blockSize, int vecsize, int vecsize_aligned, - const float* relu, bool initOutput ) -{ - const int vlm1 = vsetvlmax_e32m1(); - int outCn = outShape[1]; - size_t outPlaneSize = outShape[2]*outShape[3]; - // now compute dot product of the weights - // and im2row-transformed part of the tensor - for( int i = 0; i < outCn; i += 3 ) - { - int unroll_tail = FASCONV_BASE_VECSZ; - const float* wptr0 = weights + i*wstep; - const float* wptr1 = wptr0 + wstep; - const float* wptr2 = wptr1 + wstep; - float* outptr0 = output + i*outPlaneSize; - float* outptr1 = outptr0 + outPlaneSize; - float* outptr2 = outptr1 + outPlaneSize; - float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; - - if( i+2 >= outCn ) - { - wptr2 = wptr1; - outptr2 = outptr1; - bias2 = bias1; - if( i+1 >= outCn ) - { - wptr2 = wptr1 = wptr0; - outptr2 = outptr1 = outptr0; - bias2 = bias1 = bias0; - } - } - - int j = 0; - for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) - { - const float* rptr = rowbuf + j*vecsize_aligned; - const float *rptr1 = rptr + vecsize_aligned*1, - *rptr2 = rptr + vecsize_aligned*2, - *rptr3 = rptr + vecsize_aligned*3, - *rptr4 = rptr + vecsize_aligned*4, - *rptr5 = rptr + vecsize_aligned*5, - *rptr6 = rptr + vecsize_aligned*6, - *rptr7 = rptr + vecsize_aligned*7; - if (j + FASCONV_BASE_VECSZ > blockSize) - { - unroll_tail = blockSize - j; - rptr1 = rptr + vecsize_aligned*std::min(1, unroll_tail-1), - rptr2 = rptr + vecsize_aligned*std::min(2, unroll_tail-1), - rptr3 = rptr + vecsize_aligned*std::min(3, unroll_tail-1), - rptr4 = rptr + vecsize_aligned*std::min(4, unroll_tail-1), - rptr5 = rptr + vecsize_aligned*std::min(5, unroll_tail-1), - rptr6 = rptr + vecsize_aligned*std::min(6, unroll_tail-1), - rptr7 = rptr + vecsize_aligned*std::min(7, unroll_tail-1); - } - - int vl, avl = vecsize; - vfloat32m1_t - vs00 = vfmv_v_f_f32m1(0, vlm1), vs10 = vfmv_v_f_f32m1(0, vlm1), vs20 = vfmv_v_f_f32m1(0, vlm1), - vs01 = vfmv_v_f_f32m1(0, vlm1), vs11 = vfmv_v_f_f32m1(0, vlm1), vs21 = vfmv_v_f_f32m1(0, vlm1), - vs02 = vfmv_v_f_f32m1(0, vlm1), vs12 = vfmv_v_f_f32m1(0, vlm1), vs22 = vfmv_v_f_f32m1(0, vlm1), - vs03 = vfmv_v_f_f32m1(0, vlm1), vs13 = vfmv_v_f_f32m1(0, vlm1), vs23 = vfmv_v_f_f32m1(0, vlm1), - vs04 = vfmv_v_f_f32m1(0, vlm1), vs14 = vfmv_v_f_f32m1(0, vlm1), vs24 = vfmv_v_f_f32m1(0, vlm1), - vs05 = vfmv_v_f_f32m1(0, vlm1), vs15 = vfmv_v_f_f32m1(0, vlm1), vs25 = vfmv_v_f_f32m1(0, vlm1), - vs06 = vfmv_v_f_f32m1(0, vlm1), vs16 = vfmv_v_f_f32m1(0, vlm1), vs26 = vfmv_v_f_f32m1(0, vlm1), - vs07 = vfmv_v_f_f32m1(0, vlm1), vs17 = vfmv_v_f_f32m1(0, vlm1), vs27 = vfmv_v_f_f32m1(0, vlm1); - - for (int k = 0; k < vecsize; k += vl, avl -= vl) - { - vl = vsetvl_e32m1(avl); - vfloat32m1_t w0 = vle32_v_f32m1(wptr0 + k, vl); - vfloat32m1_t w1 = vle32_v_f32m1(wptr1 + k, vl); - vfloat32m1_t w2 = vle32_v_f32m1(wptr2 + k, vl); - vfloat32m1_t r0 = vle32_v_f32m1(rptr, vl); - - vs00 = vfmacc_vv_f32m1(vs00, w0, r0, vl); - vs10 = vfmacc_vv_f32m1(vs10, w1, r0, vl); - vs20 = vfmacc_vv_f32m1(vs20, w2, r0, vl); - - r0 = vle32_v_f32m1(rptr1, vl); - vs01 = vfmacc_vv_f32m1(vs01, w0, r0, vl); - vs11 = vfmacc_vv_f32m1(vs11, w1, r0, vl); - vs21 = vfmacc_vv_f32m1(vs21, w2, r0, vl); - - r0 = vle32_v_f32m1(rptr2, vl); - vs02 = vfmacc_vv_f32m1(vs02, w0, r0, vl); - vs12 = vfmacc_vv_f32m1(vs12, w1, r0, vl); - vs22 = vfmacc_vv_f32m1(vs22, w2, r0, vl); - - r0 = vle32_v_f32m1(rptr3, vl); - vs03 = vfmacc_vv_f32m1(vs03, w0, r0, vl); - vs13 = vfmacc_vv_f32m1(vs13, w1, r0, vl); - vs23 = vfmacc_vv_f32m1(vs23, w2, r0, vl); - - r0 = vle32_v_f32m1(rptr4, vl); - vs04 = vfmacc_vv_f32m1(vs04, w0, r0, vl); - vs14 = vfmacc_vv_f32m1(vs14, w1, r0, vl); - vs24 = vfmacc_vv_f32m1(vs24, w2, r0, vl); - - r0 = vle32_v_f32m1(rptr5, vl); - vs05 = vfmacc_vv_f32m1(vs05, w0, r0, vl); - vs15 = vfmacc_vv_f32m1(vs15, w1, r0, vl); - vs25 = vfmacc_vv_f32m1(vs25, w2, r0, vl); - - r0 = vle32_v_f32m1(rptr6, vl); - vs06 = vfmacc_vv_f32m1(vs06, w0, r0, vl); - vs16 = vfmacc_vv_f32m1(vs16, w1, r0, vl); - vs26 = vfmacc_vv_f32m1(vs26, w2, r0, vl); - - r0 = vle32_v_f32m1(rptr7, vl); - vs07 = vfmacc_vv_f32m1(vs07, w0, r0, vl); - vs17 = vfmacc_vv_f32m1(vs17, w1, r0, vl); - vs27 = vfmacc_vv_f32m1(vs27, w2, r0, vl); - - rptr += vl; rptr1 += vl; rptr2 += vl; rptr3 += vl; - rptr4 += vl; rptr5 += vl; rptr6 += vl; rptr7 += vl; - } - - // compute sum of each vs - vfloat32m1_t zero = vfmv_v_f_f32m1(0, vlm1); - // unroll_tail(vl) is required here to be at least FASCONV_BASE_VECSZ, aka 8. - float sum0[FASCONV_BASE_VECSZ], sum1[FASCONV_BASE_VECSZ], sum2[FASCONV_BASE_VECSZ]; - sum0[0] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs00, zero, vlm1)); - sum0[1] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs01, zero, vlm1)); - sum0[2] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs02, zero, vlm1)); - sum0[3] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs03, zero, vlm1)); - sum0[4] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs04, zero, vlm1)); - sum0[5] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs05, zero, vlm1)); - sum0[6] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs06, zero, vlm1)); - sum0[7] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs07, zero, vlm1)); - sum1[0] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs10, zero, vlm1)); - sum1[1] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs11, zero, vlm1)); - sum1[2] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs12, zero, vlm1)); - sum1[3] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs13, zero, vlm1)); - sum1[4] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs14, zero, vlm1)); - sum1[5] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs15, zero, vlm1)); - sum1[6] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs16, zero, vlm1)); - sum1[7] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs17, zero, vlm1)); - sum2[0] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs20, zero, vlm1)); - sum2[1] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs21, zero, vlm1)); - sum2[2] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs22, zero, vlm1)); - sum2[3] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs23, zero, vlm1)); - sum2[4] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs24, zero, vlm1)); - sum2[5] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs25, zero, vlm1)); - sum2[6] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs26, zero, vlm1)); - sum2[7] = vfmv_f_s_f32m1_f32(vfredosum_vs_f32m1_f32m1(zero, vs27, zero, vlm1)); - - // if VLEN = 128, so LMUL = 2 for unroll_tail(vl) = 8. - // otherwise, VLEN >=256, we only use fist 8 element of the vReg. - vfloat32m2_t s0, s1, s2; - if( initOutput ) - { - s0 = vfmv_v_f_f32m2(bias0, unroll_tail); - s1 = vfmv_v_f_f32m2(bias1, unroll_tail); - s2 = vfmv_v_f_f32m2(bias2, unroll_tail); - } - else - { - s0 = vle32_v_f32m2(outptr0 + j, unroll_tail); - s1 = vle32_v_f32m2(outptr1 + j, unroll_tail); - s2 = vle32_v_f32m2(outptr2 + j, unroll_tail); - } - s0 = vfadd_vv_f32m2(vle32_v_f32m2(sum0, unroll_tail), s0, unroll_tail); - s1 = vfadd_vv_f32m2(vle32_v_f32m2(sum1, unroll_tail), s1, unroll_tail); - s2 = vfadd_vv_f32m2(vle32_v_f32m2(sum2, unroll_tail), s2, unroll_tail); - - if( relu ) - { - float r0 = relu[i], r1 = relu[i+1], r2 = relu[i+2]; - if( i+2 >= outCn ) - { - r2 = r1; - if( i+1 >= outCn ) - r2 = r1 = r0; - } - vbool16_t m0 = vmfgt_vf_f32m2_b16(s0, 0, unroll_tail); - vbool16_t m1 = vmfgt_vf_f32m2_b16(s1, 0, unroll_tail); - vbool16_t m2 = vmfgt_vf_f32m2_b16(s2, 0, unroll_tail); - s0 = vmerge_vvm_f32m2(m0, vfmul_vf_f32m2(s0, r0, unroll_tail), s0, unroll_tail); - s1 = vmerge_vvm_f32m2(m1, vfmul_vf_f32m2(s1, r1, unroll_tail), s1, unroll_tail); - s2 = vmerge_vvm_f32m2(m2, vfmul_vf_f32m2(s2, r2, unroll_tail), s2, unroll_tail); - } - - vse32_v_f32m2(outptr0 + j, s0, unroll_tail); - vse32_v_f32m2(outptr1 + j, s1, unroll_tail); - vse32_v_f32m2(outptr2 + j, s2, unroll_tail); - } - } -} - /* Example for load_deinterleave: input: ptr[16] = {1,2,3, ... ,14,15,16} @@ -1345,317 +850,6 @@ void fastDepthwiseConv( const float* wptr, #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX -enum { FASCONV_BASE_VECSZ = 4 }; - -void fastConv( const float* weights, size_t wstep, const float* bias, - const float* rowbuf, float* output, const int* outShape, - int blockSize, int vecsize, int vecsize_aligned, - const float* relu, bool initOutput ) -{ - int outCn = outShape[1]; - size_t outPlaneSize = outShape[2]*outShape[3]; - float r0 = 1.f, r1 = 1.f, r2 = 1.f; - __m256 t1 = _v256_setall_ps(1.f), t2 = _v256_setall_ps(0.f); - __m128 vr0 = *(__m128*)&t1, vr1 = vr0, vr2 = vr0, z = *(__m128*)&t2; - int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0}; - int rsz = blockSize % FASCONV_BASE_VECSZ; - for( int i = 0; i < rsz; i++ ) - maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1; - __m128i mask = __lsx_vld((const float*)maskbuf, 0); - - // now compute dot product of the weights - // and im2row-transformed part of the tensor - for( int i = 0; i < outCn; i += 3 ) - { - const float* wptr0 = weights + i*wstep; - const float* wptr1 = wptr0 + wstep; - const float* wptr2 = wptr1 + wstep; - float* outptr0 = output + i*outPlaneSize; - float* outptr1 = outptr0 + outPlaneSize; - float* outptr2 = outptr1 + outPlaneSize; - float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; - - if( i+2 >= outCn ) - { - wptr2 = wptr1; - outptr2 = outptr1; - bias2 = bias1; - if( i+1 >= outCn ) - { - wptr2 = wptr1 = wptr0; - outptr2 = outptr1 = outptr0; - bias2 = bias1 = bias0; - } - } - - if( relu ) - { - r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2]; - if( i+2 >= outCn ) - { - r2 = r1; - if( i+1 >= outCn ) - r2 = r1 = r0; - } - vr0 = _v256_extract_low(_v256_setall_ps(r0)); - vr1 = _v256_extract_low(_v256_setall_ps(r1)); - vr2 = _v256_extract_low(_v256_setall_ps(r2)); - } - - int j = 0; - for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) - { - bool tail = false; - if (j + FASCONV_BASE_VECSZ > blockSize) - { - if (j == 0) - break; - j = blockSize - FASCONV_BASE_VECSZ; - tail = true; - } - int k = 0; - const float* rptr = rowbuf + j*vecsize_aligned; - - __m256i tmp; - __m256 vs00 = (__m256)__lasx_xvxor_v(tmp, tmp), vs01 = (__m256)__lasx_xvxor_v(tmp, tmp), - vs02 = (__m256)__lasx_xvxor_v(tmp, tmp), vs03 = (__m256)__lasx_xvxor_v(tmp, tmp), - vs10 = (__m256)__lasx_xvxor_v(tmp, tmp), vs11 = (__m256)__lasx_xvxor_v(tmp, tmp), - vs12 = (__m256)__lasx_xvxor_v(tmp, tmp), vs13 = (__m256)__lasx_xvxor_v(tmp, tmp), - vs20 = (__m256)__lasx_xvxor_v(tmp, tmp), vs21 = (__m256)__lasx_xvxor_v(tmp, tmp), - vs22 = (__m256)__lasx_xvxor_v(tmp, tmp), vs23 = (__m256)__lasx_xvxor_v(tmp, tmp); - - for (; k < vecsize; k += 8, rptr += 8 ) - { - __m256 w0 = (__m256)__lasx_xvld(wptr0 + k, 0); - __m256 w1 = (__m256)__lasx_xvld(wptr1 + k, 0); - __m256 w2 = (__m256)__lasx_xvld(wptr2 + k, 0); - __m256 r0 = (__m256)__lasx_xvld(rptr, 0); - - vs00 = __lasx_xvfmadd_s(w0, r0, vs00); - vs10 = __lasx_xvfmadd_s(w1, r0, vs10); - vs20 = __lasx_xvfmadd_s(w2, r0, vs20); - - r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned, 0); - vs01 = __lasx_xvfmadd_s(w0, r0, vs01); - vs11 = __lasx_xvfmadd_s(w1, r0, vs11); - vs21 = __lasx_xvfmadd_s(w2, r0, vs21); - - r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*2, 0); - vs02 = __lasx_xvfmadd_s(w0, r0, vs02); - vs12 = __lasx_xvfmadd_s(w1, r0, vs12); - vs22 = __lasx_xvfmadd_s(w2, r0, vs22); - - r0 = (__m256)__lasx_xvld(rptr + vecsize_aligned*3, 0); - vs03 = __lasx_xvfmadd_s(w0, r0, vs03); - vs13 = __lasx_xvfmadd_s(w1, r0, vs13); - vs23 = __lasx_xvfmadd_s(w2, r0, vs23); - } - - /*t0*/ - __m256 vs00_perm = (__m256)__lasx_xvpermi_d(vs00, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs00_add_2w = __lasx_xvfadd_s(vs00, vs00_perm); - __m256 tmp00_srl = (__m256)__lasx_xvsrli_d(vs00_add_2w, 32); - __m256 vs00_add_4w = __lasx_xvfadd_s(vs00_add_2w, tmp00_srl); - - __m256 vs01_perm = (__m256)__lasx_xvpermi_d(vs01, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs01_add_2w = __lasx_xvfadd_s(vs01, vs01_perm); - __m256 tmp01_srl = (__m256)__lasx_xvsrli_d(vs01_add_2w, 32); - __m256 vs01_add_4w = __lasx_xvfadd_s(vs01_add_2w, tmp01_srl); - - __m256 vs02_perm = (__m256)__lasx_xvpermi_d(vs02, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs02_add_2w = __lasx_xvfadd_s(vs02, vs02_perm); - __m256 tmp02_srl = (__m256)__lasx_xvsrli_d(vs02_add_2w, 32); - __m256 vs02_add_4w = __lasx_xvfadd_s(vs02_add_2w, tmp02_srl); - - __m256 vs03_perm = (__m256)__lasx_xvpermi_d(vs03, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs03_add_2w = __lasx_xvfadd_s(vs03, vs03_perm); - __m256 tmp03_srl = (__m256)__lasx_xvsrli_d(vs03_add_2w, 32); - __m256 vs03_add_4w = __lasx_xvfadd_s(vs03_add_2w, tmp03_srl); - - __m256i vs01_vs00 = __lasx_xvpackev_w((__m256i)vs01_add_4w, (__m256i)vs00_add_4w); - __m256i vs03_vs02 = __lasx_xvpackev_w((__m256i)vs03_add_4w, (__m256i)vs02_add_4w); - __m256 t0 = (__m256)__lasx_xvpackev_d(vs03_vs02, vs01_vs00); - - /*t1*/ - __m256 vs10_perm = (__m256)__lasx_xvpermi_d(vs10, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs10_add_2w = __lasx_xvfadd_s(vs10, vs10_perm); - __m256 tmp10_srl = (__m256)__lasx_xvsrli_d(vs10_add_2w, 32); - __m256 vs10_add_4w = __lasx_xvfadd_s(vs10_add_2w, tmp10_srl); - - __m256 vs11_perm = (__m256)__lasx_xvpermi_d(vs11, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs11_add_2w = __lasx_xvfadd_s(vs11, vs11_perm); - __m256 tmp11_srl = (__m256)__lasx_xvsrli_d(vs11_add_2w, 32); - __m256 vs11_add_4w = __lasx_xvfadd_s(vs11_add_2w, tmp11_srl); - - __m256 vs12_perm = (__m256)__lasx_xvpermi_d(vs12, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs12_add_2w = __lasx_xvfadd_s(vs12, vs12_perm); - __m256 tmp12_srl = (__m256)__lasx_xvsrli_d(vs12_add_2w, 32); - __m256 vs12_add_4w = __lasx_xvfadd_s(vs12_add_2w, tmp12_srl); - - __m256 vs13_perm = (__m256)__lasx_xvpermi_d(vs13, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs13_add_2w = __lasx_xvfadd_s(vs13, vs13_perm); - __m256 tmp13_srl = (__m256)__lasx_xvsrli_d(vs13_add_2w, 32); - __m256 vs13_add_4w = __lasx_xvfadd_s(vs13_add_2w, tmp13_srl); - - __m256i vs11_vs10 = __lasx_xvpackev_w((__m256i)vs11_add_4w, (__m256i)vs10_add_4w); - __m256i vs13_vs12 = __lasx_xvpackev_w((__m256i)vs13_add_4w, (__m256i)vs12_add_4w); - __m256 t1 = (__m256)__lasx_xvpackev_d(vs13_vs12, vs11_vs10); - - /*t2*/ - __m256 vs20_perm = (__m256)__lasx_xvpermi_d(vs20, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs20_add_2w = __lasx_xvfadd_s(vs20, vs20_perm); - __m256 tmp20_srl = (__m256)__lasx_xvsrli_d(vs20_add_2w, 32); - __m256 vs20_add_4w = __lasx_xvfadd_s(vs20_add_2w, tmp20_srl); - - __m256 vs21_perm = (__m256)__lasx_xvpermi_d(vs21, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs21_add_2w = __lasx_xvfadd_s(vs21, vs21_perm); - __m256 tmp21_srl = (__m256)__lasx_xvsrli_d(vs21_add_2w, 32); - __m256 vs21_add_4w = __lasx_xvfadd_s(vs21_add_2w, tmp21_srl); - - __m256 vs22_perm = (__m256)__lasx_xvpermi_d(vs22, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs22_add_2w = __lasx_xvfadd_s(vs22, vs22_perm); - __m256 tmp22_srl = (__m256)__lasx_xvsrli_d(vs22_add_2w, 32); - __m256 vs22_add_4w = __lasx_xvfadd_s(vs22_add_2w, tmp22_srl); - - __m256 vs23_perm = (__m256)__lasx_xvpermi_d(vs23, (2<<6) + (3<<4) + (0<<2) + 1); - __m256 vs23_add_2w = __lasx_xvfadd_s(vs23, vs23_perm); - __m256 tmp23_srl = (__m256)__lasx_xvsrli_d(vs23_add_2w, 32); - __m256 vs23_add_4w = __lasx_xvfadd_s(vs23_add_2w, tmp23_srl); - - __m256i vs21_vs20 = __lasx_xvpackev_w((__m256i)vs21_add_4w, (__m256i)vs20_add_4w); - __m256i vs23_vs22 = __lasx_xvpackev_w((__m256i)vs23_add_4w, (__m256i)vs22_add_4w); - __m256 t2 = (__m256)__lasx_xvpackev_d(vs23_vs22, vs21_vs20); - - t0 = __lasx_xvfadd_s(t0, (__m256)__lasx_xvpermi_q(t0, t0, 1)); - t1 = __lasx_xvfadd_s(t1, (__m256)__lasx_xvpermi_q(t1, t1, 1)); - t2 = __lasx_xvfadd_s(t2, (__m256)__lasx_xvpermi_q(t2, t2, 1)); - - __m128 s0, s1, s2; - - if( initOutput ) - { - s0 = _v256_extract_low(_v256_setall_ps(bias0)); - s1 = _v256_extract_low(_v256_setall_ps(bias1)); - s2 = _v256_extract_low(_v256_setall_ps(bias2)); - } - else - { - s0 = (__m128)__lsx_vld(outptr0 + j, 0); - s1 = (__m128)__lsx_vld(outptr1 + j, 0); - s2 = (__m128)__lsx_vld(outptr2 + j, 0); - } - - s0 = __lsx_vfadd_s(s0, *(__m128*)&t0); - s1 = __lsx_vfadd_s(s1, *(__m128*)&t1); - s2 = __lsx_vfadd_s(s2, *(__m128*)&t2); - - if( relu ) - { - __m128i m0 = __lsx_vfcmp_clt_s(z, s0); - __m128i m1 = __lsx_vfcmp_clt_s(z, s1); - __m128i m2 = __lsx_vfcmp_clt_s(z, s2); - s0 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s0, vr0), (__m128i)s0, m0); - s1 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s1, vr1), (__m128i)s1, m1); - s2 = (__m128)__lsx_vbitsel_v((__m128i)__lsx_vfmul_s(s2, vr2), (__m128i)s2, m2); - } - - if( tail ) - { - s0 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr0 + j, 0), (__m128i)s0, mask); - s1 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr1 + j, 0), (__m128i)s1, mask); - s2 = (__m128)__lsx_vbitsel_v(__lsx_vld(outptr2 + j, 0), (__m128i)s2, mask); - } - - __lsx_vst(s0, outptr0 + j, 0); - __lsx_vst(s1, outptr1 + j, 0); - __lsx_vst(s2, outptr2 + j, 0); - } - - for( ; j <= blockSize - 2; j += 2 ) - { - const float* rptr0 = rowbuf + j*vecsize_aligned; - const float* rptr1 = rowbuf + (j+1)*vecsize_aligned; - float s00, s01, s10, s11, s20, s21; - - if( initOutput ) - { - s00 = s01 = bias0; - s10 = s11 = bias1; - s20 = s21 = bias2; - } - else - { - s00 = outptr0[j]; s01 = outptr0[j+1]; - s10 = outptr1[j]; s11 = outptr1[j+1]; - s20 = outptr2[j]; s21 = outptr2[j+1]; - } - - for( int k = 0; k < vecsize; k++ ) - { - float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; - float r = rptr0[k]; - s00 += w0*r; s10 += w1*r; s20 += w2*r; - r = rptr1[k]; - s01 += w0*r; s11 += w1*r; s21 += w2*r; - } - - if( relu ) - { - s00 = s00 > 0.f ? s00 : s00*r0; - s01 = s01 > 0.f ? s01 : s01*r0; - s10 = s10 > 0.f ? s10 : s10*r1; - s11 = s11 > 0.f ? s11 : s11*r1; - s20 = s20 > 0.f ? s20 : s20*r2; - s21 = s21 > 0.f ? s21 : s21*r2; - } - - outptr0[j] = s00; - outptr0[j+1] = s01; - outptr1[j] = s10; - outptr1[j+1] = s11; - outptr2[j] = s20; - outptr2[j+1] = s21; - } - - for( ; j < blockSize; j++ ) - { - const float* rptr0 = rowbuf + j*vecsize_aligned; - float s00, s10, s20; - - if( initOutput ) - { - s00 = bias0; - s10 = bias1; - s20 = bias2; - } - else - { - s00 = outptr0[j]; - s10 = outptr1[j]; - s20 = outptr2[j]; - } - - for( int k = 0; k < vecsize; k++ ) - { - float w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k]; - float r = rptr0[k]; - s00 += w0*r; s10 += w1*r; s20 += w2*r; - } - - if( relu ) - { - s00 = s00 > 0.f ? s00 : s00*r0; - s10 = s10 > 0.f ? s10 : s10*r1; - s20 = s20 > 0.f ? s20 : s20*r2; - } - - outptr0[j] = s00; - outptr1[j] = s10; - outptr2[j] = s20; - } - } -} - static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b) { __m256 t0 = (__m256)__lasx_xvld(ptr, 0); -- 2.7.4