From: Li Peng Date: Thu, 12 Jul 2018 07:16:32 +0000 (+0800) Subject: enable concat layer fuse for OCL target X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1^2~600^2~8^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f0cadaa6e3e1ae67f0e8f7e9c49413ca1714cfbf;p=platform%2Fupstream%2Fopencv.git enable concat layer fuse for OCL target Signed-off-by: Li Peng --- diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 011631f..db90593 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -1492,7 +1492,8 @@ struct Net::Impl // TODO: OpenCL target support more fusion styles. if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) && (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" && - ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) ) + ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" && + ld.layerInstance->type != "Concat")) ) continue; Ptr& currLayer = ld.layerInstance; @@ -1701,6 +1702,31 @@ struct Net::Impl ld.outputBlobs.size() == 1 ) { Mat& output = ld.outputBlobs[0]; + UMat umat_output; + if (!ld.outputBlobsWrappers.empty() && + (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))) + { + size_t i, ninputs = ld.inputBlobsId.size(); + bool conv_layer = true; + for( i = 0; i < ninputs; i++ ) + { + LayerPin pin = ld.inputBlobsId[i]; + LayerData* inp_i_data = &layers[pin.lid]; + while(inp_i_data->skip && + inp_i_data->inputBlobsId.size() == 1 && + inp_i_data->consumers.size() == 1) + { + pin = inp_i_data->inputBlobsId[0]; + inp_i_data = &layers[pin.lid]; + } + conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution"); + } + if (!conv_layer) + continue; + std::vector umat_outputBlobs; + umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers); + umat_output = umat_outputBlobs[0]; + } // TODO: in general, this optimization can always be done, but // many layers currently check that the input/output blobs are @@ -1737,6 +1763,14 @@ struct Net::Impl // Allocate new memory to prevent collisions during memory // reusing (see https://github.com/opencv/opencv/pull/10456). output = output.clone(); + if (preferableBackend == DNN_BACKEND_OPENCV && + IS_DNN_OPENCL_TARGET(preferableTarget)) + { + std::vector umats(1); + umat_output = umat_output.clone(); + umats[0] = umat_output; + OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats); + } Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() }; int ofs = 0; for( i = 0; i < ninputs; i++ ) @@ -1753,6 +1787,12 @@ struct Net::Impl CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size); Mat* oldPtr = &curr_output; curr_output = output_slice; + if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)) + { + std::vector umats(inp_i_data->outputBlobsWrappers.size()); + umats[pin.oid] = umat_output(chrange); + OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats); + } // Layers that refer old input Mat will refer to the // new data but the same Mat object. CV_Assert(curr_output.data == output_slice.data, oldPtr == &curr_output); diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index a446f3b..c889c7d 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -821,7 +821,7 @@ void OCL4DNNConvSpatial::CreateSubBuffer(const UMat& buffer, UMat& sub_bu cl_int err; size_t element_size = (use_half_) ? sizeof(short) : sizeof(float); - region.origin = offset * element_size; + region.origin = offset * element_size + buffer.offset; region.size = size * element_size; sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ), write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY, @@ -853,6 +853,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; int32_t bias_offset; + int32_t element_size = use_half_ ? sizeof(short) : sizeof(float); if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) { if (!swizzleWeight(weight, config->workItem_output[2], false)) @@ -931,10 +932,12 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer)); + kernel.set(argIdx++, (int)(out_buffer.offset / element_size)); } else { kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, (int)(top.offset / element_size)); } kernel.set(argIdx++, (uint16_t)width_); @@ -1024,10 +1027,12 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer)); + kernel.set(argIdx++, (int)(out_buffer.offset / element_size)); } else { kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, (int)(top.offset / element_size)); } kernel.set(argIdx++, (uint16_t)width_); @@ -1079,6 +1084,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, if (bias_term_) kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, (int)(top.offset / element_size)); kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)height_); kernel.set(argIdx++, (uint16_t)output_w_); @@ -1126,6 +1132,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, kernel.set(argIdx++, (void *)NULL); kernel.set(argIdx++, bias_offset); kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, (int)(top.offset / element_size)); kernel.set(argIdx++, output_image_offset); kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)height_); diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 2cc161d..adeb385 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -136,7 +136,8 @@ __kernel void ConvolveBasic( int kernel_offset, __global Dtype* bias, const int bias_offset, - __global Dtype* convolved_image, + __global Dtype* convolved_image_base, + const int convolved_image_base_offset, const int convolved_image_offset, const ushort input_width, const ushort input_height, @@ -146,6 +147,7 @@ __kernel void ConvolveBasic( const ushort pad_h ) { + __global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset; const int outputX = get_global_id(0); const int outputY = get_global_id(1); const int kernelNum = get_global_id(2) * ZPAR; @@ -220,12 +222,14 @@ convolve_simd( __global Dtype* inputs, __global Dtype* weights, BIAS_KERNEL_ARG - __global Dtype* outputs, + __global Dtype* outputs_base, + const int outputs_offset, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height) { + __global Dtype* outputs = outputs_base + outputs_offset; unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth @@ -395,7 +399,8 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy. const __global Dtype *src0, \ const __global Dtype *src1, \ BIAS_KERNEL_ARG \ - __global Dtype *dst, \ + __global Dtype *dst_base, \ + const int dst_offset, \ const ushort input_width, \ const ushort input_height, \ const ushort output_width, \ @@ -425,6 +430,7 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy. __attribute__((intel_reqd_sub_group_size(8))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + __global Dtype *dst = dst_base + dst_offset; const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -813,6 +819,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(8))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + __global Dtype *dst = dst_base + dst_offset; const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -1374,6 +1381,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(16))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + __global Dtype *dst = dst_base + dst_offset; const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -1559,6 +1567,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(16))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + __global Dtype *dst = dst_base + dst_offset; const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -1770,12 +1779,13 @@ __kernel void DWCONV( __global Dtype* image_data, __global Dtype* kernel_data, BIAS_KERNEL_ARG - __global Dtype* convolved_image, + __global Dtype* convolved_image_base, + const int convolved_image_offset, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height) { - + __global Dtype* convolved_image = convolved_image_base + convolved_image_offset; const int outputX = get_global_id(0); const int outputY = get_global_id(1); const int outputZ = get_global_id(2);