From 2e9e71ab9e2d15d3907e5ed3dbdb985fd97ddb8c Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Tue, 29 May 2018 19:18:10 +0900 Subject: [PATCH] make ocl4dnn available to run on other platform than Intel GPU --- modules/dnn/src/layers/convolution_layer.cpp | 3 +- modules/dnn/src/layers/elementwise_layers.cpp | 3 +- modules/dnn/src/layers/mvn_layer.cpp | 47 +++++++++++++++++++++------ modules/dnn/src/layers/pooling_layer.cpp | 3 +- modules/dnn/src/opencl/mvn.cl | 15 ++++++--- modules/dnn/test/test_tf_importer.cpp | 4 ++- 6 files changed, 53 insertions(+), 22 deletions(-) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 96a9d5b..edadcd9 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -966,8 +966,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && - OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), forward_ocl(inputs_arr, outputs_arr, internals_arr)) Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index a24b913..f57ef01 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -176,8 +176,7 @@ public: { CV_TRACE_FUNCTION(); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget) && - OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(this->preferableTarget), func.applyOCL(inputs_arr, outputs_arr, internals_arr)) Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index 647308a..9e4f0ac 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -73,7 +73,7 @@ public: virtual bool tryFuse(Ptr& top) CV_OVERRIDE { - if (preferableTarget == DNN_TARGET_OPENCL && !fuse_batch_norm) + if (!fuse_batch_norm) { top->getScaleShift(scale, shift); fuse_batch_norm = !scale.empty() || !shift.empty(); @@ -252,8 +252,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && - OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), forward_ocl(inputs_arr, outputs_arr, internals_arr)) Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); @@ -274,25 +273,53 @@ public: for( i = 0; i < splitDim; i++ ) newRows *= inpBlob.size[i]; - if (inpBlob.total() == newRows) + Mat inpMat = inpBlob.reshape(1, newRows); + Mat outMat = outBlob.reshape(1, newRows); + + if ( inpBlob.total() == newRows ) { // MVN is applied to single values at an every row. - outBlob.setTo(0); + if (shift.empty()) + { + outBlob.setTo(0); + } + else + { + for ( i = 0; i < newRows; i++ ) + { + outMat.row(i).setTo(((float*)shift.data)[i]); + } + } return; } - Mat inpMat = inpBlob.reshape(1, newRows); - Mat outMat = outBlob.reshape(1, newRows); - Scalar mean, dev; for ( i = 0; i < newRows; i++) { Mat inpRow = inpMat.row(i); Mat outRow = outMat.row(i); - + float weight = 1.f; + float bias = 0.f; + if (fuse_batch_norm) + { + weight = i < scale.cols ? ((float*)scale.data)[i] : weight; + bias = i < shift.cols ? ((float*)shift.data)[i] : bias; + } cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray()); double alpha = (normVariance) ? 1/(eps + dev[0]) : 1; - inpRow.convertTo(outRow, outRow.type(), alpha, -mean[0] * alpha); + double normalizationScale = 1.0; + double normalizationShift = 0.0; + if (fuse_batch_norm) + { + normalizationScale = alpha * weight; + normalizationShift = -mean[0] * normalizationScale + bias; + } + else + { + normalizationScale = alpha; + normalizationShift = -mean[0] * alpha; + } + inpRow.convertTo(outRow, outRow.type(), normalizationScale, normalizationShift); } } } diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 2bcce1d..548cb8a 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -191,8 +191,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && - OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), forward_ocl(inputs_arr, outputs_arr, internals_arr)) Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); diff --git a/modules/dnn/src/opencl/mvn.cl b/modules/dnn/src/opencl/mvn.cl index 49a8ebb..ffc81a8 100644 --- a/modules/dnn/src/opencl/mvn.cl +++ b/modules/dnn/src/opencl/mvn.cl @@ -89,7 +89,8 @@ __kernel void CALC_MEAN(__global const Dtype* src, Dtype mean_val = mean[x]; vec_type src_vec = load(src, index); - vec_type dst_vec = native_powr(src_vec - (vec_type)mean_val, 2); + vec_type dst_vec = src_vec - (vec_type)mean_val; + dst_vec = dst_vec * dst_vec; store(dst_vec, dst, index); } @@ -197,10 +198,14 @@ __kernel void MEAN_FUSE(__global const T * A, const T4 a2 = vload4(i, src0_read + 2 * A_col_size); const T4 a3 = vload4(i, src0_read + 3 * A_col_size); - dot0 = native_powr(convert_float4(a0) - (Dtype4)sum.x, 2); - dot1 = native_powr(convert_float4(a1) - (Dtype4)sum.y, 2); - dot2 = native_powr(convert_float4(a2) - (Dtype4)sum.z, 2); - dot3 = native_powr(convert_float4(a3) - (Dtype4)sum.w, 2); + dot0 = convert_float4(a0) - (Dtype4)sum.x; + dot1 = convert_float4(a1) - (Dtype4)sum.y; + dot2 = convert_float4(a2) - (Dtype4)sum.z; + dot3 = convert_float4(a3) - (Dtype4)sum.w; + dot0 = dot0 * dot0; + dot1 = dot1 * dot1; + dot2 = dot2 * dot2; + dot3 = dot3 * dot3; vstore4(dot0, i, dst0_read); vstore4(dot1, i, dst0_read + A_col_size); diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 84d224e..b3f4f4a 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -160,10 +160,12 @@ TEST_P(Test_TensorFlow_layers, batch_norm) TEST_P(Test_TensorFlow_layers, pooling) { int targetId = GetParam(); + cv::ocl::Device d = cv::ocl::Device::getDefault(); + bool loosenFlag = targetId == DNN_TARGET_OPENCL && d.isIntel() && d.type() == cv::ocl::Device::TYPE_CPU; runTensorFlowNet("max_pool_even", targetId); runTensorFlowNet("max_pool_odd_valid", targetId); runTensorFlowNet("ave_pool_same", targetId); - runTensorFlowNet("max_pool_odd_same", targetId); + runTensorFlowNet("max_pool_odd_same", targetId, false, loosenFlag ? 3e-5 : 1e-5, loosenFlag ? 3e-4 : 1e-4); runTensorFlowNet("reduce_mean", targetId); // an average pooling over all spatial dimensions. } -- 2.7.4