From dd3f517fe916cbdcb37ced5b55a6726f44edbc0e Mon Sep 17 00:00:00 2001 From: Yashas Date: Sun, 8 Dec 2019 21:03:30 +0530 Subject: [PATCH] optimize region kernels --- modules/dnn/src/cuda/region.cu | 230 +++++++++++-------------- modules/dnn/src/cuda4dnn/kernels/region.hpp | 15 +- modules/dnn/src/cuda4dnn/primitives/region.hpp | 18 +- 3 files changed, 117 insertions(+), 146 deletions(-) diff --git a/modules/dnn/src/cuda/region.cu b/modules/dnn/src/cuda/region.cu index 158deb9..00c1cbe 100644 --- a/modules/dnn/src/cuda/region.cu +++ b/modules/dnn/src/cuda/region.cu @@ -24,176 +24,154 @@ using namespace cv::dnn::cuda4dnn::csl::device; namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { namespace raw { - template - __global__ void sigmoid_strided(Span output, View input, size_type n, size_type stride, size_type offset) { - /* - the input is divided into equal blocks strided by `stride` - * - we must apply sigmoid to a continuous range of `n` values starting from `offset` in every block - */ - for (auto i : grid_stride_range(n * output.size() / stride)) { - auto block_idx = i / n; - auto index = block_idx * stride + offset + (i % n); - - using device::sigmoid; - output[index] = sigmoid(input[index]); - } - } template - __global__ void softmax_strided(Span output, View input, size_type n, size_type stride, size_type offset_) { - for (auto idx : grid_stride_range(output.size() / stride)) { - index_type offset = idx * stride + offset_; - - auto largest = numeric_limits::lowest(); - for (int i = 0; i < n; i++) { - using device::max; - largest = max(largest, output[offset + i]); - } - - auto sum = T(0); - for (int i = 0; i < n; i++) { - using device::exp; - auto temp = exp(output[offset + i] - largest); - sum += temp; - output[offset + i] = temp; - } - - for (int i = 0; i < n; i++) { - output[offset + i] /= sum; - } - } - } - - template - __global__ void region_finalize(Span output, View input, View bias, - T object_prob_cutoff, T class_prob_cutoff, - size_type height_norm, size_type width_norm, + __global__ void region_box( + Span output, View input, View bias, + size_type boxes_per_cell, size_type box_size, size_type rows, size_type cols, - size_type boxes_per_cell, - size_type box_size, - size_type classes) + size_type height_norm, size_type width_norm, + T object_prob_cutoff) { + using vector2_type = get_vector_type_t; + auto bias_vPtr = vector2_type::get_pointer(bias.data()); + for (auto box_index : grid_stride_range(output.size() / box_size)) { - auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */ - auto box_offset = box_index * box_size; + const auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */ + const auto box_offset = box_index * box_size; - auto batch_inner_size = rows * cols * boxes_per_cell; - auto row_inner_size = cols * boxes_per_cell; - auto col_inner_size = boxes_per_cell; + const auto batch_inner_size = rows * cols * boxes_per_cell; + const auto row_inner_size = cols * boxes_per_cell; + const auto col_inner_size = boxes_per_cell; - auto y = (box_index % batch_inner_size) / row_inner_size; - auto x = (box_index % row_inner_size) / col_inner_size; + const auto y = (box_index % batch_inner_size) / row_inner_size; + const auto x = (box_index % row_inner_size) / col_inner_size; using device::sigmoid; - using device::exp; output[box_offset + 0] = (T(x) + sigmoid(input[box_offset + 0])) / T(cols); output[box_offset + 1] = (T(y) + sigmoid(input[box_offset + 1])) / T(rows); - output[box_offset + 2] = exp(input[box_offset + 2]) * bias[2 * box_of_the_cell + 0] / T(width_norm); - output[box_offset + 3] = exp(input[box_offset + 3]) * bias[2 * box_of_the_cell + 1] / T(height_norm); + + vector2_type bias_xy; + v_load(bias_xy, bias_vPtr[box_of_the_cell]); + + using device::exp; + output[box_offset + 2] = exp(input[box_offset + 2]) * bias_xy.data[0] / T(width_norm); + output[box_offset + 3] = exp(input[box_offset + 3]) * bias_xy.data[1] / T(height_norm); /* squash objectness score into a probability */ using device::sigmoid; - T objectness_prob = sigmoid(output[box_offset + 4]); - output[box_offset + 4] = objectness_prob; + T objectness_prob = sigmoid(input[box_offset + 4]); /* ignore prediction if the objectness probability is less than the cutoff */ if (objectness_prob < object_prob_cutoff) objectness_prob = 0; - /* the class probabilities we have currently are conditional class probabilities + output[box_offset + 4] = objectness_prob; + } + } + + template + __global__ void region_sigmoid_class_score(Span output, View input, T class_prob_cutoff, size_type box_size) + { + for (auto idx : grid_stride_range(output.size())) { + const index_type box_no = idx / box_size; + const index_type start_of_box = box_no * box_size; + const index_type box_offset = idx % box_size; + + if (box_offset < 5) { + /* continue as we have already processed these in region_box */ + continue; + } + + auto objectness_prob = output[start_of_box + 4]; + + /* the class probabilities we currently have are conditional class probabilities * given the object * * to obtain the actual class probability, we multiply the conditional probability * with the object probability */ - const index_type class_begin = box_offset + 5; /* 4 box coordinates, 1 obj prob, class probs... */ - const index_type class_end = class_begin + classes; - index_type offset = class_begin; + auto actual_class_prob = objectness_prob * sigmoid(input[idx]); + if (actual_class_prob <= class_prob_cutoff) + actual_class_prob = T(0); + output[idx] = actual_class_prob; + } + } - using vector_type = get_vector_type_t; + template + __global__ void region_softmax_class_score(Span output, View input, T class_prob_cutoff, size_type box_size) { + for (auto box_no : grid_stride_range(output.size() / box_size)) { + const index_type start_of_box = box_no * box_size; + const index_type start_idx = start_of_box + 5; + const index_type end_idx = start_of_box + box_size; - /* process each class independently until the offset is aligned to an n-element boundary */ - while (offset % vector_type::size() != 0 && offset < class_end) { - T actual_class_prob = objectness_prob * output[offset]; - if (actual_class_prob <= class_prob_cutoff) - actual_class_prob = T(0); - output[offset] = actual_class_prob; - offset++; + auto largest = numeric_limits::lowest(); + for (int idx = start_idx; idx < end_idx; idx++) { + using device::max; + largest = max(largest, input[idx]); } - auto output_vPtr = vector_type::get_pointer(output.data() + offset); - auto input_vPtr = vector_type::get_pointer(input.data() + offset); - for (int i = 0; (offset + vector_type::size()) < class_end; i++) { - vector_type vec; - v_load(vec, output_vPtr[i]); - for (int j = 0; j < vector_type::size(); j++) { - T actual_class_prob = objectness_prob * vec.data[j]; - if (actual_class_prob <= class_prob_cutoff) - actual_class_prob = T(0); - vec.data[j] = actual_class_prob; - } - v_store(output_vPtr[i], vec); - offset += vector_type::size(); + auto sum = T(0); + for (int idx = start_idx; idx < end_idx; idx++) { + using device::exp; + auto temp = exp(input[idx] - largest); + sum += temp; + output[idx] = temp; } - /* process the remaining classes */ - while (offset < class_end) { - T actual_class_prob = objectness_prob * output[offset]; + for (int idx = start_idx; idx < end_idx; idx++) { + auto softmax_score = output[idx] / sum; + + /* the class probabilities we currently have are conditional class probabilities + * given the object + * + * to obtain the actual class probability, we multiply the conditional probability + * with the object probability + */ + auto objectness_prob = output[start_of_box + 4]; + auto actual_class_prob = objectness_prob * softmax_score; if (actual_class_prob <= class_prob_cutoff) actual_class_prob = T(0); - output[offset] = actual_class_prob; - offset++; + output[idx] = actual_class_prob; } } } } template - void sigmoid_strided(const Stream& stream, Span output, View input, std::size_t n, std::size_t stride, std::size_t offset) { - CV_Assert(output.size() % stride == 0); - - auto kernel = raw::sigmoid_strided; - auto policy = make_policy(kernel, n * output.size() / stride, 0, stream); - launch_kernel(kernel, policy, output, input, n, stride, offset); - } - - template void sigmoid_strided(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t); - template void sigmoid_strided(const Stream&, Span, View, std::size_t, std::size_t, std::size_t); - - template - void softmax_strided(const Stream& stream, Span output, View input, std::size_t n, std::size_t stride, std::size_t offset) { - CV_Assert(output.size() % stride == 0); - - auto kernel = raw::softmax_strided; - auto policy = make_policy(kernel, output.size() / stride, 0, stream); - launch_kernel(kernel, policy, output, input, n, stride, offset); - } - - template void softmax_strided(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t); - template void softmax_strided(const Stream&, Span, View, std::size_t, std::size_t, std::size_t); - - template - void region_finalize(const Stream& stream, Span output, View input, View bias, + void region(const Stream& stream, Span output, View input, View bias, T object_prob_cutoff, T class_prob_cutoff, - std::size_t height_norm, std::size_t width_norm, + std::size_t boxes_per_cell, std::size_t box_size, std::size_t rows, std::size_t cols, - std::size_t boxes_per_cell, - std::size_t box_size, - std::size_t classes) + std::size_t height_norm, std::size_t width_norm, + bool if_true_sigmoid_else_softmax /* true = sigmoid, false = softmax */) { + CV_Assert(output.size() == input.size()); CV_Assert(output.size() % box_size == 0); - - auto kernel = raw::region_finalize; - auto policy = make_policy(kernel, output.size() / box_size, 0, stream); - launch_kernel(kernel, policy, output, input, bias, - object_prob_cutoff, class_prob_cutoff, - height_norm, width_norm, - rows, cols, boxes_per_cell, box_size, classes); + CV_Assert(is_fully_aligned(bias, 2)); + + auto box_kernel = raw::region_box; + auto box_policy = make_policy(box_kernel, output.size() / box_size, 0, stream); + launch_kernel(box_kernel, box_policy, + output, input, bias, boxes_per_cell, box_size, + rows, cols, height_norm, width_norm, + object_prob_cutoff); + + if (if_true_sigmoid_else_softmax) { + auto kernel_score = raw::region_sigmoid_class_score; + auto policy_score = make_policy(kernel_score, output.size(), 0, stream); + launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size); + } else { + auto kernel_score = raw::region_softmax_class_score; + auto policy_score = make_policy(kernel_score, output.size(), 0, stream); + launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size); + } } - template void region_finalize(const Stream&, Span<__half>, View<__half>, View<__half>, - __half, __half, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t); + template void region(const Stream&, Span<__half>, View<__half>, View<__half>, + __half, __half, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool); - template void region_finalize(const Stream&, Span, View, View, - float, float, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t); + template void region(const Stream&, Span, View, View, + float, float, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool); }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda4dnn/kernels/region.hpp b/modules/dnn/src/cuda4dnn/kernels/region.hpp index 0e12ad3..1d0072d 100644 --- a/modules/dnn/src/cuda4dnn/kernels/region.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/region.hpp @@ -13,19 +13,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template - void sigmoid_strided(const csl::Stream& stream, csl::Span output, csl::View input, std::size_t n, std::size_t stride, std::size_t offset); - - template - void softmax_strided(const csl::Stream& stream, csl::Span output, csl::View input, std::size_t n, std::size_t stride, std::size_t offset); - - template - void region_finalize(const csl::Stream& stream, csl::Span output, csl::View input, csl::View bias, + void region(const csl::Stream& stream, csl::Span output, csl::View input, csl::View bias, T object_prob_cutoff, T class_prob_cutoff, - std::size_t height_norm, std::size_t width_norm, + std::size_t boxes_per_cell, std::size_t box_size, std::size_t rows, std::size_t cols, - std::size_t boxes_per_cell, - std::size_t box_size, - std::size_t classes); + std::size_t height_norm, std::size_t width_norm, + bool if_true_sigmoid_else_softmax); }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda4dnn/primitives/region.hpp b/modules/dnn/src/cuda4dnn/primitives/region.hpp index 775dd0f..595d3e2 100644 --- a/modules/dnn/src/cuda4dnn/primitives/region.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/region.hpp @@ -102,21 +102,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { auto output_wrapper = outputs[0].dynamicCast(); auto output = output_wrapper->getSpan(); - csl::memcpy(output.get(), input.get(), output.size(), stream); - auto rows = input.get_axis_size(1); auto cols = input.get_axis_size(2); auto cell_box_size = classes + 4 + 1; /* we squash class scores into probabilities using softmax or sigmoid */ - if (squash_type == SquashMethod::SOFTMAX) - kernels::softmax_strided(stream, output, input, classes, cell_box_size, 5); - else if (squash_type == SquashMethod::SIGMOID) - kernels::sigmoid_strided(stream, output, input, classes, cell_box_size, 5); - - kernels::region_finalize(stream, output, input, biasTensor, object_prob_cutoff, class_prob_cutoff, - height_norm, width_norm, rows, cols, boxes_per_cell, cell_box_size, classes); + bool if_true_sigmoid_else_softmax = (squash_type == SquashMethod::SIGMOID); + + kernels::region(stream, output, input, biasTensor, + object_prob_cutoff, class_prob_cutoff, + boxes_per_cell, cell_box_size, + rows, cols, + height_norm, width_norm, + if_true_sigmoid_else_softmax + ); if (nms_iou_threshold > 0) { auto output_mat = output_wrapper->getMutableHostMat(); -- 2.7.4