*/
#include "detection_output_inst.h"
-#include "kernel.h"
-#include "kd_selector.h"
-#include "network_impl.h"
-#include "implementation_map.h"
-#include "math_utils.h"
-
-#include <algorithm>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <xmmintrin.h>
+#include "primitive_gpu_base.h"
+#include "error_handler.h"
+#include "kernel_selector_helper.h"
+#include "detection_output/detection_output_kernel_base.h"
+#include "detection_output/detection_output_kernel_selector.h"
#ifdef FIX_OPENMP_RELEASE_ISSUE
#ifdef OPENMP_FOUND
namespace cldnn { namespace gpu {
-namespace {
- struct bounding_box
- {
- float xmin;
- float ymin;
- float xmax;
- float ymax;
-
- bounding_box() : xmin(0), ymin(0), xmax(0), ymax(0) {}
-
- bounding_box(const float xmin, const float ymin, const float xmax, const float ymax) :
- xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
-
- // Computes the area of a bounding box.
- float area() const
- {
- return (xmax - xmin) * (ymax - ymin);
- }
- };
-}
-
-struct detection_output_gpu : typed_primitive_impl<detection_output>
+struct detection_output_gpu : typed_primitive_gpu_impl<detection_output>
{
- const detection_output_node& outer;
-
- detection_output_gpu(const detection_output_node& outer)
- : outer(outer)
- {}
+ using parent = typed_primitive_gpu_impl<detection_output>;
+ using parent::parent;
- static void decode_bounding_box(
- const bounding_box& prior_bbox, const std::array<float, PRIOR_BOX_SIZE>& prior_variance,
- const prior_box_code_type code_type, const bool variance_encoded_in_target,
- const bounding_box& bbox, bounding_box* decoded_bbox,
- const bool prior_is_normalized, const size_t image_width, const size_t image_height, const bool clip)
+private:
+ static void setDetectOutSpecificParams(kernel_selector::detection_output_params::DedicatedParams& detectOutParams, const detection_output_node& arg)
{
- float prior_bbox_xmin = prior_bbox.xmin;
- float prior_bbox_ymin = prior_bbox.ymin;
- float prior_bbox_xmax = prior_bbox.xmax;
- float prior_bbox_ymax = prior_bbox.ymax;
-
- float bbox_xmin = bbox.xmin;
- float bbox_ymin = bbox.ymin;
- float bbox_xmax = bbox.xmax;
- float bbox_ymax = bbox.ymax;
-
- if (!prior_is_normalized) {
- prior_bbox_xmin /= image_width;
- prior_bbox_ymin /= image_height;
- prior_bbox_xmax /= image_width;
- prior_bbox_ymax /= image_height;
- }
-
- switch (code_type)
- {
- case prior_box_code_type::corner:
- {
- if (variance_encoded_in_target)
- {
- // variance is encoded in target, we simply need to add the offset predictions.
- decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin;
- decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin;
- decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax;
- decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax;
- }
- else
- {
- // variance is encoded in bbox, we need to scale the offset accordingly.
- decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin;
- decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin;
- decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax;
- decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax;
- }
- break;
- }
- case prior_box_code_type::center_size:
- {
- const float prior_width = prior_bbox_xmax - prior_bbox_xmin;
- assert(prior_width > 0);
- const float prior_height = prior_bbox_ymax - prior_bbox_ymin;
- assert(prior_height > 0);
- const float prior_center_x = (prior_bbox_xmin + prior_bbox_xmax) / 2.f;
- const float prior_center_y = (prior_bbox_ymin + prior_bbox_ymax) / 2.f;
- float decode_bbox_center_x, decode_bbox_center_y;
- float decode_bbox_width, decode_bbox_height;
- if (variance_encoded_in_target)
- {
- // variance is encoded in target, we simply need to restore the offset predictions.
- decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
- decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
- decode_bbox_width = (exp(bbox_xmax) * prior_width);
- decode_bbox_height = (exp(bbox_ymax) * prior_height);
- }
- else
- {
- // variance is encoded in bbox, we need to scale the offset accordingly.
- decode_bbox_center_x = prior_variance[0] * bbox_xmin * prior_width + prior_center_x;
- decode_bbox_center_y = prior_variance[1] * bbox_ymin * prior_height + prior_center_y;
- decode_bbox_width = (exp(prior_variance[2] * bbox_xmax) * prior_width);
- decode_bbox_height = (exp(prior_variance[3] * bbox_ymax) * prior_height);
- }
- decoded_bbox->xmin = decode_bbox_center_x - decode_bbox_width / 2.0f;
- decoded_bbox->ymin = decode_bbox_center_y - decode_bbox_height / 2.0f;
- decoded_bbox->xmax = decode_bbox_center_x + decode_bbox_width / 2.0f;
- decoded_bbox->ymax = decode_bbox_center_y + decode_bbox_height / 2.0f;
- break;
- }
- case prior_box_code_type::corner_size:
- {
- const float prior_width = prior_bbox_xmax - prior_bbox_xmin;
- assert(prior_width > 0);
- const float prior_height = prior_bbox_ymax - prior_bbox_ymin;
- assert(prior_height > 0);
- if (variance_encoded_in_target)
- {
- // variance is encoded in target, we simply need to add the offset predictions.
- decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin * prior_width;
- decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin * prior_height;
- decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax * prior_width;
- decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax * prior_height;
- }
- else
- {
- // variance is encoded in bbox, we need to scale the offset accordingly.
- decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin * prior_width;
- decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin * prior_height;
- decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax * prior_width;
- decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax * prior_height;
- }
- break;
- }
- default:
- {
- assert(0);
- }
- }
-
- if (clip)
- {
- decoded_bbox->xmin = std::max(0.0f, std::min(1.0f, decoded_bbox->xmin));
- decoded_bbox->ymin = std::max(0.0f, std::min(1.0f, decoded_bbox->ymin));
- decoded_bbox->xmax = std::max(0.0f, std::min(1.0f, decoded_bbox->xmax));
- decoded_bbox->ymax = std::max(0.0f, std::min(1.0f, decoded_bbox->ymax));
- }
+ auto primitive = arg.get_primitive();
+ detectOutParams.keep_top_k = primitive->keep_top_k;
+ detectOutParams.num_classes = primitive->num_classes;
+ detectOutParams.top_k = primitive->top_k;
+ detectOutParams.background_label_id = primitive->background_label_id;
+ detectOutParams.code_type = (int32_t)primitive->code_type;
+ detectOutParams.share_location = primitive->share_location;
+ detectOutParams.variance_encoded_in_target = primitive->variance_encoded_in_target;
+ detectOutParams.nms_threshold = primitive->nms_threshold;
+ detectOutParams.eta = primitive->eta;
+ detectOutParams.confidence_threshold = primitive->confidence_threshold;
+ detectOutParams.prior_coordinates_offset = primitive->prior_coordinates_offset;
+ detectOutParams.prior_info_size = primitive->prior_info_size;
+ detectOutParams.prior_is_normalized = primitive->prior_is_normalized;
+ detectOutParams.input_width = primitive->input_width;
+ detectOutParams.input_heigh = primitive->input_height;
+ detectOutParams.conf_size_x = arg.confidence().get_output_layout().get_buffer_size().spatial[0];
+ detectOutParams.conf_size_y = arg.confidence().get_output_layout().get_buffer_size().spatial[1];
+ detectOutParams.conf_padding_x = arg.confidence().get_output_layout().data_padding.lower_size().spatial[0];
+ detectOutParams.conf_padding_y = arg.confidence().get_output_layout().data_padding.lower_size().spatial[1];
}
- static void apply_nms(const std::vector<bounding_box>& bboxes,
- std::vector<std::pair<float,int>>& scores,
- const float nms_threshold, const float eta, const int top_k)
- {
- // Sort the scores in descending order and keep top_k scores if needed.
- if ((top_k != -1) && ((int)scores.size() > top_k))
- {
- std::partial_sort(scores.begin(), scores.begin() + top_k, scores.end(), [](const std::pair<float, int>& p1, const std::pair<float, int>& p2) { return (p1.first > p2.first) || (p1.first == p2.first && p1.second < p2.second); });
- scores.resize(top_k);
- }
- else
- {
- std::stable_sort(scores.begin(), scores.end(), [](const std::pair<float, int>& p1, const std::pair<float, int>& p2) { return p1.first > p2.first; });
- }
-
- // NMS
- float adaptive_threshold = nms_threshold;
- int post_nms_count = 0;
- for (auto score_index : scores)
- {
- const int idx = score_index.second;
- bounding_box box1(bboxes[idx]);
- bool keep = true;
- for (int i = 0; i < post_nms_count; ++i)
- {
- if (!keep)
- {
- break;
- }
- bounding_box box2(bboxes[scores[i].second]);
- bool intersecting = (box1.xmin < box2.xmax) & (box2.xmin < box1.xmax) & (box1.ymin < box2.ymax) & (box2.ymin < box1.ymax);
- float overlap = 0.0f;
- if (intersecting)
- {
- const float intersect_width = std::min(box1.xmax, box2.xmax) - std::max(box1.xmin, box2.xmin);
- const float intersect_height = std::min(box1.ymax, box2.ymax) - std::max(box1.ymin, box2.ymin);
- const float intersect_size = intersect_width * intersect_height;
- overlap = intersect_size / (box1.area() + box2.area() - intersect_size);
- }
- keep = (overlap <= adaptive_threshold);
- }
- if (keep)
- {
- scores[post_nms_count] = score_index;
- ++post_nms_count;
- }
- if (keep && eta < 1 && adaptive_threshold > 0.5)
- {
- adaptive_threshold *= eta;
- }
- }
- scores.resize(post_nms_count); // scores holds only the items that were kept after the NMS.
- }
+public:
- template<typename dtype>
- void generate_detections(const detection_output_inst& instance, const int num_of_images, const std::vector<std::vector<std::vector<bounding_box>>>& all_bboxes, std::vector<std::vector<std::vector<std::pair<float,int>>>>& confidences)
+ static primitive_impl* create(const detection_output_node& arg)
{
- mem_lock<dtype> lock{ instance.output_memory() };
- auto out_ptr = lock.begin();
-
- const auto& args = instance.argument;
- std::vector<std::vector<std::vector<std::pair<float,int>>>> final_detections; // Per image -> For each label: Pair (score, prior index)
- for (int image = 0; image < num_of_images; ++image)
- {
- const std::vector<std::vector<bounding_box> >& bboxes_per_image = all_bboxes[image];
- std::vector<std::vector<std::pair<float,int>>>& conf_per_image = confidences[image];
- int num_det = 0;
-#ifdef FIX_OPENMP_RELEASE_ISSUE
-#ifdef OPENMP_FOUND
- int num_available_threads = omp_get_max_threads();
- //half available threads usage shows the best perf results for both SKL (4c8t) and APL (4c4t) for this part of detection output
- int num_threads_to_use = (omp_in_parallel() == 0) ? num_available_threads/2 : 1;
- #pragma omp parallel for num_threads(num_threads_to_use) reduction(+:num_det)
-#endif
-#endif
- for (int cls = 0; cls < (int)args.num_classes; ++cls)
- {
- if ((int)cls == args.background_label_id)
- {
- conf_per_image[cls].clear();
- continue; // Skip background class.
- }
- std::vector<std::pair<float,int>>& scores = conf_per_image[cls];
- const int label = args.share_location ? 0 : cls;
- apply_nms(bboxes_per_image[label], scores, args.nms_threshold, args.eta, args.top_k);
- num_det += (int)scores.size();
- }
- if (num_det > args.keep_top_k)
- {
- std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
- score_index_pairs.reserve(num_det);
- for (int label = 0; label < (int)args.num_classes; ++label)
- {
- std::vector<std::pair<float, int>>& scores = confidences[image][label];
- for (std::pair<float, int> score_index : scores)
- {
- score_index_pairs.emplace_back(score_index.first, std::make_pair(label, score_index.second));
- }
- }
-
- // Keep top k results per image.
- auto sort_function = [](const std::pair<float, std::pair<int, int>>& p1, const std::pair<float, std::pair<int, int>>& p2) { return p1.first > p2.first; };
- if ((int)score_index_pairs.size() > args.keep_top_k)
- {
- std::partial_sort(score_index_pairs.begin(), score_index_pairs.begin() + args.keep_top_k, score_index_pairs.end(), sort_function);
- score_index_pairs.resize(args.keep_top_k);
- }
- else
- {
- std::sort(score_index_pairs.begin(), score_index_pairs.end(), sort_function);
- }
-
- // Store the new indices.
- std::vector<std::vector<std::pair<float,int>>> new_indices(args.num_classes);
- for (int j = 0; j < (int)score_index_pairs.size(); ++j)
- {
- int label = score_index_pairs[j].second.first;
- int idx = score_index_pairs[j].second.second;
- new_indices[label].emplace_back(score_index_pairs[j].first, idx);
- }
- final_detections.emplace_back(new_indices);
- }
- else
- {
- final_detections.emplace_back(confidences[image]);
- }
- }
-
- int count = 0;
- for (int image = 0; image < num_of_images; ++image)
- {
- const std::vector<std::vector<bounding_box> >& bboxes_per_image = all_bboxes[image];
- auto& final_detections_per_image = final_detections[image];
- for (int label = 0; label < (int)final_detections_per_image.size(); ++label)
- {
- int loc_label = args.share_location ? 0 : label;
- const std::vector<bounding_box>& bboxes = bboxes_per_image[loc_label];
- const std::vector<std::pair<float,int>>& label_detections = final_detections_per_image[label];
- for (std::pair<float,int> score_prior : label_detections)
- {
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)(float)image;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = args.decrease_label_id ? ((dtype)((float)label - 1.0f))
- : (dtype)(float)label;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)score_prior.first;
- const bounding_box& bbox = bboxes[score_prior.second];
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)bbox.xmin;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)bbox.ymin;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)bbox.xmax;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)bbox.ymax;
- ++count;
- }
- }
- }
-
- //In case number of detections is smaller than keep_top_k fill the rest of the buffer with invalid image id (-1).
- while (count < num_of_images*args.keep_top_k)
+ if (!arg.get_program().get_options().get<build_option_type::detection_output_gpu>()->enabled())
{
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)-1.f;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = (dtype)0.f;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)0.f;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)0.f;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)0.f;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)0.f;
- out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)0.f;
- ++count;
+ return runDetectOutCpu(arg);
}
- }
- // Compute the linear index taking the padding into account.
- static inline int get_linear_feature_index(const int batch_id, const int feature_id, const int input_buffer_size_f, const int input_buffer_size_y,
- const int input_buffer_size_x, const int input_padding_lower_y, const int input_padding_lower_x)
- {
- // This helper function assumes input layout with x_size = 1 and y_size = 1;
- // Location and confidence inputs should be tensors with size {b,f,1,1}.
- // This is validated in detection output primitive instance creation.
+ auto detect_out_params = get_default_params<kernel_selector::detection_output_params>(arg);
+ auto detect_out_optional_params = get_default_optional_params<kernel_selector::detection_output_optional_params>(arg.get_program());
- int input_idx = (batch_id * input_buffer_size_f + feature_id) * input_buffer_size_y * input_buffer_size_x;
- input_idx += input_padding_lower_y * input_buffer_size_x + input_padding_lower_x;
+ setDetectOutSpecificParams(detect_out_params.detectOutParams, arg);
- return input_idx;
- }
+ auto& kernel_selector = kernel_selector::detection_output_kernel_selector::Instance();
+ auto best_kernels = kernel_selector.GetBestKernels(detect_out_params, detect_out_optional_params);
- template<typename dtype>
- void extract_locations_per_image(const detection_output_inst& instance, std::vector<std::vector<std::vector<bounding_box>>>& locations, const int num_of_priors, const int num_loc_classes)
- {
- const bool share_location = instance.argument.share_location;
- auto& input_location = instance.location_memory();
- const int num_of_images = (int)locations.size();
+ CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments");
- mem_lock<dtype> lock{ input_location };
- auto location_data = lock.begin();
+ auto detect_out = new detection_output_gpu(arg, best_kernels[0]);
- assert(num_of_priors * num_loc_classes * PRIOR_BOX_SIZE == input_location.get_layout().size.feature[0]);
-
- const auto& input_buffer_size = input_location.get_layout().get_buffer_size();
- const int input_buffer_size_x = input_buffer_size.spatial[0];
- const int input_buffer_size_y = input_buffer_size.spatial[1];
- const int input_buffer_size_f = input_buffer_size.feature[0];
- const auto& input_padding = input_location.get_layout().data_padding;
- const int input_padding_lower_x = input_padding.lower_size().spatial[0];
- const int input_padding_lower_y = input_padding.lower_size().spatial[1];
-
- for (int image = 0; image < num_of_images; ++image)
- {
- std::vector<std::vector<bounding_box>>& label_to_bbox = locations[image];
- label_to_bbox.resize(num_loc_classes);
- for (int cls = 0; cls < num_loc_classes; ++cls)
- {
- int label = share_location ? 0 : cls;
- auto & bboxes = label_to_bbox[label];
- bboxes.resize(num_of_priors);
-
- for (int prior = 0; prior < num_of_priors; ++prior)
- {
- int idx = prior * num_loc_classes * PRIOR_BOX_SIZE;
- bboxes[prior].xmin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE, input_buffer_size_f, input_buffer_size_y,
- input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
- bboxes[prior].ymin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 1, input_buffer_size_f, input_buffer_size_y,
- input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
- bboxes[prior].xmax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 2, input_buffer_size_f, input_buffer_size_y,
- input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
- bboxes[prior].ymax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 3, input_buffer_size_f, input_buffer_size_y,
- input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
- }
- }
- }
+ return detect_out;
}
+};
- template<typename dtype>
- void extract_prior_boxes_and_variances(const detection_output_inst& instance, const bool variance_encoded_in_target,
- const int32_t prior_info_size, const int32_t prior_coordinates_offset,
- std::vector<bounding_box>& prior_bboxes, std::vector<std::array<float, PRIOR_BOX_SIZE>>& prior_variances)
- {
- auto& input_prior_box = instance.prior_box_memory();
- const int num_of_priors = (int)prior_bboxes.size();
+primitive_impl* runDetectOutGpu(const detection_output_node& arg, kernel_selector::KernelData kernel)
+{
+ return new detection_output_gpu(arg, kernel);
+}
- mem_lock<dtype> lock{ input_prior_box };
- auto prior_box_data = lock.begin();
+/************************ Detection Output keep_top_k part ************************/
- for (int prior = 0; prior < num_of_priors; ++prior)
- {
- int idx = prior * prior_info_size + prior_coordinates_offset;
- prior_bboxes[prior] = bounding_box((float)(prior_box_data[idx]), (float)(prior_box_data[idx + 1]), (float)(prior_box_data[idx + 2]), (float)(prior_box_data[idx + 3]));
- idx += num_of_priors * prior_info_size;
- for (int j = 0; j < PRIOR_BOX_SIZE; ++j)
- {
- prior_variances[prior][j] = variance_encoded_in_target ? 0.0f : (float)(prior_box_data[idx + j]);
- }
- }
- }
+struct detection_output_sort_gpu : typed_primitive_gpu_impl<detection_output_sort>
+{
+ using parent = typed_primitive_gpu_impl<detection_output_sort>;
+ using parent::parent;
- template<typename dtype>
- void extract_confidences_per_image(const detection_output_inst& instance, std::vector<std::vector<std::vector<std::pair<float,int>>>>& confidences, const int num_of_priors)
+private:
+ static void setDetectOutSpecificParams(kernel_selector::detection_output_params::DedicatedParams& detectOutParams, const detection_output_sort_node& arg)
{
- const int num_classes = instance.argument.num_classes;
-
- const int num_of_images = (int)confidences.size();
- auto& input_confidence = instance.confidence_memory();
- const float confidence_threshold = instance.argument.confidence_threshold;
-
- mem_lock<dtype> lock{ &input_confidence };
- auto confidence_data = lock.begin();
-
- assert(num_of_priors * num_classes == input_confidence.get_layout().size.feature[0]);
-
- const auto& input_buffer_size = input_confidence.get_layout().get_buffer_size();
- const int input_buffer_size_x = input_buffer_size.spatial[0];
- const int input_buffer_size_y = input_buffer_size.spatial[1];
- const int input_buffer_size_f = input_buffer_size.feature[0];
- const auto& input_padding = input_confidence.get_layout().data_padding;
- const int input_padding_lower_x = input_padding.lower_size().spatial[0];
- const int input_padding_lower_y = input_padding.lower_size().spatial[1];
- const int stride = input_buffer_size_y * input_buffer_size_x;
-
- for (int image = 0; image < num_of_images; ++image)
+ if (arg.get_dependency(0).is_type<detection_output>())
{
- std::vector<std::vector<std::pair<float,int>>>& label_to_scores = confidences[image];
- label_to_scores.resize(num_classes);
- int idx = get_linear_feature_index(image, 0, input_buffer_size_f, input_buffer_size_y,
- input_buffer_size_x, input_padding_lower_y, input_padding_lower_x);
-
- if (stride == 1 && std::is_same<dtype, float>::value)
- {
- float const* confidence_ptr_float = (float const*)(&(*confidence_data));
- confidence_ptr_float += idx;
- __m128 threshold = _mm_load_ps1(&confidence_threshold);
- for (int prior = 0; prior < num_of_priors; ++prior)
- {
- int cls = 0;
- for (; cls + 3 < num_classes; cls += 4)
- {
- __m128 scores = _mm_loadu_ps(confidence_ptr_float);
- confidence_ptr_float += 4;
- __m128i mask128 = _mm_castps_si128(_mm_cmpgt_ps(scores, threshold));
- if (_mm_testz_si128(mask128, mask128))
- {
- continue;
- }
- int mask = _mm_movemask_ps(_mm_castsi128_ps(mask128));
- if (mask & 1)
- {
- label_to_scores[cls + 0].emplace_back(_mm_cvtss_f32(scores), prior);
- }
- if (mask & 2)
- {
- int score = _mm_extract_ps(scores, 1);
- float s = reinterpret_cast<float&>(score);
- label_to_scores[cls + 1].emplace_back(s, prior);
- }
- if (mask & 4)
- {
- int score = _mm_extract_ps(scores, 2);
- float s = reinterpret_cast<float&>(score);
- label_to_scores[cls + 2].emplace_back(s, prior);
- }
- if (mask & 8)
- {
- int score = _mm_extract_ps(scores, 3);
- float s = reinterpret_cast<float&>(score);
- label_to_scores[cls + 3].emplace_back(s, prior);
- }
- }
- for (; cls < num_classes; ++cls)
- {
- float score = *confidence_ptr_float;
- if (score > confidence_threshold)
- {
- label_to_scores[cls].emplace_back(score, prior);
- }
- ++confidence_ptr_float;
- }
- }
- }
- else
- {
- for (int prior = 0; prior < num_of_priors; ++prior)
- {
- for (int cls = 0; cls < num_classes; ++cls)
- {
- float score = (float)confidence_data[idx];
- if (score > confidence_threshold)
- {
- label_to_scores[cls].emplace_back(score, prior);
- }
- idx += stride;
- }
- }
- }
+ auto primitive = arg.get_dependency(0).as<detection_output>().get_primitive();
+ detectOutParams.keep_top_k = primitive->keep_top_k;
+ detectOutParams.num_classes = primitive->num_classes;
+ detectOutParams.num_images = arg.get_dependency(0).as<detection_output>().location().get_output_layout().size.batch[0];
+ detectOutParams.top_k = primitive->top_k;
+ detectOutParams.share_location = primitive->share_location;
+ detectOutParams.background_label_id = primitive->background_label_id;
}
- }
-
- template<typename dtype>
- void prepare_data(const detection_output_inst& instance, std::vector<std::vector<std::vector<bounding_box>>> &bboxes, std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences)
- {
- assert(bboxes.size() == confidences.size());
-
- const auto& args = instance.argument;
-
- const int num_of_images = (int)bboxes.size();
- const int num_of_priors = instance.prior_box_memory().get_layout().size.spatial[1] / args.prior_info_size;
- const int num_loc_classes = args.share_location ? 1 : args.num_classes;
-
- // Extract locations per image.
- std::vector<std::vector<std::vector<bounding_box>>> locations(num_of_images); // Per image : label -> bounding boxes.
- extract_locations_per_image<dtype>(instance, locations, num_of_priors, num_loc_classes);
-
- // Extract prior boxes - same within a batch.
- std::vector<bounding_box> prior_bboxes(num_of_priors); // Prior-Boxes (identical for all images since we assume all images in a batch are of same dimension).
- std::vector<std::array<float, PRIOR_BOX_SIZE>> prior_variances(num_of_priors); // Variances per prior-box (identical for all images since we assume all images in a batch are of same dimension).
- extract_prior_boxes_and_variances<dtype>(instance, args.variance_encoded_in_target,
- args.prior_info_size, args.prior_coordinates_offset,
- prior_bboxes, prior_variances);
-
- // Create the decoded bounding boxes according to locations predictions and prior-boxes.
- for (int image = 0; image < num_of_images; ++image)
+ else
{
- std::vector<std::vector<bounding_box>>& bboxes_per_image = bboxes[image];
- bboxes_per_image.resize(num_loc_classes);
- locations[image].resize(num_loc_classes);
- for (int cls = 0; cls < num_loc_classes; ++cls)
- {
- const int label = args.share_location ? 0 : cls;
- if (!args.share_location && label == args.background_label_id)
- {
- continue; // Skip background class.
- }
- const std::vector<bounding_box>& label_loc_preds = locations[image][label];
- int label_loc_preds_size = (int)label_loc_preds.size();
- assert((int)prior_bboxes.size() == label_loc_preds_size);
-
- bboxes_per_image[label].clear();
-
- for (int i = 0; i < label_loc_preds_size; ++i)
- {
- bounding_box decoded_bbox;
- decode_bounding_box(prior_bboxes[i], prior_variances[i], args.code_type, args.variance_encoded_in_target, label_loc_preds[i], &decoded_bbox,
- args.prior_is_normalized, args.input_width, args.input_height, args.clip);
- bboxes_per_image[label].emplace_back(decoded_bbox);
- }
- }
+ auto primitive = arg.get_primitive();
+ detectOutParams.keep_top_k = primitive->keep_top_k;
+ detectOutParams.num_classes = primitive->num_classes;
+ detectOutParams.num_images = primitive->num_images;
+ detectOutParams.top_k = primitive->top_k;
+ detectOutParams.share_location = primitive->share_location;
+ detectOutParams.background_label_id = primitive->background_label_id;
}
-
- // Extract confidences per image.
- extract_confidences_per_image<dtype>(instance, confidences, num_of_priors);
}
- event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, detection_output_inst& instance) override
+public:
+ static primitive_impl* create(const detection_output_sort_node& arg)
{
- for (auto& a : events)
- {
- a->wait();
- }
-
- auto ev = instance.get_network().get_engine().create_user_event(false);
-
- const int num_of_images = instance.location_memory().get_layout().size.batch[0]; //batch size
+ auto detect_out_params = get_default_params<kernel_selector::detection_output_params>(arg);
+ auto detect_out_optional_params = get_default_optional_params<kernel_selector::detection_output_optional_params>(arg.get_program());
- std::vector<std::vector<std::vector<bounding_box>>> bboxes(num_of_images); // Per image : label -> decoded bounding boxes.
- std::vector<std::vector<std::vector<std::pair<float, int>>>> confidences(num_of_images); // Per image : class -> confidences per bounding box.
+ setDetectOutSpecificParams(detect_out_params.detectOutParams, arg);
- if (instance.location_memory().get_layout().data_type == data_types::f32)
- {
- prepare_data<data_type_to_type<data_types::f32>::type>(instance, bboxes, confidences);
-
- generate_detections<data_type_to_type<data_types::f32>::type>(instance, num_of_images, bboxes, confidences);
- }
- else
- {
- prepare_data<data_type_to_type<data_types::f16>::type>(instance, bboxes, confidences);
+ auto& kernel_selector = kernel_selector::detection_output_sort_kernel_selector::Instance();
+ auto best_kernels = kernel_selector.GetBestKernels(detect_out_params, detect_out_optional_params);
- generate_detections<data_type_to_type<data_types::f16>::type>(instance, num_of_images, bboxes, confidences);
- }
+ CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments");
- dynamic_cast<cldnn::user_event*>(ev.get())->set(); // set as complete
- // TODO: consider refactoring create_user_event() to return cldnn::user_event*
- return ev;
- }
+ auto detect_out = new detection_output_sort_gpu(arg, best_kernels[0]);
- static primitive_impl* create(const detection_output_node& arg)
- {
- return new detection_output_gpu(arg);
+ return detect_out;
}
};
+primitive_impl* runDetectOutSortGpu(const detection_output_sort_node& arg, kernel_selector::KernelData kernel)
+{
+ return new detection_output_sort_gpu(arg, kernel);
+}
+
namespace {
struct attach {
- attach()
- {
+ attach() {
implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), detection_output_gpu::create);
implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), detection_output_gpu::create);
+ implementation_map<detection_output_sort>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), detection_output_sort_gpu::create);
+ implementation_map<detection_output_sort>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), detection_output_sort_gpu::create);
}
-
~attach() {}
};
attach attach_impl;
}
+
}}