2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "detection_output_inst.h"
19 #include "network_impl.h"
20 #include "implementation_map.h"
21 #include "math_utils.h"
26 #include <type_traits>
27 #include <xmmintrin.h>
29 #ifdef FIX_OPENMP_RELEASE_ISSUE
35 namespace cldnn { namespace gpu {
45 bounding_box() : xmin(0), ymin(0), xmax(0), ymax(0) {}
47 bounding_box(const float xmin, const float ymin, const float xmax, const float ymax) :
48 xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
50 // Computes the area of a bounding box.
53 return (xmax - xmin) * (ymax - ymin);
58 /************************ Detection Output CPU ************************/
59 struct detection_output_cpu : typed_primitive_impl<detection_output>
61 const detection_output_node& outer;
63 detection_output_cpu(const detection_output_node& outer)
67 static void decode_bounding_box(
68 const bounding_box& prior_bbox, const std::array<float, PRIOR_BOX_SIZE>& prior_variance,
69 const prior_box_code_type code_type, const bool variance_encoded_in_target,
70 const bounding_box& bbox, bounding_box* decoded_bbox,
71 const bool prior_is_normalized, const size_t image_width, const size_t image_height, const bool clip_before_nms)
73 float prior_bbox_xmin = prior_bbox.xmin;
74 float prior_bbox_ymin = prior_bbox.ymin;
75 float prior_bbox_xmax = prior_bbox.xmax;
76 float prior_bbox_ymax = prior_bbox.ymax;
78 float bbox_xmin = bbox.xmin;
79 float bbox_ymin = bbox.ymin;
80 float bbox_xmax = bbox.xmax;
81 float bbox_ymax = bbox.ymax;
83 if (!prior_is_normalized) {
84 prior_bbox_xmin /= image_width;
85 prior_bbox_ymin /= image_height;
86 prior_bbox_xmax /= image_width;
87 prior_bbox_ymax /= image_height;
92 case prior_box_code_type::corner:
94 if (variance_encoded_in_target)
96 // variance is encoded in target, we simply need to add the offset predictions.
97 decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin;
98 decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin;
99 decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax;
100 decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax;
104 // variance is encoded in bbox, we need to scale the offset accordingly.
105 decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin;
106 decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin;
107 decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax;
108 decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax;
112 case prior_box_code_type::center_size:
114 const float prior_width = prior_bbox_xmax - prior_bbox_xmin;
115 assert(prior_width > 0);
116 const float prior_height = prior_bbox_ymax - prior_bbox_ymin;
117 assert(prior_height > 0);
118 const float prior_center_x = (prior_bbox_xmin + prior_bbox_xmax) / 2.f;
119 const float prior_center_y = (prior_bbox_ymin + prior_bbox_ymax) / 2.f;
120 float decode_bbox_center_x, decode_bbox_center_y;
121 float decode_bbox_width, decode_bbox_height;
122 if (variance_encoded_in_target)
124 // variance is encoded in target, we simply need to restore the offset predictions.
125 decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
126 decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
127 decode_bbox_width = (exp(bbox_xmax) * prior_width);
128 decode_bbox_height = (exp(bbox_ymax) * prior_height);
132 // variance is encoded in bbox, we need to scale the offset accordingly.
133 decode_bbox_center_x = prior_variance[0] * bbox_xmin * prior_width + prior_center_x;
134 decode_bbox_center_y = prior_variance[1] * bbox_ymin * prior_height + prior_center_y;
135 decode_bbox_width = (exp(prior_variance[2] * bbox_xmax) * prior_width);
136 decode_bbox_height = (exp(prior_variance[3] * bbox_ymax) * prior_height);
138 decoded_bbox->xmin = decode_bbox_center_x - decode_bbox_width / 2.0f;
139 decoded_bbox->ymin = decode_bbox_center_y - decode_bbox_height / 2.0f;
140 decoded_bbox->xmax = decode_bbox_center_x + decode_bbox_width / 2.0f;
141 decoded_bbox->ymax = decode_bbox_center_y + decode_bbox_height / 2.0f;
144 case prior_box_code_type::corner_size:
146 const float prior_width = prior_bbox_xmax - prior_bbox_xmin;
147 assert(prior_width > 0);
148 const float prior_height = prior_bbox_ymax - prior_bbox_ymin;
149 assert(prior_height > 0);
150 if (variance_encoded_in_target)
152 // variance is encoded in target, we simply need to add the offset predictions.
153 decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin * prior_width;
154 decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin * prior_height;
155 decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax * prior_width;
156 decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax * prior_height;
160 // variance is encoded in bbox, we need to scale the offset accordingly.
161 decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin * prior_width;
162 decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin * prior_height;
163 decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax * prior_width;
164 decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax * prior_height;
176 decoded_bbox->xmin = std::max(0.0f, std::min(1.0f, decoded_bbox->xmin));
177 decoded_bbox->ymin = std::max(0.0f, std::min(1.0f, decoded_bbox->ymin));
178 decoded_bbox->xmax = std::max(0.0f, std::min(1.0f, decoded_bbox->xmax));
179 decoded_bbox->ymax = std::max(0.0f, std::min(1.0f, decoded_bbox->ymax));
183 static void apply_nms(const std::vector<bounding_box>& bboxes,
184 std::vector<std::pair<float,int>>& scores,
185 const float nms_threshold, const float eta, const int top_k)
187 // Sort the scores in descending order and keep top_k scores if needed.
188 if ((top_k != -1) && ((int)scores.size() > top_k))
190 std::partial_sort(scores.begin(), scores.begin() + top_k, scores.end(), [](const std::pair<float, int>& p1, const std::pair<float, int>& p2) { return (p1.first > p2.first) || (p1.first == p2.first && p1.second < p2.second); });
191 scores.resize(top_k);
195 std::stable_sort(scores.begin(), scores.end(), [](const std::pair<float, int>& p1, const std::pair<float, int>& p2) { return p1.first > p2.first; });
199 float adaptive_threshold = nms_threshold;
200 int post_nms_count = 0;
202 for (auto score_index : scores)
204 const int idx = score_index.second;
205 bounding_box box1(bboxes[idx]);
207 for (int i = 0; i < post_nms_count; ++i)
213 bounding_box box2(bboxes[scores[i].second]);
214 bool intersecting = (box1.xmin < box2.xmax) & (box2.xmin < box1.xmax) & (box1.ymin < box2.ymax) & (box2.ymin < box1.ymax);
215 float overlap = 0.0f;
218 const float intersect_width = std::min(box1.xmax, box2.xmax) - std::max(box1.xmin, box2.xmin);
219 const float intersect_height = std::min(box1.ymax, box2.ymax) - std::max(box1.ymin, box2.ymin);
220 const float intersect_size = intersect_width * intersect_height;
221 overlap = intersect_size / (box1.area() + box2.area() - intersect_size);
223 keep = (overlap <= adaptive_threshold);
227 scores[post_nms_count] = score_index;
230 if (keep && eta < 1 && adaptive_threshold > 0.5)
232 adaptive_threshold *= eta;
235 scores.resize(post_nms_count); // scores holds only the items that were kept after the NMS.
238 template<typename dtype>
239 void generate_detections(const detection_output_inst& instance, const int num_of_images, const std::vector<std::vector<std::vector<bounding_box>>>& all_bboxes, std::vector<std::vector<std::vector<std::pair<float,int>>>>& confidences)
241 mem_lock<dtype> lock{ instance.output_memory() };
242 auto out_ptr = lock.begin();
244 const auto& args = instance.argument;
245 std::vector<std::vector<std::vector<std::pair<float,int>>>> final_detections; // Per image -> For each label: Pair (score, prior index)
246 for (int image = 0; image < num_of_images; ++image)
248 const std::vector<std::vector<bounding_box> >& bboxes_per_image = all_bboxes[image];
249 std::vector<std::vector<std::pair<float,int>>>& conf_per_image = confidences[image];
251 #ifdef FIX_OPENMP_RELEASE_ISSUE
253 int num_available_threads = omp_get_max_threads();
254 //half available threads usage shows the best perf results for both SKL (4c8t) and APL (4c4t) for this part of detection output
255 int num_threads_to_use = (omp_in_parallel() == 0) ? num_available_threads/2 : 1;
256 #pragma omp parallel for num_threads(num_threads_to_use) reduction(+:num_det)
259 for (int cls = 0; cls < (int)args.num_classes; ++cls)
261 if ((int)cls == args.background_label_id)
263 conf_per_image[cls].clear();
264 continue; // Skip background class.
266 std::vector<std::pair<float,int>>& scores = conf_per_image[cls];
267 const int label = args.share_location ? 0 : cls;
268 apply_nms(bboxes_per_image[label], scores, args.nms_threshold, args.eta, args.top_k);
269 num_det += (int)scores.size();
271 if (num_det > args.keep_top_k)
273 std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
274 score_index_pairs.reserve(num_det);
275 for (int label = 0; label < (int)args.num_classes; ++label)
277 std::vector<std::pair<float, int>>& scores = confidences[image][label];
278 for (std::pair<float, int> score_index : scores)
280 score_index_pairs.emplace_back(score_index.first, std::make_pair(label, score_index.second));
284 // Keep top k results per image.
285 auto sort_function = [](const std::pair<float, std::pair<int, int>>& p1, const std::pair<float, std::pair<int, int>>& p2) { return p1.first > p2.first; };
286 if ((int)score_index_pairs.size() > args.keep_top_k)
288 std::partial_sort(score_index_pairs.begin(), score_index_pairs.begin() + args.keep_top_k, score_index_pairs.end(), sort_function);
289 score_index_pairs.resize(args.keep_top_k);
293 std::sort(score_index_pairs.begin(), score_index_pairs.end(), sort_function);
296 // Store the new indices.
297 std::vector<std::vector<std::pair<float,int>>> new_indices(args.num_classes);
298 for (int j = 0; j < (int)score_index_pairs.size(); ++j)
300 int label = score_index_pairs[j].second.first;
301 int idx = score_index_pairs[j].second.second;
302 new_indices[label].emplace_back(score_index_pairs[j].first, idx);
304 final_detections.emplace_back(new_indices);
308 final_detections.emplace_back(confidences[image]);
313 for (int image = 0; image < num_of_images; ++image)
315 const std::vector<std::vector<bounding_box> >& bboxes_per_image = all_bboxes[image];
316 auto& final_detections_per_image = final_detections[image];
317 for (int label = 0; label < (int)final_detections_per_image.size(); ++label)
319 int loc_label = args.share_location ? 0 : label;
320 const std::vector<bounding_box>& bboxes = bboxes_per_image[loc_label];
321 const std::vector<std::pair<float,int>>& label_detections = final_detections_per_image[label];
322 for (std::pair<float,int> score_prior : label_detections)
324 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)(float)image;
325 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = args.decrease_label_id ? ((dtype)((float)label - 1.0f))
326 : (dtype)(float)label;
327 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)score_prior.first;
328 const bounding_box& bbox = bboxes[score_prior.second];
329 float xmin = bbox.xmin;
330 float ymin = bbox.ymin;
331 float xmax = bbox.xmax;
332 float ymax = bbox.ymax;
334 if (args.clip_after_nms)
336 xmin = std::max(0.0f, std::min(1.0f, xmin));
337 ymin = std::max(0.0f, std::min(1.0f, ymin));
338 xmax = std::max(0.0f, std::min(1.0f, xmax));
339 ymax = std::max(0.0f, std::min(1.0f, ymax));
342 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)xmin;
343 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)ymin;
344 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)xmax;
345 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)ymax;
351 //In case number of detections is smaller than keep_top_k fill the rest of the buffer with invalid image id (-1).
352 while (count < num_of_images*args.keep_top_k)
354 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)-1.f;
355 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = (dtype)0.f;
356 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)0.f;
357 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)0.f;
358 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)0.f;
359 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)0.f;
360 out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)0.f;
365 // Compute the linear index taking the padding into account.
366 static inline int get_linear_feature_index(const int batch_id, const int feature_id, const int input_buffer_size_f, const int input_buffer_size_y,
367 const int input_buffer_size_x, const int input_padding_lower_y, const int input_padding_lower_x)
369 // This helper function assumes input layout with x_size = 1 and y_size = 1;
370 // Location and confidence inputs should be tensors with size {b,f,1,1}.
371 // This is validated in detection output primitive instance creation.
373 int input_idx = (batch_id * input_buffer_size_f + feature_id) * input_buffer_size_y * input_buffer_size_x;
374 input_idx += input_padding_lower_y * input_buffer_size_x + input_padding_lower_x;
379 template<typename dtype>
380 void extract_locations_per_image(const detection_output_inst& instance, std::vector<std::vector<std::vector<bounding_box>>>& locations, const int num_of_priors, const int num_loc_classes)
382 const bool share_location = instance.argument.share_location;
383 auto& input_location = instance.location_memory();
384 const int num_of_images = (int)locations.size();
386 mem_lock<dtype> lock{ input_location };
387 auto location_data = lock.begin();
389 assert(num_of_priors * num_loc_classes * PRIOR_BOX_SIZE == input_location.get_layout().size.feature[0]);
391 const auto& input_buffer_size = input_location.get_layout().get_buffer_size();
392 const int input_buffer_size_x = input_buffer_size.spatial[0];
393 const int input_buffer_size_y = input_buffer_size.spatial[1];
394 const int input_buffer_size_f = input_buffer_size.feature[0];
395 const auto& input_padding = input_location.get_layout().data_padding;
396 const int input_padding_lower_x = input_padding.lower_size().spatial[0];
397 const int input_padding_lower_y = input_padding.lower_size().spatial[1];
399 for (int image = 0; image < num_of_images; ++image)
401 std::vector<std::vector<bounding_box>>& label_to_bbox = locations[image];
402 label_to_bbox.resize(num_loc_classes);
403 for (int cls = 0; cls < num_loc_classes; ++cls)
405 int label = share_location ? 0 : cls;
406 auto & bboxes = label_to_bbox[label];
407 bboxes.resize(num_of_priors);
409 for (int prior = 0; prior < num_of_priors; ++prior)
411 int idx = prior * num_loc_classes * PRIOR_BOX_SIZE;
412 bboxes[prior].xmin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE, input_buffer_size_f, input_buffer_size_y,
413 input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
414 bboxes[prior].ymin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 1, input_buffer_size_f, input_buffer_size_y,
415 input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
416 bboxes[prior].xmax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 2, input_buffer_size_f, input_buffer_size_y,
417 input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
418 bboxes[prior].ymax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 3, input_buffer_size_f, input_buffer_size_y,
419 input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
425 template<typename dtype>
426 void extract_prior_boxes_and_variances(const detection_output_inst& instance, const bool variance_encoded_in_target,
427 const int32_t prior_info_size, const int32_t prior_coordinates_offset, const int32_t images_count,
428 std::vector<bounding_box>& prior_bboxes, std::vector<std::array<float, PRIOR_BOX_SIZE>>& prior_variances)
430 auto& input_prior_box = instance.prior_box_memory();
431 const int num_of_priors = (int)prior_bboxes.size() / images_count;
433 mem_lock<dtype> lock{ input_prior_box };
434 for (int i = 0; i < images_count; i++)
436 auto prior_box_data = lock.begin() + i*num_of_priors*prior_info_size * (variance_encoded_in_target ? 1 : 2);
438 for (int prior = 0; prior < num_of_priors; ++prior)
440 int idx = prior * prior_info_size + prior_coordinates_offset;
441 prior_bboxes[i*num_of_priors + prior] = bounding_box((float)(prior_box_data[idx]), (float)(prior_box_data[idx + 1]), (float)(prior_box_data[idx + 2]), (float)(prior_box_data[idx + 3]));
442 idx += num_of_priors * prior_info_size;
443 for (int j = 0; j < PRIOR_BOX_SIZE; ++j)
445 prior_variances[i*num_of_priors + prior][j] = variance_encoded_in_target ? 0.0f : (float)(prior_box_data[idx + j]);
452 template<typename dtype>
453 void extract_confidences_per_image(const detection_output_inst& instance, std::vector<std::vector<std::vector<std::pair<float,int>>>>& confidences, const int num_of_priors)
455 const int num_classes = instance.argument.num_classes;
457 const int num_of_images = (int)confidences.size();
458 auto& input_confidence = instance.confidence_memory();
459 const float confidence_threshold = instance.argument.confidence_threshold;
461 mem_lock<dtype> lock{ &input_confidence };
462 auto confidence_data = lock.begin();
464 assert(num_of_priors * num_classes == input_confidence.get_layout().size.feature[0]);
466 const auto& input_buffer_size = input_confidence.get_layout().get_buffer_size();
467 const int input_buffer_size_x = input_buffer_size.spatial[0];
468 const int input_buffer_size_y = input_buffer_size.spatial[1];
469 const int input_buffer_size_f = input_buffer_size.feature[0];
470 const auto& input_padding = input_confidence.get_layout().data_padding;
471 const int input_padding_lower_x = input_padding.lower_size().spatial[0];
472 const int input_padding_lower_y = input_padding.lower_size().spatial[1];
473 const int stride = input_buffer_size_y * input_buffer_size_x;
475 for (int image = 0; image < num_of_images; ++image)
477 std::vector<std::vector<std::pair<float,int>>>& label_to_scores = confidences[image];
478 label_to_scores.resize(num_classes);
479 int idx = get_linear_feature_index(image, 0, input_buffer_size_f, input_buffer_size_y,
480 input_buffer_size_x, input_padding_lower_y, input_padding_lower_x);
482 if (stride == 1 && std::is_same<dtype, float>::value)
484 float const* confidence_ptr_float = (float const*)(&(*confidence_data));
485 confidence_ptr_float += idx;
486 __m128 threshold = _mm_load_ps1(&confidence_threshold);
487 for (int prior = 0; prior < num_of_priors; ++prior)
490 for (; cls + 3 < num_classes; cls += 4)
492 __m128 scores = _mm_loadu_ps(confidence_ptr_float);
493 confidence_ptr_float += 4;
494 __m128i mask128 = _mm_castps_si128(_mm_cmpgt_ps(scores, threshold));
495 if (_mm_testz_si128(mask128, mask128))
499 int mask = _mm_movemask_ps(_mm_castsi128_ps(mask128));
502 label_to_scores[cls + 0].emplace_back(_mm_cvtss_f32(scores), prior);
506 int score = _mm_extract_ps(scores, 1);
507 float s = reinterpret_cast<float&>(score);
508 label_to_scores[cls + 1].emplace_back(s, prior);
512 int score = _mm_extract_ps(scores, 2);
513 float s = reinterpret_cast<float&>(score);
514 label_to_scores[cls + 2].emplace_back(s, prior);
518 int score = _mm_extract_ps(scores, 3);
519 float s = reinterpret_cast<float&>(score);
520 label_to_scores[cls + 3].emplace_back(s, prior);
523 for (; cls < num_classes; ++cls)
525 float score = *confidence_ptr_float;
526 if (score > confidence_threshold)
528 label_to_scores[cls].emplace_back(score, prior);
530 ++confidence_ptr_float;
536 for (int prior = 0; prior < num_of_priors; ++prior)
538 for (int cls = 0; cls < num_classes; ++cls)
540 float score = (float)confidence_data[idx];
541 if (score > confidence_threshold)
543 label_to_scores[cls].emplace_back(score, prior);
552 template<typename dtype>
553 void prepare_data(const detection_output_inst& instance, std::vector<std::vector<std::vector<bounding_box>>> &bboxes, std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences)
555 assert(bboxes.size() == confidences.size());
557 const auto& args = instance.argument;
559 const int num_of_images = (int)bboxes.size();
560 const int num_of_priors = instance.prior_box_memory().get_layout().size.spatial[1] / args.prior_info_size;
561 const int num_loc_classes = args.share_location ? 1 : args.num_classes;
563 // Extract locations per image.
564 std::vector<std::vector<std::vector<bounding_box>>> locations(num_of_images); // Per image : label -> bounding boxes.
565 extract_locations_per_image<dtype>(instance, locations, num_of_priors, num_loc_classes);
567 int32_t batches_in_prior_boxes = instance.prior_box_memory().get_layout().size.batch[0];
568 std::vector<bounding_box> prior_bboxes(batches_in_prior_boxes*num_of_priors); // Prior-Boxes (identical for all images since we assume all images in a batch are of same dimension).
569 std::vector<std::array<float, PRIOR_BOX_SIZE>> prior_variances(batches_in_prior_boxes*num_of_priors); // Variances per prior-box (identical for all images since we assume all images in a batch are of same dimension).
570 extract_prior_boxes_and_variances<dtype>(instance, args.variance_encoded_in_target,
571 args.prior_info_size, args.prior_coordinates_offset, batches_in_prior_boxes,
572 prior_bboxes, prior_variances);
574 // Create the decoded bounding boxes according to locations predictions and prior-boxes.
575 for (int image = 0; image < num_of_images; ++image)
577 std::vector<std::vector<bounding_box>>& bboxes_per_image = bboxes[image];
578 bboxes_per_image.resize(num_loc_classes);
579 locations[image].resize(num_loc_classes);
580 for (int cls = 0; cls < num_loc_classes; ++cls)
582 const int label = args.share_location ? 0 : cls;
583 if (!args.share_location && label == args.background_label_id)
585 continue; // Skip background class.
587 const std::vector<bounding_box>& label_loc_preds = locations[image][label];
588 int label_loc_preds_size = (int)label_loc_preds.size();
590 bboxes_per_image[label].clear();
592 for (int i = 0; i < label_loc_preds_size; ++i)
594 bounding_box decoded_bbox;
595 int32_t pb_offset = (batches_in_prior_boxes > 1) ? (image*num_of_priors + i) : i;
596 int32_t var_offset = (batches_in_prior_boxes > 1) ? (image*num_of_priors + i) : i;
597 decode_bounding_box(prior_bboxes[pb_offset], prior_variances[var_offset],
598 args.code_type, args.variance_encoded_in_target, label_loc_preds[i], &decoded_bbox,
599 args.prior_is_normalized, args.input_width, args.input_height, args.clip_before_nms);
600 bboxes_per_image[label].emplace_back(decoded_bbox);
605 // Extract confidences per image.
606 extract_confidences_per_image<dtype>(instance, confidences, num_of_priors);
609 event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, detection_output_inst& instance) override
611 for (auto& a : events)
616 auto ev = instance.get_network().get_engine().create_user_event(false);
618 const int num_of_images = instance.location_memory().get_layout().size.batch[0]; //batch size
620 std::vector<std::vector<std::vector<bounding_box>>> bboxes(num_of_images); // Per image : label -> decoded bounding boxes.
621 std::vector<std::vector<std::vector<std::pair<float, int>>>> confidences(num_of_images); // Per image : class -> confidences per bounding box.
623 if (instance.location_memory().get_layout().data_type == data_types::f32)
625 prepare_data<data_type_to_type<data_types::f32>::type>(instance, bboxes, confidences);
627 generate_detections<data_type_to_type<data_types::f32>::type>(instance, num_of_images, bboxes, confidences);
631 prepare_data<data_type_to_type<data_types::f16>::type>(instance, bboxes, confidences);
633 generate_detections<data_type_to_type<data_types::f16>::type>(instance, num_of_images, bboxes, confidences);
636 dynamic_cast<cldnn::user_event*>(ev.get())->set(); // set as complete
637 // TODO: consider refactoring create_user_event() to return cldnn::user_event*
641 static primitive_impl* create(const detection_output_node& arg)
643 return new detection_output_cpu(arg);
647 primitive_impl* runDetectOutCpu(const detection_output_node& arg)
649 return new detection_output_cpu(arg);