Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / gpu / detection_output_cpu.cpp
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "detection_output_inst.h"
18 #include "kernel.h"
19 #include "network_impl.h"
20 #include "implementation_map.h"
21 #include "math_utils.h"
22
23 #include <algorithm>
24 #include <stdexcept>
25 #include <string>
26 #include <type_traits>
27 #include <xmmintrin.h>
28
29 #ifdef FIX_OPENMP_RELEASE_ISSUE
30 #ifdef OPENMP_FOUND
31 #include <omp.h>
32 #endif
33 #endif
34
35 namespace cldnn { namespace gpu {
36
37 namespace {
38     struct bounding_box
39     {
40         float xmin;
41         float ymin;
42         float xmax;
43         float ymax;
44
45         bounding_box() : xmin(0), ymin(0), xmax(0), ymax(0) {}
46
47         bounding_box(const float xmin, const float ymin, const float xmax, const float ymax) :
48             xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
49
50         // Computes the area of a bounding box.
51         float area() const
52         {
53             return (xmax - xmin) * (ymax - ymin);
54         }
55     };
56 }
57
58 /************************ Detection Output CPU ************************/
59 struct detection_output_cpu : typed_primitive_impl<detection_output>
60 {
61     const detection_output_node& outer;
62
63     detection_output_cpu(const detection_output_node& outer)
64         : outer(outer)
65     {}
66
67     static void decode_bounding_box(
68         const bounding_box& prior_bbox, const std::array<float, PRIOR_BOX_SIZE>& prior_variance,
69         const prior_box_code_type code_type, const bool variance_encoded_in_target,
70         const bounding_box& bbox, bounding_box* decoded_bbox,
71         const bool prior_is_normalized, const size_t image_width, const size_t image_height, const bool clip_before_nms)
72     {
73         float prior_bbox_xmin = prior_bbox.xmin;
74         float prior_bbox_ymin = prior_bbox.ymin;
75         float prior_bbox_xmax = prior_bbox.xmax;
76         float prior_bbox_ymax = prior_bbox.ymax;
77
78         float bbox_xmin = bbox.xmin;
79         float bbox_ymin = bbox.ymin;
80         float bbox_xmax = bbox.xmax;
81         float bbox_ymax = bbox.ymax;
82
83         if (!prior_is_normalized) {
84             prior_bbox_xmin /= image_width;
85             prior_bbox_ymin /= image_height;
86             prior_bbox_xmax /= image_width;
87             prior_bbox_ymax /= image_height;
88         }
89
90         switch (code_type)
91         {
92             case prior_box_code_type::corner:
93             {
94                 if (variance_encoded_in_target)
95                 {
96                     // variance is encoded in target, we simply need to add the offset predictions.
97                     decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin;
98                     decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin;
99                     decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax;
100                     decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax;
101                 }
102                 else
103                 {
104                     // variance is encoded in bbox, we need to scale the offset accordingly.
105                     decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin;
106                     decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin;
107                     decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax;
108                     decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax;
109                 }
110                 break;
111             }
112             case prior_box_code_type::center_size:
113             {
114                 const float prior_width = prior_bbox_xmax - prior_bbox_xmin;
115                 assert(prior_width > 0);
116                 const float prior_height = prior_bbox_ymax - prior_bbox_ymin;
117                 assert(prior_height > 0);
118                 const float prior_center_x = (prior_bbox_xmin + prior_bbox_xmax) / 2.f;
119                 const float prior_center_y = (prior_bbox_ymin + prior_bbox_ymax) / 2.f;
120                 float decode_bbox_center_x, decode_bbox_center_y;
121                 float decode_bbox_width, decode_bbox_height;
122                 if (variance_encoded_in_target)
123                 {
124                     // variance is encoded in target, we simply need to restore the offset predictions.
125                     decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
126                     decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
127                     decode_bbox_width = (exp(bbox_xmax) * prior_width);
128                     decode_bbox_height = (exp(bbox_ymax) * prior_height);
129                 }
130                 else
131                 {
132                     // variance is encoded in bbox, we need to scale the offset accordingly.
133                     decode_bbox_center_x = prior_variance[0] * bbox_xmin * prior_width + prior_center_x;
134                     decode_bbox_center_y = prior_variance[1] * bbox_ymin * prior_height + prior_center_y;
135                     decode_bbox_width = (exp(prior_variance[2] * bbox_xmax) * prior_width);
136                     decode_bbox_height = (exp(prior_variance[3] * bbox_ymax) * prior_height);
137                 }
138                 decoded_bbox->xmin = decode_bbox_center_x - decode_bbox_width  / 2.0f;
139                 decoded_bbox->ymin = decode_bbox_center_y - decode_bbox_height / 2.0f;
140                 decoded_bbox->xmax = decode_bbox_center_x + decode_bbox_width  / 2.0f;
141                 decoded_bbox->ymax = decode_bbox_center_y + decode_bbox_height / 2.0f;
142                 break;
143             }
144             case prior_box_code_type::corner_size:
145             {
146                 const float prior_width = prior_bbox_xmax - prior_bbox_xmin;
147                 assert(prior_width > 0);
148                 const float prior_height = prior_bbox_ymax - prior_bbox_ymin;
149                 assert(prior_height > 0);
150                 if (variance_encoded_in_target)
151                 {
152                     // variance is encoded in target, we simply need to add the offset predictions.
153                     decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin * prior_width;
154                     decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin * prior_height;
155                     decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax * prior_width;
156                     decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax * prior_height;
157                 }
158                 else
159                 {
160                     // variance is encoded in bbox, we need to scale the offset accordingly.
161                     decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin * prior_width;
162                     decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin * prior_height;
163                     decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax * prior_width;
164                     decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax * prior_height;
165                 }
166                 break;
167             }
168             default:
169             {
170                 assert(0);
171             }
172         }
173
174         if (clip_before_nms)
175         {
176             decoded_bbox->xmin = std::max(0.0f, std::min(1.0f, decoded_bbox->xmin));
177             decoded_bbox->ymin = std::max(0.0f, std::min(1.0f, decoded_bbox->ymin));
178             decoded_bbox->xmax = std::max(0.0f, std::min(1.0f, decoded_bbox->xmax));
179             decoded_bbox->ymax = std::max(0.0f, std::min(1.0f, decoded_bbox->ymax));
180         }
181     }
182
183     static void apply_nms(const std::vector<bounding_box>& bboxes,
184         std::vector<std::pair<float,int>>& scores,
185         const float nms_threshold, const float eta, const int top_k)
186     {
187         // Sort the scores in descending order and keep top_k scores if needed.
188         if ((top_k != -1) && ((int)scores.size() > top_k))
189         {
190             std::partial_sort(scores.begin(), scores.begin() + top_k, scores.end(), [](const std::pair<float, int>& p1, const std::pair<float, int>& p2) { return (p1.first > p2.first) || (p1.first == p2.first && p1.second < p2.second); });
191             scores.resize(top_k);
192         }
193         else
194         {
195             std::stable_sort(scores.begin(), scores.end(), [](const std::pair<float, int>& p1, const std::pair<float, int>& p2) { return p1.first > p2.first; });
196         }
197
198         // NMS
199         float adaptive_threshold = nms_threshold;
200         int post_nms_count = 0;
201
202         for (auto score_index : scores)
203         {
204             const int idx = score_index.second;
205             bounding_box box1(bboxes[idx]);
206             bool keep = true;
207             for (int i = 0; i < post_nms_count; ++i)
208             {
209                 if (!keep)
210                 {
211                     break;
212                 }
213                 bounding_box box2(bboxes[scores[i].second]);
214                 bool intersecting = (box1.xmin < box2.xmax) & (box2.xmin < box1.xmax) & (box1.ymin < box2.ymax) & (box2.ymin < box1.ymax);
215                 float overlap = 0.0f;
216                 if (intersecting)
217                 {
218                     const float intersect_width = std::min(box1.xmax, box2.xmax) - std::max(box1.xmin, box2.xmin);
219                     const float intersect_height = std::min(box1.ymax, box2.ymax) - std::max(box1.ymin, box2.ymin);
220                     const float intersect_size = intersect_width * intersect_height;
221                     overlap = intersect_size / (box1.area() + box2.area() - intersect_size);
222                 }
223                 keep = (overlap <= adaptive_threshold);
224             }
225             if (keep)
226             {
227                 scores[post_nms_count] = score_index;
228                 ++post_nms_count;
229             }
230             if (keep && eta < 1 && adaptive_threshold > 0.5)
231             {
232                 adaptive_threshold *= eta;
233             }
234         }
235         scores.resize(post_nms_count); // scores holds only the items that were kept after the NMS.
236     }
237
238     template<typename dtype>
239     void generate_detections(const detection_output_inst& instance, const int num_of_images, const std::vector<std::vector<std::vector<bounding_box>>>& all_bboxes, std::vector<std::vector<std::vector<std::pair<float,int>>>>& confidences)
240     {
241         mem_lock<dtype> lock{ instance.output_memory() };
242         auto out_ptr = lock.begin();
243
244         const auto& args = instance.argument;
245         std::vector<std::vector<std::vector<std::pair<float,int>>>> final_detections; // Per image -> For each label: Pair (score, prior index)
246         for (int image = 0; image < num_of_images; ++image)
247         {
248             const std::vector<std::vector<bounding_box> >& bboxes_per_image = all_bboxes[image];
249             std::vector<std::vector<std::pair<float,int>>>& conf_per_image = confidences[image];
250             int num_det = 0;
251 #ifdef FIX_OPENMP_RELEASE_ISSUE
252 #ifdef OPENMP_FOUND
253             int num_available_threads = omp_get_max_threads();
254             //half available threads usage shows the best perf results for both SKL (4c8t) and APL (4c4t) for this part of detection output
255             int num_threads_to_use = (omp_in_parallel() == 0) ? num_available_threads/2 : 1;
256             #pragma omp parallel for num_threads(num_threads_to_use) reduction(+:num_det)
257 #endif
258 #endif
259             for (int cls = 0; cls < (int)args.num_classes; ++cls)
260             {
261                 if ((int)cls == args.background_label_id)
262                 {
263                     conf_per_image[cls].clear();
264                     continue; // Skip background class.
265                 }
266                 std::vector<std::pair<float,int>>& scores = conf_per_image[cls];
267                 const int label = args.share_location ? 0 : cls;
268                 apply_nms(bboxes_per_image[label], scores, args.nms_threshold, args.eta, args.top_k);
269                 num_det += (int)scores.size();
270             }
271             if (num_det > args.keep_top_k)
272             {
273                 std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
274                 score_index_pairs.reserve(num_det);
275                 for (int label = 0; label < (int)args.num_classes; ++label)
276                 {
277                     std::vector<std::pair<float, int>>& scores = confidences[image][label];
278                     for (std::pair<float, int> score_index : scores)
279                     {
280                         score_index_pairs.emplace_back(score_index.first, std::make_pair(label, score_index.second));
281                     }
282                 }
283
284                 // Keep top k results per image.
285                 auto sort_function = [](const std::pair<float, std::pair<int, int>>& p1, const std::pair<float, std::pair<int, int>>& p2) { return p1.first > p2.first; };
286                 if ((int)score_index_pairs.size() > args.keep_top_k)
287                 {
288                     std::partial_sort(score_index_pairs.begin(), score_index_pairs.begin() + args.keep_top_k, score_index_pairs.end(), sort_function);
289                     score_index_pairs.resize(args.keep_top_k);
290                 }
291                 else
292                 {
293                     std::sort(score_index_pairs.begin(), score_index_pairs.end(), sort_function);
294                 }
295
296                 // Store the new indices.
297                 std::vector<std::vector<std::pair<float,int>>> new_indices(args.num_classes);
298                 for (int j = 0; j < (int)score_index_pairs.size(); ++j)
299                 {
300                     int label = score_index_pairs[j].second.first;
301                     int idx = score_index_pairs[j].second.second;
302                     new_indices[label].emplace_back(score_index_pairs[j].first, idx);
303                 }
304                 final_detections.emplace_back(new_indices);
305             }
306             else
307             {
308                 final_detections.emplace_back(confidences[image]);
309             }
310         }
311
312         int count = 0;
313         for (int image = 0; image < num_of_images; ++image)
314         {
315             const std::vector<std::vector<bounding_box> >& bboxes_per_image = all_bboxes[image];
316             auto& final_detections_per_image = final_detections[image];
317             for (int label = 0; label < (int)final_detections_per_image.size(); ++label)
318             {
319                 int loc_label = args.share_location ? 0 : label;
320                 const std::vector<bounding_box>& bboxes = bboxes_per_image[loc_label];
321                 const std::vector<std::pair<float,int>>& label_detections = final_detections_per_image[label];
322                 for (std::pair<float,int> score_prior : label_detections)
323                 {
324                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)(float)image;
325                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = args.decrease_label_id ? ((dtype)((float)label - 1.0f))
326                                                                                             : (dtype)(float)label;
327                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)score_prior.first;
328                     const bounding_box& bbox = bboxes[score_prior.second];
329                     float xmin = bbox.xmin;
330                     float ymin = bbox.ymin;
331                     float xmax = bbox.xmax;
332                     float ymax = bbox.ymax;
333
334                     if (args.clip_after_nms)
335                     {
336                         xmin = std::max(0.0f, std::min(1.0f, xmin));
337                         ymin = std::max(0.0f, std::min(1.0f, ymin));
338                         xmax = std::max(0.0f, std::min(1.0f, xmax));
339                         ymax = std::max(0.0f, std::min(1.0f, ymax));
340                     }
341
342                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)xmin;
343                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)ymin;
344                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)xmax;
345                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)ymax;
346                     ++count;
347                 }
348             }
349         }
350
351         //In case number of detections is smaller than keep_top_k fill the rest of the buffer with invalid image id (-1).
352         while (count < num_of_images*args.keep_top_k)
353         {
354             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)-1.f;
355             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = (dtype)0.f;
356             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)0.f;
357             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)0.f;
358             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)0.f;
359             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)0.f;
360             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)0.f;
361             ++count;
362         }
363     }
364
365     // Compute the linear index taking the padding into account.
366     static inline int get_linear_feature_index(const int batch_id, const int feature_id, const int input_buffer_size_f, const int input_buffer_size_y,
367         const int input_buffer_size_x, const int input_padding_lower_y, const int input_padding_lower_x)
368     {
369         // This helper function assumes input layout with x_size = 1 and y_size = 1;
370         // Location and confidence inputs should be tensors with size {b,f,1,1}.
371         // This is validated in detection output primitive instance creation.
372
373         int input_idx = (batch_id * input_buffer_size_f + feature_id) * input_buffer_size_y * input_buffer_size_x;
374         input_idx += input_padding_lower_y * input_buffer_size_x + input_padding_lower_x;
375
376         return input_idx;
377     }
378
379     template<typename dtype>
380     void extract_locations_per_image(const detection_output_inst& instance, std::vector<std::vector<std::vector<bounding_box>>>& locations, const int num_of_priors, const int num_loc_classes)
381     {
382         const bool share_location = instance.argument.share_location;
383         auto& input_location = instance.location_memory();
384         const int num_of_images = (int)locations.size();
385
386         mem_lock<dtype> lock{ input_location };
387         auto location_data = lock.begin();
388
389         assert(num_of_priors * num_loc_classes * PRIOR_BOX_SIZE == input_location.get_layout().size.feature[0]);
390
391         const auto& input_buffer_size = input_location.get_layout().get_buffer_size();
392         const int input_buffer_size_x = input_buffer_size.spatial[0];
393         const int input_buffer_size_y = input_buffer_size.spatial[1];
394         const int input_buffer_size_f = input_buffer_size.feature[0];
395         const auto& input_padding = input_location.get_layout().data_padding;
396         const int input_padding_lower_x = input_padding.lower_size().spatial[0];
397         const int input_padding_lower_y = input_padding.lower_size().spatial[1];
398
399         for (int image = 0; image < num_of_images; ++image)
400         {
401             std::vector<std::vector<bounding_box>>& label_to_bbox = locations[image];
402             label_to_bbox.resize(num_loc_classes);
403             for (int cls = 0; cls < num_loc_classes; ++cls)
404             {
405                 int label = share_location ? 0 : cls;
406                 auto & bboxes = label_to_bbox[label];
407                 bboxes.resize(num_of_priors);
408
409                 for (int prior = 0; prior < num_of_priors; ++prior)
410                 {
411                     int idx = prior * num_loc_classes * PRIOR_BOX_SIZE;
412                     bboxes[prior].xmin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE, input_buffer_size_f, input_buffer_size_y,
413                                                                                         input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
414                     bboxes[prior].ymin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 1, input_buffer_size_f, input_buffer_size_y,
415                                                                                         input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
416                     bboxes[prior].xmax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 2, input_buffer_size_f, input_buffer_size_y,
417                                                                                         input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
418                     bboxes[prior].ymax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 3, input_buffer_size_f, input_buffer_size_y,
419                                                                                         input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
420                 }
421             }
422         }
423     }
424
425     template<typename dtype>
426     void extract_prior_boxes_and_variances(const detection_output_inst& instance, const bool variance_encoded_in_target,
427                                            const int32_t prior_info_size, const int32_t prior_coordinates_offset, const int32_t images_count,
428                                            std::vector<bounding_box>& prior_bboxes,  std::vector<std::array<float, PRIOR_BOX_SIZE>>& prior_variances)
429     {
430         auto& input_prior_box = instance.prior_box_memory();
431         const int num_of_priors = (int)prior_bboxes.size() / images_count;
432
433         mem_lock<dtype> lock{ input_prior_box };
434         for (int i = 0; i < images_count; i++)
435         {
436             auto prior_box_data = lock.begin() + i*num_of_priors*prior_info_size * (variance_encoded_in_target ? 1 : 2);
437
438             for (int prior = 0; prior < num_of_priors; ++prior)
439             {
440                 int idx = prior * prior_info_size + prior_coordinates_offset;
441                 prior_bboxes[i*num_of_priors + prior] = bounding_box((float)(prior_box_data[idx]), (float)(prior_box_data[idx + 1]), (float)(prior_box_data[idx + 2]), (float)(prior_box_data[idx + 3]));
442                 idx += num_of_priors * prior_info_size;
443                 for (int j = 0; j < PRIOR_BOX_SIZE; ++j)
444                 {
445                     prior_variances[i*num_of_priors + prior][j] = variance_encoded_in_target ? 0.0f : (float)(prior_box_data[idx + j]);
446                 }
447             }
448
449         }
450     }
451
452     template<typename dtype>
453     void extract_confidences_per_image(const detection_output_inst& instance, std::vector<std::vector<std::vector<std::pair<float,int>>>>& confidences, const int num_of_priors)
454     {
455         const int num_classes = instance.argument.num_classes;
456
457         const int num_of_images = (int)confidences.size();
458         auto& input_confidence = instance.confidence_memory();
459         const float confidence_threshold = instance.argument.confidence_threshold;
460
461         mem_lock<dtype> lock{ &input_confidence };
462         auto confidence_data = lock.begin();
463
464         assert(num_of_priors * num_classes == input_confidence.get_layout().size.feature[0]);
465
466         const auto& input_buffer_size = input_confidence.get_layout().get_buffer_size();
467         const int input_buffer_size_x = input_buffer_size.spatial[0];
468         const int input_buffer_size_y = input_buffer_size.spatial[1];
469         const int input_buffer_size_f = input_buffer_size.feature[0];
470         const auto& input_padding = input_confidence.get_layout().data_padding;
471         const int input_padding_lower_x = input_padding.lower_size().spatial[0];
472         const int input_padding_lower_y = input_padding.lower_size().spatial[1];
473         const int stride = input_buffer_size_y * input_buffer_size_x;
474
475         for (int image = 0; image < num_of_images; ++image)
476         {
477             std::vector<std::vector<std::pair<float,int>>>& label_to_scores = confidences[image];
478             label_to_scores.resize(num_classes);
479             int idx = get_linear_feature_index(image, 0, input_buffer_size_f, input_buffer_size_y,
480                 input_buffer_size_x, input_padding_lower_y, input_padding_lower_x);
481
482             if (stride == 1 && std::is_same<dtype, float>::value)
483             {
484                 float const* confidence_ptr_float = (float const*)(&(*confidence_data));
485                 confidence_ptr_float += idx;
486                 __m128 threshold = _mm_load_ps1(&confidence_threshold);
487                 for (int prior = 0; prior < num_of_priors; ++prior)
488                 {
489                     int cls = 0;
490                     for (; cls + 3 < num_classes; cls += 4)
491                     {
492                         __m128 scores = _mm_loadu_ps(confidence_ptr_float);
493                         confidence_ptr_float += 4;
494                         __m128i mask128 = _mm_castps_si128(_mm_cmpgt_ps(scores, threshold));
495                         if (_mm_testz_si128(mask128, mask128))
496                         {
497                             continue;
498                         }
499                         int mask = _mm_movemask_ps(_mm_castsi128_ps(mask128));
500                         if (mask & 1)
501                         {
502                             label_to_scores[cls + 0].emplace_back(_mm_cvtss_f32(scores), prior);
503                         }
504                         if (mask & 2)
505                         {
506                             int score = _mm_extract_ps(scores, 1);
507                             float s = reinterpret_cast<float&>(score);
508                             label_to_scores[cls + 1].emplace_back(s, prior);
509                         }
510                         if (mask & 4)
511                         {
512                             int score = _mm_extract_ps(scores, 2);
513                             float s = reinterpret_cast<float&>(score);
514                             label_to_scores[cls + 2].emplace_back(s, prior);
515                         }
516                         if (mask & 8)
517                         {
518                             int score = _mm_extract_ps(scores, 3);
519                             float s = reinterpret_cast<float&>(score);
520                             label_to_scores[cls + 3].emplace_back(s, prior);
521                         }
522                     }
523                     for (; cls < num_classes; ++cls)
524                     {
525                         float score = *confidence_ptr_float;
526                         if (score > confidence_threshold)
527                         {
528                             label_to_scores[cls].emplace_back(score, prior);
529                         }
530                         ++confidence_ptr_float;
531                     }
532                 }
533             }
534             else
535             {
536                 for (int prior = 0; prior < num_of_priors; ++prior)
537                 {
538                     for (int cls = 0; cls < num_classes; ++cls)
539                     {
540                         float score = (float)confidence_data[idx];
541                         if (score > confidence_threshold)
542                         {
543                             label_to_scores[cls].emplace_back(score, prior);
544                         }
545                         idx += stride;
546                     }
547                 }
548             }
549         }
550     }
551
552     template<typename dtype>
553     void prepare_data(const detection_output_inst& instance, std::vector<std::vector<std::vector<bounding_box>>> &bboxes, std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences)
554     {
555         assert(bboxes.size() == confidences.size());
556
557         const auto& args = instance.argument;
558
559         const int num_of_images = (int)bboxes.size();
560         const int num_of_priors = instance.prior_box_memory().get_layout().size.spatial[1] / args.prior_info_size;
561         const int num_loc_classes = args.share_location ? 1 : args.num_classes;
562
563         // Extract locations per image.
564         std::vector<std::vector<std::vector<bounding_box>>> locations(num_of_images); // Per image : label -> bounding boxes.
565         extract_locations_per_image<dtype>(instance, locations, num_of_priors, num_loc_classes);
566
567         int32_t batches_in_prior_boxes = instance.prior_box_memory().get_layout().size.batch[0];
568         std::vector<bounding_box> prior_bboxes(batches_in_prior_boxes*num_of_priors); // Prior-Boxes (identical for all images since we assume all images in a batch are of same dimension).
569         std::vector<std::array<float, PRIOR_BOX_SIZE>> prior_variances(batches_in_prior_boxes*num_of_priors); // Variances per prior-box (identical for all images since we assume all images in a batch are of same dimension).
570         extract_prior_boxes_and_variances<dtype>(instance, args.variance_encoded_in_target,
571                                                  args.prior_info_size, args.prior_coordinates_offset, batches_in_prior_boxes,
572                                                  prior_bboxes, prior_variances);
573
574         // Create the decoded bounding boxes according to locations predictions and prior-boxes.
575         for (int image = 0; image < num_of_images; ++image)
576         {
577             std::vector<std::vector<bounding_box>>& bboxes_per_image = bboxes[image];
578             bboxes_per_image.resize(num_loc_classes);
579             locations[image].resize(num_loc_classes);
580             for (int cls = 0; cls < num_loc_classes; ++cls)
581             {
582                 const int label = args.share_location ? 0 : cls;
583                 if (!args.share_location && label == args.background_label_id)
584                 {
585                     continue; // Skip background class.
586                 }
587                 const std::vector<bounding_box>& label_loc_preds = locations[image][label];
588                 int label_loc_preds_size = (int)label_loc_preds.size();
589
590                 bboxes_per_image[label].clear();
591
592                 for (int i = 0; i < label_loc_preds_size; ++i)
593                 {
594                     bounding_box decoded_bbox;
595                     int32_t pb_offset = (batches_in_prior_boxes > 1) ? (image*num_of_priors + i) : i;
596                     int32_t var_offset = (batches_in_prior_boxes > 1) ? (image*num_of_priors + i) : i;
597                     decode_bounding_box(prior_bboxes[pb_offset], prior_variances[var_offset],
598                                         args.code_type, args.variance_encoded_in_target, label_loc_preds[i], &decoded_bbox,
599                                         args.prior_is_normalized, args.input_width, args.input_height, args.clip_before_nms);
600                     bboxes_per_image[label].emplace_back(decoded_bbox);
601                 }
602             }
603         }
604
605         // Extract confidences per image.
606         extract_confidences_per_image<dtype>(instance, confidences, num_of_priors);
607     }
608
609     event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, detection_output_inst& instance) override
610     {
611         for (auto& a : events)
612         {
613             a->wait();
614         }
615
616         auto ev = instance.get_network().get_engine().create_user_event(false);
617
618         const int num_of_images = instance.location_memory().get_layout().size.batch[0]; //batch size
619
620         std::vector<std::vector<std::vector<bounding_box>>> bboxes(num_of_images); // Per image : label -> decoded bounding boxes.
621         std::vector<std::vector<std::vector<std::pair<float, int>>>> confidences(num_of_images); // Per image : class -> confidences per bounding box.
622
623         if (instance.location_memory().get_layout().data_type == data_types::f32)
624         {
625             prepare_data<data_type_to_type<data_types::f32>::type>(instance, bboxes, confidences);
626
627             generate_detections<data_type_to_type<data_types::f32>::type>(instance, num_of_images, bboxes, confidences);
628         }
629         else
630         {
631             prepare_data<data_type_to_type<data_types::f16>::type>(instance, bboxes, confidences);
632
633             generate_detections<data_type_to_type<data_types::f16>::type>(instance, num_of_images, bboxes, confidences);
634         }
635
636         dynamic_cast<cldnn::user_event*>(ev.get())->set(); // set as complete
637         // TODO: consider refactoring create_user_event() to return cldnn::user_event*
638         return ev;
639     }
640
641     static primitive_impl* create(const detection_output_node& arg)
642     {
643         return new detection_output_cpu(arg);
644     }
645 };
646
647 primitive_impl* runDetectOutCpu(const detection_output_node& arg)
648 {
649     return new detection_output_cpu(arg);
650 }
651
652 }}