inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "detection_output_inst.h"
  18 #include "kernel.h"
  19 #include "network_impl.h"
  20 #include "implementation_map.h"
  21 #include "math_utils.h"
  22
  23 #include <algorithm>
  24 #include <stdexcept>
  25 #include <string>
  26 #include <type_traits>
  27 #include <xmmintrin.h>
  28
  29 #ifdef FIX_OPENMP_RELEASE_ISSUE
  30 #ifdef OPENMP_FOUND
  31 #include <omp.h>
  32 #endif
  33 #endif
  34
  35 namespace cldnn { namespace gpu {
  36
  37 namespace {
  38     struct bounding_box
  39     {
  40         float xmin;
  41         float ymin;
  42         float xmax;
  43         float ymax;
  44
  45         bounding_box() : xmin(0), ymin(0), xmax(0), ymax(0) {}
  46
  47         bounding_box(const float xmin, const float ymin, const float xmax, const float ymax) :
  48             xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
  49
  50         // Computes the area of a bounding box.
  51         float area() const
  52         {
  53             return (xmax - xmin) * (ymax - ymin);
  54         }
  55     };
  56 }
  57
  58 /************************ Detection Output CPU ************************/
  59 struct detection_output_cpu : typed_primitive_impl<detection_output>
  60 {
  61     const detection_output_node& outer;
  62
  63     detection_output_cpu(const detection_output_node& outer)
  64         : outer(outer)
  65     {}
  66
  67     static void decode_bounding_box(
  68         const bounding_box& prior_bbox, const std::array<float, PRIOR_BOX_SIZE>& prior_variance,
  69         const prior_box_code_type code_type, const bool variance_encoded_in_target,
  70         const bounding_box& bbox, bounding_box* decoded_bbox,
  71         const bool prior_is_normalized, const size_t image_width, const size_t image_height, const bool clip_before_nms)
  72     {
  73         float prior_bbox_xmin = prior_bbox.xmin;
  74         float prior_bbox_ymin = prior_bbox.ymin;
  75         float prior_bbox_xmax = prior_bbox.xmax;
  76         float prior_bbox_ymax = prior_bbox.ymax;
  77
  78         float bbox_xmin = bbox.xmin;
  79         float bbox_ymin = bbox.ymin;
  80         float bbox_xmax = bbox.xmax;
  81         float bbox_ymax = bbox.ymax;
  82
  83         if (!prior_is_normalized) {
  84             prior_bbox_xmin /= image_width;
  85             prior_bbox_ymin /= image_height;
  86             prior_bbox_xmax /= image_width;
  87             prior_bbox_ymax /= image_height;
  88         }
  89
  90         switch (code_type)
  91         {
  92             case prior_box_code_type::corner:
  93             {
  94                 if (variance_encoded_in_target)
  95                 {
  96                     // variance is encoded in target, we simply need to add the offset predictions.
  97                     decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin;
  98                     decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin;
  99                     decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax;
 100                     decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax;
 101                 }
 102                 else
 103                 {
 104                     // variance is encoded in bbox, we need to scale the offset accordingly.
 105                     decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin;
 106                     decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin;
 107                     decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax;
 108                     decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax;
 109                 }
 110                 break;
 111             }
 112             case prior_box_code_type::center_size:
 113             {
 114                 const float prior_width = prior_bbox_xmax - prior_bbox_xmin;
 115                 assert(prior_width > 0);
 116                 const float prior_height = prior_bbox_ymax - prior_bbox_ymin;
 117                 assert(prior_height > 0);
 118                 const float prior_center_x = (prior_bbox_xmin + prior_bbox_xmax) / 2.f;
 119                 const float prior_center_y = (prior_bbox_ymin + prior_bbox_ymax) / 2.f;
 120                 float decode_bbox_center_x, decode_bbox_center_y;
 121                 float decode_bbox_width, decode_bbox_height;
 122                 if (variance_encoded_in_target)
 123                 {
 124                     // variance is encoded in target, we simply need to restore the offset predictions.
 125                     decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
 126                     decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
 127                     decode_bbox_width = (exp(bbox_xmax) * prior_width);
 128                     decode_bbox_height = (exp(bbox_ymax) * prior_height);
 129                 }
 130                 else
 131                 {
 132                     // variance is encoded in bbox, we need to scale the offset accordingly.
 133                     decode_bbox_center_x = prior_variance[0] * bbox_xmin * prior_width + prior_center_x;
 134                     decode_bbox_center_y = prior_variance[1] * bbox_ymin * prior_height + prior_center_y;
 135                     decode_bbox_width = (exp(prior_variance[2] * bbox_xmax) * prior_width);
 136                     decode_bbox_height = (exp(prior_variance[3] * bbox_ymax) * prior_height);
 137                 }
 138                 decoded_bbox->xmin = decode_bbox_center_x - decode_bbox_width  / 2.0f;
 139                 decoded_bbox->ymin = decode_bbox_center_y - decode_bbox_height / 2.0f;
 140                 decoded_bbox->xmax = decode_bbox_center_x + decode_bbox_width  / 2.0f;
 141                 decoded_bbox->ymax = decode_bbox_center_y + decode_bbox_height / 2.0f;
 142                 break;
 143             }
 144             case prior_box_code_type::corner_size:
 145             {
 146                 const float prior_width = prior_bbox_xmax - prior_bbox_xmin;
 147                 assert(prior_width > 0);
 148                 const float prior_height = prior_bbox_ymax - prior_bbox_ymin;
 149                 assert(prior_height > 0);
 150                 if (variance_encoded_in_target)
 151                 {
 152                     // variance is encoded in target, we simply need to add the offset predictions.
 153                     decoded_bbox->xmin = prior_bbox_xmin + bbox_xmin * prior_width;
 154                     decoded_bbox->ymin = prior_bbox_ymin + bbox_ymin * prior_height;
 155                     decoded_bbox->xmax = prior_bbox_xmax + bbox_xmax * prior_width;
 156                     decoded_bbox->ymax = prior_bbox_ymax + bbox_ymax * prior_height;
 157                 }
 158                 else
 159                 {
 160                     // variance is encoded in bbox, we need to scale the offset accordingly.
 161                     decoded_bbox->xmin = prior_bbox_xmin + prior_variance[0] * bbox_xmin * prior_width;
 162                     decoded_bbox->ymin = prior_bbox_ymin + prior_variance[1] * bbox_ymin * prior_height;
 163                     decoded_bbox->xmax = prior_bbox_xmax + prior_variance[2] * bbox_xmax * prior_width;
 164                     decoded_bbox->ymax = prior_bbox_ymax + prior_variance[3] * bbox_ymax * prior_height;
 165                 }
 166                 break;
 167             }
 168             default:
 169             {
 170                 assert(0);
 171             }
 172         }
 173
 174         if (clip_before_nms)
 175         {
 176             decoded_bbox->xmin = std::max(0.0f, std::min(1.0f, decoded_bbox->xmin));
 177             decoded_bbox->ymin = std::max(0.0f, std::min(1.0f, decoded_bbox->ymin));
 178             decoded_bbox->xmax = std::max(0.0f, std::min(1.0f, decoded_bbox->xmax));
 179             decoded_bbox->ymax = std::max(0.0f, std::min(1.0f, decoded_bbox->ymax));
 180         }
 181     }
 182
 183     static void apply_nms(const std::vector<bounding_box>& bboxes,
 184         std::vector<std::pair<float,int>>& scores,
 185         const float nms_threshold, const float eta, const int top_k)
 186     {
 187         // Sort the scores in descending order and keep top_k scores if needed.
 188         if ((top_k != -1) && ((int)scores.size() > top_k))
 189         {
 190             std::partial_sort(scores.begin(), scores.begin() + top_k, scores.end(), [](const std::pair<float, int>& p1, const std::pair<float, int>& p2) { return (p1.first > p2.first) || (p1.first == p2.first && p1.second < p2.second); });
 191             scores.resize(top_k);
 192         }
 193         else
 194         {
 195             std::stable_sort(scores.begin(), scores.end(), [](const std::pair<float, int>& p1, const std::pair<float, int>& p2) { return p1.first > p2.first; });
 196         }
 197
 198         // NMS
 199         float adaptive_threshold = nms_threshold;
 200         int post_nms_count = 0;
 201
 202         for (auto score_index : scores)
 203         {
 204             const int idx = score_index.second;
 205             bounding_box box1(bboxes[idx]);
 206             bool keep = true;
 207             for (int i = 0; i < post_nms_count; ++i)
 208             {
 209                 if (!keep)
 210                 {
 211                     break;
 212                 }
 213                 bounding_box box2(bboxes[scores[i].second]);
 214                 bool intersecting = (box1.xmin < box2.xmax) & (box2.xmin < box1.xmax) & (box1.ymin < box2.ymax) & (box2.ymin < box1.ymax);
 215                 float overlap = 0.0f;
 216                 if (intersecting)
 217                 {
 218                     const float intersect_width = std::min(box1.xmax, box2.xmax) - std::max(box1.xmin, box2.xmin);
 219                     const float intersect_height = std::min(box1.ymax, box2.ymax) - std::max(box1.ymin, box2.ymin);
 220                     const float intersect_size = intersect_width * intersect_height;
 221                     overlap = intersect_size / (box1.area() + box2.area() - intersect_size);
 222                 }
 223                 keep = (overlap <= adaptive_threshold);
 224             }
 225             if (keep)
 226             {
 227                 scores[post_nms_count] = score_index;
 228                 ++post_nms_count;
 229             }
 230             if (keep && eta < 1 && adaptive_threshold > 0.5)
 231             {
 232                 adaptive_threshold *= eta;
 233             }
 234         }
 235         scores.resize(post_nms_count); // scores holds only the items that were kept after the NMS.
 236     }
 237
 238     template<typename dtype>
 239     void generate_detections(const detection_output_inst& instance, const int num_of_images, const std::vector<std::vector<std::vector<bounding_box>>>& all_bboxes, std::vector<std::vector<std::vector<std::pair<float,int>>>>& confidences)
 240     {
 241         mem_lock<dtype> lock{ instance.output_memory() };
 242         auto out_ptr = lock.begin();
 243
 244         const auto& args = instance.argument;
 245         std::vector<std::vector<std::vector<std::pair<float,int>>>> final_detections; // Per image -> For each label: Pair (score, prior index)
 246         for (int image = 0; image < num_of_images; ++image)
 247         {
 248             const std::vector<std::vector<bounding_box> >& bboxes_per_image = all_bboxes[image];
 249             std::vector<std::vector<std::pair<float,int>>>& conf_per_image = confidences[image];
 250             int num_det = 0;
 251 #ifdef FIX_OPENMP_RELEASE_ISSUE
 252 #ifdef OPENMP_FOUND
 253             int num_available_threads = omp_get_max_threads();
 254             //half available threads usage shows the best perf results for both SKL (4c8t) and APL (4c4t) for this part of detection output
 255             int num_threads_to_use = (omp_in_parallel() == 0) ? num_available_threads/2 : 1;
 256             #pragma omp parallel for num_threads(num_threads_to_use) reduction(+:num_det)
 257 #endif
 258 #endif
 259             for (int cls = 0; cls < (int)args.num_classes; ++cls)
 260             {
 261                 if ((int)cls == args.background_label_id)
 262                 {
 263                     conf_per_image[cls].clear();
 264                     continue; // Skip background class.
 265                 }
 266                 std::vector<std::pair<float,int>>& scores = conf_per_image[cls];
 267                 const int label = args.share_location ? 0 : cls;
 268                 apply_nms(bboxes_per_image[label], scores, args.nms_threshold, args.eta, args.top_k);
 269                 num_det += (int)scores.size();
 270             }
 271             if (num_det > args.keep_top_k)
 272             {
 273                 std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
 274                 score_index_pairs.reserve(num_det);
 275                 for (int label = 0; label < (int)args.num_classes; ++label)
 276                 {
 277                     std::vector<std::pair<float, int>>& scores = confidences[image][label];
 278                     for (std::pair<float, int> score_index : scores)
 279                     {
 280                         score_index_pairs.emplace_back(score_index.first, std::make_pair(label, score_index.second));
 281                     }
 282                 }
 283
 284                 // Keep top k results per image.
 285                 auto sort_function = [](const std::pair<float, std::pair<int, int>>& p1, const std::pair<float, std::pair<int, int>>& p2) { return p1.first > p2.first; };
 286                 if ((int)score_index_pairs.size() > args.keep_top_k)
 287                 {
 288                     std::partial_sort(score_index_pairs.begin(), score_index_pairs.begin() + args.keep_top_k, score_index_pairs.end(), sort_function);
 289                     score_index_pairs.resize(args.keep_top_k);
 290                 }
 291                 else
 292                 {
 293                     std::sort(score_index_pairs.begin(), score_index_pairs.end(), sort_function);
 294                 }
 295
 296                 // Store the new indices.
 297                 std::vector<std::vector<std::pair<float,int>>> new_indices(args.num_classes);
 298                 for (int j = 0; j < (int)score_index_pairs.size(); ++j)
 299                 {
 300                     int label = score_index_pairs[j].second.first;
 301                     int idx = score_index_pairs[j].second.second;
 302                     new_indices[label].emplace_back(score_index_pairs[j].first, idx);
 303                 }
 304                 final_detections.emplace_back(new_indices);
 305             }
 306             else
 307             {
 308                 final_detections.emplace_back(confidences[image]);
 309             }
 310         }
 311
 312         int count = 0;
 313         for (int image = 0; image < num_of_images; ++image)
 314         {
 315             const std::vector<std::vector<bounding_box> >& bboxes_per_image = all_bboxes[image];
 316             auto& final_detections_per_image = final_detections[image];
 317             for (int label = 0; label < (int)final_detections_per_image.size(); ++label)
 318             {
 319                 int loc_label = args.share_location ? 0 : label;
 320                 const std::vector<bounding_box>& bboxes = bboxes_per_image[loc_label];
 321                 const std::vector<std::pair<float,int>>& label_detections = final_detections_per_image[label];
 322                 for (std::pair<float,int> score_prior : label_detections)
 323                 {
 324                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)(float)image;
 325                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = args.decrease_label_id ? ((dtype)((float)label - 1.0f))
 326                                                                                             : (dtype)(float)label;
 327                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)score_prior.first;
 328                     const bounding_box& bbox = bboxes[score_prior.second];
 329                     float xmin = bbox.xmin;
 330                     float ymin = bbox.ymin;
 331                     float xmax = bbox.xmax;
 332                     float ymax = bbox.ymax;
 333
 334                     if (args.clip_after_nms)
 335                     {
 336                         xmin = std::max(0.0f, std::min(1.0f, xmin));
 337                         ymin = std::max(0.0f, std::min(1.0f, ymin));
 338                         xmax = std::max(0.0f, std::min(1.0f, xmax));
 339                         ymax = std::max(0.0f, std::min(1.0f, ymax));
 340                     }
 341
 342                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)xmin;
 343                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)ymin;
 344                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)xmax;
 345                     out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)ymax;
 346                     ++count;
 347                 }
 348             }
 349         }
 350
 351         //In case number of detections is smaller than keep_top_k fill the rest of the buffer with invalid image id (-1).
 352         while (count < num_of_images*args.keep_top_k)
 353         {
 354             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE] = (dtype)-1.f;
 355             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 1] = (dtype)0.f;
 356             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 2] = (dtype)0.f;
 357             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 3] = (dtype)0.f;
 358             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 4] = (dtype)0.f;
 359             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 5] = (dtype)0.f;
 360             out_ptr[count * DETECTION_OUTPUT_ROW_SIZE + 6] = (dtype)0.f;
 361             ++count;
 362         }
 363     }
 364
 365     // Compute the linear index taking the padding into account.
 366     static inline int get_linear_feature_index(const int batch_id, const int feature_id, const int input_buffer_size_f, const int input_buffer_size_y,
 367         const int input_buffer_size_x, const int input_padding_lower_y, const int input_padding_lower_x)
 368     {
 369         // This helper function assumes input layout with x_size = 1 and y_size = 1;
 370         // Location and confidence inputs should be tensors with size {b,f,1,1}.
 371         // This is validated in detection output primitive instance creation.
 372
 373         int input_idx = (batch_id * input_buffer_size_f + feature_id) * input_buffer_size_y * input_buffer_size_x;
 374         input_idx += input_padding_lower_y * input_buffer_size_x + input_padding_lower_x;
 375
 376         return input_idx;
 377     }
 378
 379     template<typename dtype>
 380     void extract_locations_per_image(const detection_output_inst& instance, std::vector<std::vector<std::vector<bounding_box>>>& locations, const int num_of_priors, const int num_loc_classes)
 381     {
 382         const bool share_location = instance.argument.share_location;
 383         auto& input_location = instance.location_memory();
 384         const int num_of_images = (int)locations.size();
 385
 386         mem_lock<dtype> lock{ input_location };
 387         auto location_data = lock.begin();
 388
 389         assert(num_of_priors * num_loc_classes * PRIOR_BOX_SIZE == input_location.get_layout().size.feature[0]);
 390
 391         const auto& input_buffer_size = input_location.get_layout().get_buffer_size();
 392         const int input_buffer_size_x = input_buffer_size.spatial[0];
 393         const int input_buffer_size_y = input_buffer_size.spatial[1];
 394         const int input_buffer_size_f = input_buffer_size.feature[0];
 395         const auto& input_padding = input_location.get_layout().data_padding;
 396         const int input_padding_lower_x = input_padding.lower_size().spatial[0];
 397         const int input_padding_lower_y = input_padding.lower_size().spatial[1];
 398
 399         for (int image = 0; image < num_of_images; ++image)
 400         {
 401             std::vector<std::vector<bounding_box>>& label_to_bbox = locations[image];
 402             label_to_bbox.resize(num_loc_classes);
 403             for (int cls = 0; cls < num_loc_classes; ++cls)
 404             {
 405                 int label = share_location ? 0 : cls;
 406                 auto & bboxes = label_to_bbox[label];
 407                 bboxes.resize(num_of_priors);
 408
 409                 for (int prior = 0; prior < num_of_priors; ++prior)
 410                 {
 411                     int idx = prior * num_loc_classes * PRIOR_BOX_SIZE;
 412                     bboxes[prior].xmin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE, input_buffer_size_f, input_buffer_size_y,
 413                                                                                         input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
 414                     bboxes[prior].ymin = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 1, input_buffer_size_f, input_buffer_size_y,
 415                                                                                         input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
 416                     bboxes[prior].xmax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 2, input_buffer_size_f, input_buffer_size_y,
 417                                                                                         input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
 418                     bboxes[prior].ymax = (float)(location_data[get_linear_feature_index(image, idx + cls * PRIOR_BOX_SIZE + 3, input_buffer_size_f, input_buffer_size_y,
 419                                                                                         input_buffer_size_x, input_padding_lower_y, input_padding_lower_x)]);
 420                 }
 421             }
 422         }
 423     }
 424
 425     template<typename dtype>
 426     void extract_prior_boxes_and_variances(const detection_output_inst& instance, const bool variance_encoded_in_target,
 427                                            const int32_t prior_info_size, const int32_t prior_coordinates_offset, const int32_t images_count,
 428                                            std::vector<bounding_box>& prior_bboxes,  std::vector<std::array<float, PRIOR_BOX_SIZE>>& prior_variances)
 429     {
 430         auto& input_prior_box = instance.prior_box_memory();
 431         const int num_of_priors = (int)prior_bboxes.size() / images_count;
 432
 433         mem_lock<dtype> lock{ input_prior_box };
 434         for (int i = 0; i < images_count; i++)
 435         {
 436             auto prior_box_data = lock.begin() + i*num_of_priors*prior_info_size * (variance_encoded_in_target ? 1 : 2);
 437
 438             for (int prior = 0; prior < num_of_priors; ++prior)
 439             {
 440                 int idx = prior * prior_info_size + prior_coordinates_offset;
 441                 prior_bboxes[i*num_of_priors + prior] = bounding_box((float)(prior_box_data[idx]), (float)(prior_box_data[idx + 1]), (float)(prior_box_data[idx + 2]), (float)(prior_box_data[idx + 3]));
 442                 idx += num_of_priors * prior_info_size;
 443                 for (int j = 0; j < PRIOR_BOX_SIZE; ++j)
 444                 {
 445                     prior_variances[i*num_of_priors + prior][j] = variance_encoded_in_target ? 0.0f : (float)(prior_box_data[idx + j]);
 446                 }
 447             }
 448
 449         }
 450     }
 451
 452     template<typename dtype>
 453     void extract_confidences_per_image(const detection_output_inst& instance, std::vector<std::vector<std::vector<std::pair<float,int>>>>& confidences, const int num_of_priors)
 454     {
 455         const int num_classes = instance.argument.num_classes;
 456
 457         const int num_of_images = (int)confidences.size();
 458         auto& input_confidence = instance.confidence_memory();
 459         const float confidence_threshold = instance.argument.confidence_threshold;
 460
 461         mem_lock<dtype> lock{ &input_confidence };
 462         auto confidence_data = lock.begin();
 463
 464         assert(num_of_priors * num_classes == input_confidence.get_layout().size.feature[0]);
 465
 466         const auto& input_buffer_size = input_confidence.get_layout().get_buffer_size();
 467         const int input_buffer_size_x = input_buffer_size.spatial[0];
 468         const int input_buffer_size_y = input_buffer_size.spatial[1];
 469         const int input_buffer_size_f = input_buffer_size.feature[0];
 470         const auto& input_padding = input_confidence.get_layout().data_padding;
 471         const int input_padding_lower_x = input_padding.lower_size().spatial[0];
 472         const int input_padding_lower_y = input_padding.lower_size().spatial[1];
 473         const int stride = input_buffer_size_y * input_buffer_size_x;
 474
 475         for (int image = 0; image < num_of_images; ++image)
 476         {
 477             std::vector<std::vector<std::pair<float,int>>>& label_to_scores = confidences[image];
 478             label_to_scores.resize(num_classes);
 479             int idx = get_linear_feature_index(image, 0, input_buffer_size_f, input_buffer_size_y,
 480                 input_buffer_size_x, input_padding_lower_y, input_padding_lower_x);
 481
 482             if (stride == 1 && std::is_same<dtype, float>::value)
 483             {
 484                 float const* confidence_ptr_float = (float const*)(&(*confidence_data));
 485                 confidence_ptr_float += idx;
 486                 __m128 threshold = _mm_load_ps1(&confidence_threshold);
 487                 for (int prior = 0; prior < num_of_priors; ++prior)
 488                 {
 489                     int cls = 0;
 490                     for (; cls + 3 < num_classes; cls += 4)
 491                     {
 492                         __m128 scores = _mm_loadu_ps(confidence_ptr_float);
 493                         confidence_ptr_float += 4;
 494                         __m128i mask128 = _mm_castps_si128(_mm_cmpgt_ps(scores, threshold));
 495                         if (_mm_testz_si128(mask128, mask128))
 496                         {
 497                             continue;
 498                         }
 499                         int mask = _mm_movemask_ps(_mm_castsi128_ps(mask128));
 500                         if (mask & 1)
 501                         {
 502                             label_to_scores[cls + 0].emplace_back(_mm_cvtss_f32(scores), prior);
 503                         }
 504                         if (mask & 2)
 505                         {
 506                             int score = _mm_extract_ps(scores, 1);
 507                             float s = reinterpret_cast<float&>(score);
 508                             label_to_scores[cls + 1].emplace_back(s, prior);
 509                         }
 510                         if (mask & 4)
 511                         {
 512                             int score = _mm_extract_ps(scores, 2);
 513                             float s = reinterpret_cast<float&>(score);
 514                             label_to_scores[cls + 2].emplace_back(s, prior);
 515                         }
 516                         if (mask & 8)
 517                         {
 518                             int score = _mm_extract_ps(scores, 3);
 519                             float s = reinterpret_cast<float&>(score);
 520                             label_to_scores[cls + 3].emplace_back(s, prior);
 521                         }
 522                     }
 523                     for (; cls < num_classes; ++cls)
 524                     {
 525                         float score = *confidence_ptr_float;
 526                         if (score > confidence_threshold)
 527                         {
 528                             label_to_scores[cls].emplace_back(score, prior);
 529                         }
 530                         ++confidence_ptr_float;
 531                     }
 532                 }
 533             }
 534             else
 535             {
 536                 for (int prior = 0; prior < num_of_priors; ++prior)
 537                 {
 538                     for (int cls = 0; cls < num_classes; ++cls)
 539                     {
 540                         float score = (float)confidence_data[idx];
 541                         if (score > confidence_threshold)
 542                         {
 543                             label_to_scores[cls].emplace_back(score, prior);
 544                         }
 545                         idx += stride;
 546                     }
 547                 }
 548             }
 549         }
 550     }
 551
 552     template<typename dtype>
 553     void prepare_data(const detection_output_inst& instance, std::vector<std::vector<std::vector<bounding_box>>> &bboxes, std::vector<std::vector<std::vector<std::pair<float, int>>>>& confidences)
 554     {
 555         assert(bboxes.size() == confidences.size());
 556
 557         const auto& args = instance.argument;
 558
 559         const int num_of_images = (int)bboxes.size();
 560         const int num_of_priors = instance.prior_box_memory().get_layout().size.spatial[1] / args.prior_info_size;
 561         const int num_loc_classes = args.share_location ? 1 : args.num_classes;
 562
 563         // Extract locations per image.
 564         std::vector<std::vector<std::vector<bounding_box>>> locations(num_of_images); // Per image : label -> bounding boxes.
 565         extract_locations_per_image<dtype>(instance, locations, num_of_priors, num_loc_classes);
 566
 567         int32_t batches_in_prior_boxes = instance.prior_box_memory().get_layout().size.batch[0];
 568         std::vector<bounding_box> prior_bboxes(batches_in_prior_boxes*num_of_priors); // Prior-Boxes (identical for all images since we assume all images in a batch are of same dimension).
 569         std::vector<std::array<float, PRIOR_BOX_SIZE>> prior_variances(batches_in_prior_boxes*num_of_priors); // Variances per prior-box (identical for all images since we assume all images in a batch are of same dimension).
 570         extract_prior_boxes_and_variances<dtype>(instance, args.variance_encoded_in_target,
 571                                                  args.prior_info_size, args.prior_coordinates_offset, batches_in_prior_boxes,
 572                                                  prior_bboxes, prior_variances);
 573
 574         // Create the decoded bounding boxes according to locations predictions and prior-boxes.
 575         for (int image = 0; image < num_of_images; ++image)
 576         {
 577             std::vector<std::vector<bounding_box>>& bboxes_per_image = bboxes[image];
 578             bboxes_per_image.resize(num_loc_classes);
 579             locations[image].resize(num_loc_classes);
 580             for (int cls = 0; cls < num_loc_classes; ++cls)
 581             {
 582                 const int label = args.share_location ? 0 : cls;
 583                 if (!args.share_location && label == args.background_label_id)
 584                 {
 585                     continue; // Skip background class.
 586                 }
 587                 const std::vector<bounding_box>& label_loc_preds = locations[image][label];
 588                 int label_loc_preds_size = (int)label_loc_preds.size();
 589
 590                 bboxes_per_image[label].clear();
 591
 592                 for (int i = 0; i < label_loc_preds_size; ++i)
 593                 {
 594                     bounding_box decoded_bbox;
 595                     int32_t pb_offset = (batches_in_prior_boxes > 1) ? (image*num_of_priors + i) : i;
 596                     int32_t var_offset = (batches_in_prior_boxes > 1) ? (image*num_of_priors + i) : i;
 597                     decode_bounding_box(prior_bboxes[pb_offset], prior_variances[var_offset],
 598                                         args.code_type, args.variance_encoded_in_target, label_loc_preds[i], &decoded_bbox,
 599                                         args.prior_is_normalized, args.input_width, args.input_height, args.clip_before_nms);
 600                     bboxes_per_image[label].emplace_back(decoded_bbox);
 601                 }
 602             }
 603         }
 604
 605         // Extract confidences per image.
 606         extract_confidences_per_image<dtype>(instance, confidences, num_of_priors);
 607     }
 608
 609     event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, detection_output_inst& instance) override
 610     {
 611         for (auto& a : events)
 612         {
 613             a->wait();
 614         }
 615
 616         auto ev = instance.get_network().get_engine().create_user_event(false);
 617
 618         const int num_of_images = instance.location_memory().get_layout().size.batch[0]; //batch size
 619
 620         std::vector<std::vector<std::vector<bounding_box>>> bboxes(num_of_images); // Per image : label -> decoded bounding boxes.
 621         std::vector<std::vector<std::vector<std::pair<float, int>>>> confidences(num_of_images); // Per image : class -> confidences per bounding box.
 622
 623         if (instance.location_memory().get_layout().data_type == data_types::f32)
 624         {
 625             prepare_data<data_type_to_type<data_types::f32>::type>(instance, bboxes, confidences);
 626
 627             generate_detections<data_type_to_type<data_types::f32>::type>(instance, num_of_images, bboxes, confidences);
 628         }
 629         else
 630         {
 631             prepare_data<data_type_to_type<data_types::f16>::type>(instance, bboxes, confidences);
 632
 633             generate_detections<data_type_to_type<data_types::f16>::type>(instance, num_of_images, bboxes, confidences);
 634         }
 635
 636         dynamic_cast<cldnn::user_event*>(ev.get())->set(); // set as complete
 637         // TODO: consider refactoring create_user_event() to return cldnn::user_event*
 638         return ev;
 639     }
 640
 641     static primitive_impl* create(const detection_output_node& arg)
 642     {
 643         return new detection_output_cpu(arg);
 644     }
 645 };
 646
 647 primitive_impl* runDetectOutCpu(const detection_output_node& arg)
 648 {
 649     return new detection_output_cpu(arg);
 650 }
 651
 652 }}