inference-engine/thirdparty/clDNN/api/CPP/detection_output.hpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////
  18 #pragma once
  19 #include <limits>
  20 #include "../C/detection_output.h"
  21 #include "../C/detection_output_sort.h"
  22 #include "primitive.hpp"
  23
  24 namespace cldnn
  25 {
  26 /// @addtogroup cpp_api C++ API
  27 /// @{
  28 /// @addtogroup cpp_topology Network Topology
  29 /// @{
  30 /// @addtogroup cpp_primitives Primitives
  31 /// @{
  32
  33 /// @brief Select method for coding the prior-boxes in the @ref detection output layer.
  34 enum class prior_box_code_type : int32_t
  35 {
  36     corner      = cldnn_code_type_corner,
  37     center_size = cldnn_code_type_center_size,
  38     corner_size = cldnn_code_type_corner_size
  39 };
  40
  41 /// @brief Generates a list of detections based on location and confidence predictions by doing non maximum suppression.
  42 /// @details Each row is a 7 dimension vector, which stores: [image_id, label, confidence, xmin, ymin, xmax, ymax].
  43 /// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1.
  44 struct detection_output : public primitive_base<detection_output, CLDNN_PRIMITIVE_DESC(detection_output)>
  45 {
  46     CLDNN_DECLARE_PRIMITIVE(detection_output)
  47
  48     /// @brief Constructs detection output primitive.
  49     /// @param id This primitive id.
  50     /// @param input_location Input location primitive id.
  51     /// @param input_confidence Input confidence primitive id.
  52     /// @param input_prior_box Input prior-box primitive id.
  53     /// @param num_classes Number of classes to be predicted.
  54     /// @param keep_top_k Number of total bounding boxes to be kept per image after NMS step.
  55     /// @param share_location If true bounding box are shared among different classes.
  56     /// @param background_label_id Background label id (-1 if there is no background class).
  57     /// @param nms_threshold Threshold for NMS step.
  58     /// @param top_k Maximum number of results to be kept in NMS.
  59     /// @param eta Used for adaptive NMS.
  60     /// @param code_type Type of coding method for bounding box.
  61     /// @param variance_encoded_in_target If true, variance is encoded in target; otherwise we need to adjust the predicted offset accordingly.
  62     /// @param confidence_threshold Only keep detections with confidences larger than this threshold.
  63     detection_output(
  64         const primitive_id& id,
  65         const primitive_id& input_location,
  66         const primitive_id& input_confidence,
  67         const primitive_id& input_prior_box,
  68         const uint32_t num_classes,
  69         const uint32_t keep_top_k,
  70         const bool share_location = true,
  71         const int background_label_id = 0,
  72         const float nms_threshold = 0.3,
  73         const int top_k = -1,
  74         const float eta = 1.f,
  75         const prior_box_code_type code_type = prior_box_code_type::corner,
  76         const bool variance_encoded_in_target = false,
  77         const float confidence_threshold = -std::numeric_limits<float>::max(),
  78         const int32_t prior_info_size = 4,
  79         const int32_t prior_coordinates_offset = 0,
  80         const bool prior_is_normalized = true,
  81         const int32_t input_width = -1,
  82         const int32_t input_height = -1,
  83         const bool decrease_label_id = false,
  84         const bool clip_before_nms = false,
  85         const bool clip_after_nms = false,
  86         const padding& output_padding = padding()
  87         )
  88         : primitive_base(id, { input_location, input_confidence, input_prior_box }, output_padding)
  89         , num_classes(num_classes)
  90         , keep_top_k(keep_top_k)
  91         , share_location(share_location)
  92         , background_label_id(background_label_id)
  93         , nms_threshold(nms_threshold)
  94         , top_k(top_k)
  95         , eta(eta)
  96         , code_type(code_type)
  97         , variance_encoded_in_target(variance_encoded_in_target)
  98         , confidence_threshold(confidence_threshold)
  99         , prior_info_size(prior_info_size)
 100         , prior_coordinates_offset(prior_coordinates_offset)
 101         , prior_is_normalized(prior_is_normalized)
 102         , input_width(input_width)
 103         , input_height(input_height)
 104         , decrease_label_id(decrease_label_id)
 105         , clip_before_nms(clip_before_nms)
 106         , clip_after_nms(clip_after_nms)
 107     {
 108         if (decrease_label_id && background_label_id != 0)
 109             throw std::invalid_argument("Cannot use decrease_label_id and background_label_id parameter simultaneously.");
 110     }
 111
 112     /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{detection_output}
 113     detection_output(const dto* dto)
 114         : primitive_base(dto)
 115         , num_classes(dto->num_classes)
 116         , keep_top_k(dto->keep_top_k)
 117         , share_location(dto->share_location != 0)
 118         , background_label_id(dto->background_label_id)
 119         , nms_threshold(dto->nms_threshold)
 120         , top_k(dto->top_k)
 121         , eta(dto->eta)
 122         , code_type(static_cast<prior_box_code_type>(dto->code_type))
 123         , variance_encoded_in_target(dto->variance_encoded_in_target != 0)
 124         , confidence_threshold(dto->confidence_threshold)
 125         , prior_info_size(dto->prior_info_size)
 126         , prior_coordinates_offset(dto->prior_coordinates_offset)
 127         , prior_is_normalized(dto->prior_is_normalized != 0)
 128         , input_width(dto->input_width)
 129         , input_height(dto->input_height)
 130         , decrease_label_id(dto->decrease_label_id != 0)
 131         , clip_before_nms(dto->clip_before_nms != 0)
 132         , clip_after_nms(dto->clip_after_nms != 0)
 133     {
 134         if (decrease_label_id && background_label_id != 0)
 135             throw std::invalid_argument("Cannot use decrease_label_id and background_label_id parameter simultaneously.");
 136     }
 137
 138     /// @brief Number of classes to be predicted.
 139     const uint32_t num_classes;
 140     /// @brief Number of total bounding boxes to be kept per image after NMS step.
 141     const int keep_top_k;
 142     /// @brief If true, bounding box are shared among different classes.
 143     const bool share_location;
 144     /// @brief Background label id (-1 if there is no background class).
 145     const int background_label_id;
 146     /// @brief Threshold for NMS step.
 147     const float nms_threshold;
 148     /// @brief Maximum number of results to be kept in NMS.
 149     const int top_k;
 150     /// @brief Used for adaptive NMS.
 151     const float eta;
 152     /// @brief Type of coding method for bounding box.
 153     const prior_box_code_type code_type;
 154     /// @brief If true, variance is encoded in target; otherwise we need to adjust the predicted offset accordingly.
 155     const bool variance_encoded_in_target;
 156     /// @brief Only keep detections with confidences larger than this threshold.
 157     const float confidence_threshold;
 158     /// @brief Number of elements in a single prior description (4 if priors calculated using PriorBox layer, 5 - if Proposal)
 159     const int32_t prior_info_size;
 160     /// @brief Offset of the box coordinates w.r.t. the beginning of a prior info record
 161     const int32_t prior_coordinates_offset;
 162     /// @brief If true, priors are normalized to [0; 1] range.
 163     const bool prior_is_normalized;
 164     /// @brief Width of input image.
 165     const int32_t input_width;
 166     /// @brief Height of input image.
 167     const int32_t input_height;
 168     /// @brief Decrease label id to skip background label equal to 0. Can't be used simultaneously with background_label_id.
 169     const bool decrease_label_id;
 170     /// @brief Clip decoded boxes right after decoding
 171     const bool clip_before_nms;
 172     /// @brief Clip decoded boxes after nms step
 173     const bool clip_after_nms;
 174
 175 protected:
 176     void update_dto(dto& dto) const override
 177     {
 178         dto.num_classes = num_classes;
 179         dto.share_location = share_location;
 180         dto.background_label_id = background_label_id;
 181         dto.nms_threshold = nms_threshold;
 182         dto.top_k = top_k;
 183         dto.eta = eta;
 184         dto.code_type = static_cast<int32_t>(code_type);
 185         dto.variance_encoded_in_target = variance_encoded_in_target;
 186         dto.keep_top_k = keep_top_k;
 187         dto.confidence_threshold = confidence_threshold;
 188         dto.prior_info_size = prior_info_size;
 189         dto.prior_coordinates_offset = prior_coordinates_offset;
 190         dto.prior_is_normalized = prior_is_normalized;
 191         dto.input_width = input_width;
 192         dto.input_height = input_height;
 193         dto.decrease_label_id = decrease_label_id;
 194         dto.clip_before_nms = clip_before_nms;
 195         dto.clip_after_nms = clip_after_nms;
 196     }
 197 };
 198
 199 /// @brief Generates a list of detections based on location and confidence predictions by doing non maximum suppression.
 200 /// @details Each row is a 7 dimension vector, which stores: [image_id, label, confidence, xmin, ymin, xmax, ymax].
 201 /// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1.
 202 struct detection_output_sort : public primitive_base<detection_output_sort, CLDNN_PRIMITIVE_DESC(detection_output_sort)>
 203 {
 204     CLDNN_DECLARE_PRIMITIVE(detection_output_sort)
 205
 206     /// @brief Constructs detection output primitive.
 207     /// @param id This primitive id.
 208     /// @param input_bboxes Input bounding boxes primitive id.
 209     /// @param num_images Number of images to be predicted.
 210     /// @param num_classes Number of classes to be predicted.
 211     /// @param keep_top_k Number of total bounding boxes to be kept per image after NMS step.
 212     /// @param share_location If true bounding box are shared among different classes.
 213     /// @param top_k Maximum number of results to be kept in NMS.
 214     /// @param output_padding Output padding.
 215     detection_output_sort(
 216         const primitive_id& id,
 217         const primitive_id& input_bboxes,
 218         const uint32_t num_images,
 219         const uint32_t num_classes,
 220         const uint32_t keep_top_k,
 221         const bool share_location = true,
 222         const int top_k = -1,
 223         const int background_label_id = -1,
 224         const padding& output_padding = padding()
 225     )
 226     : primitive_base(id, { input_bboxes }, output_padding)
 227     , num_images(num_images)
 228     , num_classes(num_classes)
 229     , keep_top_k(keep_top_k)
 230     , share_location(share_location)
 231     , top_k(top_k)
 232     , background_label_id(background_label_id)
 233     {}
 234
 235     /// @brief Constructs a copy from C API @CLDNN_PRIMITIVE_DESC{detection_output}
 236     detection_output_sort(const dto* dto)
 237         : primitive_base(dto)
 238         , num_images(dto->num_images)
 239         , num_classes(dto->num_classes)
 240         , keep_top_k(dto->keep_top_k)
 241         , share_location(dto->share_location != 0)
 242         , top_k(dto->top_k)
 243         , background_label_id(dto->background_label_id)
 244     {}
 245
 246     /// @brief Number of classes to be predicted.
 247     const uint32_t num_images;
 248     /// @brief Number of classes to be predicted.
 249     const uint32_t num_classes;
 250     /// @brief Number of total bounding boxes to be kept per image after NMS step.
 251     const int keep_top_k;
 252     /// @brief If true, bounding box are shared among different classes.
 253     const bool share_location;
 254     /// @brief Maximum number of results to be kept in NMS.
 255     const int top_k;
 256     /// @brief Background label id (-1 if there is no background class).
 257     const int background_label_id;
 258
 259
 260 protected:
 261     void update_dto(dto& dto) const override
 262     {
 263         dto.num_classes = num_classes;
 264         dto.num_images = num_images;
 265         dto.keep_top_k = keep_top_k;
 266         dto.share_location = share_location;
 267         dto.top_k = top_k;
 268         dto.background_label_id = background_label_id;
 269     }
 270 };
 271 /// @}
 272 /// @}
 273 /// @}
 274 }