Publishing 2019 R1 content

[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / detection_output.cpp
diff --git a/inference-engine/thirdparty/clDNN/src/detection_output.cpp b/inference-engine/thirdparty/clDNN/src/detection_output.cpp

index e8fa392..4d121df 100644 (file)
--- a/inference-engine/thirdparty/clDNN/src/detection_output.cpp
+++ b/inference-engine/thirdparty/clDNN/src/detection_output.cpp
@@ -30,16 +30,48 @@ primitive_type_id detection_output_type_id()
  
  layout detection_output_inst::calc_output_layout(detection_output_node const& node)
  {
+    assert((bool)node.get_primitive()->output_data_type == false
+           && "Output data type forcing is not supported for "
+              "detection_output_node!");
      CLDNN_ERROR_NOT_EQUAL(node.id(), "Detection output layer input number", node.get_dependencies().size(), "expected number of inputs", static_cast<size_t>(3), "");
  
      auto input_layout = node.location().get_output_layout();
  
      // Batch size and feature size are 1.
-    // Number of bounding boxes to be kept is set to keep_top_k*batch size. 
-    // If number of detections is lower than keep_top_k, will write dummy results at the end with image_id=-1. 
+    // Number of bounding boxes to be kept is set to keep_top_k*batch size.
+    // If number of detections is lower than top_k, will write dummy results at the end with image_id=-1.
      // Each row is a 7 dimension vector, which stores:
      // [image_id, label, confidence, xmin, ymin, xmax, ymax]
-    return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, node.get_primitive()->keep_top_k * input_layout.size.batch[0]) };
+    int output_size = (int)input_layout.get_linear_size() / PRIOR_BOX_SIZE;
+    int num_classes = node.get_primitive()->num_classes;
+
+    if (node.get_primitive()->share_location)
+    {
+        num_classes = (node.get_primitive()->background_label_id == 0) ? node.get_primitive()->num_classes - 1 : node.get_primitive()->num_classes;
+        output_size *= num_classes;
+    }
+
+    if (node.get_primitive()->top_k != -1)
+    {
+        int top_k = node.get_primitive()->top_k * num_classes * input_layout.size.batch[0];
+        if (top_k < output_size)
+        {
+            output_size = top_k;
+        }
+    }
+
+    output_size *= DETECTION_OUTPUT_ROW_SIZE;
+    // Add space for number of output results per image - needed in the next detection output step
+    output_size += ((input_layout.size.batch[0] + 15) / 16) * 16;
+
+    if (node.get_program().get_options().get<build_option_type::detection_output_gpu>()->enabled())
+    {
+        return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, 1, output_size) };
+    }
+    else
+    {
+        return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, node.get_primitive()->keep_top_k * input_layout.size.batch[0]) };
+    }
  }
  
  std::string detection_output_inst::to_string(detection_output_node const& node)
@@ -50,12 +82,13 @@ std::string detection_output_inst::to_string(detection_output_node const& node)
      auto variance_encoded    = desc->variance_encoded_in_target ? "true" : "false";
      auto prior_is_normalized = desc->prior_is_normalized ? "true" : "false";
      auto decrease_label_id   = desc->decrease_label_id ? "true" : "false";
-    auto clip                = desc->clip ? "true" : "false";
+    auto clip_before_nms     = desc->clip_before_nms ? "true" : "false";
+    auto clip_after_nms      = desc->clip_after_nms ? "true" : "false";
      auto& input_location     = node.location();
      auto& input_prior_box    = node.prior_box();
      auto& input_confidence   = node.confidence();
  
-    
+
      std::stringstream primitive_description;
      std::string       str_code_type;
  
@@ -74,7 +107,7 @@ std::string detection_output_inst::to_string(detection_output_node const& node)
          str_code_type = "not supported code type";
          break;
      }
-    
+
      json_composite detec_out_info;
      detec_out_info.add("input location id", input_location.id());
      detec_out_info.add("input confidence id", input_confidence.id());
@@ -95,7 +128,8 @@ std::string detection_output_inst::to_string(detection_output_node const& node)
      detec_out_info.add("input_width", desc->input_width);
      detec_out_info.add("input_height", desc->input_height);
      detec_out_info.add("decrease_label_id", decrease_label_id);
-    detec_out_info.add("clip", clip);
+    detec_out_info.add("clip_before_nms", clip_before_nms);
+    detec_out_info.add("clip_after_nms", clip_after_nms);
      detec_out_info.dump(primitive_description);
  
      node_info->add("dection output info", detec_out_info);
@@ -125,11 +159,77 @@ detection_output_inst::typed_primitive_inst(network_impl& network, detection_out
      auto desc              = node.get_primitive();
      int prior_feature_size = desc->variance_encoded_in_target ? 1 : 2;
      tensor prior_box_size = prior_box_layout.size;
-    CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box batch size", prior_box_size.batch[0], "expected value", 1, "");
      CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box spatial X", prior_box_size.spatial[0], "expected value", 1, "");
      CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box feature size", prior_box_size.feature[0], "expected value", prior_feature_size, "");
  
      CLDNN_ERROR_BOOL(node.id(), "Detection output layer padding", node.is_padded(), "Detection output layer doesn't support output padding.");
      CLDNN_ERROR_BOOL(node.id(), "Detection output layer Prior-box input padding", node.get_dependency(2).is_padded(), "Detection output layer doesn't support input padding in Prior-Box input");
  }
+
+/************************ Detection Output keep_top_k part ************************/
+
+primitive_type_id detection_output_sort_type_id()
+{
+    static primitive_type_base<detection_output_sort> instance;
+    return &instance;
+}
+
+layout detection_output_sort_inst::calc_output_layout(detection_output_sort_node const& node)
+{
+    assert((bool)node.get_primitive()->output_data_type == false
+           && "Output data type forcing is not supported for "
+              "detection_output_sort_node!");
+    CLDNN_ERROR_NOT_EQUAL(node.id(), "Detection output layer input number", node.get_dependencies().size(), "expected number of inputs", static_cast<size_t>(1), "");
+
+    auto input_layout = node.input().get_output_layout();
+    int keep_top_k = node.as<detection_output_sort>().get_primitive()->keep_top_k;
+    int num_images = node.as<detection_output_sort>().get_primitive()->num_images;
+
+    // If detection output sort is used as a second part of detection output get proper info from detection otput node
+    if (num_images == 0)
+    {
+        CLDNN_ERROR_BOOL(node.id(), "node.get_dependency(0).is_type<detection_output>()", !node.get_dependency(0).is_type<detection_output>(), "Cannot calculate output layout.");
+        input_layout = node.get_dependency(0).as<detection_output>().location().get_output_layout();
+        keep_top_k = node.get_dependency(0).as<detection_output>().get_primitive()->keep_top_k;
+        num_images = input_layout.size.batch[0];
+    }
+    // Batch size and feature size are 1.
+    // Number of bounding boxes to be kept is set to keep_top_k*batch size.
+    // If number of detections is lower than keep_top_k, will write dummy results at the end with image_id=-1.
+    // Each row is a 7 dimension vector, which stores:
+    // [image_id, label, confidence, xmin, ymin, xmax, ymax]
+    return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, keep_top_k * num_images) };
+}
+
+std::string detection_output_sort_inst::to_string(detection_output_sort_node const& node)
+{
+    auto node_info = node.desc_to_json();
+    auto desc = node.get_primitive();
+
+    auto& input_bboxes = node.input();
+
+    std::stringstream primitive_description;
+
+    json_composite detec_out_info;
+    detec_out_info.add("input bboxes id", input_bboxes.id());
+    detec_out_info.add("num_classes:", desc->num_images);
+    detec_out_info.add("num_classes:", desc->num_classes);
+    detec_out_info.add("keep_top_k", desc->keep_top_k);
+    detec_out_info.add("share_location", desc->share_location);
+    detec_out_info.add("top_k", desc->top_k);
+    detec_out_info.dump(primitive_description);
+
+    node_info->add("dection output info", detec_out_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+detection_output_sort_inst::typed_primitive_inst(network_impl& network, detection_output_sort_node const& node)
+    :parent(network, node)
+{
+    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Input memory format", node.get_dependency(0).get_output_layout().format.value, "expected bfyx input format", format::bfyx);
+
+    CLDNN_ERROR_BOOL(node.id(), "Detecion output layer padding", node.is_padded(), "Detection output layer doesn't support output padding.");
+}
  }