Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / detection_output.cpp
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "detection_output_inst.h"
18 #include "primitive_type_base.h"
19 #include "network_impl.h"
20 #include "error_handler.h"
21 #include "json_object.h"
22
23 namespace cldnn
24 {
25 primitive_type_id detection_output_type_id()
26 {
27     static primitive_type_base<detection_output> instance;
28     return &instance;
29 }
30
31 layout detection_output_inst::calc_output_layout(detection_output_node const& node)
32 {
33     assert((bool)node.get_primitive()->output_data_type == false
34            && "Output data type forcing is not supported for "
35               "detection_output_node!");
36     CLDNN_ERROR_NOT_EQUAL(node.id(), "Detection output layer input number", node.get_dependencies().size(), "expected number of inputs", static_cast<size_t>(3), "");
37
38     auto input_layout = node.location().get_output_layout();
39
40     // Batch size and feature size are 1.
41     // Number of bounding boxes to be kept is set to keep_top_k*batch size.
42     // If number of detections is lower than top_k, will write dummy results at the end with image_id=-1.
43     // Each row is a 7 dimension vector, which stores:
44     // [image_id, label, confidence, xmin, ymin, xmax, ymax]
45     int output_size = (int)input_layout.get_linear_size() / PRIOR_BOX_SIZE;
46     int num_classes = node.get_primitive()->num_classes;
47
48     if (node.get_primitive()->share_location)
49     {
50         num_classes = (node.get_primitive()->background_label_id == 0) ? node.get_primitive()->num_classes - 1 : node.get_primitive()->num_classes;
51         output_size *= num_classes;
52     }
53
54     if (node.get_primitive()->top_k != -1)
55     {
56         int top_k = node.get_primitive()->top_k * num_classes * input_layout.size.batch[0];
57         if (top_k < output_size)
58         {
59             output_size = top_k;
60         }
61     }
62
63     output_size *= DETECTION_OUTPUT_ROW_SIZE;
64     // Add space for number of output results per image - needed in the next detection output step
65     output_size += ((input_layout.size.batch[0] + 15) / 16) * 16;
66
67     if (node.get_program().get_options().get<build_option_type::detection_output_gpu>()->enabled())
68     {
69         return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, 1, output_size) };
70     }
71     else
72     {
73         return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, node.get_primitive()->keep_top_k * input_layout.size.batch[0]) };
74     }
75 }
76
77 std::string detection_output_inst::to_string(detection_output_node const& node)
78 {
79     auto node_info           = node.desc_to_json();
80     auto desc                = node.get_primitive();
81     auto share_location      = desc->share_location ? "true" : "false";
82     auto variance_encoded    = desc->variance_encoded_in_target ? "true" : "false";
83     auto prior_is_normalized = desc->prior_is_normalized ? "true" : "false";
84     auto decrease_label_id   = desc->decrease_label_id ? "true" : "false";
85     auto clip_before_nms     = desc->clip_before_nms ? "true" : "false";
86     auto clip_after_nms      = desc->clip_after_nms ? "true" : "false";
87     auto& input_location     = node.location();
88     auto& input_prior_box    = node.prior_box();
89     auto& input_confidence   = node.confidence();
90
91
92     std::stringstream primitive_description;
93     std::string       str_code_type;
94
95     switch (desc->code_type)
96     {
97     case prior_box_code_type::corner:
98         str_code_type = "corner";
99         break;
100     case prior_box_code_type::center_size:
101         str_code_type = "center size";
102         break;
103     case prior_box_code_type::corner_size:
104         str_code_type = "corner size";
105         break;
106     default:
107         str_code_type = "not supported code type";
108         break;
109     }
110
111     json_composite detec_out_info;
112     detec_out_info.add("input location id", input_location.id());
113     detec_out_info.add("input confidence id", input_confidence.id());
114     detec_out_info.add("input prior box id", input_prior_box.id());
115     detec_out_info.add("num_classes:", desc->num_classes);
116     detec_out_info.add("keep_top_k", desc->keep_top_k);
117     detec_out_info.add("share_location", share_location);
118     detec_out_info.add("background_label_id", desc->background_label_id);
119     detec_out_info.add("nms_treshold", desc->nms_threshold);
120     detec_out_info.add("top_k", desc->top_k);
121     detec_out_info.add("eta", desc->eta);
122     detec_out_info.add("code_type", str_code_type);
123     detec_out_info.add("variance_encoded", variance_encoded);
124     detec_out_info.add("confidence_threshold", desc->confidence_threshold);
125     detec_out_info.add("prior_info_size", desc->prior_info_size);
126     detec_out_info.add("prior_coordinates_offset", desc->prior_coordinates_offset);
127     detec_out_info.add("prior_is_normalized", prior_is_normalized);
128     detec_out_info.add("input_width", desc->input_width);
129     detec_out_info.add("input_height", desc->input_height);
130     detec_out_info.add("decrease_label_id", decrease_label_id);
131     detec_out_info.add("clip_before_nms", clip_before_nms);
132     detec_out_info.add("clip_after_nms", clip_after_nms);
133     detec_out_info.dump(primitive_description);
134
135     node_info->add("dection output info", detec_out_info);
136     node_info->dump(primitive_description);
137
138     return primitive_description.str();
139 }
140
141 detection_output_inst::typed_primitive_inst(network_impl& network, detection_output_node const& node)
142     :parent(network, node)
143 {
144     auto location_layout = node.location().get_output_layout();
145     auto confidence_layout = node.confidence().get_output_layout();
146     auto prior_box_layout = node.prior_box().get_output_layout();
147     CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Location memory format", location_layout.format.value, "expected bfyx input format", format::bfyx );
148     CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Confidence memory format", confidence_layout.format.value, "expected bfyx input format", format::bfyx );
149     CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Prior box memory format", prior_box_layout.format.value, "expected bfyx input format", format::bfyx );
150
151     tensor location_size = location_layout.size;
152     CLDNN_ERROR_NOT_EQUAL(node.id(), "Location input dimensions", (location_size.feature[0] * location_size.batch[0]), "detection output layer dimensions", (int)location_layout.count(), "Location input/ detection output dims mismatch");
153
154     tensor confidence_size = confidence_layout.size;
155     CLDNN_ERROR_NOT_EQUAL(node.id(), "Confidence input dimensions", (confidence_size.feature[0] * confidence_size.batch[0]), "detection output layer dimensions", (int)confidence_layout.count(), "Confidence input/detection output dims mistmach");
156
157     CLDNN_ERROR_NOT_EQUAL(node.id(), "Confidence batch size", confidence_size.batch[0], "location input batch size", location_size.batch[0], "Batch sizes mismatch.");
158
159     auto desc              = node.get_primitive();
160     int prior_feature_size = desc->variance_encoded_in_target ? 1 : 2;
161     tensor prior_box_size = prior_box_layout.size;
162     CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box spatial X", prior_box_size.spatial[0], "expected value", 1, "");
163     CLDNN_ERROR_NOT_EQUAL(node.id(), "Prior box feature size", prior_box_size.feature[0], "expected value", prior_feature_size, "");
164
165     CLDNN_ERROR_BOOL(node.id(), "Detection output layer padding", node.is_padded(), "Detection output layer doesn't support output padding.");
166     CLDNN_ERROR_BOOL(node.id(), "Detection output layer Prior-box input padding", node.get_dependency(2).is_padded(), "Detection output layer doesn't support input padding in Prior-Box input");
167 }
168
169 /************************ Detection Output keep_top_k part ************************/
170
171 primitive_type_id detection_output_sort_type_id()
172 {
173     static primitive_type_base<detection_output_sort> instance;
174     return &instance;
175 }
176
177 layout detection_output_sort_inst::calc_output_layout(detection_output_sort_node const& node)
178 {
179     assert((bool)node.get_primitive()->output_data_type == false
180            && "Output data type forcing is not supported for "
181               "detection_output_sort_node!");
182     CLDNN_ERROR_NOT_EQUAL(node.id(), "Detection output layer input number", node.get_dependencies().size(), "expected number of inputs", static_cast<size_t>(1), "");
183
184     auto input_layout = node.input().get_output_layout();
185     int keep_top_k = node.as<detection_output_sort>().get_primitive()->keep_top_k;
186     int num_images = node.as<detection_output_sort>().get_primitive()->num_images;
187
188     // If detection output sort is used as a second part of detection output get proper info from detection otput node
189     if (num_images == 0)
190     {
191         CLDNN_ERROR_BOOL(node.id(), "node.get_dependency(0).is_type<detection_output>()", !node.get_dependency(0).is_type<detection_output>(), "Cannot calculate output layout.");
192         input_layout = node.get_dependency(0).as<detection_output>().location().get_output_layout();
193         keep_top_k = node.get_dependency(0).as<detection_output>().get_primitive()->keep_top_k;
194         num_images = input_layout.size.batch[0];
195     }
196     // Batch size and feature size are 1.
197     // Number of bounding boxes to be kept is set to keep_top_k*batch size.
198     // If number of detections is lower than keep_top_k, will write dummy results at the end with image_id=-1.
199     // Each row is a 7 dimension vector, which stores:
200     // [image_id, label, confidence, xmin, ymin, xmax, ymax]
201     return{ input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, keep_top_k * num_images) };
202 }
203
204 std::string detection_output_sort_inst::to_string(detection_output_sort_node const& node)
205 {
206     auto node_info = node.desc_to_json();
207     auto desc = node.get_primitive();
208
209     auto& input_bboxes = node.input();
210
211     std::stringstream primitive_description;
212
213     json_composite detec_out_info;
214     detec_out_info.add("input bboxes id", input_bboxes.id());
215     detec_out_info.add("num_classes:", desc->num_images);
216     detec_out_info.add("num_classes:", desc->num_classes);
217     detec_out_info.add("keep_top_k", desc->keep_top_k);
218     detec_out_info.add("share_location", desc->share_location);
219     detec_out_info.add("top_k", desc->top_k);
220     detec_out_info.dump(primitive_description);
221
222     node_info->add("dection output info", detec_out_info);
223     node_info->dump(primitive_description);
224
225     return primitive_description.str();
226 }
227
228 detection_output_sort_inst::typed_primitive_inst(network_impl& network, detection_output_sort_node const& node)
229     :parent(network, node)
230 {
231     CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Input memory format", node.get_dependency(0).get_output_layout().format.value, "expected bfyx input format", format::bfyx);
232
233     CLDNN_ERROR_BOOL(node.id(), "Detecion output layer padding", node.is_padded(), "Detection output layer doesn't support output padding.");
234 }
235 }