1 // Copyright (c) 2018 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #define PRIOR_BOX_SIZE 4 // Each prior-box consists of [xmin, ymin, xmax, ymax].
16 #define OUTPUT_ROW_SIZE 7 // Each detection consists of [image_id, label, confidence, xmin, ymin, xmax, ymax].
18 #define CODE_TYPE_CORNER 0
19 #define CODE_TYPE_CENTER_SIZE 1
20 #define CODE_TYPE_CORNER_SIZE 2
22 #define HIDDEN_CLASS ((BACKGROUND_LABEL_ID == 0 && SHARE_LOCATION)? 1 : 0)
23 #define NUM_OF_IMAGES INPUT0_BATCH_NUM
24 #define NUM_LOC_CLASSES ((SHARE_LOCATION)? 1 : NUM_CLASSES)
25 #define NUM_CLASSES_OUT ((HIDDEN_CLASS == 1)? NUM_CLASSES - 1 : NUM_CLASSES)
26 #define NUM_OF_PRIORS (INPUT0_LENGTH / (NUM_OF_IMAGES * NUM_LOC_CLASSES * PRIOR_BOX_SIZE))
27 #define NUM_OF_ITEMS ((NUM_OF_PRIORS / 256) + 1)
28 #define NUM_OF_ITERATIONS ((NUM_OF_PRIORS % NUM_OF_ITEMS == 0)? (NUM_OF_PRIORS / NUM_OF_ITEMS) : ((NUM_OF_PRIORS / NUM_OF_ITEMS) + 1))
30 #define X_SIZE INPUT0_Y_PITCH
31 #define Y_SIZE (INPUT0_FEATURE_PITCH/INPUT0_Y_PITCH)
32 #define LOCATION_PADDING (INPUT0_PAD_BEFORE_SIZE_Y * X_SIZE + INPUT0_PAD_BEFORE_SIZE_X)
33 #define LOC_XY_SIZE_PRODUCT (X_SIZE * Y_SIZE)
34 #define CONF_PADDING (CONF_PADDING_Y * CONF_SIZE_X + CONF_PADDING_X)
35 #define CONF_XY_SIZE_PRODUCT (CONF_SIZE_X * CONF_SIZE_Y)
37 #define NUM_OF_PRIOR_COMPONENTS (NUM_OF_PRIORS * PRIOR_BOX_SIZE)
38 #define NUM_OF_IMAGE_CONF (INPUT0_LENGTH/NUM_OF_IMAGES/PRIOR_BOX_SIZE)
40 #define SCORES_COUNT (((TOP_K != -1) && (TOP_K < NUM_OF_PRIORS))? TOP_K : NUM_OF_PRIORS)
42 #define OUTPUT_OFFSET (((NUM_OF_IMAGES + 15) / 16) * 16)
43 #define SCORE_OFFSET 2
45 #define INPUT_OFFSET (((NUM_IMAGES + 15) / 16) * 16)
46 #define INPUT_BBOXES_COUNT ((INPUT0_LENGTH - INPUT_OFFSET) / OUTPUT_ROW_SIZE)
47 #define NUM_CLASSES_IN NUM_CLASSES_OUT
48 #define BBOXES_NUM_BASED_TOP_K (TOP_K * NUM_CLASSES_IN * NUM_IMAGES)
49 #define INPUT_BBOXES_LENGTH (((TOP_K != -1) && (BBOXES_NUM_BASED_TOP_K < INPUT_BBOXES_COUNT))? BBOXES_NUM_BASED_TOP_K : INPUT_BBOXES_COUNT)
50 #define NUM_OF_CLASS_BBOXES (INPUT_BBOXES_LENGTH / (NUM_IMAGES * NUM_CLASSES_IN))
51 #define NUM_OF_IMAGE_BBOXES (INPUT_BBOXES_LENGTH / NUM_IMAGES)
52 #define NUM_OF_ITEMS_SORT ((NUM_CLASSES_IN / 256) + 1)
55 // Number of bboxes to keep in output
56 #define KEEP_BBOXES_NUM ((KEEP_TOP_K < NUM_OF_IMAGE_BBOXES)? KEEP_TOP_K : NUM_OF_IMAGE_BBOXES)
58 void FUNC(get_decoded_bbox)(UNIT_TYPE* decoded_bbox, __global UNIT_TYPE* input_location, __global UNIT_TYPE* input_prior_box, const uint idx_prior, const uint idx_class, const uint idx_image)
60 const uint prior_offset = idx_prior * PRIOR_INFO_SIZE + PRIOR_COORD_OFFSET;
61 uint location_offset =
62 (NUM_LOC_CLASSES * (idx_prior * PRIOR_BOX_SIZE) + idx_image * INPUT0_FEATURE_NUM + idx_class * PRIOR_BOX_SIZE) *
66 UNIT_TYPE prior_bboxes[4] = {
67 input_prior_box[prior_offset],
68 input_prior_box[prior_offset + 1],
69 input_prior_box[prior_offset + 2],
70 input_prior_box[prior_offset + 3]};
72 if (!PRIOR_IS_NORMALIZED)
74 prior_bboxes[0] /= IMAGE_WIDTH;
75 prior_bboxes[1] /= IMAGE_HEIGH;
76 prior_bboxes[2] /= IMAGE_WIDTH;
77 prior_bboxes[3] /= IMAGE_HEIGH;
80 if (CODE_TYPE == CODE_TYPE_CORNER)
82 if (VARIANCE_ENCODED_IN_TARGET)
84 // variance is encoded in target, we simply need to add the offset predictions.
85 for(uint i = 0; i < PRIOR_BOX_SIZE; i++)
89 input_location[location_offset];
91 location_offset += LOC_XY_SIZE_PRODUCT;
96 // variance is encoded in bbox, we need to scale the offset accordingly.
97 for(uint i = 0; i < PRIOR_BOX_SIZE; i++)
100 mad(input_prior_box[NUM_OF_PRIOR_COMPONENTS + i], // prior variances are places after prior bboxes
101 input_location[location_offset],
104 location_offset += LOC_XY_SIZE_PRODUCT;
108 else if (CODE_TYPE == CODE_TYPE_CENTER_SIZE)
110 const UNIT_TYPE prior_width = prior_bboxes[2] - prior_bboxes[0];
111 const UNIT_TYPE prior_height = prior_bboxes[3] - prior_bboxes[1];
112 const UNIT_TYPE prior_center_x = (prior_bboxes[0] + prior_bboxes[2]) / 2;
113 const UNIT_TYPE prior_center_y = (prior_bboxes[1] + prior_bboxes[3]) / 2;
114 const UNIT_TYPE bbox_xmin = input_location[location_offset];
115 const UNIT_TYPE bbox_ymin = input_location[location_offset + LOC_XY_SIZE_PRODUCT];
116 const UNIT_TYPE bbox_xmax = input_location[location_offset + 2 * LOC_XY_SIZE_PRODUCT];
117 const UNIT_TYPE bbox_ymax = input_location[location_offset + 3 * LOC_XY_SIZE_PRODUCT];
118 UNIT_TYPE decode_bbox_center_x, decode_bbox_center_y;
119 UNIT_TYPE decode_bbox_width, decode_bbox_height;
121 if (VARIANCE_ENCODED_IN_TARGET)
123 // variance is encoded in target, we simply need to restore the offset predictions.
124 decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
125 decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
126 decode_bbox_width = (exp(bbox_xmax) * prior_width) / 2;
127 decode_bbox_height = (exp(bbox_ymax) * prior_height) / 2;
131 // variance is encoded in bbox, we need to scale the offset accordingly.
132 decode_bbox_center_x = input_prior_box[NUM_OF_PRIOR_COMPONENTS] * bbox_xmin * prior_width + prior_center_x;
133 decode_bbox_center_y = input_prior_box[NUM_OF_PRIOR_COMPONENTS + 1] * bbox_ymin * prior_height + prior_center_y;
134 decode_bbox_width = (exp(input_prior_box[NUM_OF_PRIOR_COMPONENTS + 2] * bbox_xmax) * prior_width) / 2;
135 decode_bbox_height = (exp(input_prior_box[NUM_OF_PRIOR_COMPONENTS + 3] * bbox_ymax) * prior_height) / 2;
138 decoded_bbox[0] = decode_bbox_center_x - decode_bbox_width;
139 decoded_bbox[1] = decode_bbox_center_y - decode_bbox_height;
140 decoded_bbox[2] = decode_bbox_center_x + decode_bbox_width;
141 decoded_bbox[3] = decode_bbox_center_y + decode_bbox_height;
145 const UNIT_TYPE prior_width = prior_bboxes[2] - prior_bboxes[0];
146 const UNIT_TYPE prior_height = prior_bboxes[3] - prior_bboxes[1];
147 const UNIT_TYPE bbox_xmin = input_location[location_offset];
148 const UNIT_TYPE bbox_ymin = input_location[location_offset + LOC_XY_SIZE_PRODUCT];
149 const UNIT_TYPE bbox_xmax = input_location[location_offset + 2 * LOC_XY_SIZE_PRODUCT];
150 const UNIT_TYPE bbox_ymax = input_location[location_offset + 3 * LOC_XY_SIZE_PRODUCT];
152 if (VARIANCE_ENCODED_IN_TARGET)
154 // variance is encoded in target, we simply need to add the offset predictions.
155 decoded_bbox[0] = prior_bboxes[0] + bbox_xmin * prior_width;
156 decoded_bbox[1] = prior_bboxes[1] + bbox_ymin * prior_height;
157 decoded_bbox[2] = prior_bboxes[2] + bbox_xmax * prior_width;
158 decoded_bbox[3] = prior_bboxes[3] + bbox_ymax * prior_height;
162 // variance is encoded in bbox, we need to scale the offset accordingly.
163 decoded_bbox[0] = prior_bboxes[0] + input_prior_box[NUM_OF_PRIOR_COMPONENTS] * bbox_xmin * prior_width;
164 decoded_bbox[1] = prior_bboxes[1] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 1] * bbox_ymin * prior_height;
165 decoded_bbox[2] = prior_bboxes[2] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 2] * bbox_xmax * prior_width;
166 decoded_bbox[3] = prior_bboxes[3] + input_prior_box[NUM_OF_PRIOR_COMPONENTS + 3] * bbox_ymax * prior_height;
171 UNIT_TYPE FUNC(get_score)(__global UNIT_TYPE* input_confidence, const uint idx_prior, const uint idx_class, const uint idx_image)
173 const uint confidence_offset = // offset in kernel input 'input_confidence'
174 (idx_prior * NUM_CLASSES + idx_image * NUM_OF_PRIORS * NUM_CLASSES + idx_class) *
175 CONF_XY_SIZE_PRODUCT +
178 return (input_confidence[confidence_offset] > CONFIDENCE_THRESHOLD)? input_confidence[confidence_offset] : 0;