2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 * Copyright (c) 2019 ARM Limited.
20 * SPDX-License-Identifier: MIT
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this software and associated documentation files (the "Software"), to
24 * deal in the Software without restriction, including without limitation the
25 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
26 * sell copies of the Software, and to permit persons to whom the Software is
27 * furnished to do so, subject to the following conditions:
29 * The above copyright notice and this permission notice shall be included in all
30 * copies or substantial portions of the Software.
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
37 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
44 defined(DIM_Y) && defined(DIM_Z)
45 /** This function normalizes the input 2D tensor across the first dimension with respect to mean and
46 * standard deviation of the same dimension.
48 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
50 * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g.
52 * @attention Normalization epsilon parameter should be given as a preprocessor argument with
53 * -DEPSILON=value. e.g. -DEPSILON=0.001f
54 * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value,
55 * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
57 * @param[in] input_ptr Pointer to the first source tensor. Supported
59 * @param[in] input_stride_x Stride of the first source tensor in X dimension
61 * @param[in] input_step_x input_stride_x * number of elements along X
62 * processed per workitem(in bytes)
63 * @param[in] input_stride_y Stride of the first source tensor in Y dimension
65 * @param[in] input_step_y input_stride_y * number of elements along Y
66 * processed per workitem(in bytes)
67 * @param[in] input_stride_z Stride of the first source tensor in Z dimension
69 * @param[in] input_step_z input_stride_z * number of elements along Z
70 * processed per workitem(in bytes)
71 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first
73 * @param[out] output_ptr (Optional) Pointer to the destination tensor.
74 * Supported data types: same as @p input_ptr
75 * @param[in] output_stride_x (Optional) Stride of the destination tensor in X
76 * dimension (in bytes)
77 * @param[in] output_step_x (Optional) output_stride_x * number of elements
78 * along X processed per workitem(in bytes)
79 * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y
80 * dimension (in bytes)
81 * @param[in] output_step_y (Optional) output_stride_y * number of elements
82 * along Y processed per workitem(in bytes)
83 * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z
84 * dimension (in bytes)
85 * @param[in] output_step_z (Optional) output_stride_z * number of elements
86 * along Z processed per workitem(in bytes)
87 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in
88 * the destination tensor
89 * @param[in] gamma_ptr (Optional) Pointer to the gamma tensor.
90 * Supported data types: same as @p input_ptr
91 * @param[in] gamma_stride_x (Optional) Stride of the gamma tensor in X
92 * dimension (in bytes)
93 * @param[in] gamma_step_x (Optional) output_stride_x * number of elements
94 * along X processed per workitem(in bytes)
95 * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in
97 * @param[in] beta_ptr (Optional) Pointer to the beta tensor. Supported
98 * data types: same as @p input_ptr
99 * @param[in] beta_stride_x (Optional) Stride of the beta tensor in X
100 * dimension (in bytes)
101 * @param[in] beta_step_x (Optional) output_stride_x * number of elements
102 * along X processed per workitem(in bytes)
103 * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in
106 __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
108 TENSOR4D_DECLARATION(output)
109 #endif /* IN_PLACE */
112 VECTOR_DECLARATION(gamma)
116 VECTOR_DECLARATION(beta)
120 Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
122 Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
123 #endif /* IN_PLACE */
130 const int ch = get_global_id(0); // Current channel
131 const int batch = get_global_id(2); // Current batch
132 const int elements_plane = DIM_Y * DIM_Z;
134 for (int i_w = 0; i_w < DIM_Y; ++i_w)
136 for (int i_h = 0; i_h < DIM_Z; ++i_h)
138 float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
140 sum_sq += data * data;
144 #else // !defined(NHWC)
145 const int ch = get_global_id(2) % DIM_Z; // Current channel
146 const int batch = get_global_id(2) / DIM_Z; // Current batch
147 const int elements_plane = DIM_X * DIM_Y;
149 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
151 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
153 // Calculate partial sum
154 for (int y = 0; y < DIM_Y; ++y)
157 for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
160 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
161 data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
163 part_sum_sq += data * data;
166 for (; x < DIM_X; ++x)
168 DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch));
170 part_sum_sq.s0 += data * data;
175 part_sum.s01234567 += part_sum.s89abcdef;
176 part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
177 #endif // VEC_SIZE > 8
179 part_sum.s0123 += part_sum.s4567;
180 part_sum_sq.s0123 += part_sum_sq.s4567;
181 #endif // VEC_SIZE > 4
183 part_sum.s01 += part_sum.s23;
184 part_sum_sq.s01 += part_sum_sq.s23;
185 #endif // VEC_SIZE > 2
186 part_sum.s0 += part_sum.s1;
187 part_sum_sq.s0 += part_sum_sq.s1;
189 sum = (float)part_sum.s0;
190 sum_sq = (float)part_sum_sq.s0;
192 #endif // defined(NHWC)
194 const float mean_float = (sum / elements_plane);
195 const DATA_TYPE mean = (DATA_TYPE)mean_float;
196 const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float);
198 const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON);
199 const DATA_TYPE multip = (DATA_TYPE)multip_float;
200 #else // !defined(GAMMA)
201 const DATA_TYPE multip = (DATA_TYPE)0;
202 #endif // defined(GAMMA)
204 const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch);
205 #else // !defined(BETA)
206 const DATA_TYPE beta = 0;
207 #endif // defined(BETA)
211 for (int i_w = 0; i_w < DIM_Y; ++i_w)
213 for (int i_h = 0; i_h < DIM_Z; ++i_h)
215 __global DATA_TYPE *input_address =
216 (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
218 __global DATA_TYPE *output_address = input_address;
219 #else /* !IN_PLACE */
220 __global DATA_TYPE *output_address =
221 (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
222 #endif /* IN_PLACE */
223 *(output_address) = (*(input_address)-mean) * multip + beta;
227 #else // !defined(NHWC)
228 for (int y = 0; y < DIM_Y; ++y)
231 for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
233 __global DATA_TYPE *input_address =
234 (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
236 __global DATA_TYPE *output_address = input_address;
237 #else /* !IN_PLACE */
238 __global DATA_TYPE *output_address =
239 (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
240 #endif /* IN_PLACE */
242 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
243 data = VLOAD(VEC_SIZE)(0, input_address);
245 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
246 res = (data - mean) * multip + beta;
248 (res, 0, output_address);
251 for (; x < DIM_X; ++x)
253 __global DATA_TYPE *input_address =
254 (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
256 __global DATA_TYPE *output_address = input_address;
257 #else /* !IN_PLACE */
258 __global DATA_TYPE *output_address =
259 (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
260 #endif /* IN_PLACE */
261 *(output_address) = (*(input_address)-mean) * multip + beta;
264 #endif // defined(NHWC)
266 #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
267 defined(DIM_Y) && defined(DIM_Z) */