2 * Copyright (c) 2017-2018 ARM Limited.
4 * SPDX-License-Identifier: MIT
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 #ifdef FIXED_POINT_POSITION
28 #include "fixed_point.h"
29 #define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
30 #define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
31 #define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
32 #define MUL_OP(x, y, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
33 #define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
34 #define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
36 #define MIN_VAL_EXPAND(type) type##_MIN
37 #define MIN_VAL(type) MIN_VAL_EXPAND(type)
38 #define MINVAL MIN_VAL(DATA_TYPE)
39 #define SELECT_DATA_TYPE EXPAND(DATA_TYPE)
41 #else /* FIXED_POINT_POSITION */
43 #define MAX_OP(x, y, type, size) max((x), (y))
44 #define ADD_OP(x, y, type, size) ((x) + (y))
45 #define SUB_OP(x, y, type, size) ((x) - (y))
46 #define MUL_OP(x, y, type, size) ((x) * (y))
47 #define DIV_OP(x, y, type, size) ((x) / (y))
48 #define EXP_OP(x, type, size) exp((x))
51 #define MINVAL -HALF_MAX
52 #define SELECT_DATA_TYPE short
54 #define MINVAL -FLT_MAX
55 #define SELECT_DATA_TYPE int
58 #endif /* FIXED_POINT_POSITION */
60 /* Number of workitems in dimension 0. */
61 #if !defined(GRID_SIZE)
63 #endif /* !defined(GRID_SIZE) */
65 /* Vector size, i.e. number of vector elements. */
67 __constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
68 __constant uint2 idx__ = (uint2)(0, 1);
70 #elif VECTOR_SIZE == 4
71 __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
72 __constant uint4 idx__ = (uint4)(0, 1, 2, 3);
74 #elif VECTOR_SIZE == 8
75 __constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
76 __constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
78 #else /* VECTOR_SIZE DEFAULT */
79 #define VECTOR_SIZE 16
80 #define LOG_VECTOR_SIZE 4
81 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
82 __constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
84 #endif /* VECTOR_SIZE END */
86 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
87 __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
88 __constant uint4 idx4 = (uint4)(0, 1, 2, 3);
90 /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
92 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
93 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
95 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
96 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
97 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
98 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
99 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
100 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
101 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
102 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
103 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
104 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
105 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
106 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
107 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
108 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
109 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
110 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
111 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
112 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
113 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
114 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
115 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
116 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
117 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
118 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
120 __kernel void softmax_layer_norm(
121 TENSOR3D_DECLARATION(src),
122 TENSOR3D_DECLARATION(sum),
123 TENSOR3D_DECLARATION(dst))
125 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
126 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
127 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
129 // Load max value of 1D logits vector (row)
130 DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
131 VEC_DATA_TYPE(DATA_TYPE, 16)
132 data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
133 vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
136 /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
137 * then gets the exponent of each element as sums all elements across each row.
139 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
140 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
141 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
142 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
144 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
145 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
146 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
147 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
148 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
149 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
150 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
151 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
152 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
153 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
154 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
155 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
156 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
157 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
158 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
159 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
160 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
161 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
162 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
163 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
164 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
165 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
166 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
167 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
168 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
169 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
170 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
171 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
172 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
173 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
174 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
175 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
176 * @param[in] width Input image width
178 __kernel void softmax_layer_max_shift_exp_sum_serial(
179 TENSOR3D_DECLARATION(src),
180 TENSOR3D_DECLARATION(maxo),
181 TENSOR3D_DECLARATION(dst),
182 TENSOR3D_DECLARATION(sum),
185 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
186 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
187 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
188 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
192 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
193 beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
196 // Initialize local maximum
197 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
198 max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
200 // Calculate max of row
201 const uint width_ = width >> LOG_VECTOR_SIZE;
202 for(uint i = 0; i < width_; i++)
204 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
205 data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
206 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
209 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
210 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
211 data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
212 VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
213 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
214 max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
215 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
217 // Perform max reduction
218 #if VECTOR_SIZE == 16
219 max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
220 #endif /* VECTOR SIZE 16 END */
222 max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
223 #endif /* VECTOR SIZE 8 END */
225 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
226 #endif /* VECTOR SIZE 4 END */
227 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
229 *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
233 // Load max value of 1D logits vector (row)
234 DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
237 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
240 // Shift values, exp and sum
241 for(uint i = 0; i < width_; i++)
243 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
244 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
245 data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
247 data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
249 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
251 (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
252 sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
255 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
256 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
257 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
258 data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
260 data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
262 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
263 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
264 data = select(0, data, widx);
266 (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
267 sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
268 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
270 // Perform sum reduction
271 #if VECTOR_SIZE == 16
272 sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
273 #endif /* VECTOR SIZE 16 END */
275 sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
276 #endif /* VECTOR SIZE 8 END */
278 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
279 #endif /* VECTOR SIZE 4 END */
280 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
282 // Calculate and store result
283 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
286 /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
287 * then gets the exponent of each element as sums all elements across each row.
289 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
290 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
291 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
292 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
294 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
295 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
296 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
297 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
298 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
299 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
300 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
301 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
302 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
303 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
304 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
305 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
306 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
307 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
308 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
309 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
310 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
311 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
312 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
313 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
314 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
315 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
316 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
317 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
318 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
319 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
320 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
321 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
322 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
323 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
324 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
325 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
326 * @param[in] width Input image width
328 __kernel void softmax_layer_max_shift_exp_sum_parallel(
329 TENSOR3D_DECLARATION(src),
330 TENSOR3D_DECLARATION(maxo),
331 TENSOR3D_DECLARATION(dst),
332 TENSOR3D_DECLARATION(sum),
335 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
336 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
337 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
338 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
340 const uint lid = get_local_id(0);
344 VEC_DATA_TYPE(DATA_TYPE, 4)
345 beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
348 // Define one temporary vector per work-item.
349 __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
350 __local DATA_TYPE max_local;
352 __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
353 VEC_DATA_TYPE(DATA_TYPE, 4)
354 max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
355 // Number of elements per work-item.
356 const uint row = width / GRID_SIZE;
357 // Number of iterations per work-item.
358 const uint width_ = row >> 2;
359 // Calculate max of row
361 for(; i < width_; i++)
363 VEC_DATA_TYPE(DATA_TYPE, 4)
364 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
365 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
367 #ifdef NON_MULTIPLE_OF_GRID_SIZE
368 // How many work-items needed to complete the computation.
369 int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
370 if(lid < boundary_workitems)
372 VEC_DATA_TYPE(DATA_TYPE, 4)
373 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
374 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
376 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
377 if(boundary_workitems == 0)
379 boundary_workitems = GRID_SIZE;
382 if(lid == (boundary_workitems - 1))
384 // Handle non multiple of 4
385 VEC_DATA_TYPE(DATA_TYPE, 4)
386 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
387 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
388 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
389 max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
391 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
392 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
393 tmp_local[lid] = max_val_vec;
395 barrier(CLK_LOCAL_MEM_FENCE);
401 tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
403 barrier(CLK_LOCAL_MEM_FENCE);
409 tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
411 barrier(CLK_LOCAL_MEM_FENCE);
417 tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
419 barrier(CLK_LOCAL_MEM_FENCE);
425 tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
427 barrier(CLK_LOCAL_MEM_FENCE);
433 tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
435 barrier(CLK_LOCAL_MEM_FENCE);
441 tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
443 barrier(CLK_LOCAL_MEM_FENCE);
449 tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
451 barrier(CLK_LOCAL_MEM_FENCE);
455 max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
456 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
457 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
458 max_local = max_val_vec.s0;
460 barrier(CLK_LOCAL_MEM_FENCE);
465 VEC_DATA_TYPE(DATA_TYPE, 4)
467 DATA_TYPE max_val = max_local;
469 // Shift values, exp and sum
470 for(i = 0; i < width_; i++)
472 VEC_DATA_TYPE(DATA_TYPE, 4)
473 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
474 data = SUB_OP(data, max_val, DATA_TYPE, 4);
476 data = MUL_OP(data, beta, DATA_TYPE, 4);
478 data = EXP_OP(data, DATA_TYPE, 4);
480 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
481 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
483 #ifdef NON_MULTIPLE_OF_GRID_SIZE
484 boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
485 if(lid < boundary_workitems)
487 VEC_DATA_TYPE(DATA_TYPE, 4)
488 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
489 data = SUB_OP(data, max_val, DATA_TYPE, 4);
491 data = MUL_OP(data, beta, DATA_TYPE, 4);
493 data = EXP_OP(data, DATA_TYPE, 4);
495 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
496 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
498 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
499 if(boundary_workitems == 0)
501 boundary_workitems = GRID_SIZE;
504 if(lid == (boundary_workitems - 1))
506 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
507 VEC_DATA_TYPE(DATA_TYPE, 4)
508 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
509 data = SUB_OP(data, max_val, DATA_TYPE, 4);
511 data = MUL_OP(data, beta, DATA_TYPE, 4);
513 data = EXP_OP(data, DATA_TYPE, 4);
514 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
515 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
516 data = select(0, data, widx);
518 (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
519 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
521 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
522 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
523 tmp_local[lid] = sum1D;
525 barrier(CLK_LOCAL_MEM_FENCE);
531 tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
533 barrier(CLK_LOCAL_MEM_FENCE);
539 tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
541 barrier(CLK_LOCAL_MEM_FENCE);
547 tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
549 barrier(CLK_LOCAL_MEM_FENCE);
555 tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
557 barrier(CLK_LOCAL_MEM_FENCE);
563 tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
565 barrier(CLK_LOCAL_MEM_FENCE);
571 tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
573 barrier(CLK_LOCAL_MEM_FENCE);
579 tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
581 barrier(CLK_LOCAL_MEM_FENCE);
585 sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
586 // Perform max reduction
587 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
588 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
589 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;