2 // Copyright (c) 2018 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "include/include_all.cl"
20 #define GET_INDEX(src) \
21 GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(src, d4, d3, d2 * CAT(src, _STRIDE_Y), d1 * CAT(src, _STRIDE_X))
23 #define GET_INDEX(src) \
24 GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(src, d4, d3, d2, d1)
27 int16 FUNC(get_int16)(const __global UNIT_TYPE* src, uint idx)
29 int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(src + idx)));
31 for(uint b = 0; b < 4; b++)
33 for(uint f = 0; f < 4; f++)
35 to_return[b * 4 + f] = as_char4(int_data[b])[f];
40 #define GET_INPUT(A, B) FUNC_CALL(get_int16)(A, GET_INDEX(B))
42 __attribute__((intel_reqd_sub_group_size(8)))
43 KERNEL(eltwise_fs_bs_yx_bsv4_fsv32)(
45 __global UNIT_TYPE* output
47 , const __global float* calibrations
51 const uint of_32_aligned = ((OUTPUT_FEATURE_NUM + 31) / 32) * 32;
52 const uint d1 = get_global_id(0); // X
53 const uint d2 = get_global_id(1); // Y
54 const uint d3 = (get_global_id(2) * 4) % of_32_aligned; // Feature
55 const uint d4 = 4 * ((get_global_id(2) * 4) / of_32_aligned); // Batch
62 for(uint b = 0; b < 4; b++)
65 for(uint f = 0; f < 4; f++)
67 int res_tmp = res[b * 4 + f];
69 res_tmp = (int)round(((float)res_tmp) * calibrations[d3+f]);
70 #else // CALIBRATION_TERM
71 res_tmp = (int)round(((float)res_tmp) * O_QF);
72 #endif // CALIBRATION_TERM
73 char_res[f] = ACTIVATION(convert_char(res_tmp), NL_M, NL_N);
75 // pack 4 chars into int
76 char_result[b] = as_int(char_res);
79 uint output_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, d4, d3, d2, d1);
80 intel_sub_group_block_write4((__global uint*)(output + output_offset), as_uint4(char_result));