1 // Copyright (c) 2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/common.cl"
18 #include "include/data_types.cl"
19 #include "include/fetch.cl"
20 #include "include/imad.cl"
23 #define BYTES_PER_READ (sizeof(int))
24 #define BYTES_PER_READ8 (8 * BYTES_PER_READ)
26 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
27 KERNEL(fully_connected_gpu_IMAD)(
28 const __global INPUT0_TYPE* input,
29 __global OUTPUT_TYPE* output,
30 const __global FILTER_TYPE* weights
32 , const __global BIAS_TYPE* biases
35 ,const __global float* quantizations
38 ,const __global float* calibrations
42 // This kernel works with linearized data w/o strides and padding
43 // so only one dimension 'F' is required
44 const uint f = get_global_id(0);
45 const uint b = get_global_id(1);
47 if (f >= OUTPUT_FEATURE_NUM) {
53 uint idx_w = ((f / SIMD_SIZE) * SIMD_SIZE) * INPUT0_FEATURE_NUM;
54 const __global INPUT0_TYPE* current_input = &input[GET_DATA_INDEX(INPUT0, b, 0, 0, 0)];
56 for (uint idx_i = 0; idx_i < INPUT0_FEATURE_NUM; idx_i += BYTES_PER_READ8) {
57 int input_data = as_int(intel_sub_group_block_read((const __global uint*)(current_input + idx_i)));
58 int8 activations; //activations of all lanes
59 activations.s0 = sub_group_broadcast(input_data, 0);
60 activations.s1 = sub_group_broadcast(input_data, 1);
61 activations.s2 = sub_group_broadcast(input_data, 2);
62 activations.s3 = sub_group_broadcast(input_data, 3);
63 activations.s4 = sub_group_broadcast(input_data, 4);
64 activations.s5 = sub_group_broadcast(input_data, 5);
65 activations.s6 = sub_group_broadcast(input_data, 6);
66 activations.s7 = sub_group_broadcast(input_data, 7);
68 int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + idx_w)));
69 idx_w += SIMD_SIZE * BYTES_PER_READ8;
71 for (int i = 0; i < 8; i++) {
72 dotProd = IMAD(dotProd, as_char4(activations[i]), as_char4(weights_data[i]));
78 const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
80 const uint bias_index = f;
83 dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]);
84 #else // CALIBRATION_TERM
85 dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * O_QF);
86 #endif // CALIBRATION_TERM
89 const uint out_index = GET_DATA_INDEX(OUTPUT, b, f, 0, 0);
90 output[out_index] = ACTIVATION(convert_char(dotProd), NL_M, NL_N);
95 #undef BYTES_PER_READ8