Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl
1 // Copyright (c) 2016-2017 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "include/common.cl"
16
17 #include "include/data_types.cl"
18 #include "include/fetch.cl"
19 #include "include/mmad.cl"
20
21 #define FILTER_IFM_SLICES ((FILTER_IFM_NUM + 3) /4)
22 #define FILTER_SIZE_X_SLICES ((FILTER_SIZE_X + 7) / 8)
23
24 #define OUT_BLOCK_HEIGHT 4
25 #define WEIGHTS_PER_WORKITEM 4 // currently needs to be set to 4, check output stage and float4 on quantizations etc.
26
27 #define SCALE 0.11f
28
29 #ifdef LIGHTWEIGHT_QUANTIZATION
30
31 #define QUANTIZATION \
32     out[w] = convert_uchar_sat((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * SCALE + bias_f[w]);
33
34 #elif NO_QUANTIZATION
35
36 #define QUANTIZATION \
37     out[w] = convert_uchar_sat(dotProd[w*OUT_BLOCK_HEIGHT + h][i]);
38
39 #else
40
41 #define QUANTIZATION \
42     out[w] = as_uchar( ACTIVATION( convert_char( round( ( (float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w])), NL_M, NL_N));
43
44 #endif
45
46 __attribute__((intel_reqd_sub_group_size(8)))
47 KERNEL(convolution_gpu_byx8_f4_fs_bs_yx_bsv4_fsv32)(
48     __global INPUT0_TYPE* input, 
49     __global OUTPUT_TYPE* output, 
50     __global FILTER_TYPE* weights, 
51     __global BIAS_TYPE* biases,
52     __global float* quantizations,
53 #if CALIBRATION_TERM
54     __global float* calibrations,
55 #endif
56     uint split_idx)
57 {
58     const uint x = get_group_id(1) * 8;
59     const uint y = get_group_id(2) * OUT_BLOCK_HEIGHT;
60
61     const uint f = (get_group_id(0) * 8 * WEIGHTS_PER_WORKITEM ) % OUTPUT_FEATURE_NUM;
62     const uint b = (get_group_id(0) * 8 * WEIGHTS_PER_WORKITEM) / OUTPUT_FEATURE_NUM;
63
64     int8 dotProd[OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] =  { 0 };
65
66     const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
67     const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
68
69     const uint filter_offset = f*FILTER_OFM_PITCH;
70     const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET;
71
72     for (uint k = 0; k < FILTER_IFM_SLICES; ++k)
73     {
74         __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
75         for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
76         {
77             const int input_offset_y = input_y + j * DILATION_SIZE_Y;
78
79             __attribute__((opencl_unroll_hint(FILTER_SIZE_X_SLICES)))
80             for(uint i = 0; i < FILTER_SIZE_X_SLICES; i++)
81             {
82                 int8 act_reg[OUT_BLOCK_HEIGHT]; // activations for MMAD
83
84                 // preload spatial data
85                 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
86                 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
87                 {
88                     uint input_idx = GET_DATA_BYX8_F4_INDEX(INPUT0, b, k * 4, input_offset_y + h * STRIDE_SIZE_Y, input_x + i * 8);
89                     int2 _input_data_01 = as_int2(intel_sub_group_block_read2((__global uint*)(input + input_idx)));
90                     int _input_data_2 = as_int(intel_sub_group_block_read((__global uint*)(input + input_idx + 8 * 8)));
91
92                     act_reg[h][0] = _input_data_01[0];
93                     act_reg[h][1] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 1);
94                     act_reg[h][2] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 2);
95                     act_reg[h][3] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 3);
96                     act_reg[h][4] = _input_data_01[1];
97                     act_reg[h][5] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 1);
98                     act_reg[h][6] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 2);
99                     act_reg[h][7] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 3);
100                 }
101
102                 __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
103                 for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) // iterate over output feature channels for weights
104                 {
105                     uint filter_idx = GET_FILTER_OS_IS_Y_X8_OSV8_ISV4(FILTER, f + w * 8, k * 4, j, i * 8);
106                     int8 _w = as_int8(intel_sub_group_block_read8((__global uint*)(weights + filter_idx)));
107
108                     __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
109                     for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
110                     {
111                         // MMAD on 8x WEIGHTS_PER_WORKITEM input channels elements for 8x outputs in WI
112                         dotProd[w*OUT_BLOCK_HEIGHT + h] = MMAD_8x8(act_reg[h], _w, dotProd[w*OUT_BLOCK_HEIGHT + h]);
113                     }
114                 }
115             }
116         }
117     }
118
119 float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + f) ));
120 float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + f) ));
121 #if CALIBRATION_TERM
122 float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + f) ));
123 #endif
124
125 __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
126 for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
127 {
128     const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f + get_sub_group_local_id(), y + h, x);
129
130     __attribute__((opencl_unroll_hint(8)))
131     for(uint i = 0; i < 8; i++)
132     {
133
134     #if WEIGHTS_PER_WORKITEM == 4
135     
136         uchar4 out;
137         __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
138         for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
139         {
140             QUANTIZATION;
141         }
142         intel_sub_group_block_write_uc4((__global uchar*)(output + dst_index + 32 * 4 * i), out);
143     
144     #else
145     
146         __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
147         for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
148         {
149         #if CALIBRATION_TERM
150             dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w]);
151         #else  // CALIBRATION_TERM
152             dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * O_QF);
153         #endif // CALIBRATION_TERM
154             output[dst_index + 32 * 4 * i + 8 * w] = ACTIVATION(convert_char(dotProd[w*OUT_BLOCK_HEIGHT + h][i]), NL_M, NL_N);
155         }
156     
157     #endif
158     }
159 }
160
161 }
162
163 #undef OUT_BLOCK_HEIGHT
164 #undef WEIGHTS_PER_WORKITEM
165
166 #undef FILTER_SIZE_X_SLICES
167 #undef FILTER_IFM_SLICES
168
169 #undef SCALE
170 #undef QUANTIZATION