1 // Copyright (c) 2018-2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
16 #include "include/common.cl"
17 #include "include/fetch.cl"
18 #include "include/data_types.cl"
19 #include "include/imad.cl"
21 #ifndef NON_BLOCK_LOAD
22 // block loads for inputs and weights should be fastest, but compiler seems
23 // to do better with a mix, regular loads for inputs and block loads for weights.
24 #define BLOCK_LOAD_WEIGHTS
26 // Input reading operation is always blocked.
27 #define BLOCK_LOAD_INPUTS
29 // for now kernel stride is square
30 #define K_WSTRIDE K_STRIDE
31 #define K_HSTRIDE K_STRIDE
33 // need KERNEL width for first output + STRIDE more for each additional.
34 #define IN_BLOCK_WIDTH (K_WIDTH + K_WSTRIDE * (OUT_BLOCK_WIDTH - 1))
35 #define IN_BLOCK_HEIGHT (K_HEIGHT + K_HSTRIDE * (OUT_BLOCK_HEIGHT - 1))
37 // for imad we are packing 4 8bit activations per 32 bit SIMD lane
38 // if we later add 4bit, then PACK would be 8.
41 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
42 KERNEL (convolution_gpu_imad)(
43 __global uint *inputs,
44 __global OUTPUT_TYPE *outputs,
47 ,__global BIAS_TYPE *biases
50 ,__global float *quantizations
53 ,__global float *calibrations
57 const uint oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
58 const uint or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
59 const uint fm = get_global_id(2); // fm = Feature Map = od = Output Depth, SIMD is across this dimension, WG is 1x1x16
60 const uint fmg = get_group_id(2);
61 const uint lid = get_local_id(2);
62 const uint batch = fm / _OD;
64 uint in[IN_BLOCK_HEIGHT];
65 int out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0 }; // this is the 32 bit signed accumulator that must be converted to 8 bits before final write.
67 #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
72 #ifdef BLOCK_LOAD_WEIGHTS
73 int weight_addr = (fmg % (_OD / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK);
75 int weight_addr = (fmg % (_OD / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK) + lid;
78 uint input_size = (_ID * (_IH + IHPAD) * (_IW + IWPAD)) / PACK; // dividing by PACK to get right number of 32bit entities.
80 __attribute__((opencl_unroll_hint(1)))
81 for(int kd = 0; kd < (_ID / PACK); kd++) // For imad we do 4X less input feature map iterations since we are packing 4 of them in each uchar4. For now assume _ID is multiple of packing factor.
84 #ifdef BLOCK_LOAD_INPUTS
85 in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) + (or * K_STRIDE) * (_IW + IWPAD) + (oc * K_STRIDE);
87 in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) + (or * K_STRIDE) * (_IW + IWPAD) + (oc * K_STRIDE) + lid;
89 in_addr += batch * input_size; // adjust for batching
91 for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) {
92 #ifdef BLOCK_LOAD_INPUTS
93 in[reg] = intel_sub_group_block_read((const __global uint*) &inputs[in_addr]);
95 in[reg] = inputs[in_addr];// read SIMD_SIZE elements wide
97 in_addr += (_IW + IWPAD); // move to next row down
100 #ifdef BLOCK_LOAD_WEIGHTS
101 *((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
102 w[8]= as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
103 weight_addr += SIMD_SIZE*NUM_FILTERS;
105 for(int pf=0; pf < NUM_FILTERS; pf++) {
106 w[pf] = weights[weight_addr];
107 weight_addr += SIMD_SIZE;
112 int kr = 0; // kr = Kernel Row
115 int kc = 0; // kc = Kernel Column
118 for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
119 for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
120 uint input = sub_group_broadcast(in[br * K_HSTRIDE + kr], bc * K_WSTRIDE + kc);
122 out[br * OUT_BLOCK_WIDTH + bc] =
123 #ifdef CONVO_UNSIGNED
124 IMAD(out[br * OUT_BLOCK_WIDTH + bc], as_uchar4(input), as_char4(w[wi]));
126 IMAD(out[br * OUT_BLOCK_WIDTH + bc], as_char4(input), as_char4(w[wi]));
135 // Feature maps are an array of slices, each H,W position within the slice contains
136 // four 8bit feature maps, packed like RGBA components into a 32 bit pixel.
137 int row_size_bytes = (_OW + OWPAD) * PACK;
139 // Slice_pack is a pack of 4 feature map tiles that are [OH][OW][4]
140 // that are stored within the full [N][C/4][H][W][4] output.
141 int slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD);
143 // Dividing the feature map index by 4 gives us the slice_pack_index in each lane
144 // (each lane within block of 4 will have same index).
145 int slice_pack_index = fm / PACK;
147 // Each group of 4 simd lanes points to start of it's slice pack.
148 int slice_pack_start_addr_bytes = slice_pack_index * slice_pack_size_bytes;
150 // Make each lane within the group of 4(PACK) simd lanes point to an individual byte
151 // witihn the uchar4 at start of slice pack.
152 int slice_pack_addr_bytes = slice_pack_start_addr_bytes + (lid % PACK);
154 // Adjust to particular tile that we are working on
155 slice_pack_addr_bytes += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * row_size_bytes
156 + (oc + OUTPUT_PAD_BEFORE_SIZE_X) * PACK;
158 for (int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
159 for (int c = 0; c < OUT_BLOCK_WIDTH; c++) {
160 uint out_idx = slice_pack_addr_bytes + r * row_size_bytes + (c*PACK);
161 #if QUANTIZATION_TERM
162 int dotProd = out[r * OUT_BLOCK_WIDTH + c];
164 UNIT_TYPE dotProd = out[r * OUT_BLOCK_WIDTH + c];
168 const uint f = fm % _OD;
170 #error convolution_gpu_imad.cl: BIAS_PER_OUTPUT - not supported
172 const uint bias_index = f;
175 #if QUANTIZATION_TERM
178 dotProd = (UNIT_TYPE)round( ((float)dotProd * quantizations[f] * I_QF + biases[bias_index])
181 dotProd = (UNIT_TYPE)round( ((float)dotProd * quantizations[f] * I_QF + biases[bias_index])
183 #endif // CALIBRATION_TERM
185 dotProd += (UNIT_TYPE)biases[bias_index];
186 #endif // QUANTIZATION_TERM
189 #if QUANTIZATION_TERM
190 UNIT_TYPE dotProd_A = ACTIVATION(convert_char(dotProd), NL_M, NL_N);
192 UNIT_TYPE dotProd_A = ACTIVATION(dotProd, NL_M, NL_N);
195 #ifdef CONVO_UNSIGNED
196 outputs[out_idx] = (uchar)( max((int)dotProd_A , 0) & 0xFF );
198 outputs[out_idx] = (uchar)dotProd_A & 0xFF;
200 } // for (int c = 0; c < OUT_BLOCK_WIDTH; c++)
201 } // for (int r = 0; r < OUT_BLOCK_HEIGHT; r++)