Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / deconvolution_gpu_bfyx_opt.cl
1 // Copyright (c) 2016-2017 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "include/include_all.cl"
16
17 #define WORK_GROUP_GROUP_SIZE 16
18
19 __attribute__((reqd_work_group_size(WORK_GROUP_GROUP_SIZE, 1, 1)))
20 KERNEL(deconvolution_gpu_bfyx_opt)(
21     const __global UNIT_TYPE* input,
22     __global UNIT_TYPE* output,
23     const __global UNIT_TYPE* filter,
24 #if BIAS_TERM
25     const __global UNIT_TYPE* bias,
26 #endif
27     uint split_idx
28 #if FUSED_ELTWISE
29         , const __global UNIT_TYPE* fuse_input
30 #endif
31         )
32 {
33     UNIT_TYPE result = UNIT_VAL_ZERO;
34
35     const uint b_f          = get_global_id(2);
36     const uint batch_offset = b_f / OUTPUT_FEATURE_NUM;
37     const uint ofm_offset   = b_f % OUTPUT_FEATURE_NUM;
38
39     const uint global_x_group    = get_group_id(0);    
40     const uint global_y_group    = get_group_id(1);
41
42     const uint local_x        = get_local_id(0);  
43     const uint local_y        = get_local_id(1);  
44
45     const uint stride_x_id = global_x_group % STRIDE_SIZE_X;
46     const uint stride_y_id = global_y_group % STRIDE_SIZE_Y;
47
48     const uint id_x = (global_x_group / STRIDE_SIZE_X) * STRIDE_SIZE_X * WORK_GROUP_GROUP_SIZE + local_x * STRIDE_SIZE_X + stride_x_id;
49     
50     if (id_x >= OUTPUT_SIZE_X)
51         return;
52
53     const uint id_y = (global_y_group / STRIDE_SIZE_Y) * STRIDE_SIZE_Y + local_y * STRIDE_SIZE_Y + stride_y_id;
54     const int in_x = (int)id_x + PADDING_SIZE_X - (FILTER_SIZE_X - 1);
55     const int in_y = (int)id_y + PADDING_SIZE_Y - (FILTER_SIZE_Y - 1);
56
57     const uint start_x = (STRIDE_SIZE_X - (in_x % STRIDE_SIZE_X)) % STRIDE_SIZE_X;
58     const uint start_y = (STRIDE_SIZE_Y - (in_y % STRIDE_SIZE_Y)) % STRIDE_SIZE_Y;
59
60 #if DEPTHWISE_SEPARABLE_OPT
61     const uint in_split_offset = (ofm_offset / FILTER_OFM_NUM) * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
62 #else
63     const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
64 #endif
65     const uint input_offset = INPUT0_OFFSET + batch_offset*INPUT0_BATCH_PITCH + in_split_offset;
66 #if GROUPED && !DEPTHWISE_SEPARABLE_OPT
67     const uint filter_offset = split_idx * FILTER_LENGTH;
68 #else
69     const uint filter_offset = 0;
70 #endif
71
72     for (uint i = start_y; i < FILTER_SIZE_Y; i+=STRIDE_SIZE_Y)
73     {
74         const int input_offset_y = in_y + i;
75         const bool zero_y = (input_offset_y >= INPUT0_SIZE_Y * STRIDE_SIZE_Y) || (input_offset_y < 0);
76
77         if(!zero_y)
78         {
79             for (uint j = start_x; j < FILTER_SIZE_X; j+=STRIDE_SIZE_X)
80             {
81                 const int input_offset_x = in_x + j;
82                 const bool zero_x = (input_offset_x >= INPUT0_SIZE_X * STRIDE_SIZE_X) || (input_offset_x < 0);
83
84                 if(!zero_x)
85                 {
86                     uint fixed_input_offset_x = (uint)input_offset_x / STRIDE_SIZE_X;
87                     uint fixed_input_offset_y = (uint)input_offset_y / STRIDE_SIZE_Y;
88                     uint input_idx = input_offset + (uint)fixed_input_offset_x*INPUT0_X_PITCH + (uint)fixed_input_offset_y*INPUT0_Y_PITCH;
89
90 #if GRADIENT
91                     uint filter_idx = filter_offset + ofm_offset*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
92                     for (uint h = 0; h < FILTER_OFM_NUM; h++)
93                     {
94                         result = fma(input[input_idx], filter[filter_idx], result);
95                         filter_idx += FILTER_OFM_PITCH;
96                         input_idx += INPUT0_FEATURE_PITCH;
97                     }
98 #else
99                     uint filter_idx = filter_offset + ofm_offset*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
100                     for (uint h = 0; h < FILTER_IFM_NUM; h++)
101                     {
102                         result = fma(input[input_idx], filter[filter_idx], result);
103                         filter_idx += FILTER_IFM_PITCH;
104                         input_idx += INPUT0_FEATURE_PITCH;
105                     }
106 #endif
107                 }
108             }
109         }
110     }
111 #if BIAS_TERM
112 #if GROUPED && !DEPTHWISE_SEPARABLE_OPT
113     const uint bias_offset = split_idx * BIAS_LENGTH;
114 #else
115     const uint bias_offset = 0;
116 #endif
117     result += bias[ofm_offset + bias_offset];
118 #endif
119     const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;
120     const uint dst_index = OUTPUT_OFFSET + out_split_offset + batch_offset*OUTPUT_BATCH_PITCH + ofm_offset*OUTPUT_FEATURE_PITCH + id_y*OUTPUT_Y_PITCH + id_x*OUTPUT_X_PITCH;
121 #if FUSED_ELTWISE
122     const uint fused_index = INPUT1_OFFSET + split_idx * INPUT1_FEATURE_PITCH * FILTER_OFM_NUM + batch_offset*INPUT1_BATCH_PITCH + ofm_offset*INPUT1_FEATURE_PITCH + id_y*INPUT1_Y_PITCH + id_x*INPUT1_X_PITCH;
123 #if !GRADIENT
124         output[dst_index] = ACTIVATION(result + fuse_input[fused_index], NL_M, NL_N);
125 #else
126         output[dst_index] = result + fuse_input[fused_index];
127 #endif
128 #else
129     output[dst_index] = ACTIVATION(result, NL_M, NL_N);
130 #endif
131 }
132
133 #undef ACTIVATION
134 #undef WORK_GROUP_GROUP_SIZE