Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / actual_kernels / fused_conv_eltwise / fused_conv_eltwise_kernel_bfyx_1x1_opt.cpp
1 /*
2 // Copyright (c) 2018 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "fused_conv_eltwise_kernel_bfyx_1x1_opt.h"
18 #include "kernel_selector_utils.h"
19
20 namespace kernel_selector {
21
22     ParamsKey fused_conv_eltwise_kernel_bfyx_1x1_opt::GetSupportedKey() const
23         {
24         ParamsKey k;
25         k.EnableInputDataType(Datatype::F32);
26         k.EnableInputWeightsType(WeightsType::F32);
27         k.EnableOutputDataType(Datatype::F32);
28         k.EnableInputLayout(DataLayout::bfyx);
29         k.EnableOutputLayout(DataLayout::bfyx);
30         k.EnableTensorOffset();
31         k.EnableTensorPitches();
32         k.EnableSubGroup();
33         //k.EnableSubGroupShort(); // we need it for FP16 only. we check it on the Validate phase
34         k.EnableBiasPerFeature();
35         k.EnableNonBiasTerm();
36         k.EnableBatching();
37         k.EnableFusedConvEltwSplitSupport();
38         k.EnableFusedConvEltwiseRWOutOpt(); // data for second input are already in output
39         return k;
40         }
41
42     struct block_params
43     {
44         int32_t out_width;
45         int32_t out_height;
46         int32_t out_depth;
47     };
48
49     static block_params get_out_block_size(const fused_conv_eltwise_params& p)
50     {
51         auto out_depth = 8;
52
53         if (p.output.X().v == 7)
54         {
55             auto gws0 = p.output.X().v / 7;
56             auto gws1 = p.output.Y().v / 1;
57             auto gws2 = 2 * (p.output.Feature().v * p.output.Batch().v) / 8; // process 8 output channels per Workitem
58
59             auto compute_units = p.engineInfo.computeUnitsCount;
60             auto total_threads = (gws0 * gws1 * gws2) / 64;
61             if (total_threads < compute_units)
62             {
63                 out_depth /= 2;
64                 total_threads *= 2;
65             }
66             if (total_threads < compute_units)
67             {
68                 out_depth /= 2;
69                 total_threads *= 2;
70             }
71             return { 7,1,out_depth };
72         }
73         else if (p.output.X().v == 14)
74             return { 7,1,8 };
75         else if (p.output.X().v == 28)
76             return { 7,2,4 };
77         else if (p.output.X().v == 56)
78             return { 8,1,8 };
79
80         return { 1,1,1 };
81     }
82
83     std::string fused_conv_eltwise_kernel_bfyx_1x1_opt::GetKernelName(const fused_conv_eltwise_params& params) const
84     {
85         if (params.inputs[0].GetDType() == Datatype::F32)
86         {
87             return kernelName + "_fp32";
88         }
89         else
90         {
91             return kernelName + "_fp16";
92         }
93     }
94
95         bool fused_conv_eltwise_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_params& o) const
96         {
97                 if (!fused_conv_eltwise_kernel_base::Validate(p, o) ||
98                         !FusedConvolutionEltwiseCheckInput(p, o))
99                 {
100                         return false;
101                 }
102
103                 const fused_conv_eltwise_params& cp = static_cast<const fused_conv_eltwise_params&>(p);
104                 
105         if (cp.conv.stride.x != 1 || cp.conv.stride.y != 1)
106             return false;
107
108         if (cp.conv.filterSize.x != 1 || cp.conv.filterSize.y != 1)
109             return false;
110
111         if (cp.output.Feature().v % 64 != 0)
112             return false;
113
114         if (cp.conv.padding.x != 0 || cp.conv.padding.y != 0)
115             return false;
116
117         // if block sizes are 1x1, then this algorithm is probably not the best
118         auto block = get_out_block_size(cp);
119         if (block.out_width == 1 && block.out_height == 1)
120             return false;
121
122         if (cp.output.X().v % block.out_width != 0)
123             return false;
124         if (cp.output.Y().v % block.out_height != 0)
125             return false;
126
127                 return true;
128         }
129
130     std::vector<WeightsLayout> fused_conv_eltwise_kernel_bfyx_1x1_opt::GetSupportedWeightLayouts(const fused_conv_eltwise_params& p) const
131     {
132         auto block = get_out_block_size(p);
133         if (block.out_depth == 8)
134             return { WeightsLayout::os_iyx_osv64 };
135         if (block.out_depth == 4)
136             return { WeightsLayout::os_iyx_osv32 };
137         if (block.out_depth == 2)
138             return { WeightsLayout::os_iyx_osv16 };
139         else
140             return{ WeightsLayout::yxio };
141     }
142
143     fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_1x1_opt::SetDefault(const fused_conv_eltwise_params& arg, int) const
144         {
145         DispatchData runInfo = Parent::SetDefault(arg);
146
147         constexpr size_t sub_group_size = 8;
148
149         runInfo.effiency = FORCE_PRIORITY_3;
150
151         auto block = get_out_block_size(arg);
152
153         runInfo.gws0 = arg.output.X().v / block.out_width;
154         runInfo.gws1 = arg.output.Y().v / block.out_height;
155         runInfo.gws2 = 2 * (arg.output.Feature().v * arg.output.Batch().v) / block.out_depth; // process 8 output channels per Workitem
156
157         runInfo.lws0 = 1;
158         runInfo.lws1 = 1;
159         runInfo.lws2 = 2 * sub_group_size;
160
161         return runInfo;
162         }
163
164         JitConstants fused_conv_eltwise_kernel_bfyx_1x1_opt::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const
165         {
166                 auto jit = Parent::GetJitConstants(params, runInfo);
167
168         auto block = get_out_block_size(params);
169         jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
170         jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
171         jit.AddConstant(MakeJitConstant("OUT_BLOCK_DEPTH", block.out_depth));
172
173         if (!params.eltw.stride.empty())
174         {
175             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x));
176             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y));
177         }
178         else
179         {
180             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1));
181             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1));
182         }
183
184         return jit;
185         }
186
187     KernelsData fused_conv_eltwise_kernel_bfyx_1x1_opt::GetKernelsData(const Params& params, const optional_params& options) const
188     {
189         KernelsData kd = GetCommonKernelsData(params, options);
190         if (!kd.empty())
191             kd[0].estimatedTime = FORCE_PRIORITY_1;
192         return kd;
193     }
194 }