Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / actual_kernels / convolution / convolution_kernel_yxfb_yxio_b1_block_multiple_x.cpp
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "convolution_kernel_yxfb_yxio_b1_block_multiple_x.h"
18
19 namespace kernel_selector 
20 {
21
22     constexpr size_t local_work_size = 16;
23
24     ParamsKey ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetSupportedKey() const
25     {
26         ParamsKey k;
27         k.EnableInputDataType(Datatype::F32);
28         k.EnableInputWeightsType(WeightsType::F16);
29         k.EnableInputWeightsType(WeightsType::F32);
30         k.EnableOutputDataType(Datatype::F32);
31         k.EnableInputLayout(DataLayout::yxfb);
32         k.EnableOutputLayout(DataLayout::yxfb);
33         k.EnableTensorOffset();
34         k.EnableTensorPitches();
35         k.EnableBiasPerFeature();
36         k.EnableNonBiasTerm();
37         k.EnableBatching();
38         k.EnableSplitSupport();
39         k.EnableDilation();
40         k.EnableSubGroup();
41         return k;
42     }
43
44     namespace {
45         size_t GetOfmPerWorkitem(size_t filter_ofm_num, size_t localWorkSize)
46         {
47             if (filter_ofm_num % (localWorkSize * 4) == 0)
48                 return 4;
49             if (filter_ofm_num % (localWorkSize * 2) == 0)
50                 return 2;
51             return 1;
52         }
53     }
54
55     ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::SetDefault(const convolution_params& arg, int autoTuneIndex) const
56     {
57         DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg, autoTuneIndex);
58
59         const auto filter_ofm_num = arg.weights.OFM().v;
60         const auto batch_size = arg.output.Batch().v;
61
62         runInfo.lws0 = local_work_size;
63
64         // We cannot return 8 because we are processing 4 spatial coordinates for batch1,
65         // and if we use more than 4 ofm_per_work_item we downgrade simd16 to simd8 which would break this algorithm.
66         // NOTE: We could return 8 but then we must process only 2 coordinates, which is slower than processing 4 coordinates using blockread4
67         // TODO: experiment with SIMD8 version of algorithm and check if it could be faster
68         /*if (output_feature_count % (lws * 8) == 0)
69         {
70         run_info.ofm_per_work_item = 8;
71         run_info.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(run_info.gws1) / 2.0f));
72         }
73         else*/
74         const size_t ofmPerWorkItem = GetOfmPerWorkitem(filter_ofm_num, local_work_size);
75         if (ofmPerWorkItem == 4)
76         {
77             // We compute multiple spatial coordinates "x" in a single workitem that's why we must divide
78             runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 4.0f));
79         }
80         else if (ofmPerWorkItem == 2)
81         {
82             runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 8.0f));
83         }
84         else
85         {
86             runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 8.0f));
87         }
88
89         runInfo.gws0 = filter_ofm_num * batch_size / ofmPerWorkItem;
90         
91         return runInfo;
92     }
93
94     JitConstants ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetJitConstants(const convolution_params& params, const DispatchData& kd) const
95     {
96         auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, kd);
97
98         size_t ofmPerWorkItem = GetOfmPerWorkitem(params.weights.OFM().v, local_work_size);
99         cldnn_jit.AddConstant(MakeJitConstant("USE_VECTOR", ofmPerWorkItem));
100         if (ofmPerWorkItem == 8)
101         {
102             cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 2));
103         }
104         else if (ofmPerWorkItem == 4)
105         {
106             cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 4));
107         }
108         else
109         {
110             cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 8));
111         }
112
113         cldnn_jit.AddConstant(MakeJitConstant("OFM_PER_WORK_ITEM", ofmPerWorkItem)); // how many output feature maps for a single batch will a single work item produce
114         cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0));
115         return cldnn_jit;
116     }
117
118     bool ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::Validate(const Params& p, const optional_params& o) const
119     {
120         if (!ConvolutionKernelBase::Validate(p, o))
121         {
122             return false;
123         }
124
125         const convolution_params& params = static_cast<const convolution_params&>(p);
126
127         if (!CheckPitchForSplitOnly(params))
128         {
129             return false;
130         }
131
132         const auto filter_ofm_num = params.weights.OFM().v;
133         const auto batch_size = params.output.Batch().v;
134
135         const bool bInputValidated =
136             (filter_ofm_num > 0) &&
137             (batch_size == 1) &&    // current implementation doesn't support batching 
138                                     // (subgorup is along batch*ofm and trying to block read filter/bias along batch and filter doesn't contain batching).
139             (params.output.Feature().v == filter_ofm_num);
140
141         if (!bInputValidated)
142         {
143             return false;
144         }
145
146         if ((filter_ofm_num * batch_size) % 16 != 0)
147         {
148             return false;
149         }
150
151         return true;
152     }
153
154     KernelsData ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetKernelsData(const Params& params, const optional_params& options) const
155     {
156         return GetTunedKernelsDataByIndex(params, options);
157     }
158 }