1 // Copyright (c) 2018-2019 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
17 #include "convolution_kernel_bfyx_f16_depthwise.h"
18 #include "kernel_selector_utils.h"
21 namespace kernel_selector {
22 static const size_t sub_group_size = 16;
23 static const size_t feature_block_size = 16;
25 ParamsKey ConvolutionKernel_bfyx_f16_depthwise::GetSupportedKey() const {
27 k.EnableInputDataType(Datatype::F16);
28 k.EnableInputWeightsType(WeightsType::F16);
29 k.EnableOutputDataType(Datatype::F16);
30 k.EnableInputDataType(Datatype::F32);
31 k.EnableInputWeightsType(WeightsType::F32);
32 k.EnableOutputDataType(Datatype::F32);
33 k.EnableInputLayout(DataLayout::bfyx_f16);
34 k.EnableOutputLayout(DataLayout::bfyx_f16);
35 k.EnableTensorOffset();
36 k.EnableTensorPitches();
37 k.EnableBiasPerFeature();
38 k.EnableNonBiasTerm();
40 k.EnableGroupedConvolution();
42 k.EnableSubGroupShort();
43 k.EnableDepthwiseSeparableOpt();
47 bool ConvolutionKernel_bfyx_f16_depthwise::Validate(const Params& p, const optional_params&) const {
48 const convolution_params& cp = static_cast<const convolution_params&>(p);
49 if (!cp.depthwise_separable_opt || (cp.inputs[0].Feature().v != cp.split && cp.inputs[0].Feature().v != cp.groups))
52 if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
55 if (cp.stride.x != 1 && cp.stride.x != 2)
61 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_f16_depthwise::SetDefault(const convolution_params& params,
63 DispatchData runInfo = Parent::SetDefault(params);
64 const auto& out = params.output;
66 runInfo.gws0 = CeilDiv(out.X().v, 8) * out.Y().v;
67 runInfo.gws1 = Align(out.Feature().v, feature_block_size);
68 runInfo.gws2 = out.Batch().v;
70 runInfo.lws1 = sub_group_size;
73 if (out.Batch().v == 1)
74 runInfo.effiency = FORCE_PRIORITY_1;
76 runInfo.effiency = FORCE_PRIORITY_7;
81 JitConstants ConvolutionKernel_bfyx_f16_depthwise::GetJitConstants(const convolution_params& params,
82 const DispatchData& kd) const {
83 auto jit = ConvolutionKernelBase::GetJitConstants(params, kd);
85 const auto block_width = 8;
87 if (params.fused_ops.size() > 0) {
88 FusedOpsConfiguration conf_vec = {"_VEC", {"b", "(f_block*16)", "y", "x"}, "dst", block_width, true, false, true, false };
89 FusedOpsConfiguration conf_scalar = {"_SCALAR", {"b", "(f_block*16)", "y", "(x+i)"}, "dst[i]", 1, true, false, true, false };
90 jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
91 jit.Merge(MakeTypeJitConstants(Datatype::F32, "float"));
92 jit.Merge(MakeTypeJitConstants(Datatype::F16, "half"));
95 jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
96 jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.output.X().v, block_width)));
97 jit.AddConstant(MakeJitConstant("IC_BLOCK", feature_block_size));
98 if (params.output.Feature().v % feature_block_size != 0) {
99 jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
105 KernelsData ConvolutionKernel_bfyx_f16_depthwise::GetKernelsData(const Params& params,
106 const optional_params& options) const {
107 return GetCommonKernelsData(params, options);
110 } // namespace kernel_selector