inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp

   1 /*
   2 // Copyright (c) 2019 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////
  18
  19 #include "fully_connected_inst.h"
  20 #include "primitive_gpu_base.h"
  21 #include "implementation_map.h"
  22 #include "kernel_selector_helper.h"
  23 #include "fully_connected/fully_connected_kernel_selector.h"
  24 #include "fully_connected/fully_connected_params.h"
  25
  26 #include "network_impl.h"
  27 #include "error_handler.h"
  28 #include "kernel_runner.h"
  29
  30 #include "api/CPP/reorder.hpp"
  31 #include "api/CPP/input_layout.hpp"
  32
  33 namespace cldnn { namespace gpu {
  34
  35
  36 struct fully_connected_gpu : typed_primitive_gpu_impl<fully_connected>
  37 {
  38     using parent = typed_primitive_gpu_impl<fully_connected>;
  39
  40     std::vector<network_impl::ptr> _reorders;   // TODO: move this reorder to graph compiler
  41     memory_impl::cptr new_input_mem;      // TODO: remove this hack
  42
  43     fully_connected_gpu(const fully_connected_node& arg, const kernel_selector::kernel_data& kd, std::vector<network_impl::ptr> reorders)
  44         : parent(arg, kd)
  45         , _reorders(reorders)
  46     {}
  47
  48 protected:
  49
  50     virtual kernel::kernel_arguments_data get_arguments(typed_primitive_inst<fully_connected>& instance, int32_t) const override
  51     {
  52         kernel::kernel_arguments_data args;
  53
  54         args.inputs     = { new_input_mem };
  55         args.output     = &instance.output_memory();
  56         args.weights    = &instance.weights_memory();
  57         args.bias       = instance.bias_term() ? &instance.bias_memory() : nullptr;
  58         args.weights_quantization_factors = instance.weights_quantization_factors_term() ? &instance.weights_quantization_factors_memory() : nullptr;
  59         args.output_calibration_factors = instance.output_calibration_factors_term() ? &instance.output_calibration_factors_memory() : nullptr;
  60
  61         return args;
  62     }
  63
  64 public:
  65
  66     event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, fully_connected_inst& instance) override
  67     {
  68         std::vector<event_impl::ptr> tmp_events(events);
  69
  70         if (_reorders.empty())
  71         {
  72             new_input_mem = &instance.input_memory();
  73         }
  74         else
  75         {
  76             auto network = _reorders[0];
  77             network->set_input_data("input", instance.input_memory());
  78             network->execute(tmp_events);
  79             auto output_id = network->get_output_ids()[0];
  80             new_input_mem = &network->get_primitive(output_id)->output_memory();
  81             tmp_events.clear();
  82             tmp_events.push_back(network->get_primitive_event(output_id));
  83         }
  84
  85         return parent::execute_impl(tmp_events, instance);
  86     }
  87
  88     static primitive_impl* create(const fully_connected_node& arg)
  89     {
  90         auto fc_params = get_weights_bias_default_params<kernel_selector::fully_connected_params>(arg);
  91         auto fc_optional_params = get_default_weights_bias_optional_params<kernel_selector::fully_connected_optional_params>(arg.get_program());
  92         fc_optional_params.allowInputReordering = true;
  93
  94         if(arg.get_primitive()->with_activation)
  95             convert_activation_func_params(arg.get_primitive(), fc_params.activation);
  96
  97         fc_params.output = fc_params.output.FlattenFeatureAndSpatials();
  98
  99         const auto primitive = arg.get_primitive();
 100
 101         if (primitive->weights_quantization_factors.size() > 0)
 102         {
 103             fc_params.int8_quantization = true;
 104             fc_params.weights_quantization_factors.push_back(convert_data_tensor(arg.weights_quantization_factors().get_output_layout()).FlattenFeatureAndSpatials());
 105             fc_params.input_quantization_factor = arg.get_input_qf();
 106
 107             if (primitive->output_calibration_factors.size() > 0)
 108             {
 109                 fc_params.output_calibration = true;
 110                 fc_params.output_calibration_factors.push_back(convert_data_tensor(arg.output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials());
 111             }
 112             else
 113                 fc_params.output_quantization_factor = arg.get_output_qf();
 114         }
 115
 116         fc_optional_params.tuningParams.runner = std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), true);
 117
 118         auto& kernel_selector = kernel_selector::fully_connected_kernel_selector::Instance();
 119         auto best_kernels = kernel_selector.GetBestKernels(fc_params, fc_optional_params);
 120
 121         CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with this arguments");
 122
 123         const auto& new_fc_params = *static_cast<kernel_selector::fully_connected_params*>(best_kernels[0].params.get());
 124         std::vector<network_impl::ptr> reorders;
 125         if (fc_params.inputs[0].GetLayout() != new_fc_params.inputs[0].GetLayout())
 126         {
 127             const auto& input_layout = arg.input().get_output_layout();
 128             topology_impl tpl;
 129             tpl.add(std::make_shared<cldnn::input_layout>("input", input_layout));
 130             tpl.add(std::make_shared<cldnn::reorder>("reorder", "input", from_data_layout(new_fc_params.inputs[0].GetLayout()), input_layout.data_type));
 131             reorders.push_back(arg.get_program().get_engine().build_network(tpl, cldnn::build_options(), true));
 132         }
 133
 134         auto fc = new fully_connected_gpu(arg, best_kernels[0], reorders);
 135
 136         return fc;
 137     };
 138 };
 139
 140
 141 namespace {
 142     struct attach {
 143         attach() {
 144             auto val_fw = fully_connected_gpu::create;
 145
 146             implementation_map<fully_connected>::add({
 147                 { std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw },
 148                 { std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw },
 149                 { std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw },
 150                 { std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw },
 151                 { std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw },
 152                 { std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw },
 153                 { std::make_tuple(engine_types::ocl, data_types::i8,  format::bfyx), val_fw },
 154                 // MMAD
 155                 { std::make_tuple(engine_types::ocl, data_types::i8,  format::byxf_af32), val_fw },
 156                 { std::make_tuple(engine_types::ocl, data_types::i8,  format::fs_bs_yx_bsv4_fsv32), val_fw },
 157                 // IMAD
 158                 { std::make_tuple(engine_types::ocl, data_types::i8,  format::b_fs_yx_fsv4), val_fw },
 159                 { std::make_tuple(engine_types::ocl, data_types::u8,  format::b_fs_yx_fsv4), val_fw },
 160             });
 161         }
 162         ~attach() {}
 163     };
 164     attach attach_impl;
 165 }
 166 } }