From 0ec07b2c3bb8b1f326ce698b24b3d3707a0d8db9 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Mon, 22 Jun 2020 17:09:39 +0300 Subject: [PATCH] [IE CLDNN] fsv4 to fsv16 conv (#1030) --- ...nvolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp | 1 + .../convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl | 24 ++++ .../graph_optimizer/remove_redundant_reorders.cpp | 2 +- .../thirdparty/clDNN/src/layout_optimizer.cpp | 7 + .../tests/test_cases/convolution_gpu_test.cpp | 144 ++++++++++++++++++++- 5 files changed, 176 insertions(+), 2 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp index 792dc48..81c6869 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp @@ -32,6 +32,7 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetSupportedKey() const k.EnableOutputDataType(Datatype::F32); k.EnableInputWeightsType(WeightsType::INT8); k.EnableInputLayout(DataLayout::bfyx); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); k.EnableTensorOffset(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl index b30b27a..9f21243 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl @@ -114,7 +114,19 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)( #if ASYMMETRIC_WEIGHTS_QUANTIZATION ACCUMULATOR_TYPE_VEC acc_assym_weights = 0; #endif + +#if INPUT0_LAYOUT_BFYX const int input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + input_y * INPUT0_Y_PITCH; +#elif INPUT0_LAYOUT_B_FS_YX_FSV4 + const int fsv = 4; + const int input_x_pitch = fsv; + const int input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X); + const int input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y); + const int input_total_f_size = INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM; + const int input_b_pitch = input_fs_pitch * ((input_total_f_size + fsv - 1) / fsv); + const int input_offset = b * input_b_pitch + input_y * input_y_pitch; +#endif + int filter_idx = fg * FILTER_SIZE_X * FILTER_SIZE_Y * ISV * OSV; #if ASYMMETRIC_WEIGHTS_QUANTIZATION char4 multiplier; @@ -156,6 +168,7 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)( bool x_cross_fm = x_chunk + lid < 0 || x_chunk + lid >= INPUT0_SIZE_X; if (!x_cross_fm) { + #if INPUT0_LAYOUT_BFYX MAKE_VECTOR_TYPE(INPUT0_TYPE, ISV) src = 0; __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM))) for (int i = 0; i < INPUT0_FEATURE_NUM; i++) { @@ -164,6 +177,11 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)( + (x_chunk + lid)* INPUT0_X_PITCH]; } slm_block[c + lid] = AS_PACKED_IN_TYPE(src); + #elif INPUT0_LAYOUT_B_FS_YX_FSV4 + const __global uint* ptr = input + input_offset + kh * DILATION_SIZE_Y * input_y_pitch + (x_chunk + lid) * input_x_pitch; + PACKED_IN_TYPE src = AS_PACKED_IN_TYPE(ptr[0]); + slm_block[c + lid] = src; + #endif } else { #if ASYMMETRIC_DATA_QUANTIZATION slm_block[c + lid] = AS_PACKED_IN_TYPE(zp); @@ -178,6 +196,7 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)( const int x_chunk = x_wg_start + LWS1*SLM_CHUNK_SIZE; bool x_cross_fm = x_chunk + lid >= INPUT0_SIZE_X; if (!x_cross_fm) { + #if INPUT0_LAYOUT_BFYX MAKE_VECTOR_TYPE(INPUT0_TYPE, ISV) src = 0; __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM))) for (int i = 0; i < INPUT0_FEATURE_NUM; i++) { @@ -186,6 +205,11 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)( + (x_chunk + lid)* INPUT0_X_PITCH]; } slm_block_tail[lid] = AS_PACKED_IN_TYPE(src); + #elif INPUT0_LAYOUT_B_FS_YX_FSV4 + const __global uint* ptr = input + input_offset + kh * DILATION_SIZE_Y * input_y_pitch + (x_chunk + lid) * input_x_pitch; + PACKED_IN_TYPE src = AS_PACKED_IN_TYPE(ptr[0]); + slm_block_tail[lid] = src; + #endif } else { #if ASYMMETRIC_DATA_QUANTIZATION slm_block_tail[lid] = AS_PACKED_IN_TYPE(zp); diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp index ef1b478..1b588ee 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp @@ -84,7 +84,7 @@ void remove_redundant_reorders::run(program_impl& p) { auto output_padded = static_cast(output_layout.data_padding); auto can_omit_padding = (output_layout.format == format::b_fs_yx_fsv16 || output_layout.format == format::b_fs_yx_fsv32) && - input.get_output_layout().format == format::bfyx; + (input.get_output_layout().format == format::bfyx || input.get_output_layout().format == format::b_fs_yx_fsv4); if (output_padded && !can_omit_padding) { if (input.get_users().size() != 1) diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp index 10d1389..1802fdc 100644 --- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp +++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp @@ -214,6 +214,13 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3))) return true; + if (next.is_type() && + fmt_prev == format::b_fs_yx_fsv4 && + ((fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || + (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && + (prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8)))))) + return true; + if (next.is_type() && fmt_prev == format::bfyx && fmt_next == format::b_fs_yx_fsv16) return true; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp index 734e65d..2e9d23f 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp @@ -7838,7 +7838,7 @@ public: return "conv"; } - void run_expect(const VVVVF& expected) { + virtual void run_expect(const VVVVF& expected) { auto engine = get_test_engine(); auto topo = build_topology(engine); @@ -8118,6 +8118,135 @@ static std::string to_string_convolution_all_params(const testing::TestParamInfo } template +class convolution_random_test_fsv4_input : public convolution_random_test_base { +public: + using parent = convolution_random_test_base; + topology build_topology(const cldnn::engine& engine) override { + auto input_lay = layout(this->input_type(), format::b_fs_yx_fsv4, this->input_size()); + auto wei_lay = layout(this->weights_type(), format::bfyx, this->weights_size()); + + auto wei_mem = memory::allocate(engine, wei_lay); + auto wei_flat = flatten_4d(format::bfyx, this->_weights); + set_values(wei_mem, wei_flat); + layout reordered_layout = layout{this->input_type(), this->input_format(), this->input_size(), this->padding_size()}; + auto topo = topology(); + topo.add(input_layout("input", input_lay)); + topo.add(reorder("input_reorder", "input", reordered_layout)); + std::string input_id = "input_reorder"; + if (this->has_input_zp()) { + auto input_zp_lay = layout(this->input_type(), format::bfyx, tensor(feature(this->input_features()))); + auto input_zp_mem = memory::allocate(engine, input_zp_lay); + set_values(input_zp_mem, this->_input_zp); + topo.add(data("input_zp", input_zp_mem)); + topo.add(eltwise("input_asymm", { "input_reorder", "input_zp" }, eltwise_mode::sub)); + input_id = "input_asymm"; + } + topo.add(data("weights", wei_mem)); + std::string weights_id = "weights"; + if (this->has_weights_zp()) { + auto weights_zp_lay = layout(this->weights_type(), format::bfyx, tensor(batch(this->output_features()))); + auto weights_zp_mem = memory::allocate(engine, weights_zp_lay); + set_values(weights_zp_mem, this->_weights_zp); + topo.add(data("weights_zp", weights_zp_mem)); + topo.add(eltwise("weights_asymm", { "weights", "weights_zp" }, eltwise_mode::sub)); + weights_id = "weights_asymm"; + } + if (!this->has_bias()) { + auto conv_prim = convolution( + "conv", + input_id, + { weights_id }, + static_cast(this->groups()), + tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)), + tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)), + tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y))); + conv_prim.output_data_type = this->output_type(); + topo.add(conv_prim); + } else { + auto bias_lay = layout(this->output_type(), format::bfyx, tensor(feature(this->output_features()))); + auto bias_mem = memory::allocate(engine, bias_lay); + set_values(bias_mem, this->_bias); + topo.add(data("bias", bias_mem)); + auto conv_prim = convolution( + "conv", + input_id, + { weights_id }, + { "bias" }, + static_cast(this->groups()), + tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)), + tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)), + tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y))); + conv_prim.output_data_type = this->output_type(); + topo.add(conv_prim); + } + + return topo; + } + void run_expect(const VVVVF& expected) override { + auto engine = get_test_engine(); + + auto topo = this->build_topology(engine); + + auto build_opts = build_options( + build_option::optimize_data(true), + build_option::force_implementations({ {"conv", { this->input_format(), ""}} }) + ); + auto prog = program(engine, topo, build_opts); + + auto net = network(prog, 0); + + auto input_lay = layout(this->input_type(), format::b_fs_yx_fsv4, this->input_size()); + auto input_mem = memory::allocate(engine, input_lay); + std::vector input_flat(input_lay.get_linear_size(), static_cast(0)); + for (size_t bi = 0; bi < this->batch_num(); ++bi) + for (size_t fi = 0; fi < this->input_features(); ++fi) + for (size_t yi = 0; yi < this->input_y(); ++yi) + for (size_t xi = 0; xi < this->input_x(); ++xi) { + tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0)); + size_t offset = input_lay.get_linear_offset(coords); + input_flat[offset] = this->_input[bi][fi][yi][xi]; + } + set_values(input_mem, input_flat); + + net.set_input_data("input", input_mem); + auto result = net.execute(); + auto out_mem = result.at(this->output_primitive_id()).get_memory(); + auto out_lay = out_mem.get_layout(); + auto out_ptr = out_mem.cldnn::memory::template pointer(); + + std::stringstream description; + for (auto i : net.get_primitives_info()) { + if (i.original_id == "conv") { + std::cout << i.kernel_id << std::endl; + description << " kernel: " << i.kernel_id << std::endl; + } + } + description << " executed: "; + for (auto e : net.get_executed_primitive_ids()) { + description << e << ", "; + } + + ASSERT_EQ(out_lay.data_type, this->output_type()); + ASSERT_EQ(out_lay.size.batch[0], expected.size()); + ASSERT_EQ(out_lay.size.feature[0], expected[0].size()); + ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size()); + ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size()); + + for (size_t bi = 0; bi < this->batch_num(); ++bi) + for (size_t fi = 0; fi < this->output_features(); ++fi) + for (size_t yi = 0; yi < expected[0][0].size(); ++yi) + for (size_t xi = 0; xi < expected[0][0][0].size(); ++xi) { + tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0)); + size_t offset = out_lay.get_linear_offset(coords); + + ASSERT_EQ(out_ptr[offset], expected[bi][fi][yi][xi]) + << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi << std::endl + << description.str(); + } + } +}; + +template class convolution_scale_random_test : public convolution_random_test_base { public: using parent = convolution_random_test_base; @@ -8172,6 +8301,9 @@ class convolution_random_smoke_test : public testing::TestWithParam; using convolution_random_test_u8s8f32 = convolution_random_test_base; +using convolution_random_test_fsv4_input_s8s8f32 = convolution_random_test_fsv4_input; +using convolution_random_test_fsv4_input_u8s8f32 = convolution_random_test_fsv4_input; + using convolution_scale_random_test_s8s8f32 = convolution_scale_random_test; using convolution_scale_random_test_u8s8f32 = convolution_scale_random_test; @@ -8265,6 +8397,16 @@ TEST_P(convolution_random_smoke_test, u8s8f32_scale) { ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam())); } +TEST_P(convolution_random_smoke_test, s8s8f32_fsv4_input) { + convolution_random_test_fsv4_input_s8s8f32 test; + ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam())); +} + +TEST_P(convolution_random_smoke_test, u8s8f32_fsv4_input) { + convolution_random_test_fsv4_input_u8s8f32 test; + ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam())); +} + INSTANTIATE_TEST_CASE_P( basic, convolution_random_smoke_test, -- 2.7.4