if (cp.output.Feature().pad.before % fsv != 0)
return false;
+ // Input feature padding must be multiple of fsv to keep block alignment
+ if (cp.inputs[0].Feature().pad.before % fsv != 0)
+ return false;
+
return true;
}
if (cp.output.Feature().pad.before % fsv != 0)
return false;
+ // Input feature padding must be multiple of fsv to keep block alignment
+ if (cp.inputs[0].Feature().pad.before % fsv != 0)
+ return false;
+
return true;
}
if (cp.output.Feature().pad.before % fsv != 0)
return false;
+ // Input feature padding must be multiple of fsv to keep block alignment
+ if (cp.inputs[0].Feature().pad.before % fsv != 0)
+ return false;
+
return true;
}
if (pp.output.Feature().pad.before % 32 != 0)
return false;
+ if (pp.inputs[0].Feature().pad.before % 32 != 0)
+ return false;
+
return true;
}
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
#define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
// In some cases input padding may be bigger than needed, those variables describe the offset into padding.
#define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
uint input_offset = oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X;
input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING;
input_offset += INPUT0_PAD_BEFORE_FEATURE_NUM * INPUT0_FEATURE_PITCH;
- input_offset += b * INPUT0_BATCH_PITCH;
+ input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * INPUT0_BATCH_PITCH;
uint weight_offset = 0;
weight_offset += fs * FILTER_SIZE_X * FILTER_SIZE_Y * ALIGNED_IFM_NUM * FSV;
// ========================================================================
// Store results:
+ // Calculate offset to first output element
+ const uint out_pitch_x = FSV;
+ const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+ const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+ const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
+
uint output_offset = 0;
- output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
- output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
- output_offset += b * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
- output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+ output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+ output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+ output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b;
+ output_offset += (pad_before_fs + fs) * out_pitch_fs;
const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
#undef OUTPUT_SIZE_X_WITH_PADDING
#undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
#undef INPUT_BLOCK_WIDTH_EL_CNT
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
#define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
#define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
// In some cases input padding may be bigger than needed, those variables describe the offset into padding.
#define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
out[out_i] = UNIT_VAL_ZERO;
}
+ // Calculate offset to first input data element
+ const uint in_pitch_x = FSV;
+ const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+ const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+ const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
uint input_offset = 0;
- input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
- input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
- input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+ input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+ input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+ input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+ input_offset += (INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
uint weight_offset = 0;
weight_offset += fs * FILTER_SIZE_X * FILTER_SIZE_Y * ALIGNED_IFM_NUM * FSV;
// ====================================================================
// Move temporary input offset to next row
- tmp_input_offset += DILATION_SIZE_Y * INPUT0_SIZE_X_WITH_PADDING * FSV;
+ tmp_input_offset += DILATION_SIZE_Y * in_pitch_y;
uint tmp_weight_offset = weight_offset;
weight_offset += FILTER_SIZE_X * FSV;
}
// Move input offset to next input feature slice
- input_offset += INPUT0_BATCH_NUM * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+ input_offset += in_pitch_fs;
// Move weight offset to next input feature slice (FSV input features)
// minus offset added by moving FILTER_SIZE_Y times to new row
weight_offset += FSV * FILTER_SIZE_Y * FILTER_SIZE_X * FSV // FSV * input filter feature pitch
// ========================================================================
// Store results:
+ // Calculate offset to first output element
+ const uint out_pitch_x = FSV;
+ const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+ const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+ const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
uint output_offset = 0;
- output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
- output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
- output_offset += b * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
- output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+ output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+ output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+ output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b;
+ output_offset += (fs + pad_before_fs) * out_pitch_fs;
const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
#undef INPUT0_SIZE_X_WITH_PADDING
#undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
#undef OUTPUT_SIZE_X_WITH_PADDING
#undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
#define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
#define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
// In some cases input padding may be bigger than needed, those variables describe the offset into padding.
#define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
out[out_i] = UNIT_VAL_ZERO;
}
+ // Calculate offset to first input data element
+ const uint in_pitch_x = FSV;
+ const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+ const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+ const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
uint input_offset = 0;
- input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
- input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
- input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+ input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+ input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+ input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+ input_offset += (INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
uint weight_offset = 0;
weight_offset += fs * ALIGNED_IFM_NUM * FSV;
}
}
// Move temporary input offset to next strided row
- tmp_input_offset += INPUT0_SIZE_X_WITH_PADDING * FSV * STRIDE_SIZE_Y;
+ tmp_input_offset += in_pitch_y * STRIDE_SIZE_Y;
}
// ========================================================================
// Move input offset to next input feature slice
- input_offset += INPUT0_BATCH_NUM * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+ input_offset += in_pitch_fs;
}
// ========================================================================
// ========================================================================
// Store results:
+ // Calculate offset to first output element
+ const uint out_pitch_x = FSV;
+ const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+ const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+ const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
+
uint output_offset = 0;
- output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
- output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
- output_offset += b * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
- output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+ output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+ output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+ output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b;
+ output_offset += (pad_before_fs + fs) * out_pitch_fs;
const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
UNIT_BLOCK_WRITE2(output, output_offset + out_x * FSV, tmp_write);
}
// Move output offset to next row
- output_offset += FSV * OUTPUT_SIZE_X_WITH_PADDING;
+ output_offset += out_pitch_y;
}
}
else
}
}
// Move output offset to next row
- output_offset += FSV * OUTPUT_SIZE_X_WITH_PADDING;
+ output_offset += out_pitch_y;
}
}
// ========================================================================
#undef INPUT0_SIZE_X_WITH_PADDING
#undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
#undef OUTPUT_SIZE_X_WITH_PADDING
#undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
#define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
#define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
// In some cases input padding may be bigger than needed, those variables describe the offset into padding.
#define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
out[out_i] = UNIT_VAL_ZERO;
}
+ // Calculate offset to first input data element
+ const uint in_pitch_x = FSV;
+ const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+ const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+ const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
uint input_offset = 0;
- input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
- input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
- input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
- input_offset += fs * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV * INPUT0_BATCH_NUM;
+ input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+ input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+ input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+ input_offset += (fs + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
uint weight_offset = 0;
// ====================================================================
// Move temporary input offset to next row
- tmp_input_offset += DILATION_SIZE_Y * INPUT0_SIZE_X_WITH_PADDING * FSV;
+ tmp_input_offset += DILATION_SIZE_Y * in_pitch_y;
uint tmp_weight_offset = weight_offset;
// ========================================================================
// Store results:
+ // Calculate offset to first output element
+ const uint out_pitch_x = FSV;
+ const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+ const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+ const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
uint output_offset = 0;
- output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
- output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
- output_offset += b * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
- output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+ output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+ output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+ output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b;
+ output_offset += (pad_before_fs + fs) * out_pitch_fs;
const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
#undef INPUT0_SIZE_X_WITH_PADDING
#undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
#undef OUTPUT_SIZE_X_WITH_PADDING
#undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
+
#define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
#define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
// Kernel works only for sub_group size of 16 with 32 features slice size and process 2 features per WI
#define REQD_SUB_GROUP_SIZE 16
const uint x_pitch = REQD_FEATURE_SLICE_SIZE; // difference in location between (x+1) and (x)
const uint y_pitch = x_pitch * INPUT0_SIZE_X_WITH_PADDING; // difference in location between (y+1) and (y)
const uint b_pitch = y_pitch * INPUT0_SIZE_Y_WITH_PADDING; // difference in location between (b+1) and (b)
- const uint fs_pitch = b_pitch * INPUT0_BATCH_NUM; // difference in location between (fs+1) and (fs)
+ const uint fs_pitch = b_pitch * INPUT0_SIZE_B_WITH_PADDING; // difference in location between (fs+1) and (fs)
const int offset_x = (int)out_x*STRIDE_SIZE_X - PADDING_SIZE_X;
const int offset_y = (int)out_y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
- const size_t padding_offset = INPUT0_PAD_BEFORE_SIZE_X * x_pitch + INPUT0_PAD_BEFORE_SIZE_Y * y_pitch;
+ const size_t padding_offset = INPUT0_PAD_BEFORE_SIZE_X * x_pitch +
+ INPUT0_PAD_BEFORE_SIZE_Y * y_pitch +
+ INPUT0_PAD_BEFORE_BATCH_NUM * b_pitch +
+ INPUT0_PAD_BEFORE_FEATURE_NUM / REQD_FEATURE_SLICE_SIZE * fs_pitch;
const size_t fs_offset = fs * fs_pitch; // locate beginning of feature tile
const size_t b_offset = b * b_pitch; // locate beginning of batch
+
#ifdef CHECK_BOUNDRY
if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
const size_t out_x_pitch = REQD_FEATURE_SLICE_SIZE;
const size_t out_y_pitch = out_x_pitch * OUTPUT_SIZE_X_WITH_PADDING;
const size_t out_b_pitch = out_y_pitch * OUTPUT_SIZE_Y_WITH_PADDING;
- const size_t out_fs_pitch = out_b_pitch * OUTPUT_BATCH_NUM;
+ const size_t out_fs_pitch = out_b_pitch * OUTPUT_SIZE_B_WITH_PADDING;
const size_t out_pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / REQD_FEATURE_SLICE_SIZE);
const size_t out_x_offset = (out_x + OUTPUT_PAD_BEFORE_SIZE_X) * out_x_pitch;
const size_t out_y_offset = (out_y + OUTPUT_PAD_BEFORE_SIZE_Y) * out_y_pitch;
- const size_t out_b_offset = b * out_b_pitch;
+ const size_t out_b_offset = (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_b_pitch;
const size_t out_fs_offset = (fs + out_pad_before_fs) * out_fs_pitch;
-
const size_t output_offset = out_fs_offset + out_b_offset + out_y_offset + out_x_offset;
const bool full_f = OUTPUT_FEATURE_NUM % REQD_FEATURE_SLICE_SIZE == 0 ||
#undef OUTPUT_VEC2
#undef TO_OUTPUT_VEC2
+
+#undef INPUT0_SIZE_X_WITH_PADDING
+#undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
+
+#undef OUTPUT_SIZE_X_WITH_PADDING
+#undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
+
+#undef REQD_SUB_GROUP_SIZE
+#undef REQD_FEATURE_SLICE_SIZE
+#undef REQD_FEATURES_PER_WORK_ITEM
definitions.push_back({ safe_index_func_name, safe_index_func_val });
definitions.push_back({ index_func_name, index_func_val });
} else {
- definitions.push_back({ safe_index_func_name, "(f)" });
- definitions.push_back({ index_func_name, "(f)" });
+ definitions.push_back({ safe_index_func_name, "(" + std::to_string(_tensor.Feature().pad.before) + " + (f))" });
+ definitions.push_back({ index_func_name, "(" + std::to_string(_tensor.Feature().pad.before) + " + (f))" });
}
} else {
definitions.push_back({ safe_index_func_name, safe_index_func_val });
using namespace cldnn;
-// ToDo remove friendship relation from program_node
-void prepare_buffer_fusing::run(program_impl& p) {
- bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
- /*
- We need to take care of proper ordering by types.
- 1. Concats
- 2. Crops
- 3. Others
- Concat before crops is needed because of the crop fusing padding requirments.
- If crop is before concat there can be padding mismtach, since concat changes padding.
- */
- auto can_optimize = [](const program_node* node) {
- if (node->is_output() || (!node->get_fused_activations_funcs().empty())) {
- return false;
+namespace {
+
+struct concat_noop_optimization : pattern_match_optimization_typed<concat_noop_optimization, concatenation> {
+ // Removes concatenation nodes with single input.
+ using base = pattern_match_optimization_typed<concat_noop_optimization, concatenation>;
+ using base::base;
+
+ bool match(concatenation_node& node);
+ bool optimize(concatenation_node& node);
+};
+
+struct concat_in_place_optimization : pattern_match_optimization_typed<concat_in_place_optimization, concatenation> {
+ // Performs in-place concat optimization.
+ // Padding of predecessors is updated to use single buffer by all, which is output from concatenation.
+ // Then concatenation can be optimized out, as memory will be correctly filled by previous nodes.
+ // If one of the dependencies is also optimized-out concatenation, then cascade adjusment is performed to update it.
+ // This optimization is expected to be executed in some topological order, as cascade adjustment is performed backwards.
+ using base = pattern_match_optimization_typed<concat_in_place_optimization, concatenation>;
+ using base::base;
+
+ // Runs concat in-place optimization and adds already optimized concatenations that need re-optimization to `needs_reoptimization`.
+ void optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization);
+ bool match(concatenation_node& node);
+ bool optimize(concatenation_node& node) {
+ std::list<concatenation_node*> need_reopt;
+ optimize_cascade(node, need_reopt);
+ while (!need_reopt.empty()) {
+ auto& prop = *need_reopt.front();
+ need_reopt.pop_front();
+ if (match(prop))
+ optimize_cascade(prop, need_reopt);
+ else
+ // TODO: Revert extra padding when cascade adjustment failed.
+ prop.can_be_optimized(false);
}
- return true;
- };
+ return false; // node not invalidated
+ }
+};
+
+bool concat_noop_optimization::match(concatenation_node& node) {
+ if (node.is_output() && !get_program().is_debug_build())
+ return false;
+ return node.get_dependencies().size() == 1 &&
+ !node.has_fused_primitives() &&
+ node.get_fused_activations_funcs().empty();
+}
- // [1] First try to optimize all concats
- auto node_itr = p.get_processing_order().begin();
- while (node_itr != p.get_processing_order().end()) {
- auto& node = (*node_itr++);
- if (!can_optimize(node))
- continue;
- program_helpers::do_for_types<concatenation>(*node, [&p, is_debug](concatenation_node& node) {
- // For in place concatenation input layouts and data types must match
- auto output_format = node.get_output_layout().format;
- auto output_datatype = node.get_output_layout().data_type;
- // we need to avoid mixing padded and unpadded buffer
- bool all_dependencies_padded = true;
- bool all_dependencies_unpadded = true;
- for (auto& input : node.get_dependencies()) {
- if (input->type() == reshape::type_id())
- // reshapes should be optimized out
- return;
+bool concat_noop_optimization::optimize(concatenation_node& node) {
+ auto& dep = node.get_dependency(0);
+ dep.merge_output_padding(node.get_output_layout().data_padding);
+ prog.extract_and_remove(node);
+ // Node has been removed, so no further optimizations.
+ return true;
+}
- layout l = input->get_output_layout();
- if (static_cast<bool>(l.data_padding))
- all_dependencies_unpadded = false;
- else
- all_dependencies_padded = false;
+bool concat_in_place_optimization::match(concatenation_node& node) {
+ if (node.is_output() && !get_program().is_debug_build())
+ return false;
+ if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty())
+ return false;
- if (output_format != l.format || output_datatype != l.data_type)
- return;
+ // For in place concatenation input layouts and data types must match.
+ auto output_format = node.get_output_layout().format;
+ auto output_datatype = node.get_output_layout().data_type;
+ auto concat_axis = node.get_primitive()->axis;
- if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
- return;
+ for (auto& input : node.get_dependencies()) {
+ if (input->is_type<reshape>())
+ // reshapes should be optimized out.
+ return false;
- if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
- return;
+ layout l = input->get_output_layout();
- if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
- (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
- return;
+ if (output_format != l.format || output_datatype != l.data_type)
+ return false;
- // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
- if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
- return;
+ // TODO: Below condition should be moved to program_node::supports_padding.
+ // This hovewer will need updating the algorithm as it may make cascade adjustment impossible in some cases.
+ // It hovewer would make normal optimizations possible in others, so this is a trade-off to be investigated.
+ if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
+ return false;
- if (l.format == format::bs_fs_yx_bsv16_fsv16)
- return;
+ if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
+ return false;
- if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f))
- return;
- }
+ if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
+ (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
+ return false;
- auto concat_axis = node.get_primitive()->axis;
- auto padd = node.get_output_layout().data_padding;
+ // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
+ if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
+ return false;
- tensor lower_padd = padd.lower_size();
- tensor upper_padd = padd.upper_size();
+ if (l.format == format::bs_fs_yx_bsv16_fsv16)
+ return false;
- auto upper_padd_val =
- node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis];
- tensor lower_padd_offset = lower_padd;
+ if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f))
+ return false;
+ }
- std::list<std::pair<const std::vector<program_node*>, tensor>> stack = {
- std::make_pair(node.get_dependencies(), tensor(0))};
- while (!stack.empty()) {
- auto nodes_list = stack.front();
- stack.pop_front();
+ auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().raw[concat_axis];
+ lower_padd_in_axis = std::max(lower_padd_in_axis,
+ node.get_dependency(0).get_output_layout().data_padding.lower_size().raw[concat_axis]);
+
+ // check if concatenation in place can be applied for inputs set
+ size_t idx = 0;
+ for (auto input : node.get_dependencies()) {
+ // reverted condition - if any of this node's inputs is used by more than one primitive
+ // and is not optimized concatenation then do not fuse buffers
+ // todo: we need add padding support for all optimized kernels to remove this condition
+ if (!input->is_type<pooling>() && !input->is_type<convolution>() &&
+ !input->is_type<activation>() && !input->is_type<deconvolution>() &&
+ !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>() &&
+ !input->is_type<resample>())
+ return false;
- // if concatenation has only one input it does nothing, remove the node
- if (node.get_dependencies().size() == 1) {
- p.extract_and_remove(node);
- return;
- }
+ // if an input is marked as network output, prevent optimizations
+ // which would affect a form of its output (unless debug flag is set),
+ // we also need to restrict input types to those which support padding on all axis
+ if ((input->is_output() && !get_program().is_debug_build()) ||
+ !input->is_padding_supported(concat_axis, lower_padd_in_axis))
+ return false;
- auto cascade_adjustment = nodes_list.second;
- upper_padd.raw[concat_axis] = upper_padd_val;
- lower_padd = lower_padd_offset;
-
- auto lower_padd_in_axis = lower_padd.raw[concat_axis] + cascade_adjustment.raw[concat_axis];
- auto first_input_format = nodes_list.first[0]->get_output_layout().format;
-
- // check if concatenation in place can be applied for inputs set
- for (auto input : nodes_list.first) {
- // reverted condition - if any of this node's inputs is used by more than one primitive
- // and is not optimized concatenation then do not fuse buffers
- // todo: we need add padding support for all optimized kernels to remove this condition
- if (!input->is_type<pooling>() && !input->is_type<convolution>() &&
- !input->is_type<activation>() && !input->is_type<deconvolution>() &&
- !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>() &&
- !input->is_type<resample>())
- return;
+ // TODO: Investigate if this condition is needed
+ if (input->get_users().size() > 2)
+ return false;
- // if an input is marked as network output, prevent optimizations
- // which would affect a form of its output (unless debug flag is set),
- // we also need to restrict input types to those which support padding on all axis
- if ((input->is_output() && !is_debug) || input->get_users().size() > 2 ||
- !input->is_padding_supported(concat_axis, lower_padd_in_axis))
- return;
+ // Check that input isn't optimized out concatenation along different axis.
+ if (input->is_type<concatenation>() && input->can_be_optimized() &&
+ input->as<concatenation>().get_primitive()->axis != concat_axis)
+ return false;
- if (input->get_users().size() > 1) {
- auto user_count = input->get_users().size();
- for (auto& user : input->get_users())
- if (user->is_type<concatenation>())
- user_count--;
- if (user_count != 1) // user_cout == 0 means that input will be used only by concatenations, so
- // we cannot apply concat in place for it
- return;
- }
-
- // check if all inputs have the same format
- if (input->get_output_layout().format != first_input_format)
- return;
+ // Check that input isn't optimized out non-concatenation.
+ if (!input->is_type<concatenation>() && input->can_be_optimized())
+ return false;
- lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis];
- }
+ size_t concat_users = 0;
+ for (auto& user : input->get_users())
+ if (user->is_type<concatenation>())
+ concat_users += 1;
- // check if it is worth doing concat in place, in case the following primitive is convolution
- // with different input padding than concatenation's input users' convolutions,
- // it is likely that convolution's implementation will be a reference one, due to mismatched padding
- // and performance gain by doing in place concat is nullified by slower convolution implementation
- // this should be handled by more advanced tuning mechanism on the topology level
- auto& users = node.get_users();
- if (users.size() == 1) {
- auto& user = users.front();
- if (node.get_output_layout().format == format::bfyx && user->type() == convolution::type_id()) {
- auto out_input_offsets = user->as<convolution>().get_primitive()->input_offset;
-
- std::vector<tensor> in_input_offsets;
- for (auto& in_user : nodes_list.first) {
- if (in_user->type() == convolution::type_id())
- in_input_offsets.push_back(in_user->as<convolution>().get_primitive()->input_offset);
- }
-
- for (auto& in_input_offset : in_input_offsets) {
- if (in_input_offset.spatial[0] != out_input_offsets.spatial[0] &&
- in_input_offset.spatial[1] != out_input_offsets.spatial[1])
- return;
- }
- } else if (user->type() == fused_conv_eltwise::type_id()) {
- if (!user->as<fused_conv_eltwise>().get_fused_primitives().empty() &&
- user->as<fused_conv_eltwise>().get_fused_primitives().begin()->node->is_type<depth_to_space>())
- return;
- }
- }
+ // If input is used by more than one concatenation then they may require different paddings.
+ if (concat_users != 1)
+ return false;
- // apply concatenation in place optimization
- for (auto input : nodes_list.first) {
- auto input_lenght = input->get_output_layout().size.raw[concat_axis];
+ auto input_padd = input->get_output_layout().data_padding;
- bool optimized_concat_input = false;
- if (input->type() == concatenation::type_id() && input->can_be_optimized()) {
- if (input->as<concatenation>().get_primitive()->axis != node.get_primitive()->axis)
- return;
- optimized_concat_input = true;
- } else if (input->can_be_optimized()) {
- return;
- }
+ // Check that there isn't already some padding between inputs in concat axis.
+ // If node has already been optimized we skip this check - this is just cascade adjustment.
+ if (!node.can_be_optimized()) {
+ if (idx != node.get_dependencies().size() && input_padd.upper_size().raw[concat_axis] != 0)
+ return false;
+ if (idx != 0 && input_padd.lower_size().raw[concat_axis] != 0)
+ return false;
+ }
- // shrink upper pad so it points at the end of the input's buffer
- //
- // |--- lower padd ---| |---------- upper padd -----------|
- // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
- upper_padd.raw[concat_axis] -= input_lenght;
+ lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis];
+ idx += 1;
+ }
- // adjust padding sizes for cascade concatenations
- auto lower_padd_tmp = lower_padd;
- lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis];
- auto upper_padd_tmp = upper_padd;
- upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis];
+ return true;
+}
- // set new padding for input
- input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes()));
+void concat_in_place_optimization::optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization) {
+ auto concat_axis = node.get_primitive()->axis;
- // move lower padd further
- //
- // |-------------- lower padd -------------|---------- upper padd -----------|
- // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
+ // Select output padding by propagating all required input paddings.
+ auto padd = node.get_output_layout().data_padding;
+ for (auto input : node.get_dependencies()) {
+ padd = padding::max(padd, input->get_output_layout().data_padding);
+ }
- lower_padd.raw[concat_axis] += input_lenght;
+ auto lower_padd = padd.lower_size();
+ auto upper_padd = padd.upper_size();
- if (optimized_concat_input && !input->get_dependencies().empty())
- stack.push_back(std::make_pair(input->get_dependencies(),
- input->get_output_layout().data_padding.lower_size()));
- }
- }
+ // For cascade adjustment override padding in concat axis to output padding.
+ // In other case match(...) already checked that only first/last input have lower/upper padding.
+ if (node.can_be_optimized()) {
+ lower_padd.raw[concat_axis] = node.get_output_layout().data_padding.lower_size().raw[concat_axis];
+ upper_padd.raw[concat_axis] = node.get_output_layout().data_padding.upper_size().raw[concat_axis];
+ }
+ node.set_output_padding(padding(lower_padd.sizes(), upper_padd.sizes()));
- node.can_be_optimized(true);
- for (auto dep : node.get_users()) {
- dep->can_share_buffer(false);
- }
- if (!all_dependencies_padded && !all_dependencies_unpadded)
- node.can_share_buffer(false);
- });
+ upper_padd.raw[concat_axis] += node.get_output_layout().size.raw[concat_axis];
+
+ // apply concatenation in place optimization
+ for (auto input : node.get_dependencies()) {
+ auto input_lenght = input->get_output_layout().size.raw[concat_axis];
+
+ if (input->is_type<concatenation>() && input->can_be_optimized())
+ need_reoptimization.push_back(&input->as<concatenation>());
+
+ // shrink upper pad so it points at the end of the input's buffer
+ //
+ // |--- lower padd ---| |---------- upper padd -----------|
+ // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
+ upper_padd.raw[concat_axis] -= input_lenght;
+
+ // set new padding for input
+ input->set_output_padding(padding(lower_padd.sizes(), upper_padd.sizes()));
+
+ // move lower padd further
+ //
+ // |-------------- lower padd -------------|---------- upper padd -----------|
+ // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
+ lower_padd.raw[concat_axis] += input_lenght;
+ }
+
+ node.can_be_optimized(true);
+ for (auto dep : node.get_users()) {
+ dep->can_share_buffer(false);
}
+}
+
+} // namespace
+
+// ToDo remove friendship relation from program_node
+void prepare_buffer_fusing::run(program_impl& p) {
+ bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
+ /*
+ We need to take care of proper ordering by types.
+ 1. Concats
+ 2. Crops
+ 3. Others
+ Concat before crops is needed because of the crop fusing padding requirments.
+ If crop is before concat there can be padding mismtach, since concat changes padding.
+ */
+ auto can_optimize = [](const program_node* node) {
+ if (node->is_output() || (!node->get_fused_activations_funcs().empty())) {
+ return false;
+ }
+ return true;
+ };
+
+ // [1] First try to optimize all concats
+ run_node_optimizations<concat_noop_optimization,
+ concat_in_place_optimization>(p);
// [2] Then try to optimize all crops
- node_itr = p.get_processing_order().begin();
+ auto node_itr = p.get_processing_order().begin();
while (node_itr != p.get_processing_order().end()) {
auto& node = (*node_itr++);
if (!can_optimize(node))
/*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
};
+
+// Base class for performing pattern match style optimizations.
+// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`,
+// and overload match and optimize methods.
+template <typename Impl>
+struct pattern_match_optimization {
+ pattern_match_optimization(program_impl& prog)
+ : prog(prog)
+ {}
+
+ // Returns whether optimization can be performed for specified node.
+ bool match(program_node& node) {
+ return static_cast<Impl*>(this)->match(node);
+ }
+ // Returns whether optimization invalidated the node and no futher optimizations should execute.
+ bool optimize(program_node& node) {
+ // TODO: Add program optimizer class that would take responsibility of modifying program.
+ // Then use it to provide more complex control over pattern-matches, ie:
+ // new node added - run applicable optimizations on it as well;
+ // node deleted - don't do more optimizations;
+ return static_cast<Impl*>(this)->optimize(node);
+ }
+ // Returns whether optimization invalidated the node and no futher optimizations should execute.
+ bool match_and_optimize(program_node& node) {
+ if (!match(node))
+ return false;
+ return optimize(node);
+ }
+
+ program_impl& get_program() { return prog; }
+
+ program_impl& prog;
+};
+
+// Class for pattern-match optimizations that provides support for matching
+// single primitive type `Prim`.
+// Implementing class `Impl` is expected to overload:
+// bool match(typed_program_node<Prim>&)
+// bool optimize(typed_program_node<Prim>&)
+// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`.
+template <typename Impl, typename Prim>
+struct pattern_match_optimization_typed : pattern_match_optimization<pattern_match_optimization_typed<Impl, Prim>> {
+ using base = pattern_match_optimization<pattern_match_optimization_typed<Impl, Prim>>;
+
+ using base::base;
+
+ // Returns whether optimization can be performed for specified node.
+ bool match(program_node& node) {
+ if (!node.is_type<Prim>())
+ return false;
+ return static_cast<Impl*>(this)->match(node.as<Prim>());
+ }
+ // Should be overloaded by implementation class to match specified primitive.
+ bool match(typed_program_node<Prim>& node) {
+ return false;
+ }
+
+ // Returns whether optimization invalidated the node and no futher optimizations should execute.
+ bool optimize(program_node& node) {
+ return static_cast<Impl*>(this)->optimize(node.as<Prim>());
+ }
+ // Should be overloaded by implementation class to optimize specified primitive.
+ bool optimize(typed_program_node<Prim>& node) {
+ return false;
+ }
+};
+
+// Runs pattern-match optimiations passed as arguments on `node`.
+inline bool run_node_optimizations(program_node& /*node*/) {
+ return false;
+}
+
+template <typename Opt, typename... Rest>
+bool run_node_optimizations(program_node& node, Opt&& opt, Rest&&... rest) {
+ if (opt.match_and_optimize(node))
+ return true;
+ return run_node_optimizations(node, std::forward<Rest>(rest)...);
+}
+
+// Runs pattern-match optimizations `Opts` on `node`.
+// Optimizations should have constructor with single argument `program_impl&`.
+template <typename... Opts>
+bool run_node_optimizations(program_impl& p, program_node& node) {
+ return run_node_optimizations<Opts...>(node, Opts(p)...);
+}
+
+// Runs specified pattern-match optimizations on whole program, in processing order.
+template <typename... Opts>
+void run_node_optimizations(program_impl& p, Opts&&... opts) {
+ auto it = p.get_processing_order().begin();
+ while (it != p.get_processing_order().end()) {
+ auto node = *it++;
+ run_node_optimizations(*node, std::forward<Opts>(opts)...);
+ }
+}
+
+template <typename... Opts>
+void run_node_optimizations(program_impl& p) {
+ run_node_optimizations(p, Opts(p)...);
+}
+
} // namespace cldnn
/*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
}
}
+TEST(depth_concatenate_f32_gpu, test06_padded_input) {
+ // input1 - activation - concatenation - concatenation - reorder
+ // / /
+ // input2 - activation - convolution* /
+ //
+ // *Convolution has input offset so it should be propagated, both back to reorders and to second concatenation.
+ // As a result both concatenations should be optimized out and convolution should use optimized implementation.
+ const int32_t input_f = 32;
+ const int32_t output_f = 3 * input_f;
+
+ const auto& engine = get_test_engine();
+ auto input1 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+ auto input2 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+
+ auto input1_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+ auto input2_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+ set_values(input1, flatten_4d(format::bfyx, input1_data));
+ set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+ auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, {input_f, input_f, 3, 3} });
+ // Construct weights for convolution that just double input values.
+ VVVVF<FLOAT16> weights_data;
+ weights_data.resize(input_f);
+ for (size_t oi = 0; oi < input_f; ++oi) {
+ weights_data[oi].resize(input_f, VVF<FLOAT16>(3, VF<FLOAT16>(3, FLOAT16(0.f))));
+ weights_data[oi][oi][1][1] = 2.f;
+ }
+ set_values(weights, flatten_4d(format::bfyx, weights_data));
+
+ topology topology;
+ topology.add(input_layout("input1", input1.get_layout()));
+ topology.add(input_layout("input2", input2.get_layout()));
+ topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+ topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+ topology.add(data("weights", weights));
+ topology.add(convolution("conv", "actv2", { "weights" }, tensor(1), tensor(batch(0), feature(0), spatial(-1, -1, 0, 0))));
+ topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+ topology.add(concatenation("depth2", { "depth1", "conv" }, concatenation::along_f));
+ topology.add(reorder("output", "depth2", format::bfyx, data_types::f32));
+
+ cldnn::build_options options;
+ options.set_option(cldnn::build_option::optimize_data(true));
+ options.set_option(cldnn::build_option::force_implementations({ {"conv", implementation_desc{format::fs_b_yx_fsv32, ""} } }));
+ network network(engine, topology, options);
+
+ network.set_input_data("input1", input1);
+ network.set_input_data("input2", input2);
+
+ auto outputs = network.execute({});
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "output");
+ // Check that all concatenations have been optimized out.
+ auto executed_primitives = network.get_executed_primitives();
+ EXPECT_TRUE(executed_primitives.count("depth1") == 0);
+ EXPECT_TRUE(executed_primitives.count("depth2") == 0);
+ // Check that convolution was able to use optimzed kernel.
+ for (auto& info : network.get_primitives_info()) {
+ if (info.original_id == "conv") {
+ EXPECT_TRUE(info.kernel_id.find("ref") == std::string::npos) << " selected kernel: " << info.kernel_id;
+ }
+ }
+
+ auto output = outputs.at("output").get_memory();
+ auto output_ptr = output.pointer<float>();
+ ASSERT_EQ(output.count(), output_f);
+ for (size_t i = 0; i < output_f; ++i) {
+ auto& val = output_ptr[i];
+ float ref;
+ if (i < input_f)
+ ref = 0.75f * static_cast<float>(input1_data[0][i % input_f][0][0]);
+ else if (i < 2 * input_f)
+ ref = 0.5f * static_cast<float>(input2_data[0][i % input_f][0][0]);
+ else
+ ref = static_cast<float>(input2_data[0][i % input_f][0][0]);
+
+ EXPECT_EQ(val, ref) << " at i=" << i;
+ }
+}
+
+TEST(depth_concatenate_f32_gpu, test07_padded_output) {
+ // input1 - activation - concatenation - convolution - reorder
+ // input2 - activation /
+ //
+ // *Convolution has input offset so it should be propagated back to activations.
+ const int32_t input_f = 32;
+ const int32_t output_f = 2 * input_f;
+
+ const auto& engine = get_test_engine();
+ auto input1 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+ auto input2 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+
+ auto input1_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+ auto input2_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+ set_values(input1, flatten_4d(format::bfyx, input1_data));
+ set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+ auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, {output_f, output_f, 3, 3} });
+ // Construct weights for convolution that just double input values.
+ VVVVF<FLOAT16> weights_data;
+ weights_data.resize(output_f);
+ for (size_t oi = 0; oi < output_f; ++oi) {
+ weights_data[oi].resize(output_f, VVF<FLOAT16>(3, VF<FLOAT16>(3, FLOAT16(0.f))));
+ weights_data[oi][oi][1][1] = 2.f;
+ }
+ set_values(weights, flatten_4d(format::bfyx, weights_data));
+
+ topology topology;
+ topology.add(input_layout("input1", input1.get_layout()));
+ topology.add(input_layout("input2", input2.get_layout()));
+ topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+ topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+ topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+ topology.add(data("weights", weights));
+ topology.add(convolution("conv", "depth1", { "weights" }, tensor(1), tensor(batch(0), feature(0), spatial(-1, -1, 0, 0))));
+ topology.add(reorder("output", "conv", format::bfyx, data_types::f32));
+
+ cldnn::build_options options;
+ options.set_option(cldnn::build_option::optimize_data(true));
+ options.set_option(cldnn::build_option::force_implementations({ {"conv", implementation_desc{format::fs_b_yx_fsv32, ""} } }));
+ network network(engine, topology, options);
+
+ network.set_input_data("input1", input1);
+ network.set_input_data("input2", input2);
+
+ auto outputs = network.execute({});
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "output");
+ // Check that all concatenations have been optimized out.
+ auto executed_primitives = network.get_executed_primitives();
+ EXPECT_TRUE(executed_primitives.count("depth1") == 0);
+ // Check that convolution was able to use optimzed kernel.
+ for (auto& info : network.get_primitives_info()) {
+ if (info.original_id == "conv") {
+ EXPECT_TRUE(info.kernel_id.find("ref") == std::string::npos) << " selected kernel: " << info.kernel_id;
+ }
+ }
+
+ auto output = outputs.at("output").get_memory();
+ auto output_ptr = output.pointer<float>();
+ ASSERT_EQ(output.count(), output_f);
+ for (size_t i = 0; i < output_f; ++i) {
+ auto& val = output_ptr[i];
+ float ref;
+ if (i < input_f)
+ ref = 1.5f * static_cast<float>(input1_data[0][i % input_f][0][0]);
+ else
+ ref = static_cast<float>(input2_data[0][i % input_f][0][0]);
+
+ EXPECT_EQ(val, ref) << " at i=" << i;
+ }
+}
+
+TEST(depth_concatenate_f32_gpu, test07_concat_is_output) {
+ // input1 - activation - concatenation
+ // input2 - activation /
+ //
+ // As concatenation is output it should not be optimizex out.
+ const int32_t input_f = 16;
+ const int32_t output_f = 2 * input_f;
+
+ const auto& engine = get_test_engine();
+ auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, {1, input_f, 1, 1} });
+ auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, {1, input_f, 1, 1} });
+
+ auto input1_data = generate_random_4d<float>(1, input_f, 1, 1, -1, 1);
+ auto input2_data = generate_random_4d<float>(1, input_f, 1, 1, -1, 1);
+ set_values(input1, flatten_4d(format::bfyx, input1_data));
+ set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+ topology topology;
+ topology.add(input_layout("input1", input1.get_layout()));
+ topology.add(input_layout("input2", input2.get_layout()));
+ topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+ topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+ topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+
+ cldnn::build_options options;
+ options.set_option(cldnn::build_option::optimize_data(true));
+ network network(engine, topology, options);
+
+ network.set_input_data("input1", input1);
+ network.set_input_data("input2", input2);
+
+ auto outputs = network.execute({});
+ EXPECT_EQ(outputs.size(), size_t(1));
+ EXPECT_EQ(outputs.begin()->first, "depth1");
+ // Check that concatenation haven't been optimized out.
+ auto executed_primitives = network.get_executed_primitives();
+ EXPECT_TRUE(executed_primitives.count("depth1") == 1);
+
+ auto output = outputs.at("depth1").get_memory();
+ auto output_ptr = output.pointer<float>();
+ ASSERT_EQ(output.count(), output_f);
+ for (size_t i = 0; i < output_f; ++i) {
+ auto& val = output_ptr[i];
+ float ref;
+ if (i < input_f)
+ ref = 0.75f * input1_data[0][i % input_f][0][0];
+ else
+ ref = 0.5f * input2_data[0][i % input_f][0][0];
+
+ EXPECT_EQ(val, ref) << " at i=" << i;
+ }
+}
+
TEST(depth_concatenate_f32_gpu, concat_with_different_format_inputs) {
const auto& engine = get_test_engine();
build_options build_opt;