void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
{
+#ifdef HAVE_OPENCL
+ ocl_exec_cache.clear();
+#endif
+
std::vector<Mat> inputs, outputs;
inputs_arr.getMatVector(inputs);
outputs_arr.getMatVector(outputs);
}
#ifdef HAVE_OPENCL
- bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+ struct OpenCLExecInfo
{
- std::vector<UMat> inputs;
- std::vector<UMat> outputs;
+ std::string kernel_name;
+ std::string build_opts;
+ size_t local_size[2];
+ size_t global_size[2];
- inputs_.getUMatVector(inputs);
- outputs_.getUMatVector(outputs);
+ OpenCLExecInfo()
+ {
+ local_size[0] = local_size[1] = 0;
+ global_size[0] = global_size[1] = 0;
+ }
+ };
+ std::vector<OpenCLExecInfo> ocl_exec_cache;
+
+ void ocl_prepare(const std::vector<UMat>& inputs, const std::vector<UMat>& outputs)
+ {
+ CV_TRACE_FUNCTION();
CV_Assert(outputs.size() == finalSliceRanges.size());
+ ocl_exec_cache.resize(outputs.size());
const UMat& input = inputs[0];
- if (input.dims > 5)
- {
- CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
- return false;
- }
+ const int dims = input.dims;
size_t WSZ = 128;
- const int dims = input.dims;
const int elemSize = (int)input.elemSize();
String opts0 = cv::format(
"-DDIMS=%d -DELEMSIZE=%d",
{
opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
}
- String kname = cv::format("slice_%d", dims);
for (size_t i = 0; i < outputs.size(); i++)
{
- UMat& output = outputs[i];
+ OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+ const UMat& output = outputs[i];
const std::vector<Range>& range = finalSliceRanges[i];
String opts = opts0;
CV_CheckEQ(range[d].size(), (int)output.size[d], "");
}
+ const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64;
+
int block_dims = 0;
size_t block_size = elemSize;
for (int i = dims - 1; i >= 0; --i)
break;
block_size *= output.size[i];
block_dims++;
+ if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)
+ break;
}
const size_t total = output.total() * elemSize;
size_t num_blocks = total / block_size;
- if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
+ if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG))
{
// use 1D copy mode
opts += cv::format(" -DUSE_COPY_1D=1");
opts += cv::format(" -DWSZ=%d", (int)WSZ);
- size_t local[] = { WSZ, 1 };
- size_t global[] = { WSZ, num_blocks };
+ std::ostringstream kernel_suffix;
+ kernel_suffix << dims << 'x' << elemSize << "_bsz" << block_size;
+ kernel_suffix << "__src_";
+ for (int d = 0; d < dims; d++)
+ {
+ kernel_suffix << input.size[dims - 1 - d] << '_';
+ }
+ kernel_suffix << '_';
+ /*for (int d = 0; d < dims; d++)
+ {
+ kernel_suffix << input.step[dims - 1 - d] << '_';
+ }
+ kernel_suffix << '_';*/
- ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
+ kernel_suffix << "dst_";
+ for (int d = 0; d < dims; d++)
+ {
+ kernel_suffix << output.size[dims - 1 - d] << '_';
+ }
+ /*kernel_suffix << '_';
+ for (int d = 0; d < dims; d++)
+ {
+ kernel_suffix << output.step[dims - 1 - d] << '_';
+ }*/
+ kernel_suffix << "_slice_";
+ for (int d = 0; d < dims; d++)
+ {
+ kernel_suffix << range[dims - 1 - d].start << '_';
+ }
+ for (int d = 0; d < dims; d++)
+ {
+ kernel_suffix << '_' << range[dims - 1 - d].end;
+ }
+
+ std::string kernel_suffix_str = kernel_suffix.str();
+ opts += cv::format(" -DSLICE_KERNEL_SUFFIX=%s", kernel_suffix_str.c_str());
+
+ ocl.kernel_name = cv::format("slice_%s", kernel_suffix_str.c_str());
+ ocl.build_opts = opts;
+ ocl.local_size[0] = WSZ;
+ ocl.local_size[1] = 1;
+ ocl.global_size[0] = WSZ;
+ ocl.global_size[1] = num_blocks;
+ } // for outputs.size()
+ } // ocl_prepare
+
+ bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+ {
+ CV_TRACE_FUNCTION();
+
+ std::vector<UMat> inputs;
+ std::vector<UMat> outputs;
+
+ inputs_.getUMatVector(inputs);
+ outputs_.getUMatVector(outputs);
+
+ CV_Assert(outputs.size() == finalSliceRanges.size());
+
+ const UMat& input = inputs[0];
+ const int dims = input.dims;
+ if (dims > 5)
+ {
+ CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << ". Fallback to CPU");
+ return false;
+ }
+
+ if (ocl_exec_cache.empty())
+ {
+ ocl_prepare(inputs, outputs);
+ }
+ CV_CheckEQ(ocl_exec_cache.size(), outputs.size(), "");
+
+ for (size_t i = 0; i < outputs.size(); i++)
+ {
+ const OpenCLExecInfo& ocl = ocl_exec_cache[i];
+
+ UMat& output = outputs[i];
+
+ ocl::Kernel kernel(ocl.kernel_name.c_str(), ocl::dnn::slice_oclsrc, ocl.build_opts);
if (kernel.empty())
return false;
bool ret = kernel.args(
ocl::KernelArg::PtrReadOnly(input),
ocl::KernelArg::PtrWriteOnly(output)
)
- .run(2, global, local, false);
+ .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
if (!ret)
return false;
} // for outputs.size()
return true;
- }
+ } // forward_ocl
#endif
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
#define BLOCK_COLS_X4 (BLOCK_COLS / 4)
#define BLOCK_COLS_X16 (BLOCK_COLS / 16)
-#ifdef USE_COPY_1D
-
-static inline
-__attribute__((always_inline))
-void copy_block_1d(
+__attribute__((reqd_work_group_size(WSZ, 1, 1)))
+__kernel void
+CONCAT(slice_, SLICE_KERNEL_SUFFIX)(
__global const uchar* src0,
- const uint src_offset,
- __global uchar* dst0,
- const uint dst_offset
+ __global uchar* dst0
)
{
- __global const uchar* src = src0 + src_offset;
- __global uchar* dst = dst0 + dst_offset;
+ uint block_id = get_global_id(1);
+ uint dst_offset0 = block_id * BLOCK_SIZE;
+ uint src_offset0 = 0;
+
+ { // calculate src_offset0
+
+#define CALC_SRC_INDEX(dim) \
+ { \
+ uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
+ CONCAT(idx_, dim) = block_id / plane_sz; \
+ block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
+ }
+#define UPDATE_SRC_OFFSET(dim) \
+ src_offset0 = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset0);
+/*
+ if (get_global_id(0) == 0 && get_global_id(1) == 0) \
+ printf("(%d, %d): @%d src_offset0=%d idx_dim=%d block_id=%d\n", \
+ get_global_id(0), get_global_id(1), \
+ dim, src_offset0, CONCAT(idx_, dim), block_id \
+ );
+*/
+
+#if DIMS > 5
+#error "invalid configuration"
+#endif
+#if DIMS > 4
+ uint idx_4 = 0;
+#if BLOCK_DIMS <= 4
+ CALC_SRC_INDEX(4)
+#endif
+ UPDATE_SRC_OFFSET(4)
+#endif
+#if DIMS > 3
+ uint idx_3 = 0;
+#if BLOCK_DIMS <= 3
+ CALC_SRC_INDEX(3)
+#endif
+ UPDATE_SRC_OFFSET(3)
+#endif
+#if DIMS > 2
+ uint idx_2 = 0;
+#if BLOCK_DIMS <= 2
+ CALC_SRC_INDEX(2)
+#endif
+ UPDATE_SRC_OFFSET(2)
+#endif
+#if DIMS > 1
+ uint idx_1 = 0;
+#if BLOCK_DIMS <= 1
+ CALC_SRC_INDEX(1)
+#endif
+ UPDATE_SRC_OFFSET(1)
+#endif
+#if DIMS > 0
+ uint idx_0 = 0;
+ UPDATE_SRC_OFFSET(0)
+#endif
+
+/*
+ if (get_global_id(0) == 0)
+ printf("(%d, %d): src_offset0=%d dst_offset0=%d\n",
+ get_global_id(0), get_global_id(1),
+ src_offset0, dst_offset0
+ );
+*/
+
+ } // calculate src_offset0
+
+#ifdef USE_COPY_1D
+ { // copy_block_1d
+ __global const uchar* src = src0 + src_offset0;
+ __global uchar* dst = dst0 + dst_offset0;
uint processed = 0;
uint i = get_local_id(0) * 16; // uchar16
while (i < BLOCK_COLS_X16 * 16)
{
- uint4 idx = (uint4)(i, i + 16 * WSZ, i + 32 * WSZ, i + 48 * WSZ);
- idx = select((uint4)i, idx, idx < (BLOCK_COLS_X16 * 16));
+ uint4 idx0 = (uint4)i;
+ uint4 idx = idx0 + (uint4)(0, 16 * WSZ, 32 * WSZ, 48 * WSZ);
+ idx = select(idx0, idx, idx < (BLOCK_COLS_X16 * 16));
uchar16 a0 = vload16(0, src + idx.s0);
uchar16 a1 = vload16(0, src + idx.s1);
uint i = get_local_id(0) * 4 + processed; // uchar4
while (i < BLOCK_COLS_X4 * 4)
{
- uint4 idx = (uint4)(i, i + 4 * WSZ, i + 8 * WSZ, i + 12 * WSZ);
- idx = select((uint4)i, idx, idx < (BLOCK_COLS_X4 * 4));
+ uint4 idx0 = (uint4)i;
+ uint4 idx = idx0 + (uint4)(0, 4 * WSZ, 8 * WSZ, 12 * WSZ);
+ idx = select(idx0, idx, idx < (BLOCK_COLS_X4 * 4));
uchar4 a0 = vload4(0, src + idx.s0);
uchar4 a1 = vload4(0, src + idx.s1);
}
}
#endif
-}
+ } // copy_block_1d
-#else // USE_COPY_1D
+#else
-static inline
-__attribute__((always_inline))
-void copy_block_2d(
- __global const uchar* src0,
- const uint src_offset0,
- __global uchar* dst0,
- const uint dst_offset0
-)
-{
+ { // copy_block_2d
__global const uchar* src = src0 + src_offset0;
__global uchar* dst = dst0 + dst_offset0;
#endif // BLOCK_COLS_FILL_X4 != BLOCK_COLS
i += WSZ * 4;
}
-}
-
-#endif // USE_COPY_1D
-
-__kernel void
-CONCAT(slice_, DIMS)(
- __global const uchar* src,
- __global uchar* dst
-)
-{
- uint block_id = get_global_id(1);
-
- uint dst_offset = block_id * BLOCK_SIZE;
-
- uint src_offset = 0;
-
-#define CALC_SRC_INDEX(dim) \
- { \
- uint plane_sz = CONCAT(DST_STEP_, dim) / BLOCK_SIZE; \
- CONCAT(idx_, dim) = block_id / plane_sz; \
- block_id = block_id - CONCAT(idx_, dim) * plane_sz; \
- }
-#define UPDATE_SRC_OFFSET(dim) \
- src_offset = mad24((uint)(CONCAT(idx_, dim) + CONCAT(SRC_START_, dim)), (uint)CONCAT(SRC_STEP_, dim), (uint)src_offset);
-/*
- if (get_global_id(0) == 0 && get_global_id(1) == 0) \
- printf("(%d, %d): @%d src_offset=%d idx_dim=%d block_id=%d\n", \
- get_global_id(0), get_global_id(1), \
- dim, src_offset, CONCAT(idx_, dim), block_id \
- );
-*/
-
-#if DIMS > 5
-#error "invalid configuration"
-#endif
-#if DIMS > 4
- uint idx_4 = 0;
-#if BLOCK_DIMS <= 4
- CALC_SRC_INDEX(4)
-#endif
- UPDATE_SRC_OFFSET(4)
-#endif
-#if DIMS > 3
- uint idx_3 = 0;
-#if BLOCK_DIMS <= 3
- CALC_SRC_INDEX(3)
-#endif
- UPDATE_SRC_OFFSET(3)
-#endif
-#if DIMS > 2
- uint idx_2 = 0;
-#if BLOCK_DIMS <= 2
- CALC_SRC_INDEX(2)
-#endif
- UPDATE_SRC_OFFSET(2)
-#endif
-#if DIMS > 1
- uint idx_1 = 0;
-#if BLOCK_DIMS <= 1
- CALC_SRC_INDEX(1)
-#endif
- UPDATE_SRC_OFFSET(1)
-#endif
-#if DIMS > 0
- uint idx_0 = 0;
- UPDATE_SRC_OFFSET(0)
-#endif
-
-/*
- if (get_global_id(0) == 0)
- printf("(%d, %d): src_offset=%d dst_offset=%d\n",
- get_global_id(0), get_global_id(1),
- src_offset, dst_offset
- );
-*/
-
-#ifdef USE_COPY_1D
- copy_block_1d(src, src_offset, dst, dst_offset);
-#else
- copy_block_2d(src, src_offset, dst, dst_offset);
+ } // copy_block_2d
#endif
}