k.args(clBuffer, step, cols, rows, clImageY, clImageUV);
- size_t globalsize[] = { (size_t)cols, (size_t)rows };
+ size_t globalsize[] = { (size_t)cols/2, (size_t)rows/2 };
return k.run(2, globalsize, 0, false);
}
-#endif // HAVE_VA_INTEL && HAVE_OPENCL
+#endif // HAVE_VA_INTEL
} // namespace cv::va_intel::ocl
#endif
#ifdef HAVE_DNN_NGRAPH
if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
- return sliceRanges.size() == 1;
+ return sliceRanges.size() == 1 && !hasSteps;
+ #endif
++#ifdef HAVE_CUDA
++ if (backendId == DNN_BACKEND_CUDA)
++ return !hasSteps;
+#endif
- return backendId == DNN_BACKEND_OPENCV ||
- backendId == DNN_BACKEND_CUDA;
+ return backendId == DNN_BACKEND_OPENCV;
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
}
#endif // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_CUDA
+ Ptr<BackendNode> initCUDA(
+ void *context_,
+ const std::vector<Ptr<BackendWrapper>>& inputs,
+ const std::vector<Ptr<BackendWrapper>>& outputs
+ ) override
+ {
+ auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+ std::vector<std::vector<std::size_t>> offsets;
+ for (const auto& ranges : finalSliceRanges)
+ {
+ std::vector<std::size_t> offsets_i;
+ for (const auto& range : ranges)
+ offsets_i.push_back(range.start);
+ offsets.push_back(std::move(offsets_i));
+ }
+
+ return make_cuda_node<cuda4dnn::SliceOp>(preferableTarget, std::move(context->stream), std::move(offsets));
+ }
+#endif
+
+
+ private:
+ void getSliceRecursive(const Mat &inpMat, std::vector<int> &inpIdx,
+ const std::vector<Range> &sliceRanges,
+ const std::vector<int> &sliceSteps, int dim, int dimsNum,
+ Mat &outputs, std::vector<int> &outIdx)
+ {
+ int begin = sliceRanges[dim].start;
+ int end = sliceRanges[dim].end;
+ int step = !sliceSteps.empty() ? sliceSteps[dim] : 1;
+
+ const bool is32F = inpMat.depth() == CV_32F;
+
+ // TODO optimization is required (for 2D tail case at least)
+ for (int k = begin, j = 0; k < end; k += step, j++)
+ {
+ inpIdx[dim] = k;
+ outIdx[dim] = j;
+
+ if (dim + 1 < dimsNum)
+ getSliceRecursive(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
+ else
+ {
+ if (is32F)
+ outputs.at<float>(outIdx.data()) = inpMat.at<float>(inpIdx.data());
+ else
+ outputs.at<short>(outIdx.data()) = inpMat.at<short>(inpIdx.data()); // 16F emulation
+ }
+ }
+ }
+
protected:
// The actual non-negative values determined from @p sliceRanges depends on input size.
std::vector<std::vector<Range> > finalSliceRanges;
l1 = 0.046;
lInf = 0.023;
}
+ else if (target == DNN_TARGET_CUDA_FP16)
+ {
+ l1 = 0.0042;
+ lInf = 0.021;
+ }
+ // The OpenCL kernels use the native_ math functions which have
+ // implementation defined accuracy, so we use relaxed thresholds. See
+ // https://github.com/opencv/opencv/issues/9821 for more details.
+ else if (target == DNN_TARGET_OPENCL)
+ {
+ l1 = 0.02;
+ lInf = 0.02;
+ }
runTorchNet("net_conv_gemm_lrn", "", false, true, true, l1, lInf);
}