template<typename Dtype>
OCL4DNNPool<Dtype>::~OCL4DNNPool()
{
- mask_idx_.release();
+ // nothing
}
template<typename Dtype>
bool ret = true;
size_t global[] = { 128 * 128 };
size_t local[] = { 128 };
- cl_uint argIdx = 0;
// support 2D case
switch (pool_method_)
{
case LIBDNN_POOLING_METHOD_MAX:
{
- if (top_mask.empty() && mask_idx_.empty())
- {
- mask_idx_.create(1, count_, CV_32FC1);
- }
- ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
- cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+ bool haveMask = !top_mask.empty();
+ ocl::Kernel oclk_max_pool_forward(
+ haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
+ ocl::dnn::ocl4dnn_pooling_oclsrc,
+ format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+ " -D STRIDE_W=%d -D STRIDE_H=%d"
+ " -D PAD_W=%d -D PAD_H=%d%s",
+ kernel_w_, kernel_h_,
+ stride_w_, stride_h_,
+ pad_w_, pad_h_,
+ haveMask ? " -D HAVE_MASK=1" : ""
+ ));
if (oclk_max_pool_forward.empty())
return false;
- argIdx = 0;
- oclk_max_pool_forward.set(argIdx++, count_);
- oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
- oclk_max_pool_forward.set(argIdx++, batch_size_);
- oclk_max_pool_forward.set(argIdx++, channels_);
- oclk_max_pool_forward.set(argIdx++, height_);
- oclk_max_pool_forward.set(argIdx++, width_);
- oclk_max_pool_forward.set(argIdx++, pooled_height_);
- oclk_max_pool_forward.set(argIdx++, pooled_width_);
- oclk_max_pool_forward.set(argIdx++, kernel_h_);
- oclk_max_pool_forward.set(argIdx++, kernel_w_);
- oclk_max_pool_forward.set(argIdx++, stride_h_);
- oclk_max_pool_forward.set(argIdx++, stride_w_);
- oclk_max_pool_forward.set(argIdx++, pad_h_);
- oclk_max_pool_forward.set(argIdx++, pad_w_);
- oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
- oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
- if (mask_idx_.empty())
- oclk_max_pool_forward.set(argIdx++, (void *)NULL);
- else
- oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
- oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
+ oclk_max_pool_forward.args(
+ count_,
+ ocl::KernelArg::PtrReadOnly(bottom),
+ batch_size_,
+ channels_,
+ height_,
+ width_,
+ pooled_height_,
+ pooled_width_,
+ ocl::KernelArg::PtrWriteOnly(top),
+ ocl::KernelArg::PtrWriteOnly(top_mask)
+ );
ret = oclk_max_pool_forward.run(1, global, local, false);
}
break;
case LIBDNN_POOLING_METHOD_AVE:
{
+ CV_Assert(top_mask.empty());
+
ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
- cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+ ocl::dnn::ocl4dnn_pooling_oclsrc,
+ format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+ " -D STRIDE_W=%d -D STRIDE_H=%d"
+ " -D PAD_W=%d -D PAD_H=%d",
+ kernel_w_, kernel_h_,
+ stride_w_, stride_h_,
+ pad_w_, pad_h_
+ ));
if (oclk_ave_pool_forward.empty())
return false;
- argIdx = 0;
- oclk_ave_pool_forward.set(argIdx++, count_);
- oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
- oclk_ave_pool_forward.set(argIdx++, batch_size_);
- oclk_ave_pool_forward.set(argIdx++, channels_);
- oclk_ave_pool_forward.set(argIdx++, height_);
- oclk_ave_pool_forward.set(argIdx++, width_);
- oclk_ave_pool_forward.set(argIdx++, pooled_height_);
- oclk_ave_pool_forward.set(argIdx++, pooled_width_);
- oclk_ave_pool_forward.set(argIdx++, kernel_h_);
- oclk_ave_pool_forward.set(argIdx++, kernel_w_);
- oclk_ave_pool_forward.set(argIdx++, stride_h_);
- oclk_ave_pool_forward.set(argIdx++, stride_w_);
- oclk_ave_pool_forward.set(argIdx++, pad_h_);
- oclk_ave_pool_forward.set(argIdx++, pad_w_);
- oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ oclk_ave_pool_forward.args(
+ count_,
+ ocl::KernelArg::PtrReadOnly(bottom),
+ batch_size_,
+ channels_,
+ height_,
+ width_,
+ pooled_height_,
+ pooled_width_,
+ ocl::KernelArg::PtrWriteOnly(top)
+ );
ret = oclk_ave_pool_forward.run(1, global, local, false);
}
break;
case LIBDNN_POOLING_METHOD_STO:
{
+ CV_Assert(top_mask.empty());
+
ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
- cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+ ocl::dnn::ocl4dnn_pooling_oclsrc,
+ format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
+ " -D STRIDE_W=%d -D STRIDE_H=%d",
+ kernel_w_, kernel_h_,
+ stride_w_, stride_h_
+ ));
+
if (oclk_sto_pool_forward.empty())
return false;
- argIdx = 0;
- oclk_sto_pool_forward.set(argIdx++, count_);
- oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
- oclk_sto_pool_forward.set(argIdx++, batch_size_);
- oclk_sto_pool_forward.set(argIdx++, channels_);
- oclk_sto_pool_forward.set(argIdx++, height_);
- oclk_sto_pool_forward.set(argIdx++, width_);
- oclk_sto_pool_forward.set(argIdx++, pooled_height_);
- oclk_sto_pool_forward.set(argIdx++, pooled_width_);
- oclk_sto_pool_forward.set(argIdx++, kernel_h_);
- oclk_sto_pool_forward.set(argIdx++, kernel_w_);
- oclk_sto_pool_forward.set(argIdx++, stride_h_);
- oclk_sto_pool_forward.set(argIdx++, stride_w_);
- oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ oclk_sto_pool_forward.args(
+ count_,
+ ocl::KernelArg::PtrReadOnly(bottom),
+ batch_size_,
+ channels_,
+ height_,
+ width_,
+ pooled_height_,
+ pooled_width_,
+ ocl::KernelArg::PtrWriteOnly(top)
+ );
ret = oclk_sto_pool_forward.run(1, global, local, false);
}
#define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
-void TEMPLATE(max_pool_forward_impl, Dtype)(
+#if defined KERNEL_MAX_POOL
+
+__kernel void
+#ifdef HAVE_MASK
+ TEMPLATE(max_pool_forward_mask, Dtype)
+#else
+ TEMPLATE(max_pool_forward, Dtype)
+#endif
+(
const int nthreads, __global const Dtype* bottom_data, const int num,
const int channels, const int height, const int width,
- const int pooled_height, const int pooled_width, const int kernel_h,
- const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
- const int pad_w,
- __global Dtype* top_data,
- const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
+ const int pooled_height, const int pooled_width,
+ __global Dtype* top_data
+#ifdef HAVE_MASK
+ , __global Dtype* mask
+#endif
+)
{
for (int index = get_global_id(0); index < nthreads;
index += get_global_size(0))
const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels;
- int hstart = ph * stride_h - pad_h;
- int wstart = pw * stride_w - pad_w;
- const int hend = min(hstart + kernel_h, height);
- const int wend = min(wstart + kernel_w, width);
+ int hstart = ph * STRIDE_H - PAD_H;
+ int wstart = pw * STRIDE_W - PAD_W;
+ const int hend = min(hstart + KERNEL_H, height);
+ const int wend = min(wstart + KERNEL_W, width);
hstart = max(hstart, (int)0);
wstart = max(wstart, (int)0);
Dtype maxval = -FLT_MAX;
}
}
top_data[index] = maxval;
- if (!no_mask) {
- if (use_mask == 1) {
- mask[index] = maxidx;
- } else {
- top_mask[index] = maxidx;
- }
- }
+#ifdef HAVE_MASK
+ mask[index] = maxidx;
+#endif
}
}
-__kernel void TEMPLATE(max_pool_forward, Dtype)(
- const int nthreads, __global const Dtype* bottom_data, const int num,
- const int channels, const int height, const int width,
- const int pooled_height, const int pooled_width, const int kernel_h,
- const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
- const int pad_w,
- __global Dtype* top_data,
- const int use_mask, __global int* mask, __global Dtype* top_mask)
-{
- TEMPLATE(max_pool_forward_impl, Dtype)(
- nthreads, bottom_data, num, channels, height, width,
- pooled_height, pooled_width, kernel_h,
- kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
- );
-}
+#elif defined KERNEL_AVE_POOL
__kernel void TEMPLATE(ave_pool_forward, Dtype)(
const int nthreads, __global const Dtype* const bottom_data, const int num,
const int channels, const int height, const int width,
- const int pooled_height, const int pooled_width, const int kernel_h,
- const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
- const int pad_w, __global Dtype* top_data)
+ const int pooled_height, const int pooled_width,
+ __global Dtype* top_data)
{
for (int index = get_global_id(0); index < nthreads;
index += get_global_size(0))
const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels;
- int hstart = ph * stride_h - pad_h;
- int wstart = pw * stride_w - pad_w;
- int hend = min(hstart + kernel_h, height + pad_h);
- int wend = min(wstart + kernel_w, width + pad_w);
+ int hstart = ph * STRIDE_H - PAD_H;
+ int wstart = pw * STRIDE_W - PAD_W;
+ int hend = min(hstart + KERNEL_H, height + PAD_H);
+ int wend = min(wstart + KERNEL_W, width + PAD_W);
const int pool_size = (hend - hstart) * (wend - wstart);
hstart = max(hstart, (int)0);
wstart = max(wstart, (int)0);
}
}
+#elif defined KERNEL_STO_POOL
+
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
const int nthreads, __global const Dtype* const bottom_data, const int num,
const int channels, const int height, const int width,
- const int pooled_height, const int pooled_width, const int kernel_h,
- const int kernel_w, const int stride_h, const int stride_w,
+ const int pooled_height, const int pooled_width,
__global Dtype* top_data)
{
for (int index = get_global_id(0); index < nthreads;
const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels;
- const int hstart = ph * stride_h;
- const int hend = min(hstart + kernel_h, height);
- const int wstart = pw * stride_w;
- const int wend = min(wstart + kernel_w, width);
+ const int hstart = ph * STRIDE_H;
+ const int hend = min(hstart + KERNEL_H, height);
+ const int wstart = pw * STRIDE_W;
+ const int wend = min(wstart + KERNEL_W, width);
// We set cumsum to be 0 to avoid divide-by-zero problems
Dtype cumsum = FLT_MIN;
Dtype cumvalues = 0.;
// First pass: get sum
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
- cumsum += bottom_slice[h * width + w];
- cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+ Dtype v = bottom_slice[h * width + w];
+ cumsum += v;
+ cumvalues += v * v;
}
}
top_data[index] = cumvalues / cumsum;
}
}
+
+#endif // KERNEL_*