From 5a88e8cf61b99cb3c9aabb1001fe651c27d68f96 Mon Sep 17 00:00:00 2001 From: Andrey Morozov Date: Tue, 27 Jul 2010 09:25:46 +0000 Subject: [PATCH] optimized gpumat::copyTo() --- modules/gpu/src/cuda/matrix_operations.cu | 45 ++++++++++++++++--------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index ebaece8..0c56fcc 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -54,28 +54,6 @@ namespace mat_operators { __constant__ double scalar_d[4]; - /////////////////////////////////////////////////////////////////////////// - ////////////////////////////////// CopyTo ///////////////////////////////// - /////////////////////////////////////////////////////////////////////////// - - template - __global__ void kernel_copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels) - { - size_t x = blockIdx.x * blockDim.x + threadIdx.x; - size_t y = blockIdx.y * blockDim.y + threadIdx.y; - - if ((x < cols * channels ) && (y < rows)) - if (mask[y * step_mask + x / channels] != 0) - { - size_t idx = y * (step_mat / sizeof(T)) + x; - mat_dst[idx] = mat_src[idx]; - } - } - - - /////////////////////////////////////////////////////////////////////////// - ////////////////////////////////// SetTo ////////////////////////////////// - /////////////////////////////////////////////////////////////////////////// template class shift_and_sizeof; @@ -129,6 +107,29 @@ namespace mat_operators enum { shift = 3 }; }; + + /////////////////////////////////////////////////////////////////////////// + ////////////////////////////////// CopyTo ///////////////////////////////// + /////////////////////////////////////////////////////////////////////////// + + template + __global__ void kernel_copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels) + { + size_t x = blockIdx.x * blockDim.x + threadIdx.x; + size_t y = blockIdx.y * blockDim.y + threadIdx.y; + + if ((x < cols * channels ) && (y < rows)) + if (mask[y * step_mask + x / channels] != 0) + { + size_t idx = y * ( step_mat >> shift_and_sizeof::shift ) + x; + mat_dst[idx] = mat_src[idx]; + } + } + + /////////////////////////////////////////////////////////////////////////// + ////////////////////////////////// SetTo ////////////////////////////////// + /////////////////////////////////////////////////////////////////////////// + template __global__ void kernel_set_to_without_mask(T * mat, int cols, int rows, int step, int channels) { -- 2.7.4