}\r
}\r
\r
- template <typename T>\r
- __global__ void resize_area_scan_x(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer)\r
+ enum ScanKind { exclusive, inclusive } ;\r
+\r
+ template <ScanKind Kind , class T>\r
+ __device__ __forceinline__ T scan_warp ( volatile T *ptr , const unsigned int idx = threadIdx.x )\r
{\r
- typedef typename scan_traits<T>::scan_line_type W;\r
- extern __shared__ W line[];\r
- scan_x(src,fx,fy, buffer,line, 0);\r
+ const unsigned int lane = idx & 31;\r
+\r
+ if ( lane >= 1) ptr [idx ] = ptr [idx - 1] + ptr [idx];\r
+ if ( lane >= 2) ptr [idx ] = ptr [idx - 2] + ptr [idx];\r
+ if ( lane >= 4) ptr [idx ] = ptr [idx - 4] + ptr [idx];\r
+ if ( lane >= 8) ptr [idx ] = ptr [idx - 8] + ptr [idx];\r
+ if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx];\r
+\r
+ if( Kind == inclusive )\r
+ return ptr [idx ];\r
+ else\r
+ return (lane > 0) ? ptr [idx - 1] : 0;\r
}\r
\r
- template <typename T>\r
- __global__ void resize_area_scan_y(const DevMem2D_<T> src, DevMem2D_<T> dst, int fx, int fy, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer)\r
+ template <ScanKind Kind , class T>\r
+ __device__ __forceinline__ T scan_block( volatile T *ptr)\r
{\r
- typedef typename scan_traits<T>::scan_line_type W;\r
- extern __shared__ W line[];\r
- scan_y(buffer,fx, fy, dst, line, 0);\r
+ const unsigned int idx = threadIdx.x;\r
+ const unsigned int lane = idx & 31;\r
+ const unsigned int warp = idx >> 5;\r
+\r
+ T val = scan_warp <Kind>( ptr , idx );\r
+ __syncthreads ();\r
+\r
+ if( lane == 31 )\r
+ ptr [ warp ] = ptr [idx ];\r
+\r
+ __syncthreads ();\r
+\r
+ if( warp == 0 )\r
+ scan_warp<inclusive>( ptr , idx );\r
+\r
+ __syncthreads ();\r
+\r
+ if ( warp > 0)\r
+ val = ptr [warp -1] + val;\r
+\r
+ __syncthreads ();\r
+\r
+ ptr[idx] = val;\r
+\r
+ __syncthreads ();\r
+\r
+ return val ;\r
}\r
\r
- template <typename T> struct InterAreaDispatcherStream\r
+ template<typename T, typename W>\r
+ __global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines)\r
{\r
- static void call(const DevMem2D_<T> src, int fx, int fy, DevMem2D_<T> dst, DevMem2D_<typename scan_traits<T>::scan_line_type> buffer, cudaStream_t stream)\r
+ extern __shared__ W sbuf[];\r
+\r
+ const unsigned int tid = threadIdx. x;\r
+\r
+ // load line-block on shared memory\r
+ int y = blockIdx.x / thred_lines;\r
+ int input_stride = (blockIdx.x - y * thred_lines) * blockDim.x;\r
+ int x = input_stride + tid;\r
+\r
+ // store global data in shared memory\r
+ sbuf[tid] = src(y, x);\r
+ __syncthreads();\r
+\r
+ scan_block<inclusive, W>(sbuf);\r
+\r
+ float scale = __fdividef(1.f, fx);\r
+ int out_stride = input_stride / fx;\r
+ int count = blockDim.x / fx;\r
+\r
+ if (tid < count)\r
{\r
- resize_area_scan_x<T><<<src.rows, (src.cols >> 1), src.cols * sizeof(typename scan_traits<T>::scan_line_type) >>>(src, dst, fx, fy, buffer);\r
+ int start_idx = (tid == 0)? 0 : tid * fx - 1;\r
+ int end_idx = tid * fx + fx - 1;\r
\r
- resize_area_scan_y<T><<<dst.cols, (src.rows >> 1), src.rows * sizeof(typename scan_traits<T>::scan_line_type) >>>(src, dst, fx, fy, buffer);\r
- cudaSafeCall( cudaGetLastError() );\r
+ W start = (tid == 0)? (W)0:sbuf[start_idx];\r
+ W end = sbuf[end_idx];\r
\r
- if (stream == 0)\r
- cudaSafeCall( cudaDeviceSynchronize() );\r
+ if (blockIdx.x == 0)\r
+ printf("%d~~~~~~~~ start_idx %d, end_idx %d, start %f, end %f\n",\r
+ tid, start_idx, end_idx, start, end);\r
+\r
+ dst(y, out_stride + tid) = (end - start);\r
}\r
- };\r
+ }\r
+\r
+ template<typename T, typename W>\r
+ __global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines)\r
+ {\r
+ extern __shared__ W sbuf[];\r
+\r
+ const unsigned int tid = threadIdx. x;\r
+\r
+ // load line-block on shared memory\r
+ int x = blockIdx.x / thred_lines;\r
+\r
+ int global_stride = (blockIdx.x % thred_lines) * blockDim.x;\r
+ if (!tid) printf("STRIDE : %d", global_stride);\r
+ int y = global_stride + tid;\r
+\r
+ // store global data in shared memory\r
+\r
+ sbuf[tid] = src(y, x);\r
+ __syncthreads();\r
+ scan_block<inclusive, W>(sbuf);\r
+\r
+ float scale = __fdividef(1.f, fx * fy);\r
+ int out_stride = global_stride / fx;\r
+ int count = blockDim.x / fx;\r
+\r
+ if (tid < count)\r
+ {\r
+ int start_idx = (tid == 0)? 0 : tid * fx - 1;\r
+ int end_idx = tid * fx + fx - 1;\r
+\r
+ W start = (tid == 0)? (W)0:sbuf[start_idx];\r
+ W end = sbuf[end_idx];\r
+\r
+ if (blockIdx.x == 0)\r
+ printf("!!!!!!!!%d~~~~~~~~ start_idx %d, end_idx %d, start %f, end %f\n",\r
+ tid, start_idx, end_idx, start, end);\r
+\r
+ dst(out_stride + tid, x) = saturate_cast<T>((end - start) * scale);\r
+ }\r
+ }\r
\r
template <typename T>\r
void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy,\r
{\r
(void)interpolation;\r
\r
+ //TODO: add assert to picture size\r
int iscale_x = round(fx);\r
int iscale_y = round(fy);\r
\r
- InterAreaDispatcherStream<T>::call(src, iscale_x, iscale_y, dst, buffer, stream);\r
+ const int warps = 4;\r
+ const int threads = 32 * warps;\r
+\r
+ int thred_lines = divUp(src.cols, threads);\r
+ int blocks = src.rows * thred_lines;\r
+\r
+ printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n",\r
+ src.cols, warps, threads, thred_lines, blocks);\r
+\r
+ typedef typename scan_traits<T>::scan_line_type smem_type;\r
+\r
+ resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>\r
+ (src, buffer, iscale_x, iscale_y, thred_lines);\r
+\r
+ thred_lines = divUp(src.rows, threads);\r
+ blocks = dst.cols * thred_lines;\r
+\r
+ printf("device code executed for Y coordinate with:\nwarps %d, threads %d, thred_lines %d, blocks %d\n",\r
+ warps, threads, thred_lines, blocks);\r
+\r
+ resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>\r
+ (buffer, dst, iscale_x, iscale_y, thred_lines);\r
+\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
}\r
\r
template void resize_area_gpu<uchar>(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream);\r