From: Lukasz Wesolowski Date: Wed, 20 Feb 2019 23:52:24 +0000 (-0800) Subject: Modify TileOp GPU implementation to expose more concurrency and better utilize GPU... X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~1181 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3e44880d4d91792ceb8235d7db0787028efe62cd;p=platform%2Fupstream%2Fpytorch.git Modify TileOp GPU implementation to expose more concurrency and better utilize GPU memory bandwidth (#17275) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17275 Previous implementation used a memcpy inside the kernel. It is more efficient to reduce the data fetched per thread to a single word from memory. This exposes more concurrency and takes advantage of GPU memory coalescing support. Reviewed By: takatosp1 Differential Revision: D14120147 fbshipit-source-id: c4734003d4342e55147c5b858f232a006af60b68 --- diff --git a/caffe2/operators/tile_op.cu b/caffe2/operators/tile_op.cu index cfd4e52..b8f7820 100644 --- a/caffe2/operators/tile_op.cu +++ b/caffe2/operators/tile_op.cu @@ -5,19 +5,17 @@ namespace caffe2 { namespace { +template __global__ void TileCopyKernel( - int item_size, int outer_dim, int inner_dim, int tiles, - const char* input_data, - char* output_data) { - CUDA_1D_KERNEL_LOOP(index, outer_dim * tiles) { - int i = index / tiles; - int t = index % tiles; - const char* input_ptr = input_data + inner_dim * item_size * i; - char* output_ptr = output_data + (i * tiles + t) * inner_dim * item_size; - memcpy(output_ptr, input_ptr, inner_dim * item_size); + const T* input_data, + T* output_data) { + CUDA_1D_KERNEL_LOOP(index, outer_dim * inner_dim * tiles) { + int col = index % inner_dim; + int row = index / (inner_dim * tiles); + output_data[index] = input_data[row * inner_dim + col]; } } @@ -58,12 +56,16 @@ void TileOp::DoTile( int inner_dim, const char* input_data, char* output_data) { - TileCopyKernel<<< - std::min(outer_dim * tiles_, CAFFE_MAXIMUM_NUM_BLOCKS), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - item_size, outer_dim, inner_dim, tiles_, input_data, output_data); + TileCopyKernel + <<>>( + outer_dim, + inner_dim, + tiles_, + reinterpret_cast(input_data), + reinterpret_cast(output_data)); } template <>