Modify TileOp GPU implementation to expose more concurrency and better utilize GPU...
authorLukasz Wesolowski <lwesolowski@fb.com>
Wed, 20 Feb 2019 23:52:24 +0000 (15:52 -0800)
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>
Thu, 21 Feb 2019 00:02:14 +0000 (16:02 -0800)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17275

Previous implementation used a memcpy inside the kernel. It is more efficient to reduce the data fetched per thread to a single word from memory. This exposes more concurrency and takes advantage of GPU memory coalescing support.

Reviewed By: takatosp1

Differential Revision: D14120147

fbshipit-source-id: c4734003d4342e55147c5b858f232a006af60b68

caffe2/operators/tile_op.cu

index cfd4e52..b8f7820 100644 (file)
@@ -5,19 +5,17 @@
 
 namespace caffe2 {
 namespace {
+template <typename T>
 __global__ void TileCopyKernel(
-    int item_size,
     int outer_dim,
     int inner_dim,
     int tiles,
-    const char* input_data,
-    char* output_data) {
-  CUDA_1D_KERNEL_LOOP(index, outer_dim * tiles) {
-    int i = index / tiles;
-    int t = index % tiles;
-    const char* input_ptr = input_data + inner_dim * item_size * i;
-    char* output_ptr = output_data + (i * tiles + t) * inner_dim * item_size;
-    memcpy(output_ptr, input_ptr, inner_dim * item_size);
+    const T* input_data,
+    T* output_data) {
+  CUDA_1D_KERNEL_LOOP(index, outer_dim * inner_dim * tiles) {
+    int col = index % inner_dim;
+    int row = index / (inner_dim * tiles);
+    output_data[index] = input_data[row * inner_dim + col];
   }
 }
 
@@ -58,12 +56,16 @@ void TileOp<CUDAContext>::DoTile(
     int inner_dim,
     const char* input_data,
     char* output_data) {
-  TileCopyKernel<<<
-      std::min(outer_dim * tiles_, CAFFE_MAXIMUM_NUM_BLOCKS),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      item_size, outer_dim, inner_dim, tiles_, input_data, output_data);
+  TileCopyKernel<float>
+      <<<std::min(outer_dim * inner_dim * tiles_, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          outer_dim,
+          inner_dim,
+          tiles_,
+          reinterpret_cast<const float*>(input_data),
+          reinterpret_cast<float*>(output_data));
 }
 
 template <>