From f1da9892e94acc8578bc17567e9618f5fbf6cb8d Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 14 Feb 2019 15:58:06 -0800
Subject: [PATCH] Generalize catArray for contiguous inputs and dim != 0
 (#17032)

Summary:
I noticed that we were sinking a lot of time into `cat` operations in machine translation on CPU, and drilled down to us doing the cat element-by-element, even though all the inputs were contiguous. The reason was we were doing the cat along a dimension that was not 0, and that caused us to not use the fast `memcpy` branch. This PR generalizes that branch.

Quick benchmark script:
```
import torch, time

tensors = [torch.rand(6, 2, 1024) for i in range(5)]

NITER = 1000
s = time.time()
for i in range(NITER):
    torch.cat(tensors, dim=1)
print('time per iter ', (time.time() - s) / NITER)
```

Before:
```
time per iter  8.089399337768554e-05
```

After:
```
time per iter  2.183413505554199e-05
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17032

Differential Revision: D14090038

Pulled By: jamesr66a

fbshipit-source-id: 2c733a84915896008ac95f2233f44894bd2573de
---
 aten/src/TH/generic/THTensor.cpp | 45 +++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index ea42c1e..4c87853 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -779,24 +779,41 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
   }
   allContiguous = allContiguous && THTensor_(isContiguous)(result);
 
-  // First path is for contiguous inputs along dim 0
+  // First path is for contiguous inputs
   // Second path for non-contiguous
   int64_t offset;
-  if (dimension == 0 && allContiguous) {
+  if (allContiguous) {
+    int64_t outer = 1, inner = 1;
+
+    // Outer is the product of dimensions from the left up to (and not
+    // including the concatenation dimension). This becomes the number of times
+    // we have to replicate the memcpy call.
+    for (int i = 0; i < dimension; ++i) {
+      outer *= size[i];
+    }
+
+    // The product of dimensions to the right of the concatenation dimension.
+    // We go on to multiply this by the size of the concat dimension for
+    // each input tensor.
+    for (int i = dimension + 1; i < size.size(); ++i) {
+      inner *= size[i];
+    }
+
     scalar_t* result_data = THStorage_(data)(THTensor_getStoragePtr(result)) + result->storage_offset();
     offset = 0;
-    for (int j = 0; j < numInputs; j++) {
-      if (!should_skip(inputs[j])) {
-        THTensor* input0 = inputs[j];
-        scalar_t* input0_data = THStorage_(data)(THTensor_getStoragePtr(input0)) + input0->storage_offset();
-        int64_t input0_size = THTensor_(nElement)(input0);
-        // C standard says you can't pass nullptrs to memcpy, even if the size is 0; ubsan checks this.
-        if (input0_size != 0) {
-          memcpy(result_data + offset, input0_data, input0_size*sizeof(scalar_t));
-        }
-        offset += input0_size;
-      }
-    }
+    for (int o = 0; o < outer; ++o) {
+      for (int j = 0; j < numInputs; ++j) {
+        if (!should_skip(inputs[j])) {
+          THTensor* input0 = inputs[j];
+          scalar_t* input0_data = THStorage_(data)(THTensor_getStoragePtr(input0)) + input0->storage_offset();
+          int local_inner = inner * input0->size(dimension);
+          if (local_inner != 0) {
+            memcpy(result_data + offset, input0_data + o*local_inner, local_inner*sizeof(scalar_t));
+          } // input0_size != 0
+          offset += local_inner;
+        }  // should_skip
+      } // for j
+    } // for i
   } else {
     offset = 0;
     for (int j = 0; j < numInputs; j++) {
-- 
2.7.4