From f1da9892e94acc8578bc17567e9618f5fbf6cb8d Mon Sep 17 00:00:00 2001 From: James Reed Date: Thu, 14 Feb 2019 15:58:06 -0800 Subject: [PATCH] Generalize catArray for contiguous inputs and dim != 0 (#17032) Summary: I noticed that we were sinking a lot of time into `cat` operations in machine translation on CPU, and drilled down to us doing the cat element-by-element, even though all the inputs were contiguous. The reason was we were doing the cat along a dimension that was not 0, and that caused us to not use the fast `memcpy` branch. This PR generalizes that branch. Quick benchmark script: ``` import torch, time tensors = [torch.rand(6, 2, 1024) for i in range(5)] NITER = 1000 s = time.time() for i in range(NITER): torch.cat(tensors, dim=1) print('time per iter ', (time.time() - s) / NITER) ``` Before: ``` time per iter 8.089399337768554e-05 ``` After: ``` time per iter 2.183413505554199e-05 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/17032 Differential Revision: D14090038 Pulled By: jamesr66a fbshipit-source-id: 2c733a84915896008ac95f2233f44894bd2573de --- aten/src/TH/generic/THTensor.cpp | 45 +++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index ea42c1e..4c87853 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -779,24 +779,41 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int } allContiguous = allContiguous && THTensor_(isContiguous)(result); - // First path is for contiguous inputs along dim 0 + // First path is for contiguous inputs // Second path for non-contiguous int64_t offset; - if (dimension == 0 && allContiguous) { + if (allContiguous) { + int64_t outer = 1, inner = 1; + + // Outer is the product of dimensions from the left up to (and not + // including the concatenation dimension). This becomes the number of times + // we have to replicate the memcpy call. + for (int i = 0; i < dimension; ++i) { + outer *= size[i]; + } + + // The product of dimensions to the right of the concatenation dimension. + // We go on to multiply this by the size of the concat dimension for + // each input tensor. + for (int i = dimension + 1; i < size.size(); ++i) { + inner *= size[i]; + } + scalar_t* result_data = THStorage_(data)(THTensor_getStoragePtr(result)) + result->storage_offset(); offset = 0; - for (int j = 0; j < numInputs; j++) { - if (!should_skip(inputs[j])) { - THTensor* input0 = inputs[j]; - scalar_t* input0_data = THStorage_(data)(THTensor_getStoragePtr(input0)) + input0->storage_offset(); - int64_t input0_size = THTensor_(nElement)(input0); - // C standard says you can't pass nullptrs to memcpy, even if the size is 0; ubsan checks this. - if (input0_size != 0) { - memcpy(result_data + offset, input0_data, input0_size*sizeof(scalar_t)); - } - offset += input0_size; - } - } + for (int o = 0; o < outer; ++o) { + for (int j = 0; j < numInputs; ++j) { + if (!should_skip(inputs[j])) { + THTensor* input0 = inputs[j]; + scalar_t* input0_data = THStorage_(data)(THTensor_getStoragePtr(input0)) + input0->storage_offset(); + int local_inner = inner * input0->size(dimension); + if (local_inner != 0) { + memcpy(result_data + offset, input0_data + o*local_inner, local_inner*sizeof(scalar_t)); + } // input0_size != 0 + offset += local_inner; + } // should_skip + } // for j + } // for i } else { offset = 0; for (int j = 0; j < numInputs; j++) { -- 2.7.4