From d6b91075dc79af5022206dac730732fd1edcb488 Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Wed, 17 Apr 2019 15:33:07 -0700 Subject: [PATCH] Eliminate type dispatch from copy_kernel, and use memcpy directly rather than implementing our own copy. (#19198) Summary: It turns out that copying bytes is the same no matter what type they're interpreted as, and memcpy is already vectorized on every platform of note. Paring this down to the simplest implementation saves just over 4KB off libtorch. Pull Request resolved: https://github.com/pytorch/pytorch/pull/19198 Differential Revision: D14922656 Pulled By: resistor fbshipit-source-id: bb03899dd8f6b857847b822061e7aeb18c19e7b4 --- aten/src/ATen/native/cpu/CopyKernel.cpp | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp index 4f157bc..38f6ee6 100644 --- a/aten/src/ATen/native/cpu/CopyKernel.cpp +++ b/aten/src/ATen/native/cpu/CopyKernel.cpp @@ -14,20 +14,17 @@ namespace { constexpr int64_t COPY_GRAIN_SIZE = 20000; static void copy_kernel_impl(Tensor& dst, const Tensor& src) { - AT_DISPATCH_ALL_TYPES_AND2( - at::ScalarType::Half, at::ScalarType::Bool, dst.scalar_type(), "copy_kernel_impl", [&]() { - scalar_t* self_ptr = dst.data(); - scalar_t* src_ptr = src.data(); - - auto sample = [&](int64_t begin, int64_t end) { - int64_t len = end - begin; - scalar_t* self_seg = self_ptr + begin; - scalar_t* src_seg = src_ptr + begin; - at::vec256::convert(src_seg, self_seg, len); - }; - - parallel_for(0, dst.numel(), COPY_GRAIN_SIZE, sample); - }); + char* self_ptr = (char*)dst.data_ptr(); + char* src_ptr = (char*)src.data_ptr(); + + auto sample = [=](int64_t begin, int64_t end) { + int64_t len = end - begin; + char* self_seg = self_ptr + begin; + char* src_seg = src_ptr + begin; + memcpy(self_seg, src_seg, len); + }; + + parallel_for(0, dst.nbytes(), COPY_GRAIN_SIZE, sample); } } // anonymous namespace -- 2.7.4