From d6b91075dc79af5022206dac730732fd1edcb488 Mon Sep 17 00:00:00 2001
From: Owen Anderson <owen.anderson@oculus.com>
Date: Wed, 17 Apr 2019 15:33:07 -0700
Subject: [PATCH] Eliminate type dispatch from copy_kernel, and use memcpy
 directly rather than implementing our own copy. (#19198)

Summary:
It turns out that copying bytes is the same no matter what type
they're interpreted as, and memcpy is already vectorized on every
platform of note.  Paring this down to the simplest implementation
saves just over 4KB off libtorch.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/19198

Differential Revision: D14922656

Pulled By: resistor

fbshipit-source-id: bb03899dd8f6b857847b822061e7aeb18c19e7b4
---
 aten/src/ATen/native/cpu/CopyKernel.cpp | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp
index 4f157bc..38f6ee6 100644
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@@ -14,20 +14,17 @@ namespace {
 constexpr int64_t COPY_GRAIN_SIZE = 20000;
 
 static void copy_kernel_impl(Tensor& dst, const Tensor& src) {
-  AT_DISPATCH_ALL_TYPES_AND2(
-    at::ScalarType::Half, at::ScalarType::Bool, dst.scalar_type(), "copy_kernel_impl", [&]() {
-      scalar_t* self_ptr = dst.data<scalar_t>();
-      scalar_t* src_ptr = src.data<scalar_t>();
-
-      auto sample = [&](int64_t begin, int64_t end) {
-        int64_t len = end - begin;
-        scalar_t* self_seg = self_ptr + begin;
-        scalar_t* src_seg = src_ptr + begin;
-        at::vec256::convert<scalar_t, scalar_t>(src_seg, self_seg, len);
-    };
-
-    parallel_for(0, dst.numel(), COPY_GRAIN_SIZE, sample);
-  });
+  char* self_ptr = (char*)dst.data_ptr();
+  char* src_ptr = (char*)src.data_ptr();
+
+  auto sample = [=](int64_t begin, int64_t end) {
+    int64_t len = end - begin;
+    char* self_seg = self_ptr + begin;
+    char* src_seg = src_ptr + begin;
+    memcpy(self_seg, src_seg, len);
+  };
+
+  parallel_for(0, dst.nbytes(), COPY_GRAIN_SIZE, sample);
 }
 
 } // anonymous namespace
-- 
2.7.4