From 043e363c6c2bea3652ace954e4e70707cad0cf5e Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Fri, 5 Apr 2019 07:18:38 -0700 Subject: [PATCH] Cache device on TensorImpl; clean up TensorImpl constructors. (#18833) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18833 ghimport-source-id: 6f2be25fcc5e6be3ffe20582e604bd2c1fbab66b Stack from [ghstack](https://github.com/ezyang/ghstack): * **#18833 [STACK] Cache device on TensorImpl; clean up TensorImpl constructors.** * #18832 [STACK] Disallow changing the device of a tensor via set_. * #18831 [STACK] Stop swapping in Storages of the wrong device for Tensors. 1) We cache device on TensorImpl. This means we can access the device without a virtual function and allows us to more easily extend TensorImpls (because they don't need to figure out how to store the Device for themselves). 2) Clean up TensorImpl APIs. We had a constructor that took a TensorTypeId and an allocator and would allocate a Storage based on the recognized types of TensorTypeIds. Instead, we just have two different constructors: one for types with a storage, one without. Reviewed By: dzhulgakov Differential Revision: D14766230 fbshipit-source-id: 745b8db84dcd6cb58f1a8675ad3ff8d033bc50df --- aten/src/ATen/SparseTensorImpl.cpp | 19 ++++++-- aten/src/ATen/SparseTensorImpl.h | 8 ++-- aten/src/ATen/function_wrapper.py | 22 ++++----- aten/src/ATen/test/xla_tensor_test.cpp | 6 +-- c10/core/TensorImpl.cpp | 22 ++++----- c10/core/TensorImpl.h | 87 +++++++++++++++++++--------------- c10/core/UndefinedTensorImpl.cpp | 2 +- torch/csrc/autograd/variable.cpp | 7 +-- torch/csrc/autograd/variable.h | 3 -- torch/csrc/jit/interpreter.cpp | 30 ------------ 10 files changed, 92 insertions(+), 114 deletions(-) diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index 48ca1b3..ca04de7 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -31,11 +31,22 @@ namespace { // This means that we allocate a [1,0] size indices tensor and a [0] size // values tensor for such an empty tensor. SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeMeta& data_type) - : TensorImpl(type_id, data_type, nullptr, false) + : SparseTensorImpl(type_id, data_type + , at::empty({1, 0}, at::initialTensorOptions().device(sparseTensorIdToDeviceType(type_id)).dtype(ScalarType::Long)) + , at::empty({0}, at::initialTensorOptions().device(sparseTensorIdToDeviceType(type_id)).dtype(data_type))) {} + +SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeMeta& data_type, at::Tensor indices, at::Tensor values) + : TensorImpl(type_id, data_type, values.device(), false) , sparse_dim_(1) , dense_dim_(0) - , indices_(at::empty({1, 0}, at::initialTensorOptions().device(sparseTensorIdToDeviceType(type_id)).dtype(ScalarType::Long))) - , values_(at::empty({0}, at::initialTensorOptions().device(sparseTensorIdToDeviceType(type_id)).dtype(data_type))) {} + , indices_(std::move(indices)) + , values_(std::move(values)) { + // we proxy to this constructor so we can initialize the device correctly, but really only indices/values of this shape are allowed. + AT_ASSERT(indices_.sizes() == IntArrayRef({1, 0})); + AT_ASSERT(values_.sizes() == IntArrayRef({0})); + AT_ASSERT(values_.device() == indices_.device()); + AT_ASSERT(values_.device() == device()); +} IntArrayRef SparseTensorImpl::sizes() const { return sizes_; @@ -111,6 +122,8 @@ void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, cons indices_ = indices; values_ = values; + AT_ASSERT(device() == values_.device()); + AT_ASSERT(values_.device() == indices_.device()); coalesced_ = false; } diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index a5de5a1..23ba71f 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -206,15 +206,13 @@ public: impl->dense_dim_ = dense_dim(); impl->indices_ = indices(); impl->values_ = values(); + impl->device_opt_ = device(); impl->coalesced_ = coalesced(); impl->refresh_numel(); return impl; } - private: - int64_t get_device_slow() const override { - return values_.get_device(); - } - +private: + explicit SparseTensorImpl(at::TensorTypeId, const caffe2::TypeMeta&, at::Tensor indices, at::Tensor values); }; } // namespace at diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index ce585e3..8dd59a9 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -178,11 +178,6 @@ if(${check_name}.is_sparse()) { return static_cast(this)->${api_name}(${sparse_actuals}); }""") -BUFFER_DEFINITION = CodeTemplate("""\ -auto ${name}_ = c10::make_intrusive( - ${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), ${THTensor}_new(), false).release(); -auto ${name} = Tensor(${name}_, false);""") - CONDITIONAL_INITIALIZER = CodeTemplate("""\ if (${name}.defined()) { ${initializer} @@ -323,18 +318,17 @@ CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL') ALLOC_NOARGS_WRAP = { 'THTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()', + '(c10::Storage(caffe2::TypeMeta::Make<${ScalarType}>(), 0, allocator(), true),' + '${Backend}TensorId(), false).release()', 'THBoolTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Byte), allocator(), false).release()', + '(c10::Storage(scalarTypeToTypeMeta(ScalarType::Byte), 0, allocator(), true),' + '${Backend}TensorId(), false).release()', 'THIndexTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), allocator(), false).release()', + '(c10::Storage(scalarTypeToTypeMeta(ScalarType::Long), 0, allocator(), true),' + '${Backend}TensorId(), false).release()', 'THIntegerTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Int), allocator(), false).release()', - 'THDenseTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()', - 'THDenseIndexTensor*': 'c10::make_intrusive' - '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), ' - 'allocator(), false).release()' + '(c10::Storage(scalarTypeToTypeMeta(ScalarType::Int), 0, allocator(), true),' + '${Backend}TensorId(), false).release()', } ALLOC_WRAP = { diff --git a/aten/src/ATen/test/xla_tensor_test.cpp b/aten/src/ATen/test/xla_tensor_test.cpp index 030461a..bef48c9 100644 --- a/aten/src/ATen/test/xla_tensor_test.cpp +++ b/aten/src/ATen/test/xla_tensor_test.cpp @@ -24,11 +24,11 @@ struct XLAAllocator final : public at::Allocator { TEST(XlaTensorTest, TestNoStorage) { XLAAllocator allocator; - auto storage = Storage(caffe2::TypeMeta::Make(), 0, &allocator, true); auto tensor_impl = c10::make_intrusive( - std::move(storage), XLATensorId(), + caffe2::TypeMeta::Make(), + at::Device(DeviceType::XLA, 0), /*is_variable=*/false); at::Tensor t(std::move(tensor_impl)); - ASSERT_TRUE(t.device() == DeviceType::XLA); + ASSERT_TRUE(t.device() == at::Device(DeviceType::XLA, 0)); } diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 446d206..258d402 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -33,26 +33,26 @@ const at::Tensor& TensorImpl::grad() const { } } -TensorImpl::TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable) - : TensorImpl({}, type_id, data_type, is_variable) { - // Variables, UndefinedTensors and SparseTensors don't have storages. - if (!is_variable && type_id != UndefinedTensorId() && data_type.id() != caffe2::TypeIdentifier::uninitialized() - && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) { - storage_ = Storage(data_type, 0, allocator, true); - } -} - TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable) - : TensorImpl(std::move(storage), type_id, storage.dtype(), is_variable) {} + : TensorImpl(std::move(storage), type_id, storage.dtype(), storage.device(), is_variable) {} + +TensorImpl::TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, c10::optional device_opt, bool is_variable) + : TensorImpl({}, type_id, data_type, std::move(device_opt), is_variable) {} -TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable) +TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, + c10::optional device_opt, bool is_variable) : storage_(std::move(storage)), sizes_{0}, storage_offset_(0), numel_(0), data_type_(data_type), + device_opt_(device_opt), type_id_(type_id), is_variable_(is_variable) { + AT_ASSERT(type_id == UndefinedTensorId() || data_type.id() == caffe2::TypeIdentifier::uninitialized() || + device_opt_.has_value()); + // we would also like to check that non-cpu devices have an index, but some Caffe2 operators create + // Storages with default devices. strides_.push_back(1); } diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 24ddaac..bef73f0 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -213,23 +213,21 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { TensorImpl() = delete; /** - * Construct a 1-dim 0-size tensor with the given settings. - * The provided allocator will be used to allocate data on - * subsequent resize. + * Construct a 1-dim 0-size tensor backed by the given storage. */ - TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable); + TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); /** - * Construct a 1-dim 0-size tensor backed by the given storage. + * Construct a 1-dim 0 size tensor that doesn't have a storage. */ - TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); + TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, c10::optional device_opt, bool is_variable); private: // This constructor is private, because the data_type is redundant with // storage. Still, we pass it in separately because it's easier to write // the initializer list if we're not worried about storage being moved out // from under us. - TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable); + TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, c10::optional, bool is_variable); public: TensorImpl(const TensorImpl&) = delete; @@ -361,31 +359,25 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } int64_t get_device() const { - // NB: This method is not virtual and tries to avoid dispatches in the common case for perf. - const auto tid = type_id(); - if (tid == CUDATensorId() || tid == HIPTensorId() || tid == MSNPUTensorId() || tid == XLATensorId()) { - // TODO: #12934 investigate caching device on TensorImpl to avoid this vdispatch. - return storage().device().index(); + if (device_opt_.has_value()) { + // See NOTE [c10::optional operator usage in CUDA] + return (*device_opt_).index(); } - return get_device_slow(); + + AT_ERROR( + "tensor with backend ", toString(tensorTypeIdToBackend(type_id())), + " does not have a device"); } Device device() const { - // Special case the common case for performance reasons - // TODO: This is a little convoluted so it would be good to investigate - // caching device on TensorImpl (#12934) to speed up device() calls in all cases. - const auto tid = type_id(); - if (tid == CPUTensorId() || tid == CUDATensorId() || tid == HIPTensorId() || tid == MSNPUTensorId() || - tid == XLATensorId()) { - // NB: storage(), not storage_, b/c of Variable. - const auto& mystorage = storage(); - if (mystorage) { - return mystorage.device(); - } + if (device_opt_.has_value()) { + // See NOTE [c10::optional operator usage in CUDA] + return *device_opt_; } - const auto device_type = computeDeviceType(tid); - bool not_cpu = device_type != DeviceType::CPU; - return Device(device_type, not_cpu ? get_device() : -1); + + AT_ERROR( + "tensor with backend ", toString(tensorTypeIdToBackend(type_id())), + " does not have a device"); } Layout layout() const { @@ -873,15 +865,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } private: - // As an optimization, get_device handles the typical CUDA Tensor case and - // calls get_device_slow if the tensor stores its device somewhere else - // (VariableImpl, SparseTensorImpl). This methods does a virtual dispatch - // that makes it 10-20ns slower than the special-cased CUDA Tensor case. - virtual int64_t get_device_slow() const { - AT_ERROR( - "get_device is not implemented for tensors with ", - toString(tensorTypeIdToBackend(type_id())), - " backend"); + // See NOTE [c10::optional operator usage in CUDA] + // We probably don't want to expose this publically until + // the note is addressed. + c10::optional device_opt() const { + return device_opt_; } public: @@ -891,7 +879,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { */ DeviceType device_type() const { AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged - return storage_.device_type(); + AT_ASSERT(device_opt_.has_value()); + // See NOTE [c10::optional operator usage in CUDA] + return (*device_opt_).type(); } /** @@ -899,7 +889,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * device). */ Device GetDevice() const { - return storage_.device(); + // See NOTE [c10::optional operator usage in CUDA] + return *device_opt_; } /** @@ -1125,6 +1116,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { */ storage_ = src.storage(); data_type_ = src.dtype(); + device_opt_ = src.device_opt(); storage_offset_ = src.storage_offset(); } @@ -1143,6 +1135,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { storage_.UniqueStorageShareExternalPointer( std::move(data_ptr), data_type, capacity); data_type_ = data_type; + device_opt_ = storage_.device(); storage_offset_ = 0; } else { int64_t numel = capacity / data_type.itemsize(); @@ -1154,6 +1147,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { /*allocator=*/nullptr, /*resizable=*/false); data_type_ = data_type; + device_opt_ = storage_.device(); storage_offset_ = 0; } } @@ -1184,6 +1178,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } } data_type_ = meta; + // NB: device is not changed // We can reuse the existing buffer if the current data does not have // a special destructor and the new data doesn't have a special @@ -1219,6 +1214,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } storage_.set_numel(numel_); AT_ASSERT(storage_offset_ == 0); // because we just reallocated + device_opt_ = storage_.device(); return storage_.data(); } } @@ -1264,6 +1260,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { AT_CHECK(allow_tensor_metadata_change(), "set_storage is not allowed on Tensor created from .data or .detach()"); storage_ = std::move(storage); data_type_ = storage_.dtype(); + device_opt_ = storage_.device(); } private: @@ -1401,6 +1398,17 @@ protected: // agree with the type meta in storage caffe2::TypeMeta data_type_; + // NOTE [c10::optional operator usage in CUDA] + // Our optional definition doesn't compile in .cu file if `value()` or + // `operator->` are used. Instead, we always use `operator*`. + // See https://github.com/pytorch/pytorch/issues/18496 for more info. + // If this is too burdensome to maintain, we can just + // manually implement this with an additional bool. + + // INVARIANT: When storage is non-null, this Device must + // agree with the type meta in storage. + c10::optional device_opt_; + // You get to have eight byte-size fields here, before you // should pack this into a bitfield. TensorTypeId type_id_; @@ -1475,12 +1483,13 @@ protected: // storage offset // numel // data type pointer +// (optional) device // autograd metadata pointer // PyObject pointer // miscellaneous bitfield // static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit... - sizeof(TensorImpl) == sizeof(int64_t) * 26, + sizeof(TensorImpl) == sizeof(int64_t) * 27, "You changed the size of TensorImpl on 64-bit arch." "See Note [TensorImpl size constraints] on how to proceed."); diff --git a/c10/core/UndefinedTensorImpl.cpp b/c10/core/UndefinedTensorImpl.cpp index badc5ec..4721254 100644 --- a/c10/core/UndefinedTensorImpl.cpp +++ b/c10/core/UndefinedTensorImpl.cpp @@ -5,7 +5,7 @@ namespace c10 { // should this use the globalContext? Can it get a context passed in somehow? UndefinedTensorImpl::UndefinedTensorImpl() -: TensorImpl(UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is variable */ false) { +: TensorImpl(UndefinedTensorId(), caffe2::TypeMeta(), c10::nullopt, /* is variable */ false) { } IntArrayRef UndefinedTensorImpl::sizes() const { diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index 4747296..2a7e1fb 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -22,7 +22,7 @@ namespace torch { namespace autograd { Variable::Impl::Impl(at::Tensor data, std::unique_ptr autograd_meta, bool requires_grad, Edge gradient_edge) - : TensorImpl(data.type_id(), data.dtype(), /*allocator=*/nullptr, /* is variable */ true), + : TensorImpl(data.type_id(), data.dtype(), data.device(), /* is variable */ true), data_(std::move(data)) { autograd_meta->grad_fn_ = std::move(gradient_edge.function); autograd_meta->requires_grad_ = false; @@ -103,10 +103,6 @@ int64_t Variable::Impl::storage_offset() const { return data_.storage_offset(); } -int64_t Variable::Impl::get_device_slow() const { - return data_.get_device(); -} - std::shared_ptr Variable::grad_accumulator() const { auto autograd_meta = get_autograd_meta(); if (autograd_meta->grad_fn_) { @@ -172,6 +168,7 @@ void Variable::Impl::set_data(const at::Tensor &new_data) { // Updates metadata data_type_ = new_data.type().typeMeta(); + device_opt_ = new_data.device(); type_id_ = new_data.dispatch_type().type_id(); is_variable_ = true; diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index 6640819..18f6a22 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -439,9 +439,6 @@ struct TORCH_API Variable::Impl : public at::TensorImpl { /// The underlying data tensor for this Variable. /// This field will be removed once VariableImpl and TensorImpl are merged. at::Tensor data_; - - private: - int64_t get_device_slow() const override; }; //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index eda0242..b33bb1a 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -309,36 +309,6 @@ struct PreprocessGraph { size_t n_outputs; }; -// Sometimes we want to pass things that are not tensors. Instead of -// coming up with some "superclass" for tensor, which is annoying since -// 99% of values are at::Tensor, we instead we create a fake subclass of -// TensorImpl that can be subclassed to hold arbitrary things -// Note: this is currently unused but will probably be useful in the future, -// so we keep it around -struct ContainerTensor : public at::TensorImpl { - public: - ContainerTensor() - : TensorImpl( - at::UndefinedTensorId(), - caffe2::TypeMeta(), - nullptr, - /* is_variable */ false) {} - - ~ContainerTensor() override = default; - at::IntArrayRef sizes() const override { - throw std::runtime_error("sizes() on ContainerTensor"); - } - at::IntArrayRef strides() const override { - throw std::runtime_error("strides() on ContainerTensor"); - } - int64_t dim() const override { - throw std::runtime_error("dim() on ContainerTensor"); - } - const at::Storage& storage() const override { - throw std::runtime_error("storage() on ContainerTensor"); - } -}; - // We need some lists for inputs and outputs. To keep all the memory // contiguous we allocate a single vector and use offsets into the vector // which are stored in the ListHandle struct -- 2.7.4