From c4bb09cc42069412fa9f4f1ff033ad4696c5e6db Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Sat, 30 Mar 2019 00:03:44 -0700 Subject: [PATCH] Add functor- and function-based kernel registration API (#18162) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18162 - Adds the API to register a functor- and function-based kernel. - Change the experimental c10 ops to use this new API instead of the old one - Deletes the old APIs in KernelRegistration.h and OpSchemaRegistration.h Reviewed By: dzhulgakov Differential Revision: D14514239 fbshipit-source-id: 35b2f6e8f62964e54886450a6a5fac812ed20f26 --- aten/src/ATen/core/dispatch/DispatchTable.h | 5 +- aten/src/ATen/core/dispatch/Dispatcher.cpp | 4 +- aten/src/ATen/core/dispatch/Dispatcher.h | 6 +- aten/src/ATen/core/dispatch/KernelCache.h | 2 + aten/src/ATen/core/dispatch/KernelRegistration.h | 279 --------- aten/src/ATen/core/dispatch/OpSchemaRegistration.h | 46 -- aten/src/ATen/core/dispatch/README.md | 2 - aten/src/ATen/core/op_registration/base.h | 6 +- .../ATen/core/op_registration/kernel_function.h | 49 ++ .../core/op_registration/kernel_function_test.cpp | 539 ++++++++++++++++ .../src/ATen/core/op_registration/kernel_functor.h | 187 ++++++ .../core/op_registration/kernel_functor_test.cpp | 693 +++++++++++++++++++++ .../ATen/core/op_registration/kernel_stackbased.h | 25 +- .../ATen/core/op_registration/op_registration.h | 8 +- aten/src/ATen/core/op_registration/test_helpers.h | 10 +- c10/util/C++17.h | 2 +- c10/util/Metaprogramming.h | 10 + caffe2/operators/CMakeLists.txt | 17 - caffe2/operators/experimental/c10/cpu/add_cpu.cc | 30 +- .../experimental/c10/cpu/averaged_loss_cpu.cc | 88 +-- .../experimental/c10/cpu/batch_gather_cpu.cc | 29 +- .../experimental/c10/cpu/batch_matmul_cpu.cc | 497 ++++++++------- caffe2/operators/experimental/c10/cpu/cast_cpu.cc | 31 +- .../operators/experimental/c10/cpu/concat_cpu.cc | 34 +- .../experimental/c10/cpu/enforce_finite_cpu.cc | 29 +- .../experimental/c10/cpu/expand_dims_cpu.cc | 109 ++-- caffe2/operators/experimental/c10/cpu/fc_cpu.cc | 248 ++++---- .../operators/experimental/c10/cpu/filler_cpu.cc | 128 +++- .../operators/experimental/c10/cpu/flatten_cpu.cc | 31 +- caffe2/operators/experimental/c10/cpu/mul_cpu.cc | 32 +- caffe2/operators/experimental/c10/cpu/relu_cpu.cc | 28 +- .../operators/experimental/c10/cpu/sigmoid_cpu.cc | 30 +- .../cpu/sigmoid_cross_entropy_with_logits_cpu.cc | 34 +- .../experimental/c10/cpu/sparse_lengths_sum_cpu.cc | 31 +- .../experimental/c10/cpu/stop_gradient_cpu.cc | 30 +- caffe2/operators/experimental/c10/schemas/add.cc | 28 - caffe2/operators/experimental/c10/schemas/add.h | 11 - .../experimental/c10/schemas/averaged_loss.cc | 25 - .../experimental/c10/schemas/averaged_loss.h | 11 - .../experimental/c10/schemas/batch_gather.cc | 26 - .../experimental/c10/schemas/batch_gather.h | 11 - .../experimental/c10/schemas/batch_matmul.cc | 30 - .../experimental/c10/schemas/batch_matmul.h | 11 - caffe2/operators/experimental/c10/schemas/cast.cc | 29 - caffe2/operators/experimental/c10/schemas/cast.h | 11 - .../operators/experimental/c10/schemas/concat.cc | 29 - caffe2/operators/experimental/c10/schemas/concat.h | 11 - .../experimental/c10/schemas/enforce_finite.cc | 24 - .../experimental/c10/schemas/enforce_finite.h | 11 - .../experimental/c10/schemas/expand_dims.cc | 28 - .../experimental/c10/schemas/expand_dims.h | 11 - caffe2/operators/experimental/c10/schemas/fc.cc | 29 - caffe2/operators/experimental/c10/schemas/fc.h | 11 - .../operators/experimental/c10/schemas/filler.cc | 103 --- caffe2/operators/experimental/c10/schemas/filler.h | 15 - .../operators/experimental/c10/schemas/flatten.cc | 26 - .../operators/experimental/c10/schemas/flatten.h | 11 - caffe2/operators/experimental/c10/schemas/mul.cc | 29 - caffe2/operators/experimental/c10/schemas/mul.h | 11 - caffe2/operators/experimental/c10/schemas/relu.cc | 25 - caffe2/operators/experimental/c10/schemas/relu.h | 11 - .../operators/experimental/c10/schemas/sigmoid.cc | 25 - .../operators/experimental/c10/schemas/sigmoid.h | 11 - .../schemas/sigmoid_cross_entropy_with_logits.cc | 29 - .../schemas/sigmoid_cross_entropy_with_logits.h | 11 - .../experimental/c10/schemas/sparse_lengths_sum.cc | 27 - .../experimental/c10/schemas/sparse_lengths_sum.h | 11 - .../experimental/c10/schemas/stop_gradient.cc | 25 - .../experimental/c10/schemas/stop_gradient.h | 11 - 69 files changed, 2380 insertions(+), 1677 deletions(-) delete mode 100644 aten/src/ATen/core/dispatch/KernelRegistration.h delete mode 100644 aten/src/ATen/core/dispatch/OpSchemaRegistration.h create mode 100644 aten/src/ATen/core/op_registration/kernel_function.h create mode 100644 aten/src/ATen/core/op_registration/kernel_function_test.cpp create mode 100644 aten/src/ATen/core/op_registration/kernel_functor.h create mode 100644 aten/src/ATen/core/op_registration/kernel_functor_test.cpp delete mode 100644 caffe2/operators/experimental/c10/schemas/add.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/add.h delete mode 100644 caffe2/operators/experimental/c10/schemas/averaged_loss.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/averaged_loss.h delete mode 100644 caffe2/operators/experimental/c10/schemas/batch_gather.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/batch_gather.h delete mode 100644 caffe2/operators/experimental/c10/schemas/batch_matmul.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/batch_matmul.h delete mode 100644 caffe2/operators/experimental/c10/schemas/cast.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/cast.h delete mode 100644 caffe2/operators/experimental/c10/schemas/concat.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/concat.h delete mode 100644 caffe2/operators/experimental/c10/schemas/enforce_finite.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/enforce_finite.h delete mode 100644 caffe2/operators/experimental/c10/schemas/expand_dims.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/expand_dims.h delete mode 100644 caffe2/operators/experimental/c10/schemas/fc.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/fc.h delete mode 100644 caffe2/operators/experimental/c10/schemas/filler.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/filler.h delete mode 100644 caffe2/operators/experimental/c10/schemas/flatten.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/flatten.h delete mode 100644 caffe2/operators/experimental/c10/schemas/mul.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/mul.h delete mode 100644 caffe2/operators/experimental/c10/schemas/relu.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/relu.h delete mode 100644 caffe2/operators/experimental/c10/schemas/sigmoid.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/sigmoid.h delete mode 100644 caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.h delete mode 100644 caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.h delete mode 100644 caffe2/operators/experimental/c10/schemas/stop_gradient.cc delete mode 100644 caffe2/operators/experimental/c10/schemas/stop_gradient.h diff --git a/aten/src/ATen/core/dispatch/DispatchTable.h b/aten/src/ATen/core/dispatch/DispatchTable.h index 78b048b..15865cc 100644 --- a/aten/src/ATen/core/dispatch/DispatchTable.h +++ b/aten/src/ATen/core/dispatch/DispatchTable.h @@ -14,6 +14,7 @@ #include #include #include +#include namespace c10 { @@ -23,7 +24,7 @@ namespace c10 { * so we can create a new cache instance when a kernel is looked up * from the dispatch table. */ -using KernelCacheCreatorFunction = std::unique_ptr (); +using KernelCacheCreatorFunction = std::function ()>; /** * The dispatch table stores a pointer to a kernel function and a pointer * to a function initializing a cache for the kernel. If the kernel wants @@ -34,7 +35,7 @@ using KernelCacheCreatorFunction = std::unique_ptr (); */ struct DispatchTableEntry final { /*not-nullable*/ KernelFunction* kernel_func; - /*not-nullable*/ KernelCacheCreatorFunction* cache_creator_func; + /*not-nullable*/ KernelCacheCreatorFunction cache_creator_func; }; namespace detail { diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 85a148f..8626fae 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -103,9 +103,9 @@ void Dispatcher::deregisterSchema(const OperatorHandle& op) { } } -void Dispatcher::registerKernel(const OperatorHandle& op, TensorTypeId dispatch_key, KernelFunction* kernel_func, KernelCacheCreatorFunction* cache_creator_func) { +void Dispatcher::registerKernel(const OperatorHandle& op, TensorTypeId dispatch_key, KernelFunction* kernel_func, KernelCacheCreatorFunction cache_creator_func) { // note: this doesn't need the mutex because write operations on the list keep iterators intact. - op.operatorDefIterator_->dispatchTable.registerKernel(std::move(dispatch_key), DispatchTableEntry{kernel_func, cache_creator_func}); + op.operatorDefIterator_->dispatchTable.registerKernel(std::move(dispatch_key), DispatchTableEntry{kernel_func, std::move(cache_creator_func)}); } void Dispatcher::deregisterKernel(const OperatorHandle& op, TensorTypeId dispatch_key) { diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 35b32a3..08645a2 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -39,8 +39,8 @@ public: } private: - explicit OpKernel(KernelFunction* kernel, KernelCacheCreatorFunction* cache_creator) - : kernel_(kernel), cache_((*cache_creator)()) {} + explicit OpKernel(KernelFunction* kernel, const KernelCacheCreatorFunction& cache_creator) + : kernel_(kernel), cache_(cache_creator()) {} friend class Dispatcher; KernelFunction* kernel_; @@ -126,7 +126,7 @@ public: /** * Register an operator to the dispatch table for an operator. */ - void registerKernel(const OperatorHandle& op, TensorTypeId dispatch_key, KernelFunction* kernel_func, KernelCacheCreatorFunction* cache_creator_func); + void registerKernel(const OperatorHandle& op, TensorTypeId dispatch_key, KernelFunction* kernel_func, KernelCacheCreatorFunction cache_creator_func); /** * Remove an operator from the dispatch table for an operator. diff --git a/aten/src/ATen/core/dispatch/KernelCache.h b/aten/src/ATen/core/dispatch/KernelCache.h index e879b9d..3ff270b 100644 --- a/aten/src/ATen/core/dispatch/KernelCache.h +++ b/aten/src/ATen/core/dispatch/KernelCache.h @@ -1,5 +1,7 @@ #pragma once +#include + namespace c10 { /** diff --git a/aten/src/ATen/core/dispatch/KernelRegistration.h b/aten/src/ATen/core/dispatch/KernelRegistration.h deleted file mode 100644 index 4fef909..0000000 --- a/aten/src/ATen/core/dispatch/KernelRegistration.h +++ /dev/null @@ -1,279 +0,0 @@ -#pragma once - -#include -#include - -/** - * To register your own kernel for an operator, do in one (!) cpp file: - * C10_REGISTER_KERNEL(OperatorHandle) - * .kernel() - * .dispatchKey(dispatch_key); - * - * Example: - * - * Tensor my_kernel_cpu(Tensor in) {...} - * - * C10_REGISTER_KERNEL(MyOpSchema) - * .kernel() - * .dispatchKey(CPUTensorId()); - */ - -namespace c10 { - -// TODO Test different order for builder -// TODO Test no dispatch key defined - -/** - * Class which, on construction, registers an operator in the dispatch table. The intent is that - * this class is constructed at static initialization time so that operators automatically get - * registered when a dlopen() occurs. - * - * You shouldn't call this directly; instead, use the C10_REGISTER_KERNEL macros. - */ -class KernelRegistrar final { -public: - using OpHandleGetter = const OperatorHandle& (); - - /** - * @param op The operator to register the kernel for - * @param dispatch_key The dispatch key to register the function to - * @param kernel The concrete function implementation to register - * @param cache_creator A function initializing the cache for the kernel - */ - explicit KernelRegistrar(OpHandleGetter *op, TensorTypeId dispatch_key, KernelFunction* kernel, KernelCacheCreatorFunction* cache_creator) - : op_(std::move(op)), dispatch_key_(std::move(dispatch_key)), owns_registration_(true) { - Dispatcher::singleton().registerKernel(op_(), dispatch_key_, kernel, cache_creator); - } - - KernelRegistrar(KernelRegistrar&& rhs) - : op_(std::move(rhs.op_)), dispatch_key_(std::move(rhs.dispatch_key_)), owns_registration_(true) { - rhs.owns_registration_ = false; - } - - // not needed for now - KernelRegistrar& operator=(KernelRegistrar&& rhs) = delete; - - ~KernelRegistrar() { - if (owns_registration_) { - Dispatcher::singleton().deregisterKernel(op_(), dispatch_key_); - } - } - -private: - OpHandleGetter *op_; - const TensorTypeId dispatch_key_; - bool owns_registration_; - - C10_DISABLE_COPY_AND_ASSIGN(KernelRegistrar); -}; - -namespace detail { -// ivalue_to_arg_type: Take an IValue that is an argument to a kernel and -// cast it to the type that should be passed to the kernel function. -// Examples: If the IValue contains a plain type like an int, return that. -// If the IValue contains an IntList, return it as ArrayRef. -template -struct ivalue_to_arg_type { - static T call(const IValue& v) { - return std::move(v).to(); - } -}; -template -struct ivalue_to_arg_type> { - static ArrayRef call(const IValue& v) { - return v.to>>()->elements(); - } -}; - -// call_with_ivalue_args: Take a function pointer and an ArrayRef -// containing the arguments to call the function pointer with, and call it. -// The extra_args are appended as additional arguments at the end of the function call. -// Example: -// int myfunc(int a, ArrayRef b, string c); -// int main() { -// std::vector ivalue_args = {IValue(2), IntList::create(3, 4)}; -// call_with_ivalue_args(ivalue_args, "extra_arg"); -// } -template -typename guts::function_traits::return_type call_with_ivalue_args_(ArrayRef ivalue_args, guts::index_sequence, ExtraArgs&&... extra_args) { - using IValueArgTypes = typename guts::function_traits::parameter_types; - return (*func)(ivalue_to_arg_type>>>::call(ivalue_args[ivalue_arg_indices])..., std::forward(extra_args)...); -} - -template -typename guts::function_traits::return_type call_with_ivalue_args(ArrayRef ivalue_args, ExtraArgs&&... extra_args) { - constexpr size_t num_ivalue_args = guts::function_traits::number_of_parameters - sizeof...(ExtraArgs); - AT_ASSERTM(num_ivalue_args == ivalue_args.size(), "Wrong number of ivalue arguments"); - return call_with_ivalue_args_(ivalue_args, guts::make_index_sequence(), std::forward(extra_args)...); -} - -template -struct push_outputs final { - static void call(OutputType&& output, Stack* stack) { - push_outputs>(std::tuple(std::move(output)), stack); - } -}; -template -struct push_outputs> final { - static void call(std::tuple&& output, Stack* stack) { - for (size_t i = 0; i < sizeof...(OutputTypes); ++i) { - torch::jit::push(return_type_to_ivalue(std::move(output))); - } - } -}; - -// SFINAE over (1) does the operator kernel have a cache and (2) does it return a value or void -template struct wrap_kernel {}; -// SFINAE version for kernels with output and with cache -template -struct wrap_kernel::value && !std::is_same::return_type>::value>> final { - static typename guts::function_traits::return_type call(Stack* stack, KernelCache* cache) { - constexpr size_t num_inputs = guts::function_traits::number_of_parameters - 1; // -1 because it takes the kernel cache as last argument - auto output = call_with_ivalue_args(torch::jit::last(*stack, num_inputs), static_cast(cache)); - push_outputs::return_type>(std::move(output), stack); - } -}; -// SFINAE version for kernels with output and without a cache -template -struct wrap_kernel::value && !std::is_same::return_type>::value>> final { - static typename guts::function_traits::return_type call(Stack* stack, c10::KernelCache* /*cache*/) { - constexpr size_t num_inputs = guts::function_traits::number_of_parameters; - auto output = call_with_ivalue_args(torch::jit::last(*stack, num_inputs)); - push_outputs::return_type>(std::move(output), stack); - } -}; -// SFINAE version for kernels without output and with a cache -template -struct wrap_kernel::value && std::is_same::return_type>::value>> final { - static typename guts::function_traits::return_type call(Stack* stack, c10::KernelCache* cache) { - constexpr size_t num_inputs = guts::function_traits::number_of_parameters - 1; // -1 because it takes the kernel cache as last argument - call_with_ivalue_args(torch::jit::last(*stack, num_inputs), static_cast(cache)); - } -}; -// SFINAE version for kernels without output and without a cache -template -struct wrap_kernel::value && std::is_same::return_type>::value>> final { - static typename guts::function_traits::return_type call(Stack* stack, c10::KernelCache* /*cache*/) { - constexpr size_t num_inputs = guts::function_traits::number_of_parameters; - call_with_ivalue_args(torch::jit::last(*stack, num_inputs)); - } -}; -} - -/** - * Helper class for building a KernelRegistrar. This permits "keyword-argument" like syntax - * when performing operator registration, e.g., as in: - * - * C10_REGISTER_KERNEL(::ops::add_notensor) - * .kernel(&add_notensor_op) - * .dispatchKey("bla"); - * - * Expanded, this macro invocation looks like: - * - * static KernelRegistrar<::ops::add_notensor> _anon0 = - * KernelRegistrationBuilder<::ops::add_notensor, false, false>() - * .kernel(&add_notensor_op) - * .dispatchKey("bla"); - * - * The resulting full expression is implicitly convertible to a KernelRegistrar. - */ -template -class KernelRegistrationBuilder final { -private: - static constexpr uint64_t DISPATCH_KEY_PRESENT = 0x01 << 0; - static constexpr uint64_t KERNEL_PRESENT = 0x01 << 1; - static constexpr uint64_t CACHE_PRESENT = 0x01 << 2; - - using OpHandleGetter = KernelRegistrar::OpHandleGetter; - - static std::unique_ptr defaultCacheCreator() { - return nullptr; - } - - template - static std::unique_ptr cacheCreator() { - static_assert(std::is_default_constructible::value, "Cache class must be default constructible"); - return guts::make_unique(); - } - - OpHandleGetter *op_; - c10::optional dispatch_key_; - KernelFunction* kernel_; - KernelCacheCreatorFunction* cache_creator_; - - public: - constexpr explicit KernelRegistrationBuilder(OpHandleGetter *op) - : KernelRegistrationBuilder(std::move(op), c10::nullopt, nullptr, &defaultCacheCreator) {} - - constexpr explicit KernelRegistrationBuilder( - OpHandleGetter *op, - c10::optional dispatch_key, - KernelFunction* kernel, - KernelCacheCreatorFunction* cache_creator) - : op_(std::move(op)), dispatch_key_(std::move(dispatch_key)), kernel_(kernel), cache_creator_(cache_creator) {} - - /** - * Implicit coercion to KernelRegistrar that finalizes the builder and - * creates the object. - * @return Produced KernelRegistrar - */ - operator KernelRegistrar() && { - static_assert(FieldsPresentFlags & KERNEL_PRESENT, "Forgot to call .kernel() in kernel registration"); - static_assert(FieldsPresentFlags & DISPATCH_KEY_PRESENT, "Forgot to call .dispatchKey() in kernel registration"); - return KernelRegistrar(op_, std::move(*dispatch_key_), kernel_, cache_creator_); - } - - /** - * Specify the dispatch key for this dispatch registration - * @param dispatch_key dispatch key to register the function to - * @return "this" for method chaining - */ - AT_CPP14_CONSTEXPR KernelRegistrationBuilder dispatchKey(TensorTypeId dispatch_key) && { - static_assert(!(FieldsPresentFlags & DISPATCH_KEY_PRESENT), "Tried to define kernel twice in same op registration"); - return KernelRegistrationBuilder(std::move(op_), std::move(dispatch_key), kernel_, cache_creator_); - } - - /** - * Specify the concrete function implementation for this dispatch registration - * @param kernel concrete function implementation to be registered - * @return "this" for method chaining - */ - template - AT_CPP14_CONSTEXPR KernelRegistrationBuilder kernel() && { - static_assert(!(FieldsPresentFlags & KERNEL_PRESENT), "Tried to define kernel twice in same op registration"); - // TODO Better error message when kernel function mismatches, one common mismatch is missing cache parameter or cache parameter present while not expected. - return KernelRegistrationBuilder(std::move(op_), std::move(dispatch_key_), kernel_func, cache_creator_); - } - - /** - * Specify the concrete function implementation for this dispatch registration - * @param kernel concrete function implementation to be registered - * @return "this" for method chaining - */ - template - AT_CPP14_CONSTEXPR KernelRegistrationBuilder kernel() && { - // TODO Better error message if FuncType is not a func type - return std::move(*this).template kernel<&detail::wrap_kernel::call>(); - } - - /** - * Specify the dispatch key for this dispatch registration - * @param dispatch_key dispatch key to register the function to - * @return "this" for method chaining - */ - template - AT_CPP14_CONSTEXPR KernelRegistrationBuilder withCache() && { - static_assert(!(FieldsPresentFlags & CACHE_PRESENT), "Tried to define cache twice in same op registration"); - static_assert(std::is_base_of::value, "Cache must inherit from c10::KernelCache"); - - static_assert(!(FieldsPresentFlags & KERNEL_PRESENT), "Cannot set the cache after the kernel function is already set. Please call .withCache() first and .kernel() later in the chain."); - - return KernelRegistrationBuilder(std::move(op_), std::move(dispatch_key_), kernel_, &cacheCreator); - } -}; - -} // namespace c10 - -// NB: Semicolon after applying this macro is MANDATORY -#define C10_REGISTER_KERNEL(OperatorHandle) \ - static KernelRegistrar MACRO_CONCAT(__kernelRegistrationBuilder_, __COUNTER__) = KernelRegistrationBuilder(OperatorHandle) diff --git a/aten/src/ATen/core/dispatch/OpSchemaRegistration.h b/aten/src/ATen/core/dispatch/OpSchemaRegistration.h deleted file mode 100644 index 2e07c3a..0000000 --- a/aten/src/ATen/core/dispatch/OpSchemaRegistration.h +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -#include - -namespace c10 { -namespace detail { -class OpSchemaRegistrar final { -public: - explicit OpSchemaRegistrar(FunctionSchema schema) - : opHandle_(c10::Dispatcher::singleton().registerSchema(std::move(schema))) {} - - ~OpSchemaRegistrar() { - c10::Dispatcher::singleton().deregisterSchema(opHandle_); - } - - const c10::OperatorHandle& opHandle() const { - return opHandle_; - } - -private: - c10::OperatorHandle opHandle_; -}; -} // namespace detail -} // namespace c10 - -/** - * Macro for defining an operator schema. Every operator schema must - * invoke C10_DECLARE_OP_SCHEMA in a header and C10_DEFINE_OP_SCHEMA in one (!) - * cpp file. Internally, this arranges for the dispatch table for - * the operator to be created. - */ -#define C10_DECLARE_OP_SCHEMA(Name) \ - CAFFE2_API const c10::OperatorHandle& Name(); \ - -#define C10_DEFINE_OP_SCHEMA(Name, Schema) \ - C10_EXPORT const c10::OperatorHandle& Name() { \ - /* must be meyers singleton to make sure this is registered before any */ \ - /* kernels referencing it are registered. */ \ - static ::c10::detail::OpSchemaRegistrar registrar(Schema); \ - return registrar.opHandle(); \ - } \ - namespace { \ - /* to make sure the schema is registered even if it is not referenced by */ \ - /* a kernel registration, call it in a global object. */ \ - const c10::OperatorHandle& _c10_op_registration_instance_##Name = Name(); \ - } diff --git a/aten/src/ATen/core/dispatch/README.md b/aten/src/ATen/core/dispatch/README.md index e29c93c..d2f935e 100644 --- a/aten/src/ATen/core/dispatch/README.md +++ b/aten/src/ATen/core/dispatch/README.md @@ -8,5 +8,3 @@ This folder contains the following files: - DispatchTable.h: Implementation of the actual dispatch mechanism. Hash table with kernels, lookup, ... - KernelCache.h: An interface operator kernels can use to inherit from if they need to keep around a cache between invocations - KernelFunction.h: The core interface (i.e. function pointer) for calling a kernel -- OpSchemaRegistration.h (deprecated): The mechanisms to register new operators with the c10 dispatcher -- KernelRegistration.h (deprecated): The mechanisms to register kernels with the c10 dispatcher diff --git a/aten/src/ATen/core/op_registration/base.h b/aten/src/ATen/core/op_registration/base.h index 8a9430a..3c4ea05 100644 --- a/aten/src/ATen/core/op_registration/base.h +++ b/aten/src/ATen/core/op_registration/base.h @@ -22,9 +22,9 @@ namespace detail { // You shouldn't call this directly; instead, use the RegisterOperators class. class OperatorRegistrar final { public: - explicit OperatorRegistrar(FunctionSchema&& schema, TensorTypeId dispatch_key, KernelFunction* kernel, KernelCacheCreatorFunction* cache_creator) + explicit OperatorRegistrar(FunctionSchema&& schema, TensorTypeId dispatch_key, KernelFunction* kernel, KernelCacheCreatorFunction&& cache_creator) : op_(Dispatcher::singleton().registerSchema(std::move(schema))), dispatch_key_(std::move(dispatch_key)), owns_registration_(true) { - Dispatcher::singleton().registerKernel(op_, dispatch_key_, kernel, cache_creator); + Dispatcher::singleton().registerKernel(op_, dispatch_key_, kernel, std::move(cache_creator)); } OperatorRegistrar(OperatorRegistrar&& rhs) noexcept @@ -55,7 +55,7 @@ namespace detail { struct KernelRegistrationConfig final { TensorTypeId dispatch_key; KernelFunction* kernel_func = nullptr; - KernelCacheCreatorFunction* cache_creator_func = nullptr; + KernelCacheCreatorFunction cache_creator_func = nullptr; }; // is_registration_config_parameter is a concept that returns true_type iff its argument is diff --git a/aten/src/ATen/core/op_registration/kernel_function.h b/aten/src/ATen/core/op_registration/kernel_function.h new file mode 100644 index 0000000..ba14d5a --- /dev/null +++ b/aten/src/ATen/core/op_registration/kernel_function.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +namespace c10 { +namespace detail { + // WrapKernelFunction: Wraps a compile time function pointer into a kernel functor. + // Since it is a compile time function pointer, many compilers can inline it + // into the wrapper and you don't get any performance overhead for wrapping. + template class WrapKernelFunction_ {}; + template + class WrapKernelFunction_> final : public c10::OperatorKernel { + public: + auto operator()(Parameters&&... args) -> decltype((*kernel_func)(std::forward(args)...)) { + return (*kernel_func)(std::forward(args)...); + } + }; + template::value>> + struct WrapKernelFunction final { + using type = WrapKernelFunction_< + FuncType, + kernel_func, + typename guts::function_traits::return_type, + typename guts::function_traits::parameter_types + >; + }; +} + +/** + * Use this to register an operator whose kernel is implemented by a function: + * + * Example: + * + * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", + * > c10::kernel(), + * > c10::dispatchKey(CPUTensorId())); + */ +template +inline constexpr auto kernel() -> +// enable_if: only enable it if FuncType is actually a function, but not a stack based KernelFunction. +guts::enable_if_t::value && !std::is_same::value, +decltype(kernel::type>())> { + return kernel::type>(); +} + +} diff --git a/aten/src/ATen/core/op_registration/kernel_function_test.cpp b/aten/src/ATen/core/op_registration/kernel_function_test.cpp new file mode 100644 index 0000000..02231d3 --- /dev/null +++ b/aten/src/ATen/core/op_registration/kernel_function_test.cpp @@ -0,0 +1,539 @@ +#include +#include + +#include +#include + +using c10::RegisterOperators; +using c10::FunctionSchema; +using c10::Argument; +using c10::IntType; +using c10::ListType; +using c10::kernel; +using c10::dispatchKey; +using c10::TensorTypeId; +using c10::KernelCache; +using c10::Stack; +using c10::guts::make_unique; +using c10::ivalue::TensorList; +using c10::ivalue::IntList; +using c10::intrusive_ptr; +using c10::ArrayRef; +using std::unique_ptr; +using at::Tensor; + +namespace { + +C10_DECLARE_TENSOR_TYPE(TensorType1); +C10_DEFINE_TENSOR_TYPE(TensorType1); +C10_DECLARE_TENSOR_TYPE(TensorType2); +C10_DEFINE_TENSOR_TYPE(TensorType2); + +void errorKernel(const Tensor&) { + EXPECT_TRUE(false); // this kernel should never be called +} + +FunctionSchema errorOpSchema( + "_test::error", + "", + (std::vector{Argument("dummy")}), + (std::vector{})); + +int incrementKernel(const Tensor& tensor, int input) { + return input + 1; +} + +int decrementKernel(const Tensor& tensor, int input) { + return input - 1; +} + +FunctionSchema opSchema( + "_test::my_op", + "", + (std::vector{Argument("dummy"), + Argument("input", IntType::get())}), + (std::vector{Argument("output", IntType::get())})); + +void expectCallsIncrement(TensorTypeId type_id) { + // assert that schema and cpu kernel are present + auto op = c10::Dispatcher::singleton().findSchema("_test::my_op", ""); + ASSERT_TRUE(op.has_value()); + auto result = callOp(*op, dummyTensor(type_id), 5); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(6, result[0].toInt()); +} + +void expectCallsDecrement(TensorTypeId type_id) { + // assert that schema and cpu kernel are present + auto op = c10::Dispatcher::singleton().findSchema("_test::my_op", ""); + ASSERT_TRUE(op.has_value()); + auto result = callOp(*op, dummyTensor(type_id), 5); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(4, result[0].toInt()); +} + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType1())); + expectCallsIncrement(TensorType1()); +} + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) { + auto registrar = RegisterOperators() + .op(opSchema, kernel(), dispatchKey(TensorType1())) + .op(opSchema, kernel(), dispatchKey(TensorType2())) + .op(errorOpSchema, kernel(), dispatchKey(TensorType1())) + .op(errorOpSchema, kernel(), dispatchKey(TensorType2())); + expectCallsIncrement(TensorType1()); +} + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) { + auto registrar1 = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType1())); + auto registrar2 = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType2())); + auto registrar3 = RegisterOperators().op(errorOpSchema, kernel(), dispatchKey(TensorType1())); + auto registrar4 = RegisterOperators().op(errorOpSchema, kernel(), dispatchKey(TensorType2())); + expectCallsIncrement(TensorType1()); +} + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) { + { + auto registrar1 = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType1())); + { + auto registrar2 = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType2())); + + // assert that schema and cpu kernel are present + expectCallsIncrement(TensorType1()); + expectCallsDecrement(TensorType2()); + } + + // now registrar2 is destructed. Assert that schema is still present but cpu kernel is not + expectCallsIncrement(TensorType1()); + expectDoesntFindKernel("_test::my_op", TensorType2()); + } + + // now both registrars are destructed. Assert that the whole schema is gone + expectDoesntFindOperator("_test::my_op"); +} + +bool was_called = false; + +void kernelWithoutOutput(const Tensor&) { + was_called = true; +} + +FunctionSchema opWithoutOutputSchema( + "_test::no_return", + "", + (std::vector{Argument("dummy")}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators().op(opWithoutOutputSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::no_return", ""); + ASSERT_TRUE(op.has_value()); + was_called = false; + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_TRUE(was_called); + EXPECT_EQ(0, result.size()); +} + +std::tuple<> kernelWithZeroOutputs(const Tensor&) { + was_called = true; + return std::make_tuple(); +} + +FunctionSchema opWithZeroOutputsSchema( + "_test::zero_outputs", + "", + (std::vector{Argument("dummy")}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithZeroOutputs_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators().op(opWithZeroOutputsSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::zero_outputs", ""); + ASSERT_TRUE(op.has_value()); + was_called = false; + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_TRUE(was_called); + EXPECT_EQ(0, result.size()); +} + +int kernelWithIntOutput(Tensor, int a, int b) { + return a + b; +} + +FunctionSchema opWithIntOutputSchema( + "_test::int_output", + "", + (std::vector{Argument("dummy"), + Argument("a", IntType::get()), + Argument("b", IntType::get())}), + (std::vector{Argument("sum", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntOutputSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_output", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1()), 3, 6); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(9, result[0].toInt()); +} + +Tensor kernelWithTensorOutput(const Tensor& input) { + return input; +} + +FunctionSchema opWithTensorOutput( + "_test::returning_tensor", + "", + (std::vector{Argument("input")}), + (std::vector{Argument("output")})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::returning_tensor", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType1(), result[0].toTensor().type_id()); + + result = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType2(), result[0].toTensor().type_id()); +} + +std::vector kernelWithTensorListOutput(const Tensor& input1, const Tensor& input2, const Tensor& input3) { + return {input1, input2, input3}; +} + +FunctionSchema opWithTensorListOutputSchema( + "_test::list_output", + "", + (std::vector{Argument("input1"), + Argument("input2"), + Argument("input3")}), + (std::vector{Argument("output", ListType::ofTensors())})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorListOutputSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1()), dummyTensor(TensorType2()), dummyTensor(TensorType1())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(3, result[0].toTensorListRef().size()); + EXPECT_EQ(TensorType1(), result[0].toTensorListRef()[0].type_id()); + EXPECT_EQ(TensorType2(), result[0].toTensorListRef()[1].type_id()); + EXPECT_EQ(TensorType1(), result[0].toTensorListRef()[2].type_id()); +} + +std::vector kernelWithIntListOutput(const Tensor&, int input1, int input2, int input3) { + return {input1, input2, input3}; +} + +FunctionSchema opWithIntListOutputSchema( + "_test::list_output", + "", + (std::vector{Argument("dummy"), + Argument("input1", IntType::get()), + Argument("input2", IntType::get()), + Argument("input3", IntType::get())}), + (std::vector{Argument("output", ListType::ofInts())})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntListOutputSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1()), 2, 4, 6); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(3, result[0].toIntListRef().size()); + EXPECT_EQ(2, result[0].toIntListRef()[0]); + EXPECT_EQ(4, result[0].toIntListRef()[1]); + EXPECT_EQ(6, result[0].toIntListRef()[2]); +} + +std::tuple> kernelWithMultipleOutputs(Tensor) { + return std::tuple>( + dummyTensor(TensorType2()), 5, {dummyTensor(TensorType1()), dummyTensor(TensorType2())} + ); +} + +FunctionSchema opWithMultipleOutputsSchema( + "_test::multiple_outputs", + "", + (std::vector{Argument("dummy")}), + (std::vector{Argument("output1"), + Argument("output2", IntType::get()), + Argument("output3", ListType::ofTensors())})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithMultipleOutputsSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::multiple_outputs", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(3, result.size()); + EXPECT_EQ(TensorType2(), result[0].toTensor().type_id()); + EXPECT_EQ(5, result[1].toInt()); + EXPECT_EQ(2, result[2].toTensorListRef().size()); + EXPECT_EQ(TensorType1(), result[2].toTensorListRef()[0].type_id()); + EXPECT_EQ(TensorType2(), result[2].toTensorListRef()[1].type_id()); +} + +Tensor kernelWithTensorInputByReferenceWithOutput(const Tensor& input1) { + return input1; +} + +Tensor kernelWithTensorInputByValueWithOutput(Tensor input1) { + return input1; +} + +FunctionSchema opWithTensorInputWithOutput( + "_test::tensor_input", + "", + (std::vector{Argument("input")}), + (std::vector{Argument("output")})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByReference_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorInputWithOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorInputWithOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType1(), result[0].toTensor().type_id()); + + result = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType2(), result[0].toTensor().type_id()); +} + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByValue_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorInputWithOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorInputWithOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType1(), result[0].toTensor().type_id()); + + result = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType2(), result[0].toTensor().type_id()); +} + +Tensor captured_input; + +void kernelWithTensorInputByReferenceWithoutOutput(const Tensor& input1) { + captured_input = input1; +} + +void kernelWithTensorInputByValueWithoutOutput(Tensor input1) { + captured_input = input1; +} + +FunctionSchema opWithTensorInputWithoutOutput( + "_test::tensor_input", + "", + (std::vector{Argument("input")}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByReference_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorInputWithoutOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorInputWithoutOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(TensorType1(), captured_input.type_id()); + + outputs = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(TensorType2(), captured_input.type_id()); +} + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByValue_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorInputWithoutOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorInputWithoutOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(TensorType1(), captured_input.type_id()); + + outputs = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(TensorType2(), captured_input.type_id()); +} + +int captured_int_input = 0; + +void kernelWithIntInputWithoutOutput(Tensor, int input1) { + captured_int_input = input1; +} + +FunctionSchema opWithIntInputWithoutOutput( + "_test::int_input", + "", + (std::vector{Argument("dummy"), + Argument("input", IntType::get())}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntInput_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntInputWithoutOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); + ASSERT_TRUE(op.has_value()); + + captured_int_input = 0; + auto outputs = callOp(*op, dummyTensor(TensorType1()), 3); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(3, captured_int_input); +} + +int kernelWithIntInputWithOutput(Tensor, int input1) { + return input1 + 1; +} + +FunctionSchema opWithIntInputWithOutput( + "_test::int_input", + "", + (std::vector{Argument("dummy"), + Argument("input", IntType::get())}), + (std::vector{Argument("output", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntInput_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntInputWithOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1()), 3); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(4, outputs[0].toInt()); +} + +int captured_input_list_size = 0; + +void kernelWithIntListInputWithoutOutput(Tensor, ArrayRef input1) { + captured_input_list_size = input1.size(); +} + +FunctionSchema opWithIntListInputWithoutOutput( + "_test::int_list_input", + "", + (std::vector{Argument("dummy"), + Argument("input", ListType::ofInts())}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntListInputWithoutOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); + ASSERT_TRUE(op.has_value()); + + captured_input_list_size = 0; + auto outputs = callOp(*op, dummyTensor(TensorType1()), IntList::create({2, 4, 6})); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(3, captured_input_list_size); +} + +int kernelWithIntListInputWithOutput(Tensor, ArrayRef input1) { + return input1.size(); +} + +FunctionSchema opWithIntListInputWithOutput( + "_test::int_list_input", + "", + (std::vector{Argument("dummy"), + Argument("input", ListType::ofInts())}), + (std::vector{Argument("output", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntListInputWithOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1()), IntList::create({2, 4, 6})); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(3, outputs[0].toInt()); +} + +void kernelWithTensorListInputWithoutOutput(ArrayRef input1) { + captured_input_list_size = input1.size(); +} + +FunctionSchema opWithTensorListInputWithoutOutput( + "_test::tensor_list_input", + "", + (std::vector{Argument("input", ListType::ofTensors())}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorListInputWithoutOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); + ASSERT_TRUE(op.has_value()); + + captured_input_list_size = 0; + auto outputs = callOp(*op, TensorList::create({dummyTensor(TensorType1()), dummyTensor(TensorType1())})); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(2, captured_input_list_size); +} + +int kernelWithTensorListInputWithOutput(ArrayRef input1) { + return input1.size(); +} + +FunctionSchema opWithTensorListInputWithOutput( + "_test::tensor_list_input", + "", + (std::vector{Argument("input", ListType::ofTensors())}), + (std::vector{Argument("output", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListInput_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorListInputWithOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, TensorList::create({dummyTensor(TensorType1()), dummyTensor(TensorType1())})); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(2, outputs[0].toInt()); +} + +} diff --git a/aten/src/ATen/core/op_registration/kernel_functor.h b/aten/src/ATen/core/op_registration/kernel_functor.h new file mode 100644 index 0000000..19014a9 --- /dev/null +++ b/aten/src/ATen/core/op_registration/kernel_functor.h @@ -0,0 +1,187 @@ +#pragma once + +#include + +namespace c10 { +/** + * Inherit from OperatorKernel to implement a c10 kernel. + * + * Example: + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * + * The kernel class is allowed to have members to cache things between calls + * but it is not allowed to change behavior based on the cache. + * The cache is purely a performance optimization and the kernel must + * return the same outputs regardless of what's in the cache. + * + * See below for how to register this kernel with PyTorch. + */ +class OperatorKernel : public KernelCache {}; + +namespace detail { + // ivalue_to_arg_type: Take an IValue that is an argument to a kernel and + // cast it to the type that should be passed to the kernel function. + // Examples: If the IValue contains a plain type like an int, return that. + // If the IValue contains an IntList, return it as ArrayRef. + template + struct ivalue_to_arg_type { + static T call(const IValue& v) { + return std::move(v).to(); + } + }; + template + struct ivalue_to_arg_type> { + static ArrayRef call(const IValue& v) { + return v.to>>()->elements(); + } + }; + + template + IValue return_type_to_ivalue(T&& t) { + return IValue(std::forward(t)); + } + + template + typename guts::infer_function_traits_t::return_type call_functor_with_ivalue_args_(Functor* functor, ArrayRef ivalue_args, guts::index_sequence) { + using IValueArgTypes = typename guts::infer_function_traits_t::parameter_types; + return (*functor)(ivalue_to_arg_type>>>::call(ivalue_args[ivalue_arg_indices])...); + } + + template + typename guts::infer_function_traits_t::return_type call_functor_with_ivalue_args(Functor* functor, ArrayRef ivalue_args) { + constexpr size_t num_ivalue_args = guts::infer_function_traits_t::number_of_parameters; + AT_ASSERTM(num_ivalue_args == ivalue_args.size(), "Wrong number of ivalue arguments"); + return call_functor_with_ivalue_args_(functor, ivalue_args, guts::make_index_sequence()); + } + + template + struct push_outputs final { + static void call(OutputType&& output, Stack* stack) { + torch::jit::push(*stack, return_type_to_ivalue(std::move(output))); + } + }; + template + struct push_outputs> final { + static void call(std::tuple&& output, Stack* stack) { + call_(std::move(output), stack, guts::make_index_sequence()); + } + + private: + template + static void call_(std::tuple&& output, Stack* stack, guts::index_sequence) { + (void)(stack); // silence compiler warning of weird compilers somehow thinking this parameter is unused. + // iterate over all outputs and push them + (void)std::initializer_list{( + torch::jit::push(*stack, return_type_to_ivalue(std::move(std::get(output)))) + , 0)...}; + } + }; + + template struct wrap_kernel_functor final {}; + + // SFINAE version for kernels that return an output + template + struct wrap_kernel_functor::return_type>::value>> final { + static void call(Stack* stack, KernelCache* cache) { + static_assert(std::is_base_of::value, "Kernel functor must inherit from c10::OperatorKernel"); + + constexpr size_t num_inputs = guts::infer_function_traits_t::number_of_parameters; + KernelFunctor* functor = static_cast(cache); + auto output = call_functor_with_ivalue_args(functor, torch::jit::last(*stack, num_inputs)); + torch::jit::pop(*stack, num_inputs); + push_outputs::return_type>::call(std::move(output), stack); + } + }; + + // SFINAE version for kernels that don't return an output + template + struct wrap_kernel_functor::return_type>::value>> final { + static void call(Stack* stack, KernelCache* cache) { + static_assert(std::is_base_of::value, "Kernel functor must inherit from c10::OperatorKernel."); + + constexpr size_t num_inputs = guts::infer_function_traits_t::number_of_parameters; + KernelFunctor* functor = static_cast(cache); + call_functor_with_ivalue_args(functor, torch::jit::last(*stack, num_inputs)); + torch::jit::pop(*stack, num_inputs); + } + }; + + template + class KernelFactory final { + public: + static_assert(std::is_base_of::value, "Kernel functor must inherit from c10::OperatorKernel."); + static_assert(std::is_constructible::value, "Wrong argument types for constructor of kernel functor."); + + explicit constexpr KernelFactory(Args... args) + : constructor_parameters_(std::move(args)...) {} + + std::unique_ptr operator()() const { + return guts::apply( + [] (const Args&... params) {return guts::make_unique(params...); }, + constructor_parameters_); + } + + private: + std::tuple constructor_parameters_; + }; +} + +/** + * Use this to register an operator whose kernel is implemented as a functor + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", + * > c10::kernel(), + * > c10::dispatchKey(CPUTensorId())); + * + * The functor constructor can take arguments to configure the kernel. + * The arguments are defined in the kernel registration. + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > explicit my_kernel_cpu(std::string some_configuration, int a, bool b) + * > : ... {...} + * > + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", + * > c10::kernel("some_configuration", 3, true), + * > c10::dispatchKey(CPUTensorId())); + */ +template +inline constexpr auto kernel(ConstructorParameters&&... constructorParameters) +// enable_if: only enable it if KernelFunctor is actually a functor and inherits from c10::OperatorKernel +-> guts::enable_if_t< +guts::is_functor::value && std::is_base_of::value, +decltype(kernel( + &detail::wrap_kernel_functor::call, + detail::KernelFactory...>(std::forward(constructorParameters)...) +))> { + static_assert(std::is_constructible::value, "KernelFunctor cannot be constructed with the given arguments"); + + return kernel( + &detail::wrap_kernel_functor::call, + detail::KernelFactory...>(std::forward(constructorParameters)...) + ); +} + +} diff --git a/aten/src/ATen/core/op_registration/kernel_functor_test.cpp b/aten/src/ATen/core/op_registration/kernel_functor_test.cpp new file mode 100644 index 0000000..3e638ff --- /dev/null +++ b/aten/src/ATen/core/op_registration/kernel_functor_test.cpp @@ -0,0 +1,693 @@ +#include +#include + +#include +#include + +using c10::RegisterOperators; +using c10::FunctionSchema; +using c10::OperatorKernel; +using c10::Argument; +using c10::IntType; +using c10::ListType; +using c10::kernel; +using c10::dispatchKey; +using c10::TensorTypeId; +using c10::KernelCache; +using c10::Stack; +using c10::guts::make_unique; +using c10::ivalue::TensorList; +using c10::ivalue::IntList; +using c10::intrusive_ptr; +using c10::ArrayRef; +using std::unique_ptr; +using at::Tensor; + +namespace { + +C10_DECLARE_TENSOR_TYPE(TensorType1); +C10_DEFINE_TENSOR_TYPE(TensorType1); +C10_DECLARE_TENSOR_TYPE(TensorType2); +C10_DEFINE_TENSOR_TYPE(TensorType2); + +struct ErrorKernel final : public OperatorKernel { + void operator()(const Tensor&) { + EXPECT_TRUE(false); // this kernel should never be called + } +}; + +FunctionSchema errorOpSchema( + "_test::error", + "", + (std::vector{Argument("dummy")}), + (std::vector{})); + +struct IncrementKernel final : OperatorKernel { + int operator()(const Tensor& tensor, int input) { + return input + 1; + } +}; + +struct DecrementKernel final : OperatorKernel { + int operator()(const Tensor& tensor, int input) { + return input - 1; + } +}; + +FunctionSchema opSchema( + "_test::my_op", + "", + (std::vector{Argument("dummy"), + Argument("input", IntType::get())}), + (std::vector{Argument("output", IntType::get())})); + +void expectCallsIncrement(TensorTypeId type_id) { + // assert that schema and cpu kernel are present + auto op = c10::Dispatcher::singleton().findSchema("_test::my_op", ""); + ASSERT_TRUE(op.has_value()); + auto result = callOp(*op, dummyTensor(type_id), 5); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(6, result[0].toInt()); +} + +void expectCallsDecrement(TensorTypeId type_id) { + // assert that schema and cpu kernel are present + auto op = c10::Dispatcher::singleton().findSchema("_test::my_op", ""); + ASSERT_TRUE(op.has_value()); + auto result = callOp(*op, dummyTensor(type_id), 5); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(4, result[0].toInt()); +} + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType1())); + expectCallsIncrement(TensorType1()); +} + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) { + auto registrar = RegisterOperators() + .op(opSchema, kernel(), dispatchKey(TensorType1())) + .op(opSchema, kernel(), dispatchKey(TensorType2())) + .op(errorOpSchema, kernel(), dispatchKey(TensorType1())) + .op(errorOpSchema, kernel(), dispatchKey(TensorType2())); + expectCallsIncrement(TensorType1()); +} + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) { + auto registrar1 = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType1())); + auto registrar2 = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType2())); + auto registrar3 = RegisterOperators().op(errorOpSchema, kernel(), dispatchKey(TensorType1())); + auto registrar4 = RegisterOperators().op(errorOpSchema, kernel(), dispatchKey(TensorType2())); + expectCallsIncrement(TensorType1()); +} + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) { + { + auto registrar1 = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType1())); + { + auto registrar2 = RegisterOperators().op(opSchema, kernel(), dispatchKey(TensorType2())); + + // assert that schema and cpu kernel are present + expectCallsIncrement(TensorType1()); + expectCallsDecrement(TensorType2()); + } + + // now registrar2 is destructed. Assert that schema is still present but cpu kernel is not + expectCallsIncrement(TensorType1()); + expectDoesntFindKernel("_test::my_op", TensorType2()); + } + + // now both registrars are destructed. Assert that the whole schema is gone + expectDoesntFindOperator("_test::my_op"); +} + +bool was_called = false; + +struct KernelWithoutOutput final : OperatorKernel { + void operator()(const Tensor&) { + was_called = true; + } +}; + +FunctionSchema opWithoutOutputSchema( + "_test::no_return", + "", + (std::vector{Argument("dummy")}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators().op(opWithoutOutputSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::no_return", ""); + ASSERT_TRUE(op.has_value()); + was_called = false; + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_TRUE(was_called); + EXPECT_EQ(0, result.size()); +} + +struct KernelWithZeroOutputs final : OperatorKernel { + std::tuple<> operator()(const Tensor&) { + was_called = true; + return std::make_tuple(); + } +}; + +FunctionSchema opWithZeroOutputsSchema( + "_test::zero_outputs", + "", + (std::vector{Argument("dummy")}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithZeroOutputs_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators().op(opWithZeroOutputsSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::zero_outputs", ""); + ASSERT_TRUE(op.has_value()); + was_called = false; + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_TRUE(was_called); + EXPECT_EQ(0, result.size()); +} + +struct KernelWithIntOutput final : OperatorKernel { + int operator()(Tensor, int a, int b) { + return a + b; + } +}; + + +FunctionSchema opWithIntOutputSchema( + "_test::int_output", + "", + (std::vector{Argument("dummy"), + Argument("a", IntType::get()), + Argument("b", IntType::get())}), + (std::vector{Argument("sum", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntOutputSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_output", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1()), 3, 6); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(9, result[0].toInt()); +} + +struct KernelWithTensorOutput final : OperatorKernel { + Tensor operator()(const Tensor& input) { + return input; + } +}; + +FunctionSchema opWithTensorOutput( + "_test::returning_tensor", + "", + (std::vector{Argument("input")}), + (std::vector{Argument("output")})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::returning_tensor", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType1(), result[0].toTensor().type_id()); + + result = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType2(), result[0].toTensor().type_id()); +} + +struct KernelWithTensorListOutput final : OperatorKernel { + std::vector operator()(const Tensor& input1, const Tensor& input2, const Tensor& input3) { + return {input1, input2, input3}; + } +}; + +FunctionSchema opWithTensorListOutputSchema( + "_test::list_output", + "", + (std::vector{Argument("input1"), + Argument("input2"), + Argument("input3")}), + (std::vector{Argument("output", ListType::ofTensors())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorListOutputSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1()), dummyTensor(TensorType2()), dummyTensor(TensorType1())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(3, result[0].toTensorListRef().size()); + EXPECT_EQ(TensorType1(), result[0].toTensorListRef()[0].type_id()); + EXPECT_EQ(TensorType2(), result[0].toTensorListRef()[1].type_id()); + EXPECT_EQ(TensorType1(), result[0].toTensorListRef()[2].type_id()); +} + +struct KernelWithIntListOutput final : OperatorKernel { + std::vector operator()(const Tensor&, int input1, int input2, int input3) { + return {input1, input2, input3}; + } +}; + +FunctionSchema opWithIntListOutputSchema( + "_test::list_output", + "", + (std::vector{Argument("dummy"), + Argument("input1", IntType::get()), + Argument("input2", IntType::get()), + Argument("input3", IntType::get())}), + (std::vector{Argument("output", ListType::ofInts())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntListOutputSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1()), 2, 4, 6); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(3, result[0].toIntListRef().size()); + EXPECT_EQ(2, result[0].toIntListRef()[0]); + EXPECT_EQ(4, result[0].toIntListRef()[1]); + EXPECT_EQ(6, result[0].toIntListRef()[2]); +} + +struct KernelWithMultipleOutputs final : OperatorKernel { + std::tuple> operator()(Tensor) { + return std::tuple>( + dummyTensor(TensorType2()), 5, {dummyTensor(TensorType1()), dummyTensor(TensorType2())} + ); + } +}; + +FunctionSchema opWithMultipleOutputsSchema( + "_test::multiple_outputs", + "", + (std::vector{Argument("dummy")}), + (std::vector{Argument("output1"), + Argument("output2", IntType::get()), + Argument("output3", ListType::ofTensors())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithMultipleOutputsSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::multiple_outputs", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(3, result.size()); + EXPECT_EQ(TensorType2(), result[0].toTensor().type_id()); + EXPECT_EQ(5, result[1].toInt()); + EXPECT_EQ(2, result[2].toTensorListRef().size()); + EXPECT_EQ(TensorType1(), result[2].toTensorListRef()[0].type_id()); + EXPECT_EQ(TensorType2(), result[2].toTensorListRef()[1].type_id()); +} + +struct KernelWithTensorInputByReferenceWithOutput final : OperatorKernel { + Tensor operator()(const Tensor& input1) { + return input1; + } +}; + +struct KernelWithTensorInputByValueWithOutput final : OperatorKernel { + Tensor operator()(Tensor input1) { + return input1; + } +}; + +FunctionSchema opWithTensorInputWithOutput( + "_test::tensor_input", + "", + (std::vector{Argument("input")}), + (std::vector{Argument("output")})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByReference_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorInputWithOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorInputWithOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType1(), result[0].toTensor().type_id()); + + result = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType2(), result[0].toTensor().type_id()); +} + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByValue_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorInputWithOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorInputWithOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); + ASSERT_TRUE(op.has_value()); + + auto result = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType1(), result[0].toTensor().type_id()); + + result = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(1, result.size()); + EXPECT_EQ(TensorType2(), result[0].toTensor().type_id()); +} + +Tensor captured_input; + +struct KernelWithTensorInputByReferenceWithoutOutput final : OperatorKernel { + void operator()(const Tensor& input1) { + captured_input = input1; + } +}; + +struct KernelWithTensorInputByValueWithoutOutput final : OperatorKernel { + void operator()(Tensor input1) { + captured_input = input1; + } +}; + +FunctionSchema opWithTensorInputWithoutOutput( + "_test::tensor_input", + "", + (std::vector{Argument("input")}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByReference_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorInputWithoutOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorInputWithoutOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(TensorType1(), captured_input.type_id()); + + outputs = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(TensorType2(), captured_input.type_id()); +} + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByValue_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorInputWithoutOutput, kernel(), dispatchKey(TensorType1())) + .op(opWithTensorInputWithoutOutput, kernel(), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1())); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(TensorType1(), captured_input.type_id()); + + outputs = callOp(*op, dummyTensor(TensorType2())); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(TensorType2(), captured_input.type_id()); +} + +int captured_int_input = 0; + +struct KernelWithIntInputWithoutOutput final : OperatorKernel { + void operator()(Tensor, int input1) { + captured_int_input = input1; + } +}; + +FunctionSchema opWithIntInputWithoutOutput( + "_test::int_input", + "", + (std::vector{Argument("dummy"), + Argument("input", IntType::get())}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntInput_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntInputWithoutOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); + ASSERT_TRUE(op.has_value()); + + captured_int_input = 0; + auto outputs = callOp(*op, dummyTensor(TensorType1()), 3); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(3, captured_int_input); +} + +struct KernelWithIntInputWithOutput final : OperatorKernel { + int operator()(Tensor, int input1) { + return input1 + 1; + } +}; + +FunctionSchema opWithIntInputWithOutput( + "_test::int_input", + "", + (std::vector{Argument("dummy"), + Argument("input", IntType::get())}), + (std::vector{Argument("output", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntInput_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntInputWithOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1()), 3); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(4, outputs[0].toInt()); +} + +int captured_input_list_size = 0; + +struct KernelWithIntListInputWithoutOutput final : OperatorKernel { + void operator()(Tensor, ArrayRef input1) { + captured_input_list_size = input1.size(); + } +}; + +FunctionSchema opWithIntListInputWithoutOutput( + "_test::int_list_input", + "", + (std::vector{Argument("dummy"), + Argument("input", ListType::ofInts())}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntListInputWithoutOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); + ASSERT_TRUE(op.has_value()); + + captured_input_list_size = 0; + auto outputs = callOp(*op, dummyTensor(TensorType1()), IntList::create({2, 4, 6})); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(3, captured_input_list_size); +} + +struct KernelWithIntListInputWithOutput final : OperatorKernel { + int operator()(Tensor, ArrayRef input1) { + return input1.size(); + } +}; + +FunctionSchema opWithIntListInputWithOutput( + "_test::int_list_input", + "", + (std::vector{Argument("dummy"), + Argument("input", ListType::ofInts())}), + (std::vector{Argument("output", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithIntListInputWithOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1()), IntList::create({2, 4, 6})); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(3, outputs[0].toInt()); +} + +struct KernelWithTensorListInputWithoutOutput final : OperatorKernel { + void operator()(ArrayRef input1) { + captured_input_list_size = input1.size(); + } +}; + +FunctionSchema opWithTensorListInputWithoutOutput( + "_test::tensor_list_input", + "", + (std::vector{Argument("input", ListType::ofTensors())}), + (std::vector{})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorListInputWithoutOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); + ASSERT_TRUE(op.has_value()); + + captured_input_list_size = 0; + auto outputs = callOp(*op, TensorList::create({dummyTensor(TensorType1()), dummyTensor(TensorType1())})); + EXPECT_EQ(0, outputs.size()); + EXPECT_EQ(2, captured_input_list_size); +} + +struct KernelWithTensorListInputWithOutput final : OperatorKernel { + int operator()(ArrayRef input1) { + return input1.size(); + } +}; + +FunctionSchema opWithTensorListInputWithOutput( + "_test::tensor_list_input", + "", + (std::vector{Argument("input", ListType::ofTensors())}), + (std::vector{Argument("output", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListInput_withOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithTensorListInputWithOutput, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, TensorList::create({dummyTensor(TensorType1()), dummyTensor(TensorType1())})); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(2, outputs[0].toInt()); +} + +class KernelWithCache final : public OperatorKernel { +public: + KernelWithCache(): counter(3) {} + + int64_t operator()(Tensor) { + return ++counter; + } +private: + int64_t counter; +}; + +FunctionSchema opWithCacheSchema( + "_test::cache_op", + "", + (std::vector{Argument("input")}), + (std::vector{Argument("output", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithCache_thenCacheIsKeptCorrectly) { + auto registrar = RegisterOperators() + .op(opWithCacheSchema, kernel(), dispatchKey(TensorType1())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::cache_op", ""); + ASSERT_TRUE(op.has_value()); + + // expect first time calling returns a 4 (4 is the initial value in the cache) + auto stack = makeStack(dummyTensor(TensorType1())); + auto kernel = c10::Dispatcher::singleton().lookup(*op, &stack); + kernel.call(&stack); + EXPECT_EQ(1, stack.size()); + EXPECT_EQ(4, stack[0].toInt()); + + // expect second time calling returns a 5 + stack = makeStack(dummyTensor(TensorType1())); + kernel.call(&stack); + EXPECT_EQ(1, stack.size()); + EXPECT_EQ(5, stack[0].toInt()); + + // expect third time calling returns a 6 + stack = makeStack(dummyTensor(TensorType1())); + kernel.call(&stack); + EXPECT_EQ(1, stack.size()); + EXPECT_EQ(6, stack[0].toInt()); +} + +class KernelWithConstructorArg final : public OperatorKernel { +public: + explicit KernelWithConstructorArg(int64_t offset) + : offset_(offset) {} + + int64_t operator()(const Tensor&, int64_t input) { + return input + offset_; + } + +private: + int64_t offset_; +}; + +FunctionSchema opWithConstructorArgsSchema( + "_test::offset_op", + "", + (std::vector{Argument("tensor"), + Argument("input", IntType::get())}), + (std::vector{Argument("output", IntType::get())})); + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithConstructorArg_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithConstructorArgsSchema, kernel(2), dispatchKey(TensorType1())) + .op(opWithConstructorArgsSchema, kernel(4), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::offset_op", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1()), 4); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(6, outputs[0].toInt()); + + outputs = callOp(*op, dummyTensor(TensorType2()), 4); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(8, outputs[0].toInt()); +} + +class KernelWithMultipleConstructorArgs final : public OperatorKernel { +public: + explicit KernelWithMultipleConstructorArgs(int64_t offset1, int64_t offset2) + : offset_(offset1 + offset2) {} + + int64_t operator()(const Tensor&, int64_t input) { + return input + offset_; + } + +private: + int64_t offset_; +}; + +TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithMultipleConstructorArgs_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op(opWithConstructorArgsSchema, kernel(2, 3), dispatchKey(TensorType1())) + .op(opWithConstructorArgsSchema, kernel(4, 5), dispatchKey(TensorType2())); + + auto op = c10::Dispatcher::singleton().findSchema("_test::offset_op", ""); + ASSERT_TRUE(op.has_value()); + + auto outputs = callOp(*op, dummyTensor(TensorType1()), 4); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(9, outputs[0].toInt()); + + outputs = callOp(*op, dummyTensor(TensorType2()), 4); + EXPECT_EQ(1, outputs.size()); + EXPECT_EQ(13, outputs[0].toInt()); +} + + +} diff --git a/aten/src/ATen/core/op_registration/kernel_stackbased.h b/aten/src/ATen/core/op_registration/kernel_stackbased.h index 09750aa..412a5dd 100644 --- a/aten/src/ATen/core/op_registration/kernel_stackbased.h +++ b/aten/src/ATen/core/op_registration/kernel_stackbased.h @@ -16,22 +16,30 @@ namespace c10 { namespace detail { + + template struct KernelRegistrationConfigParameter final { - explicit constexpr KernelRegistrationConfigParameter(KernelFunction* kernel_func, KernelCacheCreatorFunction* cache_creator_func) - : kernel_func_(kernel_func), cache_creator_func_(std::move(cache_creator_func)) { + template + constexpr KernelRegistrationConfigParameter(KernelFunction* kernel_func, KernelCacheCreatorFunction__&& cache_creator_func) + : kernel_func_(kernel_func), cache_creator_func_(std::forward(cache_creator_func)) { } - void apply(KernelRegistrationConfig* registration) const { + void apply(KernelRegistrationConfig* registration) const & { registration->kernel_func = kernel_func_; registration->cache_creator_func = cache_creator_func_; } + void apply(KernelRegistrationConfig* registration) && { + registration->kernel_func = kernel_func_; + registration->cache_creator_func = std::move(cache_creator_func_); + } + private: KernelFunction* kernel_func_; - KernelCacheCreatorFunction* cache_creator_func_; + KernelCacheCreatorFunction_ cache_creator_func_; }; - static_assert(is_registration_config_parameter::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept"); + static_assert(is_registration_config_parameter>::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept"); } /** @@ -52,8 +60,11 @@ namespace detail { * > c10::kernel(&my_kernel_cpu, &my_cache_creator), * > c10::dispatchKey(CPUTensorId())); */ -inline constexpr detail::KernelRegistrationConfigParameter kernel(KernelFunction* kernel_func, KernelCacheCreatorFunction* cache_creator) { - return detail::KernelRegistrationConfigParameter(kernel_func, cache_creator); +template +inline constexpr detail::KernelRegistrationConfigParameter> kernel(KernelFunction* kernel_func, KernelCacheCreatorFunction_&& cache_creator) { + static_assert(detail::is_registration_config_parameter>>::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept"); + + return {kernel_func, std::forward(cache_creator)}; } } diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h index d6bd887..96e12ce 100644 --- a/aten/src/ATen/core/op_registration/op_registration.h +++ b/aten/src/ATen/core/op_registration/op_registration.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include namespace c10 { @@ -63,15 +65,15 @@ public: guts::enable_if_t>...>::value, RegisterOperators> op(FunctionSchema schema, ConfigParameters&&... configParameters) && { detail::KernelRegistrationConfig config = detail::make_registration_config(std::forward(configParameters)...); - registrars_.emplace_back(std::move(schema), config.dispatch_key, config.kernel_func, config.cache_creator_func); + registrars_.emplace_back(std::move(schema), config.dispatch_key, config.kernel_func, std::move(config.cache_creator_func)); return std::move(*this); } // TODO error if dispatch key is not specified - // TODO Add functor, function and lambda based kernel APIs + // TODO Add deprecated function and lambda based kernel APIs private: - std::vector registrars_; + std::vector registrars_; }; } diff --git a/aten/src/ATen/core/op_registration/test_helpers.h b/aten/src/ATen/core/op_registration/test_helpers.h index ed9d35a..595912c 100644 --- a/aten/src/ATen/core/op_registration/test_helpers.h +++ b/aten/src/ATen/core/op_registration/test_helpers.h @@ -8,11 +8,11 @@ #include template -std::vector makeStack(Inputs&&... inputs) { +inline std::vector makeStack(Inputs&&... inputs) { return {std::forward(inputs)...}; } -at::Tensor dummyTensor(c10::TensorTypeId dispatch_key) { +inline at::Tensor dummyTensor(c10::TensorTypeId dispatch_key) { auto* allocator = c10::GetCPUAllocator(); int64_t nelements = 1; auto dtype = caffe2::TypeMeta::Make(); @@ -26,21 +26,21 @@ at::Tensor dummyTensor(c10::TensorTypeId dispatch_key) { } template -std::vector callOp(const c10::OperatorHandle& op, Args... args) { +inline std::vector callOp(const c10::OperatorHandle& op, Args... args) { auto stack = makeStack(std::forward(args)...); auto kernel = c10::Dispatcher::singleton().lookup(op, &stack); kernel.call(&stack); return stack; } -void expectDoesntFindKernel(const char* op_name, c10::TensorTypeId dispatch_key) { +inline void expectDoesntFindKernel(const char* op_name, c10::TensorTypeId dispatch_key) { auto op = c10::Dispatcher::singleton().findSchema(op_name, ""); EXPECT_ANY_THROW( callOp(*op, dummyTensor(dispatch_key), 5); ); } -void expectDoesntFindOperator(const char* op_name) { +inline void expectDoesntFindOperator(const char* op_name) { auto op = c10::Dispatcher::singleton().findSchema(op_name, ""); EXPECT_FALSE(op.has_value()); } diff --git a/c10/util/C++17.h b/c10/util/C++17.h index c1c375d..2d7345b 100644 --- a/c10/util/C++17.h +++ b/c10/util/C++17.h @@ -183,7 +183,7 @@ template using void_t = typename make_void::type; template inline constexpr decltype(auto) apply(F&& f, Tuple&& t) { - return std::apply(c10::guts::forward(f), c10::guts::forward(t)); + return std::apply(std::forward(f), std::forward(t)); } #else diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h index 8fa7724..60e4553 100644 --- a/c10/util/Metaprogramming.h +++ b/c10/util/Metaprogramming.h @@ -44,6 +44,16 @@ struct function_traits { }; /** + * Evaluates to true_type, iff the given class is a Functor + * (i.e. has a call operator with some set of arguments) + */ + +template +struct is_functor : std::false_type {}; +template +struct is_functor>::value>> : std::true_type {}; + +/** * infer_function_traits: creates a `function_traits` type for a simple * function (pointer) or functor (lambda/struct). Currently does not support * class methods. diff --git a/caffe2/operators/CMakeLists.txt b/caffe2/operators/CMakeLists.txt index 262655b..80db17f 100644 --- a/caffe2/operators/CMakeLists.txt +++ b/caffe2/operators/CMakeLists.txt @@ -59,23 +59,6 @@ set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c1 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/fc_cpu.cc) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/enforce_finite_cpu.cc) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/add_cpu.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/sigmoid.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/filler.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/expand_dims.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/mul.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/relu.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/stop_gradient.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/enforce_finite.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/cast.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/averaged_loss.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/batch_matmul.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/batch_gather.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/fc.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/concat.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/sparse_lengths_sum.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/add.cc) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/schemas/flatten.cc) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp}) # exclude test files and gpu files diff --git a/caffe2/operators/experimental/c10/cpu/add_cpu.cc b/caffe2/operators/experimental/c10/cpu/add_cpu.cc index 9ec1aa1..8c67e0d 100644 --- a/caffe2/operators/experimental/c10/cpu/add_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/add_cpu.cc @@ -1,6 +1,6 @@ -#include +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/operators/elementwise_ops_utils.h" -#include "caffe2/operators/experimental/c10/schemas/add.h" #include "caffe2/utils/math.h" using caffe2::BaseContext; @@ -69,11 +69,25 @@ void add_op_cpu_impl( C.mutable_data(), static_cast(&context)); } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::Add", + "", + (std::vector{ + c10::Argument("input1"), + c10::Argument("input2"), + c10::Argument("output"), + c10::Argument("legacy_broadcast", BoolType::get()), + c10::Argument("axis", IntType::get())}), + (std::vector{})), + c10::kernel), &add_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::Add) - .kernel), &caffe2::add_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::Add", + C10Add_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc b/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc index cc5823d..cd8f090 100644 --- a/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/averaged_loss.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::BaseContext; using caffe2::Tensor; @@ -10,45 +10,53 @@ using std::vector; namespace caffe2 { namespace { -struct Cache final : public c10::KernelCache { - at::Tensor scratch = at::Tensor(C10Tensor(empty({}, CPU))); -}; - template -void averaged_loss_op_cpu_impl( - const at::Tensor& X_, - const at::Tensor& sum_, - Cache* state) { - Tensor X{C10Tensor(X_)}; - Tensor sum{C10Tensor(sum_)}; - CPUContext context; - - sum.Resize(vector()); - - T* data = sum.template mutable_data(); - - Tensor scratch(state->scratch); - caffe2::math::Sum( - X.numel(), - X.template data(), - data, - static_cast(&context), - &scratch); - if (X.numel() > 0) { - caffe2::math::Scale( - 1, - static_cast(1.) / X.numel(), - sum.template data(), +class averaged_loss_cpu final : public c10::OperatorKernel { + public: + void operator()(const at::Tensor& X_, const at::Tensor& sum_) { + Tensor X{C10Tensor(X_)}; + Tensor sum{C10Tensor(sum_)}; + CPUContext context; + + sum.Resize(vector()); + + T* data = sum.template mutable_data(); + + Tensor scratch(scratch_); + caffe2::math::Sum( + X.numel(), + X.template data(), data, - static_cast(&context)); + static_cast(&context), + &scratch); + if (X.numel() > 0) { + caffe2::math::Scale( + 1, + static_cast(1.) / X.numel(), + sum.template data(), + data, + static_cast(&context)); + } } -} + + private: + at::Tensor scratch_ = at::Tensor(C10Tensor(empty({}, CPU))); +}; + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::AveragedLoss", + "", + (std::vector{c10::Argument("input"), + c10::Argument("output")}), + (std::vector{})), + c10::kernel>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::AveragedLoss) - .withCache() - .kernel), &caffe2::averaged_loss_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::AveragedLoss", + C10AveragedLoss_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc b/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc index a34c098..c41ead2 100644 --- a/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/batch_gather.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::BaseContext; using caffe2::Tensor; @@ -63,11 +63,22 @@ void batch_gather_op_cpu(const at::Tensor& data, default: throw std::runtime_error(string() + "Unsupported dtype: " + toString(data.scalar_type())); } } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::BatchGather", + "", + (std::vector{c10::Argument("data"), + c10::Argument("indices"), + c10::Argument("output")}), + (std::vector{})), + c10::kernel(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::BatchGather) - .kernel() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::BatchGather", + C10BatchGather_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc b/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc index dfe8523..4970f5e 100644 --- a/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/batch_matmul.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::BaseContext; using caffe2::Tensor; @@ -16,264 +16,277 @@ struct Cache final : public c10::KernelCache { }; template -void batch_matmul_op_cpu_impl( - const at::Tensor& A_, - const at::Tensor& B_, - const at::Tensor& Y_, - int64_t trans_a, - int64_t trans_b, - int64_t broadcast, - Cache* cache) { - Tensor A{C10Tensor(A_)}; - Tensor B{C10Tensor(B_)}; - Tensor Y{C10Tensor(Y_)}; - CPUContext context; - using Engine = caffe2::DefaultEngine; - - auto ndims_A = A.dim(); - auto dims_A = A.sizes().vec(); - auto ndims_B = B.dim(); - auto dims_B = B.sizes().vec(); +class batch_matmul_cpu final : public c10::OperatorKernel { + public: + void operator()( + const at::Tensor& A_, + const at::Tensor& B_, + const at::Tensor& Y_, + int64_t trans_a, + int64_t trans_b, + int64_t broadcast) { + Tensor A{C10Tensor(A_)}; + Tensor B{C10Tensor(B_)}; + Tensor Y{C10Tensor(Y_)}; + CPUContext context; + using Engine = caffe2::DefaultEngine; - auto noBroadcastErrorMsg = [](size_t dim1, size_t dim2) { - std::stringstream ss; - ss << "Inputs with dimensions A = "; - ss << dim1; - ss << " and B = "; - ss << dim2; - ss << " is not supported with broadcast=0. Did you forget to set the " - "broadcast flag?"; - return ss.str(); - }; + auto ndims_A = A.dim(); + auto dims_A = A.sizes().vec(); + auto ndims_B = B.dim(); + auto dims_B = B.sizes().vec(); - // These should all be false if we're not broadcasting. - bool dimMismatch = ndims_A != ndims_B; - bool dimsLessThan1D = ndims_A < 2; - CAFFE_ENFORCE( - broadcast || (!dimMismatch && !dimsLessThan1D), - noBroadcastErrorMsg(ndims_A, ndims_B)); + auto noBroadcastErrorMsg = [](size_t dim1, size_t dim2) { + std::stringstream ss; + ss << "Inputs with dimensions A = "; + ss << dim1; + ss << " and B = "; + ss << dim2; + ss << " is not supported with broadcast=0. Did you forget to set the " + "broadcast flag?"; + return ss.str(); + }; - auto* data_A = A.template data(); - auto* data_B = B.template data(); + // These should all be false if we're not broadcasting. + bool dimMismatch = ndims_A != ndims_B; + bool dimsLessThan1D = ndims_A < 2; + CAFFE_ENFORCE( + broadcast || (!dimMismatch && !dimsLessThan1D), + noBroadcastErrorMsg(ndims_A, ndims_B)); - auto dimMismatchErrorString = [](size_t dimnum1, - size_t dim1, - size_t dimnum2, - size_t dim2, - bool trans_a_, - bool trans_b_) { - std::stringstream ss; - ss << "Expected dimension "; - ss << dimnum1; - ss << " of tensor A with value "; - ss << dim1; - ss << " to match dimension "; - ss << dimnum2; - ss << " of tensor B with value "; - ss << dim2; - ss << ". trans_a = "; - ss << trans_a_; - ss << " trans_b = "; - ss << trans_b_; - return ss.str(); - }; - - if (ndims_A == 1 && ndims_B == 1) { - // vector-vector - CAFFE_ENFORCE_EQ( - dims_A[0], - dims_B[0], - "Vector-vector product requires each of the vectors to " - "be the same size."); - Y.Resize(1); - math::Dot( - dims_A[0], data_A, data_B, Y.template mutable_data(), static_cast(&context)); - } else { - bool A_broadcasted = false, B_broadcasted = false; - if (ndims_A == 1) { - dims_A.insert(dims_A.begin(), 1); - ndims_A = 2; - A_broadcasted = true; - } - if (ndims_B == 1) { - dims_B.push_back(1); - ndims_B = 2; - B_broadcasted = true; - } - // matrix-matrix with batches - // [B1..., M, K] * [B2..., K, N] -> [B..., M, N] - // In the event that A or B are one-dimensional, the trailing or leading - // 1 is not added to the output tensor's size. + auto* data_A = A.template data(); + auto* data_B = B.template data(); - // First step: partition the tensors into inner and outer blocks. - // Ignoring the last two dimensions of A and B, ensure that one of the - // tensors' dimensions is a suffix of the other. For example, - // [4, x, x] is a suffix of [2, 3, 4, x, x]. In this example, the - // dimensions of size 2 and 3 will be broadcasted, so we partition into - // 2*3=6 individual instances of batched GEMM with A and B \in [4, x, x]. - size_t num_inner_dims = std::min(ndims_A, ndims_B); - for (size_t i = 2; i < num_inner_dims; ++i) { - auto first_r_itr = dims_A.rbegin(); - auto second_r_itr = dims_B.rbegin(); - CAFFE_ENFORCE_EQ( - *(first_r_itr + i), - *(second_r_itr + i), - dimMismatchErrorString( - ndims_A - i - 1, - *(first_r_itr + i), - ndims_B - i - 1, - *(second_r_itr + i), - trans_a, - trans_b)); - } - size_t num_outer_dims = std::max(ndims_A, ndims_B) - num_inner_dims; + auto dimMismatchErrorString = [](size_t dimnum1, + size_t dim1, + size_t dimnum2, + size_t dim2, + bool trans_a_, + bool trans_b_) { + std::stringstream ss; + ss << "Expected dimension "; + ss << dimnum1; + ss << " of tensor A with value "; + ss << dim1; + ss << " to match dimension "; + ss << dimnum2; + ss << " of tensor B with value "; + ss << dim2; + ss << ". trans_a = "; + ss << trans_a_; + ss << " trans_b = "; + ss << trans_b_; + return ss.str(); + }; - // Standard M, N, and K parameters respecting GEMM API and transpose - // flags - size_t M, N, K, K_dim; - if (trans_a) { - M = dims_A[ndims_A - 1]; - K = dims_A[ndims_A - 2]; - K_dim = ndims_A - 2; - } else { - M = dims_A[ndims_A - 2]; - K = dims_A[ndims_A - 1]; - K_dim = ndims_A - 1; - } - if (trans_b) { - N = dims_B[ndims_B - 2]; + if (ndims_A == 1 && ndims_B == 1) { + // vector-vector CAFFE_ENFORCE_EQ( - K, - dims_B[ndims_B - 1], - dimMismatchErrorString( - K_dim, - K, - ndims_B - 1, - dims_B[ndims_B - 1], - trans_a, - trans_b)); + dims_A[0], + dims_B[0], + "Vector-vector product requires each of the vectors to " + "be the same size."); + Y.Resize(1); + math::Dot( + dims_A[0], + data_A, + data_B, + Y.template mutable_data(), + static_cast(&context)); } else { - N = dims_B[ndims_B - 1]; - CAFFE_ENFORCE_EQ( - K, - dims_B[ndims_B - 2], - dimMismatchErrorString( - K_dim, - K, - ndims_B - 2, - dims_B[ndims_B - 2], - trans_a, - trans_b)); - } + bool A_broadcasted = false, B_broadcasted = false; + if (ndims_A == 1) { + dims_A.insert(dims_A.begin(), 1); + ndims_A = 2; + A_broadcasted = true; + } + if (ndims_B == 1) { + dims_B.push_back(1); + ndims_B = 2; + B_broadcasted = true; + } + // matrix-matrix with batches + // [B1..., M, K] * [B2..., K, N] -> [B..., M, N] + // In the event that A or B are one-dimensional, the trailing or leading + // 1 is not added to the output tensor's size. - // Calculate output tensor shapes [B..., (M), (N)] - // Batch dimensions will be broadcasted out to those of the longer tensor - // A or B. Either M or N are optional if A or B, respectively are 1-D. - std::vector new_dims; - if (ndims_A >= ndims_B) { - new_dims.assign(dims_A.begin(), dims_A.end() - 2); - } else { - new_dims.assign(dims_B.begin(), dims_B.end() - 2); - } - if (!A_broadcasted) { - new_dims.push_back(M); - } else { - new_dims.push_back(1); - } - if (!B_broadcasted) { - new_dims.push_back(N); - } else { - new_dims.push_back(1); - } + // First step: partition the tensors into inner and outer blocks. + // Ignoring the last two dimensions of A and B, ensure that one of the + // tensors' dimensions is a suffix of the other. For example, + // [4, x, x] is a suffix of [2, 3, 4, x, x]. In this example, the + // dimensions of size 2 and 3 will be broadcasted, so we partition into + // 2*3=6 individual instances of batched GEMM with A and B \in [4, x, x]. + size_t num_inner_dims = std::min(ndims_A, ndims_B); + for (size_t i = 2; i < num_inner_dims; ++i) { + auto first_r_itr = dims_A.rbegin(); + auto second_r_itr = dims_B.rbegin(); + CAFFE_ENFORCE_EQ( + *(first_r_itr + i), + *(second_r_itr + i), + dimMismatchErrorString( + ndims_A - i - 1, + *(first_r_itr + i), + ndims_B - i - 1, + *(second_r_itr + i), + trans_a, + trans_b)); + } + size_t num_outer_dims = std::max(ndims_A, ndims_B) - num_inner_dims; - // Calculate strides. Continuing our example above, - // [4, M, K] * [2, 3, 4, K, N] = [2, 3, 4, M, N] - // We calculate this as follows: - // 1) Treat the outer batch dimensions as flattened, i.e. view the B - // tensor here as [6, 4, K, N] and Y as [6, 4, M, N]. The same rea- - // soning is analogous for the case where # dims A >= # dims B. - // 2) Perform this operation: - // for i in range(6): - // Y[i, :, :, :] = BatchMatMul(A, B[i, :, :, :]) - size_t A_stride = 1; // How far to increment A pointer each itr - size_t B_stride = 1; // How far to increment B pointer each itr - size_t Y_stride = 1; // How far to increment Y pointer each itr - // How many "inner batches" we have. That is, the product of sizes for - // the slices excluding M, K, and N, for their respective matrices. - size_t num_sub_batches = 1; - if (ndims_A >= ndims_B) { - auto first_r_itr = dims_A.rbegin(); - auto output_r_itr = new_dims.rbegin(); - for (size_t i = 0; i < num_inner_dims; ++i) { - A_stride *= *(first_r_itr + i); - Y_stride *= *(output_r_itr + i); - if (i >= 2) { - num_sub_batches *= *(first_r_itr + i); - } + // Standard M, N, and K parameters respecting GEMM API and transpose + // flags + size_t M, N, K, K_dim; + if (trans_a) { + M = dims_A[ndims_A - 1]; + K = dims_A[ndims_A - 2]; + K_dim = ndims_A - 2; + } else { + M = dims_A[ndims_A - 2]; + K = dims_A[ndims_A - 1]; + K_dim = ndims_A - 1; } - B_stride = 0; - } else { - A_stride = 0; - auto second_r_itr = dims_B.rbegin(); - auto output_r_itr = new_dims.rbegin(); - for (size_t i = 0; i < num_inner_dims; ++i) { - B_stride *= *(second_r_itr + i); - Y_stride *= *(output_r_itr + i); - if (i >= 2) { - num_sub_batches *= *(second_r_itr + i); + if (trans_b) { + N = dims_B[ndims_B - 2]; + CAFFE_ENFORCE_EQ( + K, + dims_B[ndims_B - 1], + dimMismatchErrorString( + K_dim, K, ndims_B - 1, dims_B[ndims_B - 1], trans_a, trans_b)); + } else { + N = dims_B[ndims_B - 1]; + CAFFE_ENFORCE_EQ( + K, + dims_B[ndims_B - 2], + dimMismatchErrorString( + K_dim, K, ndims_B - 2, dims_B[ndims_B - 2], trans_a, trans_b)); + } + + // Calculate output tensor shapes [B..., (M), (N)] + // Batch dimensions will be broadcasted out to those of the longer tensor + // A or B. Either M or N are optional if A or B, respectively are 1-D. + std::vector new_dims; + if (ndims_A >= ndims_B) { + new_dims.assign(dims_A.begin(), dims_A.end() - 2); + } else { + new_dims.assign(dims_B.begin(), dims_B.end() - 2); + } + if (!A_broadcasted) { + new_dims.push_back(M); + } else { + new_dims.push_back(1); + } + if (!B_broadcasted) { + new_dims.push_back(N); + } else { + new_dims.push_back(1); + } + + // Calculate strides. Continuing our example above, + // [4, M, K] * [2, 3, 4, K, N] = [2, 3, 4, M, N] + // We calculate this as follows: + // 1) Treat the outer batch dimensions as flattened, i.e. view the B + // tensor here as [6, 4, K, N] and Y as [6, 4, M, N]. The same rea- + // soning is analogous for the case where # dims A >= # dims B. + // 2) Perform this operation: + // for i in range(6): + // Y[i, :, :, :] = BatchMatMul(A, B[i, :, :, :]) + size_t A_stride = 1; // How far to increment A pointer each itr + size_t B_stride = 1; // How far to increment B pointer each itr + size_t Y_stride = 1; // How far to increment Y pointer each itr + // How many "inner batches" we have. That is, the product of sizes for + // the slices excluding M, K, and N, for their respective matrices. + size_t num_sub_batches = 1; + if (ndims_A >= ndims_B) { + auto first_r_itr = dims_A.rbegin(); + auto output_r_itr = new_dims.rbegin(); + for (size_t i = 0; i < num_inner_dims; ++i) { + A_stride *= *(first_r_itr + i); + Y_stride *= *(output_r_itr + i); + if (i >= 2) { + num_sub_batches *= *(first_r_itr + i); + } + } + B_stride = 0; + } else { + A_stride = 0; + auto second_r_itr = dims_B.rbegin(); + auto output_r_itr = new_dims.rbegin(); + for (size_t i = 0; i < num_inner_dims; ++i) { + B_stride *= *(second_r_itr + i); + Y_stride *= *(output_r_itr + i); + if (i >= 2) { + num_sub_batches *= *(second_r_itr + i); + } } } - } - size_t num_outer_batches = 1; - for (size_t i = 0; i < num_outer_dims; ++i) { - num_outer_batches *= new_dims[i]; - } + size_t num_outer_batches = 1; + for (size_t i = 0; i < num_outer_dims; ++i) { + num_outer_batches *= new_dims[i]; + } - // Mutually exclusive since otherwise we would've taken the vector-vector - // path above - if (A_broadcasted) { - new_dims.erase(new_dims.end() - 2); - } else if (B_broadcasted) { - new_dims.erase(new_dims.end() - 1); - } + // Mutually exclusive since otherwise we would've taken the vector-vector + // path above + if (A_broadcasted) { + new_dims.erase(new_dims.end() - 2); + } else if (B_broadcasted) { + new_dims.erase(new_dims.end() - 1); + } - // Allocate output tensor - Y.Resize(new_dims); - auto* Y_data = Y.template mutable_data(); + // Allocate output tensor + Y.Resize(new_dims); + auto* Y_data = Y.template mutable_data(); - // Zero batch dimension indicates no elements - if (num_sub_batches == 0 || num_outer_batches == 0) { - return; - } + // Zero batch dimension indicates no elements + if (num_sub_batches == 0 || num_outer_batches == 0) { + return; + } - // TODO(T23893772): doing this in a loop is likely going to be slow on GPU - for (size_t p = 0; p < num_outer_batches; ++p) { - math::GemmStridedBatched( - trans_a ? CblasTrans : CblasNoTrans, - trans_b ? CblasTrans : CblasNoTrans, - num_sub_batches, - M, - N, - K, - 1.0f, - data_A + p * A_stride, - M * K, - data_B + p * B_stride, - K * N, - 0.0f, - Y_data + p * Y_stride, - M * N, - static_cast(&context)); + // TODO(T23893772): doing this in a loop is likely going to be slow on GPU + for (size_t p = 0; p < num_outer_batches; ++p) { + math::GemmStridedBatched( + trans_a ? CblasTrans : CblasNoTrans, + trans_b ? CblasTrans : CblasNoTrans, + num_sub_batches, + M, + N, + K, + 1.0f, + data_A + p * A_stride, + M * K, + data_B + p * B_stride, + K * N, + 0.0f, + Y_data + p * Y_stride, + M * N, + static_cast(&context)); + } } } -} + + private: + at::Tensor scratch = at::Tensor(C10Tensor(empty({}, CPU))); +}; + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::BatchMatmul", + "", + (std::vector{ + c10::Argument("A"), + c10::Argument("B"), + c10::Argument("output"), + c10::Argument("trans_a", IntType::get()), + c10::Argument("trans_b", IntType::get()), + c10::Argument("broadcast", IntType::get())}), + (std::vector{})), + c10::kernel>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::BatchMatmul) - .withCache() - .kernel), &caffe2::batch_matmul_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::BatchMatmul", + C10BatchMatMul_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/cast_cpu.cc b/caffe2/operators/experimental/c10/cpu/cast_cpu.cc index 66ffcf5..1fa560e 100644 --- a/caffe2/operators/experimental/c10/cpu/cast_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/cast_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/cast.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::CPUContext; using caffe2::Tensor; @@ -85,11 +85,24 @@ void cast_op_cpu( default: throw std::runtime_error(string() + "Unsupported scalar type " + toString(input.scalar_type())); } } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::Cast", + "", + (std::vector{ + c10::Argument("input"), + c10::Argument("output"), + c10::Argument("to_dtype", IntType::get()), + }), + (std::vector{})), + c10::kernel(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::Cast) - .kernel() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::Cast", + C10Cast_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/concat_cpu.cc b/caffe2/operators/experimental/c10/cpu/concat_cpu.cc index 5a87d20..c23aff3 100644 --- a/caffe2/operators/experimental/c10/cpu/concat_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/concat_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/concat.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::BaseContext; using caffe2::CPUContext; @@ -103,11 +103,27 @@ void concat_op_cpu_impl( output_offset += axis_dim * after * input.itemsize(); } } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::Concat", + "", + (std::vector{ + c10::Argument("inputs", ListType::ofTensors()), + c10::Argument("output"), + c10::Argument("split_info", FloatType::get()), + c10::Argument("add", IntType::get()), + c10::Argument("add_axis", IntType::get())}), + (std::vector{})), + c10::kernel< + decltype(concat_op_cpu_impl), + &concat_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::Concat) - .kernel), &caffe2::concat_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::Concat", + C10Concat_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc b/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc index 2e9f608..d833a67 100644 --- a/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/enforce_finite.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::CPUContext; using caffe2::Tensor; @@ -23,11 +23,22 @@ void enforce_finite_op_impl_cpu(const at::Tensor& input_) { input_data[i]); } } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::EnforceFinite", + "", + (std::vector{c10::Argument("input")}), + (std::vector{})), + c10::kernel< + decltype(enforce_finite_op_impl_cpu), + &enforce_finite_op_impl_cpu>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::EnforceFinite) - .kernel), &caffe2::enforce_finite_op_impl_cpu>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::EnforceFinite", + C10EnforceFinite_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc b/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc index a719500..81566e7 100644 --- a/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc @@ -1,65 +1,74 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/expand_dims.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::Tensor; namespace caffe2 { namespace { -struct Cache final : public c10::KernelCache { - std::vector dims; - bool initialized = false; -}; - template -void expand_dims_op_cpu_impl( - const at::Tensor& input_, - const at::Tensor& output_, - ArrayRef dims, - Cache* cache) { - Tensor input{C10Tensor(input_)}; - Tensor output{C10Tensor(output_)}; +class expand_dims_cpu final : public c10::OperatorKernel { + public: + void operator()( + const at::Tensor& input_, + const at::Tensor& output_, + ArrayRef dims) { + Tensor input{C10Tensor(input_)}; + Tensor output{C10Tensor(output_)}; - if (!cache->initialized) { - cache->dims = dims.vec(); - auto originalSize = cache->dims.size(); - CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided."); - std::sort(cache->dims.begin(), cache->dims.end()); - cache->dims.erase( - std::unique(cache->dims.begin(), cache->dims.end()), cache->dims.end()); - if (cache->dims.size() < originalSize) { - LOG(WARNING) << "Parameter `dims` has repeated dimensions."; + if (!initialized_) { + dims_ = dims.vec(); + auto originalSize = dims_.size(); + CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided."); + std::sort(dims_.begin(), dims_.end()); + dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end()); + if (dims_.size() < originalSize) { + LOG(WARNING) << "Parameter `dims` has repeated dimensions."; + } + CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative."); + initialized_ = true; } - CAFFE_ENFORCE( - cache->dims.front() >= 0, "Dimension ids must be non-negative."); - cache->initialized = true; - } - output.CopyFrom(input); - if (cache->dims.empty()) { - return; - } + output.CopyFrom(input); + if (dims_.empty()) { + return; + } - auto newDims = input.sizes().vec(); - CAFFE_ENFORCE_GE( - input.sizes().size() + cache->dims.size(), - cache->dims.back() + 1, - "Input needs at least ", - (1 + cache->dims.back() - cache->dims.size()), - " dimensions given `dims`."); - for (const auto dim : cache->dims) { - newDims.insert(newDims.begin() + dim, 1); + auto newDims = input.sizes().vec(); + CAFFE_ENFORCE_GE( + input.sizes().size() + dims_.size(), + dims_.back() + 1, + "Input needs at least ", + (1 + dims_.back() - dims_.size()), + " dimensions given `dims`."); + for (const auto dim : dims_) { + newDims.insert(newDims.begin() + dim, 1); + } + output.Reshape(newDims); } - output.Reshape(newDims); -} + + private: + std::vector dims_; + bool initialized_ = false; +}; + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::ExpandDims", + "", + (std::vector{c10::Argument("input"), + c10::Argument("output"), + c10::Argument("dims", ListType::ofInts())}), + (std::vector{})), + c10::kernel>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::ExpandDims) - .withCache() - .kernel), &caffe2::expand_dims_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::ExpandDims", + C10ExpandDims_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/fc_cpu.cc b/caffe2/operators/experimental/c10/cpu/fc_cpu.cc index 99e0545..51d034a 100644 --- a/caffe2/operators/experimental/c10/cpu/fc_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/fc_cpu.cc @@ -1,10 +1,10 @@ +#include #include "caffe2/core/context.h" -#include #include "caffe2/core/operator.h" -#include "caffe2/operators/experimental/c10/schemas/fc.h" +#include "caffe2/core/operator_c10wrapper.h" +#include "caffe2/core/tensor.h" #include "caffe2/utils/conversions.h" #include "caffe2/utils/math.h" -#include "caffe2/core/tensor.h" using caffe2::BaseContext; using caffe2::Tensor; @@ -12,124 +12,140 @@ using caffe2::Tensor; namespace caffe2 { namespace { -struct Cache final : public c10::KernelCache { - vector Y_shape_cache_; - at::Tensor bias_multiplier_ = at::Tensor(C10Tensor(Tensor())); -}; - template -void fc_op_cpu_impl( - const at::Tensor& X_, - const at::Tensor& W_, - const at::Tensor& b_, - const at::Tensor& Y_, - int64_t axis, - int64_t axis_w, - Cache* cache) { - Tensor X{C10Tensor(X_)}; - Tensor W{C10Tensor(W_)}; - Tensor b{C10Tensor(b_)}; - Tensor Y{C10Tensor(Y_)}; - CPUContext context; - - constexpr bool TransposeWeight = true; - - CAFFE_ENFORCE(b.dim() == 1, b.dim()); - // batch size - const auto canonical_axis = X.canonical_axis_index(axis); - const auto M = X.size_to_dim(canonical_axis); - const auto K = X.size_from_dim(canonical_axis); - const auto canonical_axis_w = W.canonical_axis_index(axis_w); - const int N = TransposeWeight ? W.size_to_dim(canonical_axis_w) - : W.size_from_dim(canonical_axis_w); - - auto dimErrorString = [&]() { - return c10::str( - "Dimension mismatch: ", - "X: ", - X.sizes(), - ", W: ", - W.sizes(), - ", b: ", - b.sizes(), - ", axis: ", - axis, - ", M: ", +class fc_op_cpu final : public c10::OperatorKernel { + public: + void operator()( + const at::Tensor& X_, + const at::Tensor& W_, + const at::Tensor& b_, + const at::Tensor& Y_, + int64_t axis, + int64_t axis_w) { + Tensor X{C10Tensor(X_)}; + Tensor W{C10Tensor(W_)}; + Tensor b{C10Tensor(b_)}; + Tensor Y{C10Tensor(Y_)}; + CPUContext context; + + constexpr bool TransposeWeight = true; + + CAFFE_ENFORCE(b.dim() == 1, b.dim()); + // batch size + const auto canonical_axis = X.canonical_axis_index(axis); + const auto M = X.size_to_dim(canonical_axis); + const auto K = X.size_from_dim(canonical_axis); + const auto canonical_axis_w = W.canonical_axis_index(axis_w); + const int N = TransposeWeight ? W.size_to_dim(canonical_axis_w) + : W.size_from_dim(canonical_axis_w); + + auto dimErrorString = [&]() { + return c10::str( + "Dimension mismatch: ", + "X: ", + X.sizes(), + ", W: ", + W.sizes(), + ", b: ", + b.sizes(), + ", axis: ", + axis, + ", M: ", + M, + ", N: ", + N, + ", K: ", + K); + }; + + // Error checking + CAFFE_ENFORCE(M == X.numel() / K, dimErrorString()); + CAFFE_ENFORCE(K == W.numel() / N, dimErrorString()); + CAFFE_ENFORCE(N == b.dim32(0), dimErrorString()); + CAFFE_ENFORCE(N == b.numel(), dimErrorString()); + + Y_shape_cache_ = X.sizes().vec(); + // This is an invariant of canonical_axis, so we can DCHECK. + DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size()); + Y_shape_cache_.resize(canonical_axis + 1); + Y_shape_cache_[canonical_axis] = N; + Y.Resize(Y_shape_cache_); + CAFFE_ENFORCE(M * N == Y.numel(), dimErrorString()); + + if (X.numel() == 0) { + // skip the rest of the computation if X is empty + Y.template mutable_data(); + return; + } + + // default to FLOAT as math.h does. + caffe2::TensorProto::DataType math_type = + caffe2::TensorProto_DataType_FLOAT; + if (caffe2::fp16_type()) { + math_type = caffe2::TensorProto_DataType_FLOAT16; + } + + // W * x + caffe2::math::Gemm( + CblasNoTrans, + TransposeWeight ? CblasTrans : CblasNoTrans, + M, + N, + K, + 1, + X.template data(), + W.template data(), + 0, + Y.template mutable_data(), + static_cast(&context), + math_type); + // Add bias term + Tensor bias_multiplier(bias_multiplier_); + ReinitializeTensor( + &bias_multiplier, {M}, at::dtype().device(CPU)); + caffe2::math::Set( + M, + caffe2::convert::To(1), + bias_multiplier.template mutable_data(), + static_cast(&context)); + caffe2::math::Gemm( + CblasNoTrans, + CblasNoTrans, M, - ", N: ", N, - ", K: ", - K); - }; - - // Error checking - CAFFE_ENFORCE(M == X.numel() / K, dimErrorString()); - CAFFE_ENFORCE(K == W.numel() / N, dimErrorString()); - CAFFE_ENFORCE(N == b.dim32(0), dimErrorString()); - CAFFE_ENFORCE(N == b.numel(), dimErrorString()); - - cache->Y_shape_cache_ = X.sizes().vec(); - // This is an invariant of canonical_axis, so we can DCHECK. - DCHECK_LE(canonical_axis + 1, cache->Y_shape_cache_.size()); - cache->Y_shape_cache_.resize(canonical_axis + 1); - cache->Y_shape_cache_[canonical_axis] = N; - Y.Resize(cache->Y_shape_cache_); - CAFFE_ENFORCE(M * N == Y.numel(), dimErrorString()); - - if (X.numel() == 0) { - // skip the rest of the computation if X is empty - Y.template mutable_data(); - return; + 1, + 1, + bias_multiplier.template data(), + b.template data(), + 1, + Y.template mutable_data(), + static_cast(&context), + math_type); } - // default to FLOAT as math.h does. - caffe2::TensorProto::DataType math_type = caffe2::TensorProto_DataType_FLOAT; - if (caffe2::fp16_type()) { - math_type = caffe2::TensorProto_DataType_FLOAT16; - } + private: + vector Y_shape_cache_; + at::Tensor bias_multiplier_ = at::Tensor(C10Tensor(Tensor())); +}; + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::FullyConnected", + "", + (std::vector{c10::Argument("X"), + c10::Argument("W"), + c10::Argument("b"), + c10::Argument("output"), + c10::Argument("axis", IntType::get()), + c10::Argument("axis_w", IntType::get())}), + (std::vector{})), + c10::kernel>(), + c10::dispatchKey(CPUTensorId())); - // W * x - caffe2::math::Gemm( - CblasNoTrans, - TransposeWeight ? CblasTrans : CblasNoTrans, - M, - N, - K, - 1, - X.template data(), - W.template data(), - 0, - Y.template mutable_data(), - static_cast(&context), - math_type); - // Add bias term - Tensor bias_multiplier(cache->bias_multiplier_); - ReinitializeTensor(&bias_multiplier, {M}, at::dtype().device(CPU)); - caffe2::math::Set( - M, - caffe2::convert::To(1), - bias_multiplier.template mutable_data(), - static_cast(&context)); - caffe2::math::Gemm( - CblasNoTrans, - CblasNoTrans, - M, - N, - 1, - 1, - bias_multiplier.template data(), - b.template data(), - 1, - Y.template mutable_data(), - static_cast(&context), - math_type); -} } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::FullyConnected) - .withCache() - .kernel), &caffe2::fc_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::FullyConnected", + C10FC_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/filler_cpu.cc b/caffe2/operators/experimental/c10/cpu/filler_cpu.cc index 6db6685..55e225d 100644 --- a/caffe2/operators/experimental/c10/cpu/filler_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/filler_cpu.cc @@ -1,8 +1,8 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/filler.h" -#include "caffe2/utils/math.h" -#include "caffe2/core/tensor.h" +#include #include +#include "caffe2/core/operator_c10wrapper.h" +#include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::CPUContext; using caffe2::Tensor; @@ -143,27 +143,107 @@ void uniform_fill_op_cpu_impl( output.template mutable_data(), static_cast(&context)); } -} // namespace -} // namespace caffe2 - -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::ConstantFill) - .kernel() - .dispatchKey(CPUTensorId()); -C10_REGISTER_KERNEL(caffe2::ops::UniformFill) - .kernel() - .dispatchKey(CPUTensorId()); +static auto registry = + c10::RegisterOperators() + .op(FunctionSchema( + "_c10_experimental::ConstantFill", + "", + (std::vector{ + c10::Argument("inputs", ListType::ofTensors()), + c10::Argument("output"), + c10::Argument("shape", ListType::ofInts()), + c10::Argument("extra_shape", ListType::ofInts()), + c10::Argument("input_as_shape", BoolType::get()), + c10::Argument("dtype", IntType::get()), + c10::Argument("value", NumberType::get())}), + (std::vector{})), + c10::kernel< + decltype(constant_fill_op_cpu_impl), + &constant_fill_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())) + .op(FunctionSchema( + "_c10_experimental::UniformFill", + "", + (std::vector{ + c10::Argument("inputs", ListType::ofTensors()), + c10::Argument("output"), + c10::Argument("shape", ListType::ofInts()), + c10::Argument("extra_shape", ListType::ofInts()), + c10::Argument("input_as_shape", BoolType::get()), + c10::Argument("min", FloatType::get()), + c10::Argument("max", FloatType::get())}), + (std::vector{})), + c10::kernel< + decltype(uniform_fill_op_cpu_impl), + &uniform_fill_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())) + .op(FunctionSchema( + "_c10_experimental::GivenTensorFill", + "", + (std::vector{ + c10::Argument("inputs", ListType::ofTensors()), + c10::Argument("output"), + c10::Argument("shape", ListType::ofInts()), + c10::Argument("extra_shape", ListType::ofInts()), + c10::Argument("input_as_shape", BoolType::get()), + c10::Argument("values"), + }), + (std::vector{})), + c10::kernel< + decltype(given_tensor_fill_op_cpu_impl), + &given_tensor_fill_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())) + .op(FunctionSchema( + "_c10_experimental::GivenTensorIntFill", + "", + (std::vector{ + c10::Argument("inputs", ListType::ofTensors()), + c10::Argument("output"), + c10::Argument("shape", ListType::ofInts()), + c10::Argument("extra_shape", ListType::ofInts()), + c10::Argument("input_as_shape", BoolType::get()), + c10::Argument("values"), + }), + (std::vector{})), + c10::kernel< + decltype(given_tensor_fill_op_cpu_impl), + &given_tensor_fill_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())) + .op(FunctionSchema( + "_c10_experimental::GivenTensorInt64Fill", + "", + (std::vector{ + c10::Argument("inputs", ListType::ofTensors()), + c10::Argument("output"), + c10::Argument("shape", ListType::ofInts()), + c10::Argument("extra_shape", ListType::ofInts()), + c10::Argument("input_as_shape", BoolType::get()), + c10::Argument("values"), + }), + (std::vector{})), + c10::kernel< + decltype(given_tensor_fill_op_cpu_impl), + &given_tensor_fill_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); -C10_REGISTER_KERNEL(caffe2::ops::GivenTensorFill) - .kernel), &caffe2::given_tensor_fill_op_cpu_impl>() - .dispatchKey(CPUTensorId()); +} // namespace -C10_REGISTER_KERNEL(caffe2::ops::GivenTensorIntFill) - .kernel), &caffe2::given_tensor_fill_op_cpu_impl>() - .dispatchKey(CPUTensorId()); +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::ConstantFill", + C10ConstantFill_DontUseThisOpYet) +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::UniformFill", + C10UniformFill_DontUseThisOpYet) + +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::GivenTensorFill", + C10GivenTensorFill_DontUseThisOpYet) +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::GivenTensorIntFill", + C10GivenTensorIntFill_DontUseThisOpYet) +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::GivenTensorInt64Fill", + C10GivenTensorInt64Fill_DontUseThisOpYet) -C10_REGISTER_KERNEL(caffe2::ops::GivenTensorInt64Fill) - .kernel), &caffe2::given_tensor_fill_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc b/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc index 23bbaf3..6c153de 100644 --- a/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/flatten.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::BaseContext; using caffe2::Tensor; @@ -25,11 +25,24 @@ void flatten_op_cpu_impl( input.raw_data(), output.raw_mutable_data(input.dtype())); } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::Flatten", + "", + (std::vector{c10::Argument("input"), + c10::Argument("output"), + c10::Argument("axis", IntType::get())}), + (std::vector{})), + c10::kernel< + decltype(flatten_op_cpu_impl), + &flatten_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::Flatten) - .kernel), &caffe2::flatten_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::Flatten", + C10Flatten_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/mul_cpu.cc b/caffe2/operators/experimental/c10/cpu/mul_cpu.cc index 247e1bb..ef1e6d9 100644 --- a/caffe2/operators/experimental/c10/cpu/mul_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/mul_cpu.cc @@ -1,8 +1,8 @@ -#include +#include +#include "caffe2/core/operator_c10wrapper.h" +#include "caffe2/core/tensor.h" #include "caffe2/operators/elementwise_ops_utils.h" -#include "caffe2/operators/experimental/c10/schemas/mul.h" #include "caffe2/utils/math.h" -#include "caffe2/core/tensor.h" using caffe2::BaseContext; using caffe2::Tensor; @@ -70,11 +70,25 @@ void mul_op_cpu_impl( C.mutable_data(), static_cast(&context)); } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::Mul", + "", + (std::vector{ + c10::Argument("input1"), + c10::Argument("input2"), + c10::Argument("output"), + c10::Argument("legacy_broadcast", BoolType::get()), + c10::Argument("axis", IntType::get())}), + (std::vector{})), + c10::kernel), &mul_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::Mul) - .kernel), &caffe2::mul_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::Mul", + C10Mul_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/relu_cpu.cc b/caffe2/operators/experimental/c10/cpu/relu_cpu.cc index 67c7ee2..c29a677 100644 --- a/caffe2/operators/experimental/c10/cpu/relu_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/relu_cpu.cc @@ -1,8 +1,8 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/relu.h" +#include +#include "caffe2/core/operator_c10wrapper.h" +#include "caffe2/core/tensor.h" #include "caffe2/utils/eigen_utils.h" #include "caffe2/utils/math.h" -#include "caffe2/core/tensor.h" using caffe2::Tensor; @@ -39,11 +39,21 @@ void relu_op_cpu_impl( } */ } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::Relu", + "", + (std::vector{c10::Argument("input"), + c10::Argument("output")}), + (std::vector{})), + c10::kernel), &relu_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::Relu) - .kernel), &caffe2::relu_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::Relu", + C10Relu_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc b/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc index 470d633..17d96a5 100644 --- a/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc @@ -1,8 +1,8 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/sigmoid.h" +#include +#include "caffe2/core/operator_c10wrapper.h" +#include "caffe2/core/tensor.h" #include "caffe2/utils/eigen_utils.h" #include "caffe2/utils/math.h" -#include "caffe2/core/tensor.h" using caffe2::Tensor; @@ -22,11 +22,23 @@ void sigmoid_op_cpu_impl( output.mutable_data(), input.numel()) = 1. / (1. + (-xM).exp()); } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::Sigmoid", + "", + (std::vector{c10::Argument("input"), + c10::Argument("output")}), + (std::vector{})), + c10::kernel< + decltype(sigmoid_op_cpu_impl), + &sigmoid_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::Sigmoid) - .kernel), &caffe2::sigmoid_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::Sigmoid", + C10Sigmoid_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc b/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc index 4255e2a..e23f6f8 100644 --- a/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::Tensor; @@ -69,11 +69,27 @@ void sigmoid_cross_entropy_with_logits_op_cpu_impl( out_ptr[i] = -value / inner_size; } } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::SigmoidCrossEntropyWithLogits", + "", + (std::vector{ + c10::Argument("input1"), + c10::Argument("input2"), + c10::Argument("output"), + c10::Argument("log_D_trick", BoolType::get()), + c10::Argument("unjoined_lr_loss", BoolType::get())}), + (std::vector{})), + c10::kernel< + decltype(sigmoid_cross_entropy_with_logits_op_cpu_impl), + &sigmoid_cross_entropy_with_logits_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::SigmoidCrossEntropyWithLogits) - .kernel() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::SigmoidCrossEntropyWithLogits", + C10SigmoidCrossEntropyWithLogits_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc b/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc index 1d8b180..6bbc107 100644 --- a/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc @@ -1,8 +1,8 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.h" +#include +#include "caffe2/core/operator_c10wrapper.h" +#include "caffe2/core/tensor.h" #include "caffe2/perfkernels/embedding_lookup.h" #include "caffe2/utils/math.h" -#include "caffe2/core/tensor.h" using caffe2::Tensor; @@ -81,11 +81,24 @@ void sparse_lengths_sum_op_cpu( } } +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::SparseLengthsSum", + "", + (std::vector{c10::Argument("data"), + c10::Argument("indices"), + c10::Argument("lengths"), + c10::Argument("output")}), + (std::vector{})), + c10::kernel< + decltype(sparse_lengths_sum_op_cpu), + &sparse_lengths_sum_op_cpu>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::SparseLengthsSum) - .kernel() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::SparseLengthsSum", + C10SparseLengthsSum_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc b/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc index 4d3a981..f103719 100644 --- a/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc @@ -1,7 +1,7 @@ -#include -#include "caffe2/operators/experimental/c10/schemas/stop_gradient.h" -#include "caffe2/utils/math.h" +#include +#include "caffe2/core/operator_c10wrapper.h" #include "caffe2/core/tensor.h" +#include "caffe2/utils/math.h" using caffe2::BaseContext; using caffe2::Tensor; @@ -18,11 +18,23 @@ void stop_gradient_op_cpu_impl( output.CopyFrom(input); } } + +static auto registry = c10::RegisterOperators().op( + FunctionSchema( + "_c10_experimental::StopGradient", + "", + (std::vector{c10::Argument("input"), + c10::Argument("output")}), + (std::vector{})), + c10::kernel< + decltype(stop_gradient_op_cpu_impl), + &stop_gradient_op_cpu_impl>(), + c10::dispatchKey(CPUTensorId())); + } // namespace -} // namespace caffe2 -namespace c10 { -C10_REGISTER_KERNEL(caffe2::ops::StopGradient) - .kernel), &caffe2::stop_gradient_op_cpu_impl>() - .dispatchKey(CPUTensorId()); -} // namespace c10 +REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( + "_c10_experimental::StopGradient", + C10StopGradient_DontUseThisOpYet) + +} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/add.cc b/caffe2/operators/experimental/c10/schemas/add.cc deleted file mode 100644 index 63ecc97..0000000 --- a/caffe2/operators/experimental/c10/schemas/add.cc +++ /dev/null @@ -1,28 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/add.h" -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - Add, - FunctionSchema( - "_c10_experimental::Add", - "", - (std::vector{ - c10::Argument("input1"), - c10::Argument("input2"), - c10::Argument("output"), - c10::Argument("legacy_broadcast", BoolType::get()), - c10::Argument("axis", IntType::get())}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::Add", - C10Add_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/add.h b/caffe2/operators/experimental/c10/schemas/add.h deleted file mode 100644 index 4dfa99a..0000000 --- a/caffe2/operators/experimental/c10/schemas/add.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(Add); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/averaged_loss.cc b/caffe2/operators/experimental/c10/schemas/averaged_loss.cc deleted file mode 100644 index dfc41a6..0000000 --- a/caffe2/operators/experimental/c10/schemas/averaged_loss.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/averaged_loss.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - AveragedLoss, - FunctionSchema( - "_c10_experimental::AveragedLoss", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output")}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::AveragedLoss", - C10AveragedLoss_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/averaged_loss.h b/caffe2/operators/experimental/c10/schemas/averaged_loss.h deleted file mode 100644 index 548bd07..0000000 --- a/caffe2/operators/experimental/c10/schemas/averaged_loss.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(AveragedLoss); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/batch_gather.cc b/caffe2/operators/experimental/c10/schemas/batch_gather.cc deleted file mode 100644 index 9fb84e5..0000000 --- a/caffe2/operators/experimental/c10/schemas/batch_gather.cc +++ /dev/null @@ -1,26 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/batch_gather.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - BatchGather, - FunctionSchema( - "_c10_experimental::BatchGather", - "", - (std::vector{c10::Argument("data"), - c10::Argument("indices"), - c10::Argument("output")}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::BatchGather", - C10BatchGather_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/batch_gather.h b/caffe2/operators/experimental/c10/schemas/batch_gather.h deleted file mode 100644 index 214c67f..0000000 --- a/caffe2/operators/experimental/c10/schemas/batch_gather.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(BatchGather); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/batch_matmul.cc b/caffe2/operators/experimental/c10/schemas/batch_matmul.cc deleted file mode 100644 index addb95e..0000000 --- a/caffe2/operators/experimental/c10/schemas/batch_matmul.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/batch_matmul.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - BatchMatmul, - FunctionSchema( - "_c10_experimental::BatchMatmul", - "", - (std::vector{ - c10::Argument("A"), - c10::Argument("B"), - c10::Argument("output"), - c10::Argument("trans_a", IntType::get()), - c10::Argument("trans_b", IntType::get()), - c10::Argument("broadcast", IntType::get())}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::BatchMatmul", - C10BatchMatMul_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/batch_matmul.h b/caffe2/operators/experimental/c10/schemas/batch_matmul.h deleted file mode 100644 index 191e0e6..0000000 --- a/caffe2/operators/experimental/c10/schemas/batch_matmul.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(BatchMatmul); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/cast.cc b/caffe2/operators/experimental/c10/schemas/cast.cc deleted file mode 100644 index c1133ce..0000000 --- a/caffe2/operators/experimental/c10/schemas/cast.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/cast.h" -#include -#include "caffe2/core/operator_c10wrapper.h" -#include "caffe2/utils/cast.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - Cast, - FunctionSchema( - "_c10_experimental::Cast", - "", - (std::vector{ - c10::Argument("input"), - c10::Argument("output"), - c10::Argument("to_dtype", IntType::get()), - }), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::Cast", - C10Cast_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/cast.h b/caffe2/operators/experimental/c10/schemas/cast.h deleted file mode 100644 index 979637b..0000000 --- a/caffe2/operators/experimental/c10/schemas/cast.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(Cast); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/concat.cc b/caffe2/operators/experimental/c10/schemas/concat.cc deleted file mode 100644 index d9a7e33..0000000 --- a/caffe2/operators/experimental/c10/schemas/concat.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/concat.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - Concat, - FunctionSchema( - "_c10_experimental::Concat", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("split_info", FloatType::get()), - c10::Argument("add", IntType::get()), - c10::Argument("add_axis", IntType::get())}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::Concat", - C10Concat_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/concat.h b/caffe2/operators/experimental/c10/schemas/concat.h deleted file mode 100644 index aecaf40..0000000 --- a/caffe2/operators/experimental/c10/schemas/concat.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(Concat); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/enforce_finite.cc b/caffe2/operators/experimental/c10/schemas/enforce_finite.cc deleted file mode 100644 index c8d58de..0000000 --- a/caffe2/operators/experimental/c10/schemas/enforce_finite.cc +++ /dev/null @@ -1,24 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/enforce_finite.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - EnforceFinite, - FunctionSchema( - "_c10_experimental::EnforceFinite", - "", - (std::vector{c10::Argument("input")}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::EnforceFinite", - C10EnforceFinite_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/enforce_finite.h b/caffe2/operators/experimental/c10/schemas/enforce_finite.h deleted file mode 100644 index 704136c..0000000 --- a/caffe2/operators/experimental/c10/schemas/enforce_finite.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(EnforceFinite); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/expand_dims.cc b/caffe2/operators/experimental/c10/schemas/expand_dims.cc deleted file mode 100644 index e2c4c75..0000000 --- a/caffe2/operators/experimental/c10/schemas/expand_dims.cc +++ /dev/null @@ -1,28 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/expand_dims.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; -using c10::intrusive_ptr; -using c10::ivalue::IntList; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - ExpandDims, - FunctionSchema( - "_c10_experimental::ExpandDims", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output"), - c10::Argument("dims", ListType::ofInts())}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::ExpandDims", - C10ExpandDims_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/expand_dims.h b/caffe2/operators/experimental/c10/schemas/expand_dims.h deleted file mode 100644 index fa3ab8f..0000000 --- a/caffe2/operators/experimental/c10/schemas/expand_dims.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(ExpandDims); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/fc.cc b/caffe2/operators/experimental/c10/schemas/fc.cc deleted file mode 100644 index 773964e..0000000 --- a/caffe2/operators/experimental/c10/schemas/fc.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/fc.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - FullyConnected, - FunctionSchema( - "_c10_experimental::FullyConnected", - "", - (std::vector{c10::Argument("X"), - c10::Argument("W"), - c10::Argument("b"), - c10::Argument("output"), - c10::Argument("axis", IntType::get()), - c10::Argument("axis_w", IntType::get())}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::FullyConnected", - C10FC_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/fc.h b/caffe2/operators/experimental/c10/schemas/fc.h deleted file mode 100644 index 1aed0eb..0000000 --- a/caffe2/operators/experimental/c10/schemas/fc.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(FullyConnected); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/filler.cc b/caffe2/operators/experimental/c10/schemas/filler.cc deleted file mode 100644 index 8fe8707..0000000 --- a/caffe2/operators/experimental/c10/schemas/filler.cc +++ /dev/null @@ -1,103 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/filler.h" -#include -#include "caffe2/core/operator_c10wrapper.h" -#include "caffe2/utils/cast.h" - -using caffe2::CPUContext; -using c10::C10Tensor; -using c10::ivalue::IntList; -using c10::intrusive_ptr; - -namespace caffe2 { -namespace ops { -// TODO Parse schema strings instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - ConstantFill, - FunctionSchema( - "_c10_experimental::ConstantFill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("dtype", IntType::get()), - c10::Argument("value", NumberType::get())}), - (std::vector{}))); -C10_DEFINE_OP_SCHEMA( - UniformFill, - FunctionSchema( - "_c10_experimental::UniformFill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("min", FloatType::get()), - c10::Argument("max", FloatType::get())}), - (std::vector{}))); -C10_DEFINE_OP_SCHEMA( - GivenTensorFill, - FunctionSchema( - "_c10_experimental::GivenTensorFill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("values"), - }), - (std::vector{}))); -C10_DEFINE_OP_SCHEMA( - GivenTensorIntFill, - FunctionSchema( - "_c10_experimental::GivenTensorIntFill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("values"), - }), - (std::vector{}))); -C10_DEFINE_OP_SCHEMA( - GivenTensorInt64Fill, - FunctionSchema( - "_c10_experimental::GivenTensorInt64Fill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("values"), - }), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::ConstantFill", - C10ConstantFill_DontUseThisOpYet) -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::UniformFill", - C10UniformFill_DontUseThisOpYet) -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::GivenTensorFill", - C10GivenTensorFill_DontUseThisOpYet) -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::GivenTensorIntFill", - C10GivenTensorIntFill_DontUseThisOpYet) -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::GivenTensorInt64Fill", - C10GivenTensorInt64Fill_DontUseThisOpYet) -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/filler.h b/caffe2/operators/experimental/c10/schemas/filler.h deleted file mode 100644 index 616893d..0000000 --- a/caffe2/operators/experimental/c10/schemas/filler.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(GivenTensorFill); -C10_DECLARE_OP_SCHEMA(GivenTensorIntFill); -C10_DECLARE_OP_SCHEMA(GivenTensorInt64Fill); -C10_DECLARE_OP_SCHEMA(ConstantFill); -C10_DECLARE_OP_SCHEMA(UniformFill); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/flatten.cc b/caffe2/operators/experimental/c10/schemas/flatten.cc deleted file mode 100644 index 42353b2..0000000 --- a/caffe2/operators/experimental/c10/schemas/flatten.cc +++ /dev/null @@ -1,26 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/flatten.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - Flatten, - FunctionSchema( - "_c10_experimental::Flatten", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output"), - c10::Argument("axis", IntType::get())}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::Flatten", - C10Flatten_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/flatten.h b/caffe2/operators/experimental/c10/schemas/flatten.h deleted file mode 100644 index 9c53462..0000000 --- a/caffe2/operators/experimental/c10/schemas/flatten.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(Flatten); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/mul.cc b/caffe2/operators/experimental/c10/schemas/mul.cc deleted file mode 100644 index af7a7b7..0000000 --- a/caffe2/operators/experimental/c10/schemas/mul.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/mul.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - Mul, - FunctionSchema( - "_c10_experimental::Mul", - "", - (std::vector{ - c10::Argument("input1"), - c10::Argument("input2"), - c10::Argument("output"), - c10::Argument("legacy_broadcast", BoolType::get()), - c10::Argument("axis", IntType::get())}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::Mul", - C10Mul_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/mul.h b/caffe2/operators/experimental/c10/schemas/mul.h deleted file mode 100644 index 54b64f4..0000000 --- a/caffe2/operators/experimental/c10/schemas/mul.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(Mul); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/relu.cc b/caffe2/operators/experimental/c10/schemas/relu.cc deleted file mode 100644 index 43528d9..0000000 --- a/caffe2/operators/experimental/c10/schemas/relu.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/relu.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - Relu, - FunctionSchema( - "_c10_experimental::Relu", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output")}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::Relu", - C10Relu_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/relu.h b/caffe2/operators/experimental/c10/schemas/relu.h deleted file mode 100644 index ea0aa89..0000000 --- a/caffe2/operators/experimental/c10/schemas/relu.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(Relu); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/sigmoid.cc b/caffe2/operators/experimental/c10/schemas/sigmoid.cc deleted file mode 100644 index 2261a19..0000000 --- a/caffe2/operators/experimental/c10/schemas/sigmoid.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/sigmoid.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - Sigmoid, - FunctionSchema( - "_c10_experimental::Sigmoid", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output")}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::Sigmoid", - C10Sigmoid_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/sigmoid.h b/caffe2/operators/experimental/c10/schemas/sigmoid.h deleted file mode 100644 index 5d5ff41..0000000 --- a/caffe2/operators/experimental/c10/schemas/sigmoid.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(Sigmoid); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.cc b/caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.cc deleted file mode 100644 index d1be6b9..0000000 --- a/caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - SigmoidCrossEntropyWithLogits, - FunctionSchema( - "_c10_experimental::SigmoidCrossEntropyWithLogits", - "", - (std::vector{ - c10::Argument("input1"), - c10::Argument("input2"), - c10::Argument("output"), - c10::Argument("log_D_trick", BoolType::get()), - c10::Argument("unjoined_lr_loss", BoolType::get())}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::SigmoidCrossEntropyWithLogits", - C10SigmoidCrossEntropyWithLogits_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.h b/caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.h deleted file mode 100644 index 671c2e2..0000000 --- a/caffe2/operators/experimental/c10/schemas/sigmoid_cross_entropy_with_logits.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(SigmoidCrossEntropyWithLogits); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.cc b/caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.cc deleted file mode 100644 index f26abcf..0000000 --- a/caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.cc +++ /dev/null @@ -1,27 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - SparseLengthsSum, - FunctionSchema( - "_c10_experimental::SparseLengthsSum", - "", - (std::vector{c10::Argument("data"), - c10::Argument("indices"), - c10::Argument("lengths"), - c10::Argument("output")}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::SparseLengthsSum", - C10SparseLengthsSum_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.h b/caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.h deleted file mode 100644 index a4054e1..0000000 --- a/caffe2/operators/experimental/c10/schemas/sparse_lengths_sum.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(SparseLengthsSum); - -} // namespace ops -} // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/schemas/stop_gradient.cc b/caffe2/operators/experimental/c10/schemas/stop_gradient.cc deleted file mode 100644 index 3305845..0000000 --- a/caffe2/operators/experimental/c10/schemas/stop_gradient.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include "caffe2/operators/experimental/c10/schemas/stop_gradient.h" -#include -#include "caffe2/core/operator_c10wrapper.h" - -using caffe2::CPUContext; - -namespace caffe2 { -namespace ops { -// TODO Parse schema string instead of creating FunctionSchema manually -C10_DEFINE_OP_SCHEMA( - StopGradient, - FunctionSchema( - "_c10_experimental::StopGradient", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output")}), - (std::vector{}))); -} -} - -namespace caffe2 { -REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( - "_c10_experimental::StopGradient", - C10StopGradient_DontUseThisOpYet) -} diff --git a/caffe2/operators/experimental/c10/schemas/stop_gradient.h b/caffe2/operators/experimental/c10/schemas/stop_gradient.h deleted file mode 100644 index bb130e2..0000000 --- a/caffe2/operators/experimental/c10/schemas/stop_gradient.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace caffe2 { -namespace ops { - -C10_DECLARE_OP_SCHEMA(StopGradient); - -} // namespace ops -} // namespace caffe2 -- 2.7.4