From 9eaa419dd494f75bbabf48821a1d89da2942b400 Mon Sep 17 00:00:00 2001 From: "Efimov Alexander/AI Tools Lab/./Samsung Electronics" Date: Fri, 10 Aug 2018 12:06:52 +0300 Subject: [PATCH] Soft backend: eigen library (#841) Add library with common support functions for operations Signed-off-by: Efimov Alexander --- .../libs/backend/soft/include/cpp_ops/eigen.def | 28897 +++++++++++++++++++ .../nnc/libs/backend/soft/src/cpp_generator.cpp | 3 +- 2 files changed, 28899 insertions(+), 1 deletion(-) create mode 100644 contrib/nnc/libs/backend/soft/include/cpp_ops/eigen.def diff --git a/contrib/nnc/libs/backend/soft/include/cpp_ops/eigen.def b/contrib/nnc/libs/backend/soft/include/cpp_ops/eigen.def new file mode 100644 index 0000000..24f01de --- /dev/null +++ b/contrib/nnc/libs/backend/soft/include/cpp_ops/eigen.def @@ -0,0 +1,28897 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008 Gael Guennebaud +// Copyright (C) 2007-2011 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// #include "src/Core/util/DisableStupidWarnings.h" +#ifndef EIGEN_WARNINGS_DISABLED +#define EIGEN_WARNINGS_DISABLED +#ifdef _MSC_VER + #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS + #pragma warning( push ) + #endif + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) +#elif defined __INTEL_COMPILER + #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS + #pragma warning push + #endif + #pragma warning disable 2196 279 1684 2259 +#elif defined __clang__ + #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS + #pragma clang diagnostic push + #endif + #pragma clang diagnostic ignored "-Wconstant-logical-operand" +#elif defined __GNUC__ && __GNUC__>=6 + #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS + #pragma GCC diagnostic push + #endif + #pragma GCC diagnostic ignored "-Wignored-attributes" +#endif +#if defined __NVCC__ + #pragma diag_suppress code_is_unreachable + #pragma diag_suppress initialization_not_reachable + #pragma diag_suppress 1222 + #pragma diag_suppress 2527 + #pragma diag_suppress 2529 + #pragma diag_suppress 2651 + #pragma diag_suppress 2653 + #pragma diag_suppress 2668 + #pragma diag_suppress 2669 + #pragma diag_suppress 2670 + #pragma diag_suppress 2671 + #pragma diag_suppress 2735 + #pragma diag_suppress 2737 +#endif +#endif +// end #include "src/Core/util/DisableStupidWarnings.h" +#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) + #ifndef EIGEN_NO_DEBUG + #define EIGEN_NO_DEBUG + #endif + #ifdef EIGEN_INTERNAL_DEBUGGING + #undef EIGEN_INTERNAL_DEBUGGING + #endif + #ifdef EIGEN_EXCEPTIONS + #undef EIGEN_EXCEPTIONS + #endif + #ifdef __CUDACC__ + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + #define EIGEN_DEVICE_FUNC __host__ __device__ + #include + #else + #define EIGEN_DEVICE_FUNC + #endif +#else + #define EIGEN_DEVICE_FUNC +#endif +#if defined(__CUDA_ARCH__) && defined(__NVCC__) + #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; +#else + #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; +#endif +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) + #define EIGEN_EXCEPTIONS +#endif +#ifdef EIGEN_EXCEPTIONS + #include +#endif +// #include "src/Core/util/Macros.h" +#ifndef EIGEN_MACROS_H +#define EIGEN_MACROS_H +#define EIGEN_WORLD_VERSION 3 +#define EIGEN_MAJOR_VERSION 3 +#define EIGEN_MINOR_VERSION 4 +#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ + (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ + EIGEN_MINOR_VERSION>=z)))) +#ifdef __GNUC__ + #define EIGEN_COMP_GNUC 1 +#else + #define EIGEN_COMP_GNUC 0 +#endif +#if defined(__clang__) + #define EIGEN_COMP_CLANG (__clang_major__*100+__clang_minor__) +#else + #define EIGEN_COMP_CLANG 0 +#endif +#if defined(__llvm__) + #define EIGEN_COMP_LLVM 1 +#else + #define EIGEN_COMP_LLVM 0 +#endif +#if defined(__INTEL_COMPILER) + #define EIGEN_COMP_ICC __INTEL_COMPILER +#else + #define EIGEN_COMP_ICC 0 +#endif +#if defined(__MINGW32__) + #define EIGEN_COMP_MINGW 1 +#else + #define EIGEN_COMP_MINGW 0 +#endif +#if defined(__SUNPRO_CC) + #define EIGEN_COMP_SUNCC 1 +#else + #define EIGEN_COMP_SUNCC 0 +#endif +#if defined(_MSC_VER) + #define EIGEN_COMP_MSVC _MSC_VER +#else + #define EIGEN_COMP_MSVC 0 +#endif +#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) + #define EIGEN_COMP_MSVC_STRICT _MSC_VER +#else + #define EIGEN_COMP_MSVC_STRICT 0 +#endif +#if defined(__IBMCPP__) || defined(__xlc__) + #define EIGEN_COMP_IBM 1 +#else + #define EIGEN_COMP_IBM 0 +#endif +#if defined(__PGI) + #define EIGEN_COMP_PGI 1 +#else + #define EIGEN_COMP_PGI 0 +#endif +#if defined(__CC_ARM) || defined(__ARMCC_VERSION) + #define EIGEN_COMP_ARM 1 +#else + #define EIGEN_COMP_ARM 0 +#endif +#if defined(__EMSCRIPTEN__) + #define EIGEN_COMP_EMSCRIPTEN 1 +#else + #define EIGEN_COMP_EMSCRIPTEN 0 +#endif +#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN) + #define EIGEN_COMP_GNUC_STRICT 1 +#else + #define EIGEN_COMP_GNUC_STRICT 0 +#endif +#if EIGEN_COMP_GNUC + #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x) + #define EIGEN_GNUC_AT_MOST(x,y) ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) +#define EIGEN_HAS_CXX11 1 +#else +#define EIGEN_HAS_CXX11 0 +#endif +#ifndef EIGEN_HAS_RVALUE_REFERENCES +#if EIGEN_MAX_CPP_VER>=11 && \ + (__has_feature(cxx_rvalue_references) || \ + (defined(__cplusplus) && __cplusplus >= 201103L) || \ + (EIGEN_COMP_MSVC >= 1600)) + #define EIGEN_HAS_RVALUE_REFERENCES 1 +#else + #define EIGEN_HAS_RVALUE_REFERENCES 0 +#endif +#endif +#ifndef EIGEN_HAS_C99_MATH +#if EIGEN_MAX_CPP_VER>=11 && \ + ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ + || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ + || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))) + #define EIGEN_HAS_C99_MATH 1 +#else + #define EIGEN_HAS_C99_MATH 0 +#endif +#endif +#ifndef EIGEN_HAS_STD_RESULT_OF +#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))) +#define EIGEN_HAS_STD_RESULT_OF 1 +#else +#define EIGEN_HAS_STD_RESULT_OF 0 +#endif +#endif +#ifndef EIGEN_HAS_VARIADIC_TEMPLATES +#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ + && ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) +#define EIGEN_HAS_VARIADIC_TEMPLATES 1 +#else +#define EIGEN_HAS_VARIADIC_TEMPLATES 0 +#endif +#endif +#ifndef EIGEN_HAS_CONSTEXPR +#ifdef __CUDACC__ +#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) + #define EIGEN_HAS_CONSTEXPR 1 +#endif +#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ + (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L))) +#define EIGEN_HAS_CONSTEXPR 1 +#endif +#ifndef EIGEN_HAS_CONSTEXPR +#define EIGEN_HAS_CONSTEXPR 0 +#endif +#endif +#ifndef EIGEN_HAS_CXX11_MATH + #if EIGEN_MAX_CPP_VER>=11 && ((__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \ + && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)) + #define EIGEN_HAS_CXX11_MATH 1 + #else + #define EIGEN_HAS_CXX11_MATH 0 + #endif +#endif +#ifndef EIGEN_HAS_CXX11_CONTAINERS + #if EIGEN_MAX_CPP_VER>=11 && \ + ((__cplusplus > 201103L) \ + || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \ + || EIGEN_COMP_MSVC >= 1900) + #define EIGEN_HAS_CXX11_CONTAINERS 1 + #else + #define EIGEN_HAS_CXX11_CONTAINERS 0 + #endif +#endif +#ifndef EIGEN_HAS_CXX11_NOEXCEPT + #if EIGEN_MAX_CPP_VER>=11 && \ + (__has_feature(cxx_noexcept) \ + || (__cplusplus > 201103L) \ + || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \ + || EIGEN_COMP_MSVC >= 1900) + #define EIGEN_HAS_CXX11_NOEXCEPT 1 + #else + #define EIGEN_HAS_CXX11_NOEXCEPT 0 + #endif +#endif +#ifndef EIGEN_FAST_MATH +#define EIGEN_FAST_MATH 1 +#endif +#define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl; +#define EIGEN_CAT2(a,b) a ## b +#define EIGEN_CAT(a,b) EIGEN_CAT2(a,b) +#define EIGEN_COMMA , +#define EIGEN_MAKESTRING2(a) #a +#define EIGEN_MAKESTRING(a) EIGEN_MAKESTRING2(a) +#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC +#define EIGEN_STRONG_INLINE __forceinline +#else +#define EIGEN_STRONG_INLINE inline +#endif +#if EIGEN_GNUC_AT_LEAST(4,2) +#define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline +#else +#define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE +#endif +#if EIGEN_COMP_GNUC +#define EIGEN_DONT_INLINE __attribute__((noinline)) +#elif EIGEN_COMP_MSVC +#define EIGEN_DONT_INLINE __declspec(noinline) +#else +#define EIGEN_DONT_INLINE +#endif +#if EIGEN_COMP_GNUC +#define EIGEN_PERMISSIVE_EXPR __extension__ +#else +#define EIGEN_PERMISSIVE_EXPR +#endif +#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline +#ifdef NDEBUG +# ifndef EIGEN_NO_DEBUG +# define EIGEN_NO_DEBUG +# endif +#endif +#ifdef EIGEN_NO_DEBUG + #define eigen_plain_assert(x) +#else + #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO + namespace Eigen { + namespace internal { + inline bool copy_bool(bool b) { return b; } + } + } + #define eigen_plain_assert(x) assert(x) + #else + #include + #include + namespace Eigen { + namespace internal { + namespace { + EIGEN_DONT_INLINE bool copy_bool(bool b) { return b; } + } + inline void assert_fail(const char *condition, const char *function, const char *file, int line) + { + std::cerr << "assertion failed: " << condition << " in function " << function << " at " << file << ":" << line << std::endl; + abort(); + } + } + } + #define eigen_plain_assert(x) \ + do { \ + if(!Eigen::internal::copy_bool(x)) \ + Eigen::internal::assert_fail(EIGEN_MAKESTRING(x), __PRETTY_FUNCTION__, __FILE__, __LINE__); \ + } while(false) + #endif +#endif +#ifndef eigen_assert +#define eigen_assert(x) eigen_plain_assert(x) +#endif +#ifdef EIGEN_INTERNAL_DEBUGGING +#define eigen_internal_assert(x) eigen_assert(x) +#else +#define eigen_internal_assert(x) +#endif +#ifdef EIGEN_NO_DEBUG +#define EIGEN_ONLY_USED_FOR_DEBUG(x) EIGEN_UNUSED_VARIABLE(x) +#else +#define EIGEN_ONLY_USED_FOR_DEBUG(x) +#endif +#ifndef EIGEN_NO_DEPRECATED_WARNING + #if EIGEN_COMP_GNUC + #define EIGEN_DEPRECATED __attribute__((deprecated)) + #elif EIGEN_COMP_MSVC + #define EIGEN_DEPRECATED __declspec(deprecated) + #else + #define EIGEN_DEPRECATED + #endif +#else + #define EIGEN_DEPRECATED +#endif +#if EIGEN_COMP_GNUC +#define EIGEN_UNUSED __attribute__((unused)) +#else +#define EIGEN_UNUSED +#endif +namespace Eigen { + namespace internal { + template EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {} + } +} +#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); +#if !defined(EIGEN_ASM_COMMENT) + #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) + #define EIGEN_ASM_COMMENT(X) __asm__("#" X) + #else + #define EIGEN_ASM_COMMENT(X) + #endif +#endif +#if (defined __CUDACC__) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) +#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) +#elif EIGEN_COMP_MSVC + #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) +#elif EIGEN_COMP_SUNCC + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) +#else + #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler +#endif +#if defined(EIGEN_DONT_VECTORIZE) + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 +#elif defined(EIGEN_VECTORIZE_AVX512) + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 +#elif defined(__AVX__) + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 +#else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 +#endif +#define EIGEN_MIN_ALIGN_BYTES 16 +#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. +#endif +#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) + #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES + #undef EIGEN_MAX_STATIC_ALIGN_BYTES + #endif + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 +#endif +#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES + #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64) + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #else + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 + #endif + #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ + && !EIGEN_GCC3_OR_OLDER \ + && !EIGEN_COMP_SUNCC \ + && !EIGEN_OS_QNX + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 + #else + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 + #endif + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT + #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES + #else + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 + #endif +#endif +#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 +#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) +#else +#define EIGEN_ALIGN_MAX +#endif +#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 +#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. +#endif +#ifdef EIGEN_DONT_ALIGN + #ifdef EIGEN_MAX_ALIGN_BYTES + #undef EIGEN_MAX_ALIGN_BYTES + #endif + #define EIGEN_MAX_ALIGN_BYTES 0 +#elif !defined(EIGEN_MAX_ALIGN_BYTES) + #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#endif +#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#else +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES +#endif +#ifndef EIGEN_UNALIGNED_VECTORIZE +#define EIGEN_UNALIGNED_VECTORIZE 1 +#endif +#ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD + #define EIGEN_RESTRICT +#endif +#ifndef EIGEN_RESTRICT + #define EIGEN_RESTRICT __restrict +#endif +#ifndef EIGEN_STACK_ALLOCATION_LIMIT +#define EIGEN_STACK_ALLOCATION_LIMIT 131072 +#endif +#ifndef EIGEN_DEFAULT_IO_FORMAT +#ifdef EIGEN_MAKING_DOCS +#define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat(3, 0, " ", "\n", "", "") +#else +#define EIGEN_DEFAULT_IO_FORMAT Eigen::IOFormat() +#endif +#endif +#define EIGEN_EMPTY +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) + #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + using Base::operator =; +#elif EIGEN_COMP_CLANG + #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + using Base::operator =; \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other) { Base::operator=(other.derived()); return *this; } +#else + #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + using Base::operator =; \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \ + { \ + Base::operator=(other); \ + return *this; \ + } +#endif +#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) +#define EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \ + typedef typename Eigen::internal::traits::Scalar Scalar; \ + typedef typename Eigen::NumTraits::Real RealScalar; \ + typedef typename Base::CoeffReturnType CoeffReturnType; \ + typedef typename Eigen::internal::ref_selector::type Nested; \ + typedef typename Eigen::internal::traits::StorageKind StorageKind; \ + typedef typename Eigen::internal::traits::StorageIndex StorageIndex; \ + enum { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ + ColsAtCompileTime = Eigen::internal::traits::ColsAtCompileTime, \ + Flags = Eigen::internal::traits::Flags, \ + SizeAtCompileTime = Base::SizeAtCompileTime, \ + MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \ + IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \ + using Base::derived; \ + using Base::const_cast_derived; +#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \ + EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \ + typedef typename Base::PacketScalar PacketScalar; +#define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b) +#define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b) +#define EIGEN_SIZE_MIN_PREFER_DYNAMIC(a,b) (((int)a == 0 || (int)b == 0) ? 0 \ + : ((int)a == 1 || (int)b == 1) ? 1 \ + : ((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \ + : ((int)a <= (int)b) ? (int)a : (int)b) +#define EIGEN_SIZE_MIN_PREFER_FIXED(a,b) (((int)a == 0 || (int)b == 0) ? 0 \ + : ((int)a == 1 || (int)b == 1) ? 1 \ + : ((int)a == Dynamic && (int)b == Dynamic) ? Dynamic \ + : ((int)a == Dynamic) ? (int)b \ + : ((int)b == Dynamic) ? (int)a \ + : ((int)a <= (int)b) ? (int)a : (int)b) +#define EIGEN_SIZE_MAX(a,b) (((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \ + : ((int)a >= (int)b) ? (int)a : (int)b) +#define EIGEN_LOGICAL_XOR(a,b) (((a) || (b)) && !((a) && (b))) +#define EIGEN_IMPLIES(a,b) (!(a) || (b)) +#define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \ + CwiseBinaryOp< \ + EIGEN_CAT(EIGEN_CAT(internal::scalar_,OPNAME),_op)< \ + typename internal::traits::Scalar, \ + typename internal::traits::Scalar \ + >, \ + const LHS, \ + const RHS \ + > +#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,OPNAME) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME) \ + (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const \ + { \ + return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,OPNAME)(derived(), other.derived()); \ + } +#define EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,TYPEA,TYPEB) \ + (Eigen::internal::has_ReturnType > >::value) +#define EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(EXPR,SCALAR,OPNAME) \ + CwiseBinaryOp::Scalar,SCALAR>, const EXPR, \ + const typename internal::plain_constant_type::type> +#define EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(SCALAR,EXPR,OPNAME) \ + CwiseBinaryOp::Scalar>, \ + const typename internal::plain_constant_type::type, const EXPR> +#if EIGEN_COMP_MSVC_STRICT<=1600 +#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if::type +#else +#define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X +#endif +#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \ + template EIGEN_DEVICE_FUNC inline \ + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,OPNAME))\ + (METHOD)(const T& scalar) const { \ + typedef typename internal::promote_scalar_arg::type PromotedT; \ + return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedT,OPNAME)(derived(), \ + typename internal::plain_constant_type::type(derived().rows(), derived().cols(), internal::scalar_constant_op(scalar))); \ + } +#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \ + template EIGEN_DEVICE_FUNC inline friend \ + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,OPNAME)) \ + (METHOD)(const T& scalar, const StorageBaseType& matrix) { \ + typedef typename internal::promote_scalar_arg::type PromotedT; \ + return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT,Derived,OPNAME)( \ + typename internal::plain_constant_type::type(matrix.derived().rows(), matrix.derived().cols(), internal::scalar_constant_op(scalar)), matrix.derived()); \ + } +#define EIGEN_MAKE_SCALAR_BINARY_OP(METHOD,OPNAME) \ + EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \ + EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) +#ifdef EIGEN_EXCEPTIONS +# define EIGEN_THROW_X(X) throw X +# define EIGEN_THROW throw +# define EIGEN_TRY try +# define EIGEN_CATCH(X) catch (X) +#else +# ifdef __CUDA_ARCH__ +# define EIGEN_THROW_X(X) asm("trap;") +# define EIGEN_THROW asm("trap;") +# else +# define EIGEN_THROW_X(X) std::abort() +# define EIGEN_THROW std::abort() +# endif +# define EIGEN_TRY if (true) +# define EIGEN_CATCH(X) else +#endif +#if EIGEN_HAS_CXX11_NOEXCEPT +# define EIGEN_INCLUDE_TYPE_TRAITS +# define EIGEN_NOEXCEPT noexcept +# define EIGEN_NOEXCEPT_IF(x) noexcept(x) +# define EIGEN_NO_THROW noexcept(true) +# define EIGEN_EXCEPTION_SPEC(X) noexcept(false) +#else +# define EIGEN_NOEXCEPT +# define EIGEN_NOEXCEPT_IF(x) +# define EIGEN_NO_THROW throw() +# define EIGEN_EXCEPTION_SPEC(X) throw(X) +#endif +#endif +// end #include "src/Core/util/Macros.h" +#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) + #pragma GCC optimize ("-fno-ipa-cp-clone") +#endif +#include +#if EIGEN_MAX_ALIGN_BYTES==0 + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif +#endif +#if EIGEN_COMP_MSVC + #include + #if (EIGEN_COMP_MSVC >= 1500) + #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 + #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER + #endif + #endif +#else + #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) + #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC + #endif +#endif +#ifndef EIGEN_DONT_VECTORIZE + #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SSE + #define EIGEN_VECTORIZE_SSE2 + #ifdef __SSE3__ + #define EIGEN_VECTORIZE_SSE3 + #endif + #ifdef __SSSE3__ + #define EIGEN_VECTORIZE_SSSE3 + #endif + #ifdef __SSE4_1__ + #define EIGEN_VECTORIZE_SSE4_1 + #endif + #ifdef __SSE4_2__ + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX__ + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX2__ + #define EIGEN_VECTORIZE_AVX2 + #endif + #ifdef __FMA__ + #define EIGEN_VECTORIZE_FMA + #endif + #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512) + #define EIGEN_VECTORIZE_AVX512 + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_FMA + #ifdef __AVX512DQ__ + #define EIGEN_VECTORIZE_AVX512DQ + #endif + #endif + extern "C" { + #if EIGEN_COMP_ICC >= 1110 + #include + #else + #include + #include + #include + #ifdef EIGEN_VECTORIZE_SSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_1 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_2 + #include + #endif + #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) + #include + #endif + #endif + } + #elif defined __VSX__ + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_VSX + #include + #undef bool + #undef vector + #undef pixel + #elif defined __ALTIVEC__ + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ALTIVEC + #include + #undef bool + #undef vector + #undef pixel + #elif (defined __ARM_NEON) || (defined __ARM_NEON__) + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_NEON + #include + #elif (defined __s390x__ && defined __VEC__) + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ZVECTOR + #include + #endif +#endif +#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) + #define EIGEN_HAS_FP16_C +#endif +#if defined __CUDACC__ + #define EIGEN_VECTORIZE_CUDA + #include + #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 + #define EIGEN_HAS_CUDA_FP16 + #endif +#endif +#if defined EIGEN_HAS_CUDA_FP16 + #include + #include +#endif +#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) + #define EIGEN_HAS_OPENMP +#endif +#ifdef EIGEN_HAS_OPENMP +#include +#endif +#if !(EIGEN_COMP_MSVC && EIGEN_OS_WINCE) && !EIGEN_COMP_ARM +#define EIGEN_HAS_ERRNO +#endif +#ifdef EIGEN_HAS_ERRNO +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef EIGEN_INCLUDE_TYPE_TRAITS +#include +#endif +#ifdef EIGEN_DEBUG_ASSIGN +#include +#endif +#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE + #include +#endif +namespace Eigen { +inline static const char *SimdInstructionSetsInUse(void) { +#if defined(EIGEN_VECTORIZE_AVX512) + return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_1) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; +#elif defined(EIGEN_VECTORIZE_SSSE3) + return "SSE, SSE2, SSE3, SSSE3"; +#elif defined(EIGEN_VECTORIZE_SSE3) + return "SSE, SSE2, SSE3"; +#elif defined(EIGEN_VECTORIZE_SSE2) + return "SSE, SSE2"; +#elif defined(EIGEN_VECTORIZE_ALTIVEC) + return "AltiVec"; +#elif defined(EIGEN_VECTORIZE_VSX) + return "VSX"; +#elif defined(EIGEN_VECTORIZE_NEON) + return "ARM NEON"; +#elif defined(EIGEN_VECTORIZE_ZVECTOR) + return "S390X ZVECTOR"; +#else + return "None"; +#endif +} +} +#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT +#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information +#endif +namespace Eigen { +using std::size_t; +using std::ptrdiff_t; +} +// #include "src/Core/util/Constants.h" +#ifndef EIGEN_CONSTANTS_H +#define EIGEN_CONSTANTS_H +namespace Eigen { +const int Dynamic = -1; +const int DynamicIndex = 0xffffff; +const int Infinity = -1; +const int HugeCost = 10000; +const unsigned int RowMajorBit = 0x1; +const unsigned int EvalBeforeNestingBit = 0x2; +EIGEN_DEPRECATED +const unsigned int EvalBeforeAssigningBit = 0x4; +const unsigned int PacketAccessBit = 0x8; +#ifdef EIGEN_VECTORIZE +const unsigned int ActualPacketAccessBit = PacketAccessBit; +#else +const unsigned int ActualPacketAccessBit = 0x0; +#endif +const unsigned int LinearAccessBit = 0x10; +const unsigned int LvalueBit = 0x20; +const unsigned int DirectAccessBit = 0x40; +EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80; +const unsigned int NestByRefBit = 0x100; +const unsigned int NoPreferredStorageOrderBit = 0x200; +const unsigned int CompressedAccessBit = 0x400; +const unsigned int HereditaryBits = RowMajorBit + | EvalBeforeNestingBit; +enum UpLoType { + Lower=0x1, + Upper=0x2, + UnitDiag=0x4, + ZeroDiag=0x8, + UnitLower=UnitDiag|Lower, + UnitUpper=UnitDiag|Upper, + StrictlyLower=ZeroDiag|Lower, + StrictlyUpper=ZeroDiag|Upper, + SelfAdjoint=0x10, + Symmetric=0x20 +}; +enum AlignmentType { + Unaligned=0, + Aligned8=8, + Aligned16=16, + Aligned32=32, + Aligned64=64, + Aligned128=128, + AlignedMask=255, + Aligned=16, +#if EIGEN_MAX_ALIGN_BYTES==128 + AlignedMax = Aligned128 +#elif EIGEN_MAX_ALIGN_BYTES==64 + AlignedMax = Aligned64 +#elif EIGEN_MAX_ALIGN_BYTES==32 + AlignedMax = Aligned32 +#elif EIGEN_MAX_ALIGN_BYTES==16 + AlignedMax = Aligned16 +#elif EIGEN_MAX_ALIGN_BYTES==8 + AlignedMax = Aligned8 +#elif EIGEN_MAX_ALIGN_BYTES==0 + AlignedMax = Unaligned +#else +#error Invalid value for EIGEN_MAX_ALIGN_BYTES +#endif +}; +enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; +enum DirectionType { + Vertical, + Horizontal, + BothDirections +}; +enum TraversalType { + DefaultTraversal, + LinearTraversal, + InnerVectorizedTraversal, + LinearVectorizedTraversal, + SliceVectorizedTraversal, + InvalidTraversal, + AllAtOnceTraversal +}; +enum UnrollingType { + NoUnrolling, + InnerUnrolling, + CompleteUnrolling +}; +enum SpecializedType { + Specialized, + BuiltIn +}; +enum StorageOptions { + ColMajor = 0, + RowMajor = 0x1, + AutoAlign = 0, + DontAlign = 0x2 +}; +enum SideType { + OnTheLeft = 1, + OnTheRight = 2 +}; +enum NoChange_t { NoChange }; +enum Sequential_t { Sequential }; +enum Default_t { Default }; +enum AmbiVectorMode { + IsDense = 0, + IsSparse +}; +enum AccessorLevels { + ReadOnlyAccessors, + WriteAccessors, + DirectAccessors, + DirectWriteAccessors +}; +enum DecompositionOptions { + Pivoting = 0x01, + NoPivoting = 0x02, + ComputeFullU = 0x04, + ComputeThinU = 0x08, + ComputeFullV = 0x10, + ComputeThinV = 0x20, + EigenvaluesOnly = 0x40, + ComputeEigenvectors = 0x80, + EigVecMask = EigenvaluesOnly | ComputeEigenvectors, + Ax_lBx = 0x100, + ABx_lx = 0x200, + BAx_lx = 0x400, + GenEigMask = Ax_lBx | ABx_lx | BAx_lx +}; +enum QRPreconditioners { + NoQRPreconditioner, + HouseholderQRPreconditioner, + ColPivHouseholderQRPreconditioner, + FullPivHouseholderQRPreconditioner +}; +#ifdef Success +#error The preprocessor symbol 'Success' is defined, possibly by the X11 header file X.h +#endif +enum ComputationInfo { + Success = 0, + NumericalIssue = 1, + NoConvergence = 2, + InvalidInput = 3 +}; +enum TransformTraits { + Isometry = 0x1, + Affine = 0x2, + AffineCompact = 0x10 | Affine, + Projective = 0x20 +}; +namespace Architecture +{ + enum Type { + Generic = 0x0, + SSE = 0x1, + AltiVec = 0x2, + VSX = 0x3, + NEON = 0x4, +#if defined EIGEN_VECTORIZE_SSE + Target = SSE +#elif defined EIGEN_VECTORIZE_ALTIVEC + Target = AltiVec +#elif defined EIGEN_VECTORIZE_VSX + Target = VSX +#elif defined EIGEN_VECTORIZE_NEON + Target = NEON +#else + Target = Generic +#endif + }; +} +enum ProductImplType +{ DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct }; +enum Action {GetAction, SetAction}; +struct Dense {}; +struct Sparse {}; +struct SolverStorage {}; +struct PermutationStorage {}; +struct TranspositionsStorage {}; +struct MatrixXpr {}; +struct ArrayXpr {}; +struct DenseShape { static std::string debugName() { return "DenseShape"; } }; +struct SolverShape { static std::string debugName() { return "SolverShape"; } }; +struct HomogeneousShape { static std::string debugName() { return "HomogeneousShape"; } }; +struct DiagonalShape { static std::string debugName() { return "DiagonalShape"; } }; +struct BandShape { static std::string debugName() { return "BandShape"; } }; +struct TriangularShape { static std::string debugName() { return "TriangularShape"; } }; +struct SelfAdjointShape { static std::string debugName() { return "SelfAdjointShape"; } }; +struct PermutationShape { static std::string debugName() { return "PermutationShape"; } }; +struct TranspositionsShape { static std::string debugName() { return "TranspositionsShape"; } }; +struct SparseShape { static std::string debugName() { return "SparseShape"; } }; +namespace internal { +struct IndexBased {}; +struct IteratorBased {}; +enum ComparisonName { + cmp_EQ = 0, + cmp_LT = 1, + cmp_LE = 2, + cmp_UNORD = 3, + cmp_NEQ = 4, + cmp_GT = 5, + cmp_GE = 6 +}; +} +} +#endif +// end #include "src/Core/util/Constants.h" +// #include "src/Core/util/Meta.h" +#ifndef EIGEN_META_H +#define EIGEN_META_H +#if defined(__CUDA_ARCH__) +#include +#include +#endif +#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L +#include +#endif +namespace Eigen { +typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex; +typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index; +namespace internal { +#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L +typedef std::intptr_t IntPtr; +typedef std::uintptr_t UIntPtr; +#else +typedef std::ptrdiff_t IntPtr; +typedef std::size_t UIntPtr; +#endif +struct true_type { enum { value = 1 }; }; +struct false_type { enum { value = 0 }; }; +template +struct conditional { typedef Then type; }; +template +struct conditional { typedef Else type; }; +template struct is_same { enum { value = 0 }; }; +template struct is_same { enum { value = 1 }; }; +template struct remove_reference { typedef T type; }; +template struct remove_reference { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_const { typedef T type; }; +template struct remove_const { typedef T type; }; +template struct remove_const { typedef T type[]; }; +template struct remove_const { typedef T type[Size]; }; +template struct remove_all { typedef T type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct is_arithmetic { enum { value = false }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic{ enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template struct is_integral { enum { value = false }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template struct add_const { typedef const T type; }; +template struct add_const { typedef T& type; }; +template struct is_const { enum { value = 0 }; }; +template struct is_const { enum { value = 1 }; }; +template struct add_const_on_value_type { typedef const T type; }; +template struct add_const_on_value_type { typedef T const& type; }; +template struct add_const_on_value_type { typedef T const* type; }; +template struct add_const_on_value_type { typedef T const* const type; }; +template struct add_const_on_value_type { typedef T const* const type; }; +template +struct is_convertible_impl +{ +private: + struct any_conversion + { + template any_conversion(const volatile T&); + template any_conversion(T&); + }; + struct yes {int a[1];}; + struct no {int a[2];}; + static yes test(const To&, int); + static no test(any_conversion, ...); +public: + static From ms_from; +#ifdef __INTEL_COMPILER + #pragma warning push + #pragma warning ( disable : 2259 ) +#endif + enum { value = sizeof(test(ms_from, 0))==sizeof(yes) }; +#ifdef __INTEL_COMPILER + #pragma warning pop +#endif +}; +template +struct is_convertible +{ + enum { value = is_convertible_impl::type, + typename remove_all::type>::value }; +}; +template struct enable_if; +template struct enable_if +{ typedef T type; }; +#if defined(__CUDA_ARCH__) +#if !defined(__FLT_EPSILON__) +#define __FLT_EPSILON__ FLT_EPSILON +#define __DBL_EPSILON__ DBL_EPSILON +#endif +namespace device { +template struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static T epsilon() { return 0; } + static T (max)() { assert(false && "Highest not supported for this type"); } + static T (min)() { assert(false && "Lowest not supported for this type"); } + static T infinity() { assert(false && "Infinity not supported for this type"); } + static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static float epsilon() { return __FLT_EPSILON__; } + EIGEN_DEVICE_FUNC + static float (max)() { return CUDART_MAX_NORMAL_F; } + EIGEN_DEVICE_FUNC + static float (min)() { return FLT_MIN; } + EIGEN_DEVICE_FUNC + static float infinity() { return CUDART_INF_F; } + EIGEN_DEVICE_FUNC + static float quiet_NaN() { return CUDART_NAN_F; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static double epsilon() { return __DBL_EPSILON__; } + EIGEN_DEVICE_FUNC + static double (max)() { return DBL_MAX; } + EIGEN_DEVICE_FUNC + static double (min)() { return DBL_MIN; } + EIGEN_DEVICE_FUNC + static double infinity() { return CUDART_INF; } + EIGEN_DEVICE_FUNC + static double quiet_NaN() { return CUDART_NAN; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static int epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static int (max)() { return INT_MAX; } + EIGEN_DEVICE_FUNC + static int (min)() { return INT_MIN; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static unsigned int epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static unsigned int (max)() { return UINT_MAX; } + EIGEN_DEVICE_FUNC + static unsigned int (min)() { return 0; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static long epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static long (max)() { return LONG_MAX; } + EIGEN_DEVICE_FUNC + static long (min)() { return LONG_MIN; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static unsigned long epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static unsigned long (max)() { return ULONG_MAX; } + EIGEN_DEVICE_FUNC + static unsigned long (min)() { return 0; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static long long epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static long long (max)() { return LLONG_MAX; } + EIGEN_DEVICE_FUNC + static long long (min)() { return LLONG_MIN; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static unsigned long long epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static unsigned long long (max)() { return ULLONG_MAX; } + EIGEN_DEVICE_FUNC + static unsigned long long (min)() { return 0; } +}; +} +#endif +class noncopyable +{ + EIGEN_DEVICE_FUNC noncopyable(const noncopyable&); + EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&); +protected: + EIGEN_DEVICE_FUNC noncopyable() {} + EIGEN_DEVICE_FUNC ~noncopyable() {} +}; +#if EIGEN_HAS_STD_RESULT_OF +template struct result_of { + typedef typename std::result_of::type type1; + typedef typename remove_all::type type; +}; +#else +template struct result_of { }; +struct has_none {int a[1];}; +struct has_std_result_type {int a[2];}; +struct has_tr1_result {int a[3];}; +template +struct unary_result_of_select {typedef typename internal::remove_all::type type;}; +template +struct unary_result_of_select {typedef typename Func::result_type type;}; +template +struct unary_result_of_select {typedef typename Func::template result::type type;}; +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename unary_result_of_select::type type; +}; +template +struct binary_result_of_select {typedef typename internal::remove_all::type type;}; +template +struct binary_result_of_select +{typedef typename Func::result_type type;}; +template +struct binary_result_of_select +{typedef typename Func::template result::type type;}; +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename binary_result_of_select::type type; +}; +template +struct ternary_result_of_select {typedef typename internal::remove_all::type type;}; +template +struct ternary_result_of_select +{typedef typename Func::result_type type;}; +template +struct ternary_result_of_select +{typedef typename Func::template result::type type;}; +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename ternary_result_of_select::type type; +}; +#endif +struct meta_yes { char a[1]; }; +struct meta_no { char a[2]; }; +template +struct has_ReturnType +{ + template static meta_yes testFunctor(typename C::ReturnType const *); + template static meta_no testFunctor(...); + enum { value = sizeof(testFunctor(0)) == sizeof(meta_yes) }; +}; +template const T* return_ptr(); +template +struct has_nullary_operator +{ + template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()())>0)>::type * = 0); + static meta_no testFunctor(...); + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; +template +struct has_unary_operator +{ + template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()(IndexType(0)))>0)>::type * = 0); + static meta_no testFunctor(...); + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; +template +struct has_binary_operator +{ + template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0); + static meta_no testFunctor(...); + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; +template Y))) > +class meta_sqrt +{ + enum { + MidX = (InfX+SupX)/2, + TakeInf = MidX*MidX > Y ? 1 : 0, + NewInf = int(TakeInf) ? InfX : int(MidX), + NewSup = int(TakeInf) ? int(MidX) : SupX + }; + public: + enum { ret = meta_sqrt::ret }; +}; +template +class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; }; +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; +template +struct meta_least_common_multiple +{ + enum { ret = A*K }; +}; +template struct scalar_product_traits +{ + enum { Defined = 0 }; +}; +} +namespace numext { +#if defined(__CUDA_ARCH__) +template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } +#else +template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } +#endif +#if defined(__CUDA_ARCH__) +using internal::device::numeric_limits; +#else +using std::numeric_limits; +#endif +template +T div_ceil(const T &a, const T &b) +{ + return (a+b-1) / b; +} +} +} +#endif +// end #include "src/Core/util/Meta.h" +// #include "src/Core/util/ForwardDeclarations.h" +#ifndef EIGEN_FORWARDDECLARATIONS_H +#define EIGEN_FORWARDDECLARATIONS_H +namespace Eigen { +namespace internal { +template struct traits; +template struct traits : traits {}; +template struct has_direct_access +{ + enum { ret = (traits::Flags & DirectAccessBit) ? 1 : 0 }; +}; +template struct accessors_level +{ + enum { has_direct_access = (traits::Flags & DirectAccessBit) ? 1 : 0, + has_write_access = (traits::Flags & LvalueBit) ? 1 : 0, + value = has_direct_access ? (has_write_access ? DirectWriteAccessors : DirectAccessors) + : (has_write_access ? WriteAccessors : ReadOnlyAccessors) + }; +}; +template struct evaluator_traits; +template< typename T> struct evaluator; +} +template struct NumTraits; +template struct EigenBase; +template class DenseBase; +template class PlainObjectBase; +template::value > +class DenseCoeffsBase; +template class Matrix; +template class MatrixBase; +template class ArrayBase; +template class Flagged; +template class StorageBase > class NoAlias; +template class NestByValue; +template class ForceAlignedAccess; +template class SwapWrapper; +template class Block; +template class VectorBlock; +template class Transpose; +template class Conjugate; +template class CwiseNullaryOp; +template class CwiseUnaryOp; +template class CwiseUnaryView; +template class CwiseBinaryOp; +template class CwiseTernaryOp; +template class Solve; +template class Inverse; +template class Product; +template class DiagonalBase; +template class DiagonalWrapper; +template class DiagonalMatrix; +template class DiagonalProduct; +template class Diagonal; +template class PermutationMatrix; +template class Transpositions; +template class PermutationBase; +template class TranspositionsBase; +template class PermutationWrapper; +template class TranspositionsWrapper; +template::has_write_access ? WriteAccessors : ReadOnlyAccessors +> class MapBase; +template class Stride; +template class InnerStride; +template class OuterStride; +template > class Map; +template class RefBase; +template,OuterStride<> >::type > class Ref; +template class TriangularBase; +template class TriangularView; +template class SelfAdjointView; +template class SparseView; +template class WithFormat; +template struct CommaInitializer; +template class ReturnByValue; +template class ArrayWrapper; +template class SolverBase; +template class InnerIterator; +namespace internal { +template struct kernel_retval_base; +template struct kernel_retval; +template struct image_retval_base; +template struct image_retval; +} +namespace internal { +template class BandMatrix; +} +namespace internal { +template struct product_type; +template struct EnableIf; +template< typename T, + int ProductTag = internal::product_type::ret, + typename LhsShape = typename evaluator_traits::Shape, + typename RhsShape = typename evaluator_traits::Shape, + typename LhsScalar = typename traits::Scalar, + typename RhsScalar = typename traits::Scalar + > struct product_evaluator; +} +template::value> +struct ProductReturnType; +template struct LazyProductReturnType; +namespace internal { +template struct conj_helper; +template struct scalar_sum_op; +template struct scalar_difference_op; +template struct scalar_conj_product_op; +template struct scalar_min_op; +template struct scalar_max_op; +template struct scalar_opposite_op; +template struct scalar_conjugate_op; +template struct scalar_real_op; +template struct scalar_imag_op; +template struct scalar_abs_op; +template struct scalar_abs2_op; +template struct scalar_sqrt_op; +template struct scalar_rsqrt_op; +template struct scalar_exp_op; +template struct scalar_log_op; +template struct scalar_cos_op; +template struct scalar_sin_op; +template struct scalar_acos_op; +template struct scalar_asin_op; +template struct scalar_tan_op; +template struct scalar_inverse_op; +template struct scalar_square_op; +template struct scalar_cube_op; +template struct scalar_cast_op; +template struct scalar_random_op; +template struct scalar_constant_op; +template struct scalar_identity_op; +template struct scalar_sign_op; +template struct scalar_pow_op; +template struct scalar_hypot_op; +template struct scalar_product_op; +template struct scalar_quotient_op; +template struct scalar_lgamma_op; +template struct scalar_digamma_op; +template struct scalar_erf_op; +template struct scalar_erfc_op; +template struct scalar_igamma_op; +template struct scalar_igammac_op; +template struct scalar_zeta_op; +template struct scalar_betainc_op; +} +struct IOFormat; +template class Array; +template class Select; +template class PartialReduxExpr; +template class VectorwiseOp; +template class Replicate; +template class Reverse; +template class FullPivLU; +template class PartialPivLU; +namespace internal { +template struct inverse_impl; +} +template class HouseholderQR; +template class ColPivHouseholderQR; +template class FullPivHouseholderQR; +template class CompleteOrthogonalDecomposition; +template class JacobiSVD; +template class BDCSVD; +template class LLT; +template class LDLT; +template class HouseholderSequence; +template class JacobiRotation; +template class RotationBase; +template class Cross; +template class QuaternionBase; +template class Rotation2D; +template class AngleAxis; +template class Translation; +template class AlignedBox; +template class Quaternion; +template class Transform; +template class ParametrizedLine; +template class Hyperplane; +template class UniformScaling; +template class Homogeneous; +template class SparseMatrixBase; +template struct MatrixExponentialReturnValue; +template class MatrixFunctionReturnValue; +template class MatrixSquareRootReturnValue; +template class MatrixLogarithmReturnValue; +template class MatrixPowerReturnValue; +template class MatrixComplexPowerReturnValue; +namespace internal { +template +struct stem_function +{ + typedef std::complex::Real> ComplexScalar; + typedef ComplexScalar type(ComplexScalar, int); +}; +} +} +#endif +// end #include "src/Core/util/ForwardDeclarations.h" +// #include "src/Core/util/StaticAssert.h" +#ifndef EIGEN_STATIC_ASSERT_H +#define EIGEN_STATIC_ASSERT_H +#ifndef EIGEN_NO_STATIC_ASSERT + #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)) + #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); + #else + namespace Eigen { + namespace internal { + template + struct static_assertion {}; + template<> + struct static_assertion + { + enum { + YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX, + YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES, + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES, + THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, + THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, + OUT_OF_RANGE_ACCESS, + YOU_MADE_A_PROGRAMMING_MISTAKE, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, + EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, + YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR, + YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR, + UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC, + THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES, + FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED, + NUMERIC_TYPE_MUST_BE_REAL, + COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED, + WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED, + THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE, + INVALID_MATRIX_PRODUCT, + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS, + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION, + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY, + THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES, + THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES, + INVALID_MATRIX_TEMPLATE_PARAMETERS, + INVALID_MATRIXBASE_TEMPLATE_PARAMETERS, + BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER, + THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX, + THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES, + YOU_ALREADY_SPECIFIED_THIS_STRIDE, + INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION, + THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD, + PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1, + THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS, + YOU_CANNOT_MIX_ARRAYS_AND_MATRICES, + YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION, + THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY, + YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT, + THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS, + THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL, + THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES, + YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED, + YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED, + THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE, + THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, + OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, + IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, + STORAGE_LAYOUT_DOES_NOT_MATCH, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS, + MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY, + THIS_TYPE_IS_NOT_SUPPORTED, + STORAGE_KIND_MUST_MATCH, + STORAGE_INDEX_MUST_MATCH, + CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY + }; + }; + } + } + #if EIGEN_COMP_MSVC + #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \ + {Eigen::internal::static_assertion::MSG;} + #else + #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \ + if (Eigen::internal::static_assertion(CONDITION)>::MSG) {} + #endif + #endif +#else + #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG); +#endif +#define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \ + EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \ + YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX) +#define EIGEN_STATIC_ASSERT_FIXED_SIZE(TYPE) \ + EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime!=Eigen::Dynamic, \ + YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR) +#define EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(TYPE) \ + EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime==Eigen::Dynamic, \ + YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR) +#define EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(TYPE, SIZE) \ + EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime && TYPE::SizeAtCompileTime==SIZE, \ + THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE) +#define EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(TYPE, ROWS, COLS) \ + EIGEN_STATIC_ASSERT(TYPE::RowsAtCompileTime==ROWS && TYPE::ColsAtCompileTime==COLS, \ + THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE) +#define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \ + EIGEN_STATIC_ASSERT( \ + (int(TYPE0::SizeAtCompileTime)==Eigen::Dynamic \ + || int(TYPE1::SizeAtCompileTime)==Eigen::Dynamic \ + || int(TYPE0::SizeAtCompileTime)==int(TYPE1::SizeAtCompileTime)),\ + YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES) +#define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1) \ + ( \ + (int(Eigen::internal::size_of_xpr_at_compile_time::ret)==0 && int(Eigen::internal::size_of_xpr_at_compile_time::ret)==0) \ + || (\ + (int(TYPE0::RowsAtCompileTime)==Eigen::Dynamic \ + || int(TYPE1::RowsAtCompileTime)==Eigen::Dynamic \ + || int(TYPE0::RowsAtCompileTime)==int(TYPE1::RowsAtCompileTime)) \ + && (int(TYPE0::ColsAtCompileTime)==Eigen::Dynamic \ + || int(TYPE1::ColsAtCompileTime)==Eigen::Dynamic \ + || int(TYPE0::ColsAtCompileTime)==int(TYPE1::ColsAtCompileTime))\ + ) \ + ) +#define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \ + EIGEN_STATIC_ASSERT(!NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) +#define EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(TYPE0,TYPE1) \ + EIGEN_STATIC_ASSERT( \ + EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1),\ + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES) +#define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \ + EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \ + (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \ + THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS) +#define EIGEN_STATIC_ASSERT_LVALUE(Derived) \ + EIGEN_STATIC_ASSERT(Eigen::internal::is_lvalue::value, \ + THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY) +#define EIGEN_STATIC_ASSERT_ARRAYXPR(Derived) \ + EIGEN_STATIC_ASSERT((Eigen::internal::is_same::XprKind, ArrayXpr>::value), \ + THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES) +#define EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2) \ + EIGEN_STATIC_ASSERT((Eigen::internal::is_same::XprKind, \ + typename Eigen::internal::traits::XprKind \ + >::value), \ + YOU_CANNOT_MIX_ARRAYS_AND_MATRICES) +#define EIGEN_INTERNAL_CHECK_COST_VALUE(C) \ + EIGEN_STATIC_ASSERT((C)>=0 && (C)<=HugeCost*HugeCost, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE); +#endif +// end #include "src/Core/util/StaticAssert.h" +// #include "src/Core/util/XprHelper.h" +#ifndef EIGEN_XPRHELPER_H +#define EIGEN_XPRHELPER_H +#if EIGEN_COMP_GNUC && !EIGEN_GNUC_AT(4,3) + #define EIGEN_EMPTY_STRUCT_CTOR(X) \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X() {} \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X(const X& ) {} +#else + #define EIGEN_EMPTY_STRUCT_CTOR(X) +#endif +namespace Eigen { +namespace internal { +template +EIGEN_DEVICE_FUNC +inline IndexDest convert_index(const IndexSrc& idx) { + eigen_internal_assert(idx <= NumTraits::highest() && "Index value to big for target type"); + return IndexDest(idx); +} +template +struct promote_scalar_arg; +template +struct promote_scalar_arg +{ + typedef T type; +}; +template::value, + bool IsSafe = NumTraits::IsInteger || !NumTraits::IsInteger> +struct promote_scalar_arg_unsupported; +template +struct promote_scalar_arg : promote_scalar_arg_unsupported::Literal> {}; +template +struct promote_scalar_arg_unsupported +{ + typedef PromotedType type; +}; +template +struct promote_scalar_arg_unsupported + : promote_scalar_arg_unsupported +{}; +template +struct promote_scalar_arg_unsupported {}; +template +struct promote_scalar_arg_unsupported {}; +class no_assignment_operator +{ + private: + no_assignment_operator& operator=(const no_assignment_operator&); +}; +template +struct promote_index_type +{ + typedef typename conditional<(sizeof(I1)::type type; +}; +template class variable_if_dynamic +{ + public: + EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} +}; +template class variable_if_dynamic +{ + T m_value; + EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); } + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; } +}; +template class variable_if_dynamicindex +{ + public: + EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} +}; +template class variable_if_dynamicindex +{ + T m_value; + EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); } + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {} + EIGEN_DEVICE_FUNC T EIGEN_STRONG_INLINE value() const { return m_value; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; } +}; +template struct functor_traits +{ + enum + { + Cost = 10, + PacketAccess = false, + IsRepeatable = false + }; +}; +template struct packet_traits; +template struct unpacket_traits +{ + typedef T type; + typedef T half; + enum + { + size = 1, + alignment = 1 + }; +}; +template::size)==0 || is_same::half>::value> +struct find_best_packet_helper; +template< int Size, typename PacketType> +struct find_best_packet_helper +{ + typedef PacketType type; +}; +template +struct find_best_packet_helper +{ + typedef typename find_best_packet_helper::half>::type type; +}; +template +struct find_best_packet +{ + typedef typename find_best_packet_helper::type>::type type; +}; +#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 +template +struct compute_default_alignment_helper +{ + enum { value = 0 }; +}; +template +struct compute_default_alignment_helper +{ + enum { value = AlignmentBytes }; +}; +template +struct compute_default_alignment_helper +{ + enum { value = compute_default_alignment_helper::value }; +}; +#else +template +struct compute_default_alignment_helper +{ + enum { value = 0 }; +}; +#endif +template struct compute_default_alignment { + enum { value = compute_default_alignment_helper::value }; +}; +template struct compute_default_alignment { + enum { value = EIGEN_MAX_ALIGN_BYTES }; +}; +template class make_proper_matrix_type +{ + enum { + IsColVector = _Cols==1 && _Rows!=1, + IsRowVector = _Rows==1 && _Cols!=1, + Options = IsColVector ? (_Options | ColMajor) & ~RowMajor + : IsRowVector ? (_Options | RowMajor) & ~ColMajor + : _Options + }; + public: + typedef Matrix<_Scalar, _Rows, _Cols, Options, _MaxRows, _MaxCols> type; +}; +template +class compute_matrix_flags +{ + enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0 }; + public: + enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit }; +}; +template struct size_at_compile_time +{ + enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols }; +}; +template struct size_of_xpr_at_compile_time +{ + enum { ret = size_at_compile_time::RowsAtCompileTime,traits::ColsAtCompileTime>::ret }; +}; +template::StorageKind> struct plain_matrix_type; +template struct plain_matrix_type_dense; +template struct plain_matrix_type +{ + typedef typename plain_matrix_type_dense::XprKind, traits::Flags>::type type; +}; +template struct plain_matrix_type +{ + typedef typename T::PlainObject type; +}; +template struct plain_matrix_type_dense +{ + typedef Matrix::Scalar, + traits::RowsAtCompileTime, + traits::ColsAtCompileTime, + AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor), + traits::MaxRowsAtCompileTime, + traits::MaxColsAtCompileTime + > type; +}; +template struct plain_matrix_type_dense +{ + typedef Array::Scalar, + traits::RowsAtCompileTime, + traits::ColsAtCompileTime, + AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor), + traits::MaxRowsAtCompileTime, + traits::MaxColsAtCompileTime + > type; +}; +template::StorageKind> struct eval; +template struct eval +{ + typedef typename plain_matrix_type::type type; +}; +template struct eval +{ + typedef typename plain_matrix_type::type type; +}; +template +struct eval, Dense> +{ + typedef const Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type; +}; +template +struct eval, Dense> +{ + typedef const Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type; +}; +template::StorageKind> struct plain_object_eval; +template +struct plain_object_eval +{ + typedef typename plain_matrix_type_dense::XprKind, evaluator::Flags>::type type; +}; +template struct plain_matrix_type_column_major +{ + enum { Rows = traits::RowsAtCompileTime, + Cols = traits::ColsAtCompileTime, + MaxRows = traits::MaxRowsAtCompileTime, + MaxCols = traits::MaxColsAtCompileTime + }; + typedef Matrix::Scalar, + Rows, + Cols, + (MaxRows==1&&MaxCols!=1) ? RowMajor : ColMajor, + MaxRows, + MaxCols + > type; +}; +template struct plain_matrix_type_row_major +{ + enum { Rows = traits::RowsAtCompileTime, + Cols = traits::ColsAtCompileTime, + MaxRows = traits::MaxRowsAtCompileTime, + MaxCols = traits::MaxColsAtCompileTime + }; + typedef Matrix::Scalar, + Rows, + Cols, + (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor, + MaxRows, + MaxCols + > type; +}; +template +struct ref_selector +{ + typedef typename conditional< + bool(traits::Flags & NestByRefBit), + T const&, + const T + >::type type; + typedef typename conditional< + bool(traits::Flags & NestByRefBit), + T &, + T + >::type non_const_type; +}; +template +struct transfer_constness +{ + typedef typename conditional< + bool(internal::is_const::value), + typename internal::add_const_on_value_type::type, + T2 + >::type type; +}; +template::type> struct nested_eval +{ + enum { + ScalarReadCost = NumTraits::Scalar>::ReadCost, + CoeffReadCost = evaluator::CoeffReadCost, + NAsInteger = n == Dynamic ? HugeCost : n, + CostEval = (NAsInteger+1) * ScalarReadCost + CoeffReadCost, + CostNoEval = NAsInteger * CoeffReadCost, + Evaluate = (int(evaluator::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval)) + }; + typedef typename conditional::type>::type type; +}; +template +EIGEN_DEVICE_FUNC +inline T* const_cast_ptr(const T* ptr) +{ + return const_cast(ptr); +} +template::XprKind> +struct dense_xpr_base +{ +}; +template +struct dense_xpr_base +{ + typedef MatrixBase type; +}; +template +struct dense_xpr_base +{ + typedef ArrayBase type; +}; +template::XprKind, typename StorageKind = typename traits::StorageKind> +struct generic_xpr_base; +template +struct generic_xpr_base +{ + typedef typename dense_xpr_base::type type; +}; +template struct cast_return_type +{ + typedef typename XprType::Scalar CurrentScalarType; + typedef typename remove_all::type _CastType; + typedef typename _CastType::Scalar NewScalarType; + typedef typename conditional::value, + const XprType&,CastType>::type type; +}; +template struct promote_storage_type; +template struct promote_storage_type +{ + typedef A ret; +}; +template struct promote_storage_type +{ + typedef A ret; +}; +template struct promote_storage_type +{ + typedef A ret; +}; +template struct cwise_promote_storage_type; +template struct cwise_promote_storage_type { typedef A ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Dense ret; }; +template struct cwise_promote_storage_type { typedef Sparse ret; }; +template struct cwise_promote_storage_type { typedef Sparse ret; }; +template struct cwise_promote_storage_order { + enum { value = LhsOrder }; +}; +template struct cwise_promote_storage_order { enum { value = RhsOrder }; }; +template struct cwise_promote_storage_order { enum { value = LhsOrder }; }; +template struct cwise_promote_storage_order { enum { value = Order }; }; +template struct product_promote_storage_type; +template struct product_promote_storage_type { typedef A ret;}; +template struct product_promote_storage_type { typedef Dense ret;}; +template struct product_promote_storage_type { typedef Dense ret; }; +template struct product_promote_storage_type { typedef Dense ret; }; +template struct product_promote_storage_type { typedef A ret; }; +template struct product_promote_storage_type { typedef B ret; }; +template struct product_promote_storage_type { typedef Dense ret; }; +template struct product_promote_storage_type { typedef Dense ret; }; +template struct product_promote_storage_type { typedef A ret; }; +template struct product_promote_storage_type { typedef B ret; }; +template struct product_promote_storage_type { typedef Dense ret; }; +template struct product_promote_storage_type { typedef Dense ret; }; +template +struct plain_row_type +{ + typedef Matrix MatrixRowType; + typedef Array ArrayRowType; + typedef typename conditional< + is_same< typename traits::XprKind, MatrixXpr >::value, + MatrixRowType, + ArrayRowType + >::type type; +}; +template +struct plain_col_type +{ + typedef Matrix MatrixColType; + typedef Array ArrayColType; + typedef typename conditional< + is_same< typename traits::XprKind, MatrixXpr >::value, + MatrixColType, + ArrayColType + >::type type; +}; +template +struct plain_diag_type +{ + enum { diag_size = EIGEN_SIZE_MIN_PREFER_DYNAMIC(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime), + max_diag_size = EIGEN_SIZE_MIN_PREFER_FIXED(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime) + }; + typedef Matrix MatrixDiagType; + typedef Array ArrayDiagType; + typedef typename conditional< + is_same< typename traits::XprKind, MatrixXpr >::value, + MatrixDiagType, + ArrayDiagType + >::type type; +}; +template +struct plain_constant_type +{ + enum { Options = (traits::Flags&RowMajorBit)?RowMajor:0 }; + typedef Array::RowsAtCompileTime, traits::ColsAtCompileTime, + Options, traits::MaxRowsAtCompileTime,traits::MaxColsAtCompileTime> array_type; + typedef Matrix::RowsAtCompileTime, traits::ColsAtCompileTime, + Options, traits::MaxRowsAtCompileTime,traits::MaxColsAtCompileTime> matrix_type; + typedef CwiseNullaryOp, const typename conditional::XprKind, MatrixXpr >::value, matrix_type, array_type>::type > type; +}; +template +struct is_lvalue +{ + enum { value = (!bool(is_const::value)) && + bool(traits::Flags & LvalueBit) }; +}; +template struct is_diagonal +{ enum { ret = false }; }; +template struct is_diagonal > +{ enum { ret = true }; }; +template struct is_diagonal > +{ enum { ret = true }; }; +template struct is_diagonal > +{ enum { ret = true }; }; +template struct glue_shapes; +template<> struct glue_shapes { typedef TriangularShape type; }; +template +bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::ret&&has_direct_access::ret, T1>::type * = 0) +{ + return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); +} +template +bool is_same_dense(const T1 &, const T2 &, typename enable_if::ret&&has_direct_access::ret), T1>::type * = 0) +{ + return false; +} +template +struct scalar_div_cost { + enum { value = 8*NumTraits::MulCost }; +}; +template +struct scalar_div_cost, Vectorized> { + enum { value = 2*scalar_div_cost::value + + 6*NumTraits::MulCost + + 3*NumTraits::AddCost + }; +}; +template +struct scalar_div_cost::type> { enum { value = 24 }; }; +template +struct scalar_div_cost::type> { enum { value = 21 }; }; +#ifdef EIGEN_DEBUG_ASSIGN +std::string demangle_traversal(int t) +{ + if(t==DefaultTraversal) return "DefaultTraversal"; + if(t==LinearTraversal) return "LinearTraversal"; + if(t==InnerVectorizedTraversal) return "InnerVectorizedTraversal"; + if(t==LinearVectorizedTraversal) return "LinearVectorizedTraversal"; + if(t==SliceVectorizedTraversal) return "SliceVectorizedTraversal"; + return "?"; +} +std::string demangle_unrolling(int t) +{ + if(t==NoUnrolling) return "NoUnrolling"; + if(t==InnerUnrolling) return "InnerUnrolling"; + if(t==CompleteUnrolling) return "CompleteUnrolling"; + return "?"; +} +std::string demangle_flags(int f) +{ + std::string res; + if(f&RowMajorBit) res += " | RowMajor"; + if(f&PacketAccessBit) res += " | Packet"; + if(f&LinearAccessBit) res += " | Linear"; + if(f&LvalueBit) res += " | Lvalue"; + if(f&DirectAccessBit) res += " | Direct"; + if(f&NestByRefBit) res += " | NestByRef"; + if(f&NoPreferredStorageOrderBit) res += " | NoPreferredStorageOrderBit"; + return res; +} +#endif +} +template > +struct ScalarBinaryOpTraits +#ifndef EIGEN_PARSED_BY_DOXYGEN + : internal::scalar_product_traits +#endif +{}; +template +struct ScalarBinaryOpTraits +{ + typedef T ReturnType; +}; +template +struct ScalarBinaryOpTraits::IsComplex,T>::type>::Real, BinaryOp> +{ + typedef T ReturnType; +}; +template +struct ScalarBinaryOpTraits::IsComplex,T>::type>::Real, T, BinaryOp> +{ + typedef T ReturnType; +}; +template +struct ScalarBinaryOpTraits +{ + typedef T ReturnType; +}; +template +struct ScalarBinaryOpTraits +{ + typedef T ReturnType; +}; +template +struct ScalarBinaryOpTraits +{ + typedef void ReturnType; +}; +#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \ + EIGEN_STATIC_ASSERT((Eigen::internal::has_ReturnType >::value), \ + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) +} +#endif +// end #include "src/Core/util/XprHelper.h" +// #include "src/Core/util/Memory.h" +#ifndef EIGEN_MEMORY_H +#define EIGEN_MEMORY_H +#ifndef EIGEN_MALLOC_ALREADY_ALIGNED +#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \ + && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_DEFAULT_ALIGN_BYTES == 16) + #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 +#else + #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 +#endif +#if defined(__FreeBSD__) && !(EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) && (EIGEN_DEFAULT_ALIGN_BYTES == 16) + #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1 +#else + #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 +#endif +#if (EIGEN_OS_MAC && (EIGEN_DEFAULT_ALIGN_BYTES == 16)) \ + || (EIGEN_OS_WIN64 && (EIGEN_DEFAULT_ALIGN_BYTES == 16)) \ + || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \ + || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED + #define EIGEN_MALLOC_ALREADY_ALIGNED 1 +#else + #define EIGEN_MALLOC_ALREADY_ALIGNED 0 +#endif +#endif +namespace Eigen { +namespace internal { +EIGEN_DEVICE_FUNC +inline void throw_std_bad_alloc() +{ + #ifdef EIGEN_EXCEPTIONS + throw std::bad_alloc(); + #else + std::size_t huge = static_cast(-1); + new int[huge]; + #endif +} +inline void* handmade_aligned_malloc(std::size_t size) +{ + void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES); + if (original == 0) return 0; + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES); + *(reinterpret_cast(aligned) - 1) = original; + return aligned; +} +inline void handmade_aligned_free(void *ptr) +{ + if (ptr) std::free(*(reinterpret_cast(ptr) - 1)); +} +inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0) +{ + if (ptr == 0) return handmade_aligned_malloc(size); + void *original = *(reinterpret_cast(ptr) - 1); + std::ptrdiff_t previous_offset = static_cast(ptr)-static_cast(original); + original = std::realloc(original,size+EIGEN_DEFAULT_ALIGN_BYTES); + if (original == 0) return 0; + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES); + void *previous_aligned = static_cast(original)+previous_offset; + if(aligned!=previous_aligned) + std::memmove(aligned, previous_aligned, size); + *(reinterpret_cast(aligned) - 1) = original; + return aligned; +} +#ifdef EIGEN_NO_MALLOC +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() +{ + eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)"); +} +#elif defined EIGEN_RUNTIME_NO_MALLOC +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false) +{ + static bool value = true; + if (update == 1) + value = new_value; + return value; +} +EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); } +EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); } +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() +{ + eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); +} +#else +EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() +{} +#endif +EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) +{ + check_that_malloc_is_allowed(); + void *result; + #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + result = std::malloc(size); + #if EIGEN_DEFAULT_ALIGN_BYTES==16 + eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); + #endif + #else + result = handmade_aligned_malloc(size); + #endif + if(!result && size) + throw_std_bad_alloc(); + return result; +} +EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) +{ + #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + std::free(ptr); + #else + handmade_aligned_free(ptr); + #endif +} +inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size) +{ + EIGEN_UNUSED_VARIABLE(old_size); + void *result; +#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + result = std::realloc(ptr,new_size); +#else + result = handmade_aligned_realloc(ptr,new_size,old_size); +#endif + if (!result && new_size) + throw_std_bad_alloc(); + return result; +} +template EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size) +{ + return aligned_malloc(size); +} +template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size) +{ + check_that_malloc_is_allowed(); + void *result = std::malloc(size); + if(!result && size) + throw_std_bad_alloc(); + return result; +} +template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) +{ + aligned_free(ptr); +} +template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) +{ + std::free(ptr); +} +template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) +{ + return aligned_realloc(ptr, new_size, old_size); +} +template<> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t) +{ + return std::realloc(ptr, new_size); +} +template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, std::size_t size) +{ + if(ptr) + while(size) ptr[--size].~T(); +} +template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size) +{ + std::size_t i; + EIGEN_TRY + { + for (i = 0; i < size; ++i) ::new (ptr + i) T; + return ptr; + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return NULL; +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size) +{ + if(size > std::size_t(-1) / sizeof(T)) + throw_std_bad_alloc(); +} +template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) +{ + check_size_for_overflow(size); + T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); + EIGEN_TRY + { + return construct_elements_of_array(result, size); + } + EIGEN_CATCH(...) + { + aligned_free(result); + EIGEN_THROW; + } + return result; +} +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size) +{ + check_size_for_overflow(size); + T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + EIGEN_TRY + { + return construct_elements_of_array(result, size); + } + EIGEN_CATCH(...) + { + conditional_aligned_free(result); + EIGEN_THROW; + } + return result; +} +template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size) +{ + destruct_elements_of_array(ptr, size); + aligned_free(ptr); +} +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, std::size_t size) +{ + destruct_elements_of_array(ptr, size); + conditional_aligned_free(ptr); +} +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t new_size, std::size_t old_size) +{ + check_size_for_overflow(new_size); + check_size_for_overflow(old_size); + if(new_size < old_size) + destruct_elements_of_array(pts+new_size, old_size-new_size); + T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); + if(new_size > old_size) + { + EIGEN_TRY + { + construct_elements_of_array(result+old_size, new_size-old_size); + } + EIGEN_CATCH(...) + { + conditional_aligned_free(result); + EIGEN_THROW; + } + } + return result; +} +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size) +{ + if(size==0) + return 0; + check_size_for_overflow(size); + T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + if(NumTraits::RequireInitialization) + { + EIGEN_TRY + { + construct_elements_of_array(result, size); + } + EIGEN_CATCH(...) + { + conditional_aligned_free(result); + EIGEN_THROW; + } + } + return result; +} +template inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size) +{ + check_size_for_overflow(new_size); + check_size_for_overflow(old_size); + if(NumTraits::RequireInitialization && (new_size < old_size)) + destruct_elements_of_array(pts+new_size, old_size-new_size); + T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); + if(NumTraits::RequireInitialization && (new_size > old_size)) + { + EIGEN_TRY + { + construct_elements_of_array(result+old_size, new_size-old_size); + } + EIGEN_CATCH(...) + { + conditional_aligned_free(result); + EIGEN_THROW; + } + } + return result; +} +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size) +{ + if(NumTraits::RequireInitialization) + destruct_elements_of_array(ptr, size); + conditional_aligned_free(ptr); +} +template +EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size) +{ + const Index ScalarSize = sizeof(Scalar); + const Index AlignmentSize = Alignment / ScalarSize; + const Index AlignmentMask = AlignmentSize-1; + if(AlignmentSize<=1) + { + return 0; + } + else if( (UIntPtr(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0) + { + return size; + } + else + { + Index first = (AlignmentSize - (Index((UIntPtr(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask; + return (first < size) ? first : size; + } +} +template +EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size) +{ + typedef typename packet_traits::type DefaultPacketType; + return first_aligned::alignment>(array, size); +} +template +inline Index first_multiple(Index size, Index base) +{ + return ((size+base-1)/base)*base; +} +template struct smart_copy_helper; +template EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T* end, T* target) +{ + smart_copy_helper::RequireInitialization>::run(start, end, target); +} +template struct smart_copy_helper { + EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target) + { + IntPtr size = IntPtr(end)-IntPtr(start); + if(size==0) return; + eigen_internal_assert(start!=0 && end!=0 && target!=0); + memcpy(target, start, size); + } +}; +template struct smart_copy_helper { + EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target) + { std::copy(start, end, target); } +}; +template struct smart_memmove_helper; +template void smart_memmove(const T* start, const T* end, T* target) +{ + smart_memmove_helper::RequireInitialization>::run(start, end, target); +} +template struct smart_memmove_helper { + static inline void run(const T* start, const T* end, T* target) + { + IntPtr size = IntPtr(end)-IntPtr(start); + if(size==0) return; + eigen_internal_assert(start!=0 && end!=0 && target!=0); + std::memmove(target, start, size); + } +}; +template struct smart_memmove_helper { + static inline void run(const T* start, const T* end, T* target) + { + if (UIntPtr(target) < UIntPtr(start)) + { + std::copy(start, end, target); + } + else + { + std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T); + std::copy_backward(start, end, target + count); + } + } +}; +#ifndef EIGEN_ALLOCA + #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) + #define EIGEN_ALLOCA alloca + #elif EIGEN_COMP_MSVC + #define EIGEN_ALLOCA _alloca + #endif +#endif +template class aligned_stack_memory_handler : noncopyable +{ + public: + aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) + : m_ptr(ptr), m_size(size), m_deallocate(dealloc) + { + if(NumTraits::RequireInitialization && m_ptr) + Eigen::internal::construct_elements_of_array(m_ptr, size); + } + ~aligned_stack_memory_handler() + { + if(NumTraits::RequireInitialization && m_ptr) + Eigen::internal::destruct_elements_of_array(m_ptr, m_size); + if(m_deallocate) + Eigen::internal::aligned_free(m_ptr); + } + protected: + T* m_ptr; + std::size_t m_size; + bool m_deallocate; +}; +template class scoped_array : noncopyable +{ + T* m_ptr; +public: + explicit scoped_array(std::ptrdiff_t size) + { + m_ptr = new T[size]; + } + ~scoped_array() + { + delete[] m_ptr; + } + T& operator[](std::ptrdiff_t i) { return m_ptr[i]; } + const T& operator[](std::ptrdiff_t i) const { return m_ptr[i]; } + T* &ptr() { return m_ptr; } + const T* ptr() const { return m_ptr; } + operator const T*() const { return m_ptr; } +}; +template void swap(scoped_array &a,scoped_array &b) +{ + std::swap(a.ptr(),b.ptr()); +} +} +#ifdef EIGEN_ALLOCA + #if EIGEN_DEFAULT_ALIGN_BYTES>0 + #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((internal::UIntPtr(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + #else + #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE) + #endif + #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ + Eigen::internal::check_size_for_overflow(SIZE); \ + TYPE* NAME = (BUFFER)!=0 ? (BUFFER) \ + : reinterpret_cast( \ + (sizeof(TYPE)*SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE)*SIZE) \ + : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ + Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) +#else + #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ + Eigen::internal::check_size_for_overflow(SIZE); \ + TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ + Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) +#endif +#if EIGEN_MAX_ALIGN_BYTES!=0 + #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ + EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc(size); } \ + EIGEN_CATCH (...) { return 0; } \ + } + #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \ + void *operator new(std::size_t size) { \ + return Eigen::internal::conditional_aligned_malloc(size); \ + } \ + void *operator new[](std::size_t size) { \ + return Eigen::internal::conditional_aligned_malloc(size); \ + } \ + void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + void operator delete(void * ptr, std::size_t ) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + void operator delete[](void * ptr, std::size_t ) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ + \ + \ + \ + static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \ + static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \ + void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \ + void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \ + \ + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ + void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \ + Eigen::internal::conditional_aligned_free(ptr); \ + } \ + typedef void eigen_aligned_operator_new_marker_type; +#else + #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) +#endif +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0))) +template +class aligned_allocator : public std::allocator +{ +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; + template + struct rebind + { + typedef aligned_allocator other; + }; + aligned_allocator() : std::allocator() {} + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} + template + aligned_allocator(const aligned_allocator& other) : std::allocator(other) {} + ~aligned_allocator() {} + pointer allocate(size_type num, const void* = 0) + { + internal::check_size_for_overflow(num); + return static_cast( internal::aligned_malloc(num * sizeof(T)) ); + } + void deallocate(pointer p, size_type ) + { + internal::aligned_free(p); + } +}; +#if !defined(EIGEN_NO_CPUID) +# if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64 +# if defined(__PIC__) && EIGEN_ARCH_i386 +# define EIGEN_CPUID(abcd,func,id) \ + __asm__ __volatile__ ("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id)); +# elif defined(__PIC__) && EIGEN_ARCH_x86_64 +# define EIGEN_CPUID(abcd,func,id) \ + __asm__ __volatile__ ("xchg{q}\t{%%}rbx, %q1; cpuid; xchg{q}\t{%%}rbx, %q1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id)); +# else +# define EIGEN_CPUID(abcd,func,id) \ + __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) ); +# endif +# elif EIGEN_COMP_MSVC +# if (EIGEN_COMP_MSVC > 1500) && EIGEN_ARCH_i386_OR_x86_64 +# define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id) +# endif +# endif +#endif +namespace internal { +#ifdef EIGEN_CPUID +inline bool cpuid_is_vendor(int abcd[4], const int vendor[3]) +{ + return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2]; +} +inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) +{ + int abcd[4]; + l1 = l2 = l3 = 0; + int cache_id = 0; + int cache_type = 0; + do { + abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; + EIGEN_CPUID(abcd,0x4,cache_id); + cache_type = (abcd[0] & 0x0F) >> 0; + if(cache_type==1||cache_type==3) + { + int cache_level = (abcd[0] & 0xE0) >> 5; + int ways = (abcd[1] & 0xFFC00000) >> 22; + int partitions = (abcd[1] & 0x003FF000) >> 12; + int line_size = (abcd[1] & 0x00000FFF) >> 0; + int sets = (abcd[2]); + int cache_size = (ways+1) * (partitions+1) * (line_size+1) * (sets+1); + switch(cache_level) + { + case 1: l1 = cache_size; break; + case 2: l2 = cache_size; break; + case 3: l3 = cache_size; break; + default: break; + } + } + cache_id++; + } while(cache_type>0 && cache_id<16); +} +inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3) +{ + int abcd[4]; + abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; + l1 = l2 = l3 = 0; + EIGEN_CPUID(abcd,0x00000002,0); + unsigned char * bytes = reinterpret_cast(abcd)+2; + bool check_for_p2_core2 = false; + for(int i=0; i<14; ++i) + { + switch(bytes[i]) + { + case 0x0A: l1 = 8; break; + case 0x0C: l1 = 16; break; + case 0x0E: l1 = 24; break; + case 0x10: l1 = 16; break; + case 0x15: l1 = 16; break; + case 0x2C: l1 = 32; break; + case 0x30: l1 = 32; break; + case 0x60: l1 = 16; break; + case 0x66: l1 = 8; break; + case 0x67: l1 = 16; break; + case 0x68: l1 = 32; break; + case 0x1A: l2 = 96; break; + case 0x22: l3 = 512; break; + case 0x23: l3 = 1024; break; + case 0x25: l3 = 2048; break; + case 0x29: l3 = 4096; break; + case 0x39: l2 = 128; break; + case 0x3A: l2 = 192; break; + case 0x3B: l2 = 128; break; + case 0x3C: l2 = 256; break; + case 0x3D: l2 = 384; break; + case 0x3E: l2 = 512; break; + case 0x40: l2 = 0; break; + case 0x41: l2 = 128; break; + case 0x42: l2 = 256; break; + case 0x43: l2 = 512; break; + case 0x44: l2 = 1024; break; + case 0x45: l2 = 2048; break; + case 0x46: l3 = 4096; break; + case 0x47: l3 = 8192; break; + case 0x48: l2 = 3072; break; + case 0x49: if(l2!=0) l3 = 4096; else {check_for_p2_core2=true; l3 = l2 = 4096;} break; + case 0x4A: l3 = 6144; break; + case 0x4B: l3 = 8192; break; + case 0x4C: l3 = 12288; break; + case 0x4D: l3 = 16384; break; + case 0x4E: l2 = 6144; break; + case 0x78: l2 = 1024; break; + case 0x79: l2 = 128; break; + case 0x7A: l2 = 256; break; + case 0x7B: l2 = 512; break; + case 0x7C: l2 = 1024; break; + case 0x7D: l2 = 2048; break; + case 0x7E: l2 = 256; break; + case 0x7F: l2 = 512; break; + case 0x80: l2 = 512; break; + case 0x81: l2 = 128; break; + case 0x82: l2 = 256; break; + case 0x83: l2 = 512; break; + case 0x84: l2 = 1024; break; + case 0x85: l2 = 2048; break; + case 0x86: l2 = 512; break; + case 0x87: l2 = 1024; break; + case 0x88: l3 = 2048; break; + case 0x89: l3 = 4096; break; + case 0x8A: l3 = 8192; break; + case 0x8D: l3 = 3072; break; + default: break; + } + } + if(check_for_p2_core2 && l2 == l3) + l3 = 0; + l1 *= 1024; + l2 *= 1024; + l3 *= 1024; +} +inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs) +{ + if(max_std_funcs>=4) + queryCacheSizes_intel_direct(l1,l2,l3); + else + queryCacheSizes_intel_codes(l1,l2,l3); +} +inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) +{ + int abcd[4]; + abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; + EIGEN_CPUID(abcd,0x80000005,0); + l1 = (abcd[2] >> 24) * 1024; + abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; + EIGEN_CPUID(abcd,0x80000006,0); + l2 = (abcd[2] >> 16) * 1024; + l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; +} +#endif +inline void queryCacheSizes(int& l1, int& l2, int& l3) +{ + #ifdef EIGEN_CPUID + int abcd[4]; + const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e}; + const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163}; + const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; + EIGEN_CPUID(abcd,0x0,0); + int max_std_funcs = abcd[1]; + if(cpuid_is_vendor(abcd,GenuineIntel)) + queryCacheSizes_intel(l1,l2,l3,max_std_funcs); + else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_)) + queryCacheSizes_amd(l1,l2,l3); + else + queryCacheSizes_intel(l1,l2,l3,max_std_funcs); + #else + l1 = l2 = l3 = -1; + #endif +} +inline int queryL1CacheSize() +{ + int l1(-1), l2, l3; + queryCacheSizes(l1,l2,l3); + return l1; +} +inline int queryTopLevelCacheSize() +{ + int l1, l2(-1), l3(-1); + queryCacheSizes(l1,l2,l3); + return (std::max)(l2,l3); +} +} +} +#endif +// end #include "src/Core/util/Memory.h" +// #include "src/Core/NumTraits.h" +#ifndef EIGEN_NUMTRAITS_H +#define EIGEN_NUMTRAITS_H +namespace Eigen { +namespace internal { +template< typename T, + bool use_numeric_limits = std::numeric_limits::is_specialized, + bool is_integer = NumTraits::IsInteger> +struct default_digits10_impl +{ + static int run() { return std::numeric_limits::digits10; } +}; +template +struct default_digits10_impl +{ + static int run() { + using std::log10; + using std::ceil; + typedef typename NumTraits::Real Real; + return int(ceil(-log10(NumTraits::epsilon()))); + } +}; +template +struct default_digits10_impl +{ + static int run() { return 0; } +}; +} +template struct GenericNumTraits +{ + enum { + IsInteger = std::numeric_limits::is_integer, + IsSigned = std::numeric_limits::is_signed, + IsComplex = 0, + RequireInitialization = internal::is_arithmetic::value ? 0 : 1, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; + typedef T Real; + typedef typename internal::conditional< + IsInteger, + typename internal::conditional::type, + T + >::type NonInteger; + typedef T Nested; + typedef T Literal; + EIGEN_DEVICE_FUNC + static inline Real epsilon() + { + return numext::numeric_limits::epsilon(); + } + EIGEN_DEVICE_FUNC + static inline int digits10() + { + return internal::default_digits10_impl::run(); + } + EIGEN_DEVICE_FUNC + static inline Real dummy_precision() + { + return Real(0); + } + EIGEN_DEVICE_FUNC + static inline T highest() { + return (numext::numeric_limits::max)(); + } + EIGEN_DEVICE_FUNC + static inline T lowest() { + return IsInteger ? (numext::numeric_limits::min)() : (-(numext::numeric_limits::max)()); + } + EIGEN_DEVICE_FUNC + static inline T infinity() { + return numext::numeric_limits::infinity(); + } + EIGEN_DEVICE_FUNC + static inline T quiet_NaN() { + return numext::numeric_limits::quiet_NaN(); + } +}; +template struct NumTraits : GenericNumTraits +{}; +template<> struct NumTraits + : GenericNumTraits +{ + EIGEN_DEVICE_FUNC + static inline float dummy_precision() { return 1e-5f; } +}; +template<> struct NumTraits : GenericNumTraits +{ + EIGEN_DEVICE_FUNC + static inline double dummy_precision() { return 1e-12; } +}; +template<> struct NumTraits + : GenericNumTraits +{ + static inline long double dummy_precision() { return 1e-15l; } +}; +template struct NumTraits > + : GenericNumTraits > +{ + typedef _Real Real; + typedef typename NumTraits<_Real>::Literal Literal; + enum { + IsComplex = 1, + RequireInitialization = NumTraits<_Real>::RequireInitialization, + ReadCost = 2 * NumTraits<_Real>::ReadCost, + AddCost = 2 * NumTraits::AddCost, + MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost + }; + EIGEN_DEVICE_FUNC + static inline Real epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC + static inline Real dummy_precision() { return NumTraits::dummy_precision(); } + EIGEN_DEVICE_FUNC + static inline int digits10() { return NumTraits::digits10(); } +}; +template +struct NumTraits > +{ + typedef Array ArrayType; + typedef typename NumTraits::Real RealScalar; + typedef Array Real; + typedef typename NumTraits::NonInteger NonIntegerScalar; + typedef Array NonInteger; + typedef ArrayType & Nested; + typedef typename NumTraits::Literal Literal; + enum { + IsComplex = NumTraits::IsComplex, + IsInteger = NumTraits::IsInteger, + IsSigned = NumTraits::IsSigned, + RequireInitialization = 1, + ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::ReadCost, + AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::AddCost, + MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::MulCost + }; + EIGEN_DEVICE_FUNC + static inline RealScalar epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC + static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } + static inline int digits10() { return NumTraits::digits10(); } +}; +template<> struct NumTraits + : GenericNumTraits +{ + enum { + RequireInitialization = 1, + ReadCost = HugeCost, + AddCost = HugeCost, + MulCost = HugeCost + }; + static inline int digits10() { return 0; } +private: + static inline std::string epsilon(); + static inline std::string dummy_precision(); + static inline std::string lowest(); + static inline std::string highest(); + static inline std::string infinity(); + static inline std::string quiet_NaN(); +}; +template<> struct NumTraits {}; +} +#endif +// end #include "src/Core/NumTraits.h" +// #include "src/Core/MathFunctions.h" +#ifndef EIGEN_MATHFUNCTIONS_H +#define EIGEN_MATHFUNCTIONS_H +#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L +namespace Eigen { +#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500 +long abs(long x) { return (labs(x)); } +double abs(double x) { return (fabs(x)); } +float abs(float x) { return (fabsf(x)); } +long double abs(long double x) { return (fabsl(x)); } +#endif +namespace internal { +template +struct global_math_functions_filtering_base +{ + typedef T type; +}; +template struct always_void { typedef void type; }; +template +struct global_math_functions_filtering_base + ::type + > +{ + typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type; +}; +#define EIGEN_MATHFUNC_IMPL(func, scalar) Eigen::internal::func##_impl::type> +#define EIGEN_MATHFUNC_RETVAL(func, scalar) typename Eigen::internal::func##_retval::type>::type +template::IsComplex> +struct real_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return x; + } +}; +template +struct real_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + using std::real; + return real(x); + } +}; +template struct real_impl : real_default_impl {}; +#ifdef __CUDA_ARCH__ +template +struct real_impl > +{ + typedef T RealScalar; + EIGEN_DEVICE_FUNC + static inline T run(const std::complex& x) + { + return x.real(); + } +}; +#endif +template +struct real_retval +{ + typedef typename NumTraits::Real type; +}; +template::IsComplex> +struct imag_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar&) + { + return RealScalar(0); + } +}; +template +struct imag_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + using std::imag; + return imag(x); + } +}; +template struct imag_impl : imag_default_impl {}; +#ifdef __CUDA_ARCH__ +template +struct imag_impl > +{ + typedef T RealScalar; + EIGEN_DEVICE_FUNC + static inline T run(const std::complex& x) + { + return x.imag(); + } +}; +#endif +template +struct imag_retval +{ + typedef typename NumTraits::Real type; +}; +template +struct real_ref_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar& run(Scalar& x) + { + return reinterpret_cast(&x)[0]; + } + EIGEN_DEVICE_FUNC + static inline const RealScalar& run(const Scalar& x) + { + return reinterpret_cast(&x)[0]; + } +}; +template +struct real_ref_retval +{ + typedef typename NumTraits::Real & type; +}; +template +struct imag_ref_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar& run(Scalar& x) + { + return reinterpret_cast(&x)[1]; + } + EIGEN_DEVICE_FUNC + static inline const RealScalar& run(const Scalar& x) + { + return reinterpret_cast(&x)[1]; + } +}; +template +struct imag_ref_default_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(Scalar&) + { + return Scalar(0); + } + EIGEN_DEVICE_FUNC + static inline const Scalar run(const Scalar&) + { + return Scalar(0); + } +}; +template +struct imag_ref_impl : imag_ref_default_impl::IsComplex> {}; +template +struct imag_ref_retval +{ + typedef typename NumTraits::Real & type; +}; +template::IsComplex> +struct conj_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + return x; + } +}; +template +struct conj_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + using std::conj; + return conj(x); + } +}; +template +struct conj_retval +{ + typedef Scalar type; +}; +template +struct abs2_impl_default +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return x*x; + } +}; +template +struct abs2_impl_default +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return real(x)*real(x) + imag(x)*imag(x); + } +}; +template +struct abs2_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return abs2_impl_default::IsComplex>::run(x); + } +}; +template +struct abs2_retval +{ + typedef typename NumTraits::Real type; +}; +template +struct norm1_default_impl +{ + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + EIGEN_USING_STD_MATH(abs); + return abs(real(x)) + abs(imag(x)); + } +}; +template +struct norm1_default_impl +{ + EIGEN_DEVICE_FUNC + static inline Scalar run(const Scalar& x) + { + EIGEN_USING_STD_MATH(abs); + return abs(x); + } +}; +template +struct norm1_impl : norm1_default_impl::IsComplex> {}; +template +struct norm1_retval +{ + typedef typename NumTraits::Real type; +}; +template +struct hypot_impl +{ + typedef typename NumTraits::Real RealScalar; + static inline RealScalar run(const Scalar& x, const Scalar& y) + { + EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD_MATH(sqrt); + RealScalar _x = abs(x); + RealScalar _y = abs(y); + Scalar p, qp; + if(_x>_y) + { + p = _x; + qp = _y / p; + } + else + { + p = _y; + qp = _x / p; + } + if(p==RealScalar(0)) return RealScalar(0); + return p * sqrt(RealScalar(1) + qp*qp); + } +}; +template +struct hypot_retval +{ + typedef typename NumTraits::Real type; +}; +template +struct cast_impl +{ + EIGEN_DEVICE_FUNC + static inline NewType run(const OldType& x) + { + return static_cast(x); + } +}; +template +EIGEN_DEVICE_FUNC +inline NewType cast(const OldType& x) +{ + return cast_impl::run(x); +} +#if EIGEN_HAS_CXX11_MATH + template + struct round_impl { + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) + using std::round; + return round(x); + } + }; +#else + template + struct round_impl + { + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) + EIGEN_USING_STD_MATH(floor); + EIGEN_USING_STD_MATH(ceil); + return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5)); + } + }; +#endif +template +struct round_retval +{ + typedef Scalar type; +}; +#if EIGEN_HAS_CXX11_MATH + template + struct arg_impl { + static inline Scalar run(const Scalar& x) + { + EIGEN_USING_STD_MATH(arg); + return arg(x); + } + }; +#else + template::IsComplex> + struct arg_default_impl + { + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); } + }; + template + struct arg_default_impl + { + typedef typename NumTraits::Real RealScalar; + EIGEN_DEVICE_FUNC + static inline RealScalar run(const Scalar& x) + { + EIGEN_USING_STD_MATH(arg); + return arg(x); + } + }; + template struct arg_impl : arg_default_impl {}; +#endif +template +struct arg_retval +{ + typedef typename NumTraits::Real type; +}; +namespace std_fallback { + template + EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + typedef typename NumTraits::Real RealScalar; + EIGEN_USING_STD_MATH(log); + Scalar x1p = RealScalar(1) + x; + return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + } +} +template +struct log1p_impl { + static inline Scalar run(const Scalar& x) + { + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + #if EIGEN_HAS_CXX11_MATH + using std::log1p; + #endif + using std_fallback::log1p; + return log1p(x); + } +}; +template +struct log1p_retval +{ + typedef Scalar type; +}; +template::IsInteger&&NumTraits::IsInteger> +struct pow_impl +{ + typedef typename ScalarBinaryOpTraits >::ReturnType result_type; + static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y) + { + EIGEN_USING_STD_MATH(pow); + return pow(x, y); + } +}; +template +struct pow_impl +{ + typedef ScalarX result_type; + static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y) + { + ScalarX res(1); + eigen_assert(!NumTraits::IsSigned || y >= 0); + if(y & 1) res *= x; + y >>= 1; + while(y) + { + x *= x; + if(y&1) res *= x; + y >>= 1; + } + return res; + } +}; +template +struct random_default_impl {}; +template +struct random_impl : random_default_impl::IsComplex, NumTraits::IsInteger> {}; +template +struct random_retval +{ + typedef Scalar type; +}; +template inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y); +template inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(); +template +struct random_default_impl +{ + static inline Scalar run(const Scalar& x, const Scalar& y) + { + return x + (y-x) * Scalar(std::rand()) / Scalar(RAND_MAX); + } + static inline Scalar run() + { + return run(Scalar(NumTraits::IsSigned ? -1 : 0), Scalar(1)); + } +}; +enum { + meta_floor_log2_terminate, + meta_floor_log2_move_up, + meta_floor_log2_move_down, + meta_floor_log2_bogus +}; +template struct meta_floor_log2_selector +{ + enum { middle = (lower + upper) / 2, + value = (upper <= lower + 1) ? int(meta_floor_log2_terminate) + : (n < (1 << middle)) ? int(meta_floor_log2_move_down) + : (n==0) ? int(meta_floor_log2_bogus) + : int(meta_floor_log2_move_up) + }; +}; +template::value> +struct meta_floor_log2 {}; +template +struct meta_floor_log2 +{ + enum { value = meta_floor_log2::middle>::value }; +}; +template +struct meta_floor_log2 +{ + enum { value = meta_floor_log2::middle, upper>::value }; +}; +template +struct meta_floor_log2 +{ + enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower }; +}; +template +struct meta_floor_log2 +{ +}; +template +struct random_default_impl +{ + static inline Scalar run(const Scalar& x, const Scalar& y) + { + typedef typename conditional::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX; + if(y range); + return Scalar(ScalarX(x) + offset); + } + static inline Scalar run() + { +#ifdef EIGEN_MAKING_DOCS + return run(Scalar(NumTraits::IsSigned ? -10 : 0), Scalar(10)); +#else + enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value, + scalar_bits = sizeof(Scalar) * CHAR_BIT, + shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)), + offset = NumTraits::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0 + }; + return Scalar((std::rand() >> shift) - offset); +#endif + } +}; +template +struct random_default_impl +{ + static inline Scalar run(const Scalar& x, const Scalar& y) + { + return Scalar(random(real(x), real(y)), + random(imag(x), imag(y))); + } + static inline Scalar run() + { + typedef typename NumTraits::Real RealScalar; + return Scalar(random(), random()); + } +}; +template +inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y) +{ + return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y); +} +template +inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() +{ + return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); +} +#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) +#define EIGEN_USE_STD_FPCLASSIFY 1 +#else +#define EIGEN_USE_STD_FPCLASSIFY 0 +#endif +template +EIGEN_DEVICE_FUNC +typename internal::enable_if::value,bool>::type +isnan_impl(const T&) { return false; } +template +EIGEN_DEVICE_FUNC +typename internal::enable_if::value,bool>::type +isinf_impl(const T&) { return false; } +template +EIGEN_DEVICE_FUNC +typename internal::enable_if::value,bool>::type +isfinite_impl(const T&) { return true; } +template +EIGEN_DEVICE_FUNC +typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type +isfinite_impl(const T& x) +{ + #ifdef __CUDA_ARCH__ + return (::isfinite)(x); + #elif EIGEN_USE_STD_FPCLASSIFY + using std::isfinite; + return isfinite EIGEN_NOT_A_MACRO (x); + #else + return x<=NumTraits::highest() && x>=NumTraits::lowest(); + #endif +} +template +EIGEN_DEVICE_FUNC +typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type +isinf_impl(const T& x) +{ + #ifdef __CUDA_ARCH__ + return (::isinf)(x); + #elif EIGEN_USE_STD_FPCLASSIFY + using std::isinf; + return isinf EIGEN_NOT_A_MACRO (x); + #else + return x>NumTraits::highest() || x::lowest(); + #endif +} +template +EIGEN_DEVICE_FUNC +typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type +isnan_impl(const T& x) +{ + #ifdef __CUDA_ARCH__ + return (::isnan)(x); + #elif EIGEN_USE_STD_FPCLASSIFY + using std::isnan; + return isnan EIGEN_NOT_A_MACRO (x); + #else + return x != x; + #endif +} +#if (!EIGEN_USE_STD_FPCLASSIFY) +#if EIGEN_COMP_MSVC +template EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) +{ + return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF; +} +EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x)!=0; } +EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); } +EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) { return isinf_msvc_helper(x); } +EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { return isinf_msvc_helper(x); } +#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC) +#if EIGEN_GNUC_AT_LEAST(5,0) + #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only"))) +#else + #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only"))) +#endif +template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); } +template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) { return __builtin_isnan(x); } +template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) { return __builtin_isnan(x); } +template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) { return __builtin_isinf(x); } +template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) { return __builtin_isinf(x); } +template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); } +#undef EIGEN_TMP_NOOPT_ATTRIB +#endif +#endif +template EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex& x); +template EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x); +template EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x); +template T generic_fast_tanh_float(const T& a_x); +} +namespace numext { +#ifndef __CUDA_ARCH__ +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) +{ + EIGEN_USING_STD_MATH(min); + return min EIGEN_NOT_A_MACRO (x,y); +} +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) +{ + EIGEN_USING_STD_MATH(max); + return max EIGEN_NOT_A_MACRO (x,y); +} +#else +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) +{ + return y < x ? y : x; +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) +{ + return fminf(x, y); +} +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) +{ + return x < y ? y : x; +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) +{ + return fmaxf(x, y); +} +#endif +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x) +{ + return internal::real_ref_impl::run(x); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) arg(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x) +{ + return internal::imag_ref_impl::run(x); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y) +{ + return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y); +} +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float log1p(const float &x) { return ::log1pf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double log1p(const double &x) { return ::log1p(x); } +#endif +template +EIGEN_DEVICE_FUNC +inline typename internal::pow_impl::result_type pow(const ScalarX& x, const ScalarY& y) +{ + return internal::pow_impl::run(x, y); +} +template EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); } +template EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return internal::isinf_impl(x); } +template EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } +template +EIGEN_DEVICE_FUNC +inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) +{ + return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x); +} +template +EIGEN_DEVICE_FUNC +T (floor)(const T& x) +{ + EIGEN_USING_STD_MATH(floor); + return floor(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float floor(const float &x) { return ::floorf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double floor(const double &x) { return ::floor(x); } +#endif +template +EIGEN_DEVICE_FUNC +T (ceil)(const T& x) +{ + EIGEN_USING_STD_MATH(ceil); + return ceil(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float ceil(const float &x) { return ::ceilf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double ceil(const double &x) { return ::ceil(x); } +#endif +inline int log2(int x) +{ + eigen_assert(x>=0); + unsigned int v(x); + static const int table[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return table[(v * 0x07C4ACDDU) >> 27]; +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sqrt(const T &x) +{ + EIGEN_USING_STD_MATH(sqrt); + return sqrt(x); +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T log(const T &x) { + EIGEN_USING_STD_MATH(log); + return log(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float log(const float &x) { return ::logf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double log(const double &x) { return ::log(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type +abs(const T &x) { + EIGEN_USING_STD_MATH(abs); + return abs(x); +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename internal::enable_if::IsSigned || NumTraits::IsComplex),typename NumTraits::Real>::type +abs(const T &x) { + return x; +} +#if defined(__SYCL_DEVICE_ONLY__) +EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } +EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } +#endif +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float abs(const float &x) { return ::fabsf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double abs(const double &x) { return ::fabs(x); } +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float abs(const std::complex& x) { + return ::hypotf(x.real(), x.imag()); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double abs(const std::complex& x) { + return ::hypot(x.real(), x.imag()); +} +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T exp(const T &x) { + EIGEN_USING_STD_MATH(exp); + return exp(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float exp(const float &x) { return ::expf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double exp(const double &x) { return ::exp(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T cos(const T &x) { + EIGEN_USING_STD_MATH(cos); + return cos(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float cos(const float &x) { return ::cosf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double cos(const double &x) { return ::cos(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sin(const T &x) { + EIGEN_USING_STD_MATH(sin); + return sin(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sin(const float &x) { return ::sinf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sin(const double &x) { return ::sin(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T tan(const T &x) { + EIGEN_USING_STD_MATH(tan); + return tan(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tan(const float &x) { return ::tanf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double tan(const double &x) { return ::tan(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T acos(const T &x) { + EIGEN_USING_STD_MATH(acos); + return acos(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float acos(const float &x) { return ::acosf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double acos(const double &x) { return ::acos(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T asin(const T &x) { + EIGEN_USING_STD_MATH(asin); + return asin(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float asin(const float &x) { return ::asinf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double asin(const double &x) { return ::asin(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atan(const T &x) { + EIGEN_USING_STD_MATH(atan); + return atan(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float atan(const float &x) { return ::atanf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double atan(const double &x) { return ::atan(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T cosh(const T &x) { + EIGEN_USING_STD_MATH(cosh); + return cosh(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float cosh(const float &x) { return ::coshf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double cosh(const double &x) { return ::cosh(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T sinh(const T &x) { + EIGEN_USING_STD_MATH(sinh); + return sinh(x); +} +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sinh(const float &x) { return ::sinhf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sinh(const double &x) { return ::sinh(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T tanh(const T &x) { + EIGEN_USING_STD_MATH(tanh); + return tanh(x); +} +#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tanh(float x) { return internal::generic_fast_tanh_float(x); } +#endif +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float tanh(const float &x) { return ::tanhf(x); } +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double tanh(const double &x) { return ::tanh(x); } +#endif +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T fmod(const T& a, const T& b) { + EIGEN_USING_STD_MATH(fmod); + return fmod(a, b); +} +#ifdef __CUDACC__ +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float fmod(const float& a, const float& b) { + return ::fmodf(a, b); +} +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double fmod(const double& a, const double& b) { + return ::fmod(a, b); +} +#endif +} +namespace internal { +template +EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex& x) +{ + return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x)); +} +template +EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x) +{ + return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x)); +} +template +EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x) +{ + return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x)); +} +template +struct scalar_fuzzy_default_impl {}; +template +struct scalar_fuzzy_default_impl +{ + typedef typename NumTraits::Real RealScalar; + template EIGEN_DEVICE_FUNC + static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec) + { + return numext::abs(x) <= numext::abs(y) * prec; + } + EIGEN_DEVICE_FUNC + static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec) + { + return numext::abs(x - y) <= numext::mini(numext::abs(x), numext::abs(y)) * prec; + } + EIGEN_DEVICE_FUNC + static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec) + { + return x <= y || isApprox(x, y, prec); + } +}; +template +struct scalar_fuzzy_default_impl +{ + typedef typename NumTraits::Real RealScalar; + template EIGEN_DEVICE_FUNC + static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&) + { + return x == Scalar(0); + } + EIGEN_DEVICE_FUNC + static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&) + { + return x == y; + } + EIGEN_DEVICE_FUNC + static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&) + { + return x <= y; + } +}; +template +struct scalar_fuzzy_default_impl +{ + typedef typename NumTraits::Real RealScalar; + template EIGEN_DEVICE_FUNC + static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec) + { + return numext::abs2(x) <= numext::abs2(y) * prec * prec; + } + EIGEN_DEVICE_FUNC + static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec) + { + return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec; + } +}; +template +struct scalar_fuzzy_impl : scalar_fuzzy_default_impl::IsComplex, NumTraits::IsInteger> {}; +template EIGEN_DEVICE_FUNC +inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) +{ + return scalar_fuzzy_impl::template isMuchSmallerThan(x, y, precision); +} +template EIGEN_DEVICE_FUNC +inline bool isApprox(const Scalar& x, const Scalar& y, + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) +{ + return scalar_fuzzy_impl::isApprox(x, y, precision); +} +template EIGEN_DEVICE_FUNC +inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, + const typename NumTraits::Real &precision = NumTraits::dummy_precision()) +{ + return scalar_fuzzy_impl::isApproxOrLessThan(x, y, precision); +} +template<> struct random_impl +{ + static inline bool run() + { + return random(0,1)==0 ? false : true; + } +}; +template<> struct scalar_fuzzy_impl +{ + typedef bool RealScalar; + template EIGEN_DEVICE_FUNC + static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&) + { + return !x; + } + EIGEN_DEVICE_FUNC + static inline bool isApprox(bool x, bool y, bool) + { + return x == y; + } + EIGEN_DEVICE_FUNC + static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&) + { + return (!x) || y; + } +}; +} +} +#endif +// end #include "src/Core/MathFunctions.h" +// #include "src/Core/GenericPacketMath.h" +#ifndef EIGEN_GENERIC_PACKET_MATH_H +#define EIGEN_GENERIC_PACKET_MATH_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_DEBUG_ALIGNED_LOAD +#define EIGEN_DEBUG_ALIGNED_LOAD +#endif +#ifndef EIGEN_DEBUG_UNALIGNED_LOAD +#define EIGEN_DEBUG_UNALIGNED_LOAD +#endif +#ifndef EIGEN_DEBUG_ALIGNED_STORE +#define EIGEN_DEBUG_ALIGNED_STORE +#endif +#ifndef EIGEN_DEBUG_UNALIGNED_STORE +#define EIGEN_DEBUG_UNALIGNED_STORE +#endif +struct default_packet_traits +{ + enum { + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 1, + HasBlend = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 0, + HasSin = 0, + HasCos = 0, + HasTan = 0, + HasASin = 0, + HasACos = 0, + HasATan = 0, + HasSinh = 0, + HasCosh = 0, + HasTanh = 0, + HasLGamma = 0, + HasDiGamma = 0, + HasZeta = 0, + HasPolygamma = 0, + HasErf = 0, + HasErfc = 0, + HasIGamma = 0, + HasIGammac = 0, + HasBetaInc = 0, + HasRound = 0, + HasFloor = 0, + HasCeil = 0, + HasSign = 0 + }; +}; +template struct packet_traits : default_packet_traits +{ + typedef T type; + typedef T half; + enum { + Vectorizable = 0, + size = 1, + AlignedOnScalar = 0, + HasHalfPacket = 0 + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; +}; +template struct packet_traits : packet_traits { }; +template struct type_casting_traits { + enum { + VectorizedCast = 0, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; +template +EIGEN_DEVICE_FUNC inline TgtPacket +pcast(const SrcPacket& a) { + return static_cast(a); +} +template +EIGEN_DEVICE_FUNC inline TgtPacket +pcast(const SrcPacket& a, const SrcPacket& ) { + return static_cast(a); +} +template +EIGEN_DEVICE_FUNC inline TgtPacket +pcast(const SrcPacket& a, const SrcPacket& , const SrcPacket& , const SrcPacket& ) { + return static_cast(a); +} +template EIGEN_DEVICE_FUNC inline Packet +padd(const Packet& a, + const Packet& b) { return a+b; } +template EIGEN_DEVICE_FUNC inline Packet +psub(const Packet& a, + const Packet& b) { return a-b; } +template EIGEN_DEVICE_FUNC inline Packet +pnegate(const Packet& a) { return -a; } +template EIGEN_DEVICE_FUNC inline Packet +pconj(const Packet& a) { return numext::conj(a); } +template EIGEN_DEVICE_FUNC inline Packet +pmul(const Packet& a, + const Packet& b) { return a*b; } +template EIGEN_DEVICE_FUNC inline Packet +pdiv(const Packet& a, + const Packet& b) { return a/b; } +template EIGEN_DEVICE_FUNC inline Packet +pmin(const Packet& a, + const Packet& b) { return numext::mini(a, b); } +template EIGEN_DEVICE_FUNC inline Packet +pmax(const Packet& a, + const Packet& b) { return numext::maxi(a, b); } +template EIGEN_DEVICE_FUNC inline Packet +pabs(const Packet& a) { using std::abs; return abs(a); } +template EIGEN_DEVICE_FUNC inline Packet +parg(const Packet& a) { using numext::arg; return arg(a); } +template EIGEN_DEVICE_FUNC inline Packet +pand(const Packet& a, const Packet& b) { return a & b; } +template EIGEN_DEVICE_FUNC inline Packet +por(const Packet& a, const Packet& b) { return a | b; } +template EIGEN_DEVICE_FUNC inline Packet +pxor(const Packet& a, const Packet& b) { return a ^ b; } +template EIGEN_DEVICE_FUNC inline Packet +pandnot(const Packet& a, const Packet& b) { return a & (!b); } +template EIGEN_DEVICE_FUNC inline Packet +pload(const typename unpacket_traits::type* from) { return *from; } +template EIGEN_DEVICE_FUNC inline Packet +ploadu(const typename unpacket_traits::type* from) { return *from; } +template EIGEN_DEVICE_FUNC inline Packet +pset1(const typename unpacket_traits::type& a) { return a; } +template EIGEN_DEVICE_FUNC inline Packet +pload1(const typename unpacket_traits::type *a) { return pset1(*a); } +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet +ploaddup(const typename unpacket_traits::type* from) { return *from; } +template EIGEN_DEVICE_FUNC inline Packet +ploadquad(const typename unpacket_traits::type* from) +{ return pload1(from); } +template EIGEN_DEVICE_FUNC +inline void pbroadcast4(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); + a2 = pload1(a+2); + a3 = pload1(a+3); +} +template EIGEN_DEVICE_FUNC +inline void pbroadcast2(const typename unpacket_traits::type *a, + Packet& a0, Packet& a1) +{ + a0 = pload1(a+0); + a1 = pload1(a+1); +} +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet +plset(const typename unpacket_traits::type& a) { return a; } +template EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) +{ (*to) = from; } +template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) +{ (*to) = from; } + template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index ) + { return ploadu(from); } + template EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index ) + { pstore(to, from); } +template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) +{ +#ifdef __CUDA_ARCH__ +#if defined(__LP64__) + asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); +#else + asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr)); +#endif +#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC) + __builtin_prefetch(addr); +#endif +} +template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type pfirst(const Packet& a) +{ return a; } +template EIGEN_DEVICE_FUNC inline Packet +preduxp(const Packet* vecs) { return vecs[0]; } +template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) +{ return a; } +template EIGEN_DEVICE_FUNC inline +typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type +predux_downto4(const Packet& a) +{ return a; } +template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) +{ return a; } +template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) +{ return a; } +template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) +{ return a; } +template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) +{ return a; } +template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) +{ + return Packet(imag(a),real(a)); +} +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet psin(const Packet& a) { using std::sin; return sin(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pcos(const Packet& a) { using std::cos; return cos(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet ptan(const Packet& a) { using std::tan; return tan(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pasin(const Packet& a) { using std::asin; return asin(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pacos(const Packet& a) { using std::acos; return acos(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet patan(const Packet& a) { using std::atan; return atan(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet psinh(const Packet& a) { using std::sinh; return sinh(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pexp(const Packet& a) { using std::exp; return exp(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plog(const Packet& a) { using std::log; return log(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plog1p(const Packet& a) { return numext::log1p(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plog10(const Packet& a) { using std::log10; return log10(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet prsqrt(const Packet& a) { + return pdiv(pset1(1), psqrt(a)); +} +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pround(const Packet& a) { using numext::round; return round(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } +template +inline void pstore1(typename unpacket_traits::type* to, const typename unpacket_traits::type& a) +{ + pstore(to, pset1(a)); +} +template EIGEN_DEVICE_FUNC inline Packet +pmadd(const Packet& a, + const Packet& b, + const Packet& c) +{ return padd(pmul(a, b),c); } +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits::type* from) +{ + if(Alignment >= unpacket_traits::alignment) + return pload(from); + else + return ploadu(from); +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) +{ + if(Alignment >= unpacket_traits::alignment) + pstore(to, from); + else + pstoreu(to, from); +} +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits::type* from) +{ + return ploadt(from); +} +template +struct palign_impl +{ + static inline void run(PacketType&, const PacketType&) {} +}; +template +inline void palign(PacketType& first, const PacketType& second) +{ + palign_impl::run(first,second); +} +#ifndef __CUDACC__ +template<> inline std::complex pmul(const std::complex& a, const std::complex& b) +{ return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } +template<> inline std::complex pmul(const std::complex& a, const std::complex& b) +{ return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } +#endif +template ::size> struct PacketBlock { + Packet packet[N]; +}; +template EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& ) { +} +template struct Selector { + bool select[N]; +}; +template EIGEN_DEVICE_FUNC inline Packet +pblend(const Selector::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { + return ifPacket.select[0] ? thenPacket : elsePacket; +} +template EIGEN_DEVICE_FUNC inline Packet +pinsertfirst(const Packet& a, typename unpacket_traits::type b) +{ + Selector::size> mask; + mask.select[0] = true; + for(Index i=1; i::size; ++i) + mask.select[i] = false; + return pblend(mask, pset1(b), a); +} +template EIGEN_DEVICE_FUNC inline Packet +pinsertlast(const Packet& a, typename unpacket_traits::type b) +{ + Selector::size> mask; + for(Index i=0; i::size-1; ++i) + mask.select[i] = false; + mask.select[unpacket_traits::size-1] = true; + return pblend(mask, pset1(b), a); +} +} +} +#endif +// end #include "src/Core/GenericPacketMath.h" +#if defined EIGEN_VECTORIZE_AVX512 +// #include "src/Core/arch/SSE/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_SSE_H +#define EIGEN_PACKET_MATH_SSE_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#endif +#endif +#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004) +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + T m_val; +}; +typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; +typedef eigen_packet_wrapper<__m128d> Packet2d; +#else +typedef __m128 Packet4f; +typedef __m128i Packet4i; +typedef __m128d Packet2d; +#endif +template<> struct is_arithmetic<__m128> { enum { value = true }; }; +template<> struct is_arithmetic<__m128i> { enum { value = true }; }; +template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +#define vec4f_swizzle1(v,p,q,r,s) \ + (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) +#define vec4i_swizzle1(v,p,q,r,s) \ + (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) +#define vec2d_swizzle1(v,p,q) \ + (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) +#define vec4f_swizzle2(a,b,p,q,r,s) \ + (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) +#define vec4i_swizzle2(a,b,p,q,r,s) \ + (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + const Packet4f p4f_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + const Packet2d p2d_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + const Packet4i p4i_##NAME = pset1(X) +#ifndef EIGEN_VECTORIZE_AVX +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasBlend = 1 +#ifdef EIGEN_VECTORIZE_SSE4_1 + , + HasRound = 1, + HasFloor = 1, + HasCeil = 1 +#endif + }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasBlend = 1 +#ifdef EIGEN_VECTORIZE_SSE4_1 + , + HasRound = 1, + HasFloor = 1, + HasCeil = 1 +#endif + }; +}; +#endif +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet4i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasBlend = 1 + }; +}; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +#ifndef EIGEN_VECTORIZE_AVX +template<> struct scalar_div_cost { enum { value = 7 }; }; +template<> struct scalar_div_cost { enum { value = 8 }; }; +#endif +#if EIGEN_COMP_MSVC==1500 +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps(from,from,from,from); } +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } +#else +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } +#endif +#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); + return _mm_xor_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); + return _mm_xor_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) +{ + return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); +} +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_mullo_epi32(a,b); +#else + return vec4i_swizzle1( + vec4i_swizzle2( + _mm_mul_epu32(a,b), + _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), + vec4i_swizzle1(b,1,0,3,2)), + 0,2,0,2), + 0,2,1,3); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_min_epi32(a,b); +#else + Packet4i mask = _mm_cmplt_epi32(a,b); + return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_max_epi32(a,b); +#else + Packet4i mask = _mm_cmpgt_epi32(a,b); + return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); +#endif +} +#ifdef EIGEN_VECTORIZE_SSE4_1 +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +#if EIGEN_COMP_MSVC + template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD + #if (EIGEN_COMP_MSVC==1600) + __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); + res = _mm_loadh_pi(res, (const __m64*)(from+2)); + return res; + #else + return _mm_loadu_ps(from); + #endif + } +#else +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_ps(from); +} +#endif +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_pd(from); +} +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_si128(reinterpret_cast(from)); +} +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast(from))), 0, 0, 1, 1); +} +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ return pset1(from[0]); } +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +{ + Packet4i tmp; + tmp = _mm_loadl_epi64(reinterpret_cast(from)); + return vec4i_swizzle1(tmp, 0, 0, 1, 1); +} +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +{ + return _mm_set_pd(from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +{ + return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + } +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + to[stride*0] = _mm_cvtss_f32(from); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +{ + to[stride*0] = _mm_cvtsd_f64(from); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + to[stride*0] = _mm_cvtsi128_si32(from); + to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); +} +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet4f pa = _mm_set_ss(a); + pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); +} +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet2d pa = _mm_set_sd(a); + pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); +} +#ifndef EIGEN_VECTORIZE_AVX +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif +#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return a.m128_f32[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return a.m128d_f64[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } +#elif EIGEN_COMP_MSVC_STRICT +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } +#else +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return _mm_cvtss_f32(a); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ return _mm_shuffle_ps(a,a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ return _mm_shuffle_pd(a,a,0x1); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm_and_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) +{ + #ifdef EIGEN_VECTORIZE_SSSE3 + return _mm_abs_epi32(a); + #else + Packet4i aux = _mm_srai_epi32(a,31); + return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); + #endif +} +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = pload(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = pload(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif +EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) +{ + vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); + vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); + vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); + vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); +} +#ifdef EIGEN_VECTORIZE_SSE3 +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); +} +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + return _mm_hadd_pd(vecs[0], vecs[1]); +} +#else +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); + tmp0 = _mm_add_ps(tmp0, tmp1); + tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); + tmp1 = _mm_add_ps(tmp1, tmp2); + tmp2 = _mm_movehl_ps(tmp1, tmp0); + tmp0 = _mm_movelh_ps(tmp0, tmp1); + return _mm_add_ps(tmp0, tmp2); +} +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); +} +#endif +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ + return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); +} +#ifdef EIGEN_VECTORIZE_SSSE3 +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); +} +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i tmp0 = _mm_hadd_epi32(a,a); + return pfirst(_mm_hadd_epi32(tmp0,tmp0)); +} +#else +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); + return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); +} +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + Packet4i tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + tmp0 = _mm_add_epi32(tmp0, tmp1); + tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + tmp1 = _mm_add_epi32(tmp1, tmp2); + tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); + tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); + return _mm_add_epi32(tmp0, tmp2); +} +#endif +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ + return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return (aux[0] * aux[1]) * (aux[2] * aux[3]);; +} +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ + return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + int aux0 = aux[0] EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ + return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; + int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; + return aux0>aux2 ? aux0 : aux2; +#endif +} +#if EIGEN_COMP_GNUC +#endif +#ifdef EIGEN_VECTORIZE_SSSE3 +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + if (Offset!=0) + first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + if (Offset!=0) + first = _mm_alignr_epi8(second,first, Offset*4); + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset==1) + first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); + } +}; +#else +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + if (Offset==1) + { + first = _mm_move_ss(first,second); + first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); + } + else if (Offset==2) + { + first = _mm_movehl_ps(first,first); + first = _mm_movelh_ps(first,second); + } + else if (Offset==3) + { + first = _mm_move_ss(first,second); + first = _mm_shuffle_ps(first,second,0x93); + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + if (Offset==1) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_shuffle_epi32(first,0x39); + } + else if (Offset==2) + { + first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); + first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + } + else if (Offset==3) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset==1) + { + first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); + first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); + } + } +}; +#endif +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); + kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); + kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); + kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); +} +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),1); +#else + return _mm_move_ss(a, _mm_load_ss(&b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),1); +#else + return _mm_move_sd(a, _mm_load_sd(&b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),(1<<3)); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); + return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1(b))); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),(1<<1)); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); + return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1(b))); +#endif +} +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { + return ::fmaf(a,b,c); +} +template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { + return ::fma(a,b,c); +} +#endif +} +} +#endif +// end #include "src/Core/arch/SSE/PacketMath.h" +// #include "src/Core/arch/AVX/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_AVX_H +#define EIGEN_PACKET_MATH_AVX_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif +#endif +typedef __m256 Packet8f; +typedef __m256i Packet8i; +typedef __m256d Packet4d; +template<> struct is_arithmetic<__m256> { enum { value = true }; }; +template<> struct is_arithmetic<__m256i> { enum { value = true }; }; +template<> struct is_arithmetic<__m256d> { enum { value = true }; }; +#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ + const Packet8f p8f_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ + const Packet4d p4d_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \ + const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1(X)) +#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \ + const Packet8i p8i_##NAME = pset1(X) +#ifndef EIGEN_VECTORIZE_AVX512 +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + HasHalfPacket = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasBlend = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1 + }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 1, + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasBlend = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1 + }; +}; +#endif +template<> struct scalar_div_cost { enum { value = 14 }; }; +template<> struct scalar_div_cost { enum { value = 16 }; }; +template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; +template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } +template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) +{ + return _mm256_sub_ps(_mm256_set1_ps(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) +{ + return _mm256_sub_pd(_mm256_set1_pd(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& , const Packet8i& ) +{ eigen_assert(false && "packet integer division are not supported by AVX"); + return pset1(0); +} +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { +#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) + Packet8f res = c; + __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_ps(a,b,c); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { +#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) + Packet4d res = c; + __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_pd(a,b,c); +#endif +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet8f pceil(const Packet8f& a) { return _mm256_ceil_ps(a); } +template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { return _mm256_ceil_pd(a); } +template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } +template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } +template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) +{ + Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); +} +template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) +{ + Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); + return _mm256_permute_pd(tmp, 3<<2); +} +template<> EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) +{ + Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); + return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); +} +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, Index stride) +{ + return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4d pgather(const double* from, Index stride) +{ + return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet8f& from, Index stride) +{ + __m128 low = _mm256_extractf128_ps(from, 0); + to[stride*0] = _mm_cvtss_f32(low); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); + __m128 high = _mm256_extractf128_ps(from, 1); + to[stride*4] = _mm_cvtss_f32(high); + to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); + to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); + to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet4d& from, Index stride) +{ + __m128d low = _mm256_extractf128_pd(from, 0); + to[stride*0] = _mm_cvtsd_f64(low); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); + __m128d high = _mm256_extractf128_pd(from, 1); + to[stride*2] = _mm_cvtsd_f64(high); + to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); +} +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet8f pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet4d pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) +{ + Packet8i pa = pset1(a); + pstore(to, pa); +} +#ifndef EIGEN_VECTORIZE_AVX512 +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif +template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { + return _mm_cvtss_f32(_mm256_castps256_ps128(a)); +} +template<> EIGEN_STRONG_INLINE double pfirst(const Packet4d& a) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); +} +template<> EIGEN_STRONG_INLINE int pfirst(const Packet8i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} +template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) +{ + __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); + return _mm256_permute2f128_ps(tmp, tmp, 1); +} +template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) +{ + __m256d tmp = _mm256_shuffle_pd(a,a,5); + return _mm256_permute2f128_pd(tmp, tmp, 1); + __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); + return _mm256_permute_pd(swap_halves,5); +} +template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) +{ + const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm256_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) +{ + const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm256_and_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) +{ + __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); + __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); + __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); + __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + return final; +} +template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) +{ + Packet4d tmp0, tmp1; + tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + return _mm256_blend_pd(tmp0, tmp1, 0xC); +} +template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) +{ + return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)))); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) +{ + return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); +} +template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) +{ + return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); +} +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) +{ + Packet8f tmp; + tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) +{ + Packet4d tmp; + tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) +{ + Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) +{ + Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} +template<> EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) +{ + Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) +{ + Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) + { + if (Offset==1) + { + first = _mm256_blend_ps(first, second, 1); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0x88); + } + else if (Offset==2) + { + first = _mm256_blend_ps(first, second, 3); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0xcc); + } + else if (Offset==3) + { + first = _mm256_blend_ps(first, second, 7); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0xee); + } + else if (Offset==4) + { + first = _mm256_blend_ps(first, second, 15); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); + } + else if (Offset==5) + { + first = _mm256_blend_ps(first, second, 31); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0x88); + } + else if (Offset==6) + { + first = _mm256_blend_ps(first, second, 63); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xcc); + } + else if (Offset==7) + { + first = _mm256_blend_ps(first, second, 127); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xee); + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) + { + if (Offset==1) + { + first = _mm256_blend_pd(first, second, 1); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 0xA); + } + else if (Offset==2) + { + first = _mm256_blend_pd(first, second, 3); + first = _mm256_permute2f128_pd(first, first, 1); + } + else if (Offset==3) + { + first = _mm256_blend_pd(first, second, 7); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 5); + } + } +}; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); + __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); + __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); + __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); + kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); + kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); + kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); + kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); + kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); + kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); + __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); + __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); + __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); + kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); + kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); + kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); + kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); +} +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) +{ + return _mm256_blend_ps(a,pset1(b),1); +} +template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) +{ + return _mm256_blend_pd(a,pset1(b),1); +} +template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) +{ + return _mm256_blend_ps(a,pset1(b),(1<<7)); +} +template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) +{ + return _mm256_blend_pd(a,pset1(b),(1<<3)); +} +} +} +#endif +// end #include "src/Core/arch/AVX/PacketMath.h" +// #include "src/Core/arch/AVX512/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_AVX512_H +#define EIGEN_PACKET_MATH_AVX512_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif +#endif +typedef __m512 Packet16f; +typedef __m512i Packet16i; +typedef __m512d Packet8d; +template <> +struct is_arithmetic<__m512> { + enum { value = true }; +}; +template <> +struct is_arithmetic<__m512i> { + enum { value = true }; +}; +template <> +struct is_arithmetic<__m512d> { + enum { value = true }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet16f type; + typedef Packet8f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, +#if EIGEN_GNUC_AT_LEAST(5, 3) +#ifdef EIGEN_VECTORIZE_AVX512DQ + HasLog = 1, +#endif + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, +#endif + HasDiv = 1 + }; + }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8d type; + typedef Packet4d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, +#if EIGEN_GNUC_AT_LEAST(5, 3) + HasSqrt = 1, + HasRsqrt = EIGEN_FAST_MATH, +#endif + HasDiv = 1 + }; +}; +template <> +struct unpacket_traits { + typedef float type; + typedef Packet8f half; + enum { size = 16, alignment=Aligned64 }; +}; +template <> +struct unpacket_traits { + typedef double type; + typedef Packet4d half; + enum { size = 8, alignment=Aligned64 }; +}; +template <> +struct unpacket_traits { + typedef int type; + typedef Packet8i half; + enum { size = 16, alignment=Aligned64 }; +}; +template <> +EIGEN_STRONG_INLINE Packet16f pset1(const float& from) { + return _mm512_set1_ps(from); +} +template <> +EIGEN_STRONG_INLINE Packet8d pset1(const double& from) { + return _mm512_set1_pd(from); +} +template <> +EIGEN_STRONG_INLINE Packet16i pset1(const int& from) { + return _mm512_set1_epi32(from); +} +template <> +EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { + return _mm512_broadcastss_ps(_mm_load_ps1(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pload1(const double* from) { + return _mm512_broadcastsd_pd(_mm_load_pd1(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16f plset(const float& a) { + return _mm512_add_ps( + _mm512_set1_ps(a), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, + 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +} +template <> +EIGEN_STRONG_INLINE Packet8d plset(const double& a) { + return _mm512_add_pd(_mm512_set1_pd(a), + _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)); +} +template <> +EIGEN_STRONG_INLINE Packet16f padd(const Packet16f& a, + const Packet16f& b) { + return _mm512_add_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d padd(const Packet8d& a, + const Packet8d& b) { + return _mm512_add_pd(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f psub(const Packet16f& a, + const Packet16f& b) { + return _mm512_sub_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d psub(const Packet8d& a, + const Packet8d& b) { + return _mm512_sub_pd(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { + return _mm512_sub_ps(_mm512_set1_ps(0.0), a); +} +template <> +EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { + return _mm512_sub_pd(_mm512_set1_pd(0.0), a); +} +template <> +EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet16f pmul(const Packet16f& a, + const Packet16f& b) { + return _mm512_mul_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmul(const Packet8d& a, + const Packet8d& b) { + return _mm512_mul_pd(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pdiv(const Packet16f& a, + const Packet16f& b) { + return _mm512_div_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d pdiv(const Packet8d& a, + const Packet8d& b) { + return _mm512_div_pd(a, b); +} +#ifdef __FMA__ +template <> +EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, + const Packet16f& c) { + return _mm512_fmadd_ps(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, + const Packet8d& c) { + return _mm512_fmadd_pd(a, b, c); +} +#endif +template <> +EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, + const Packet16f& b) { + return _mm512_min_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, + const Packet8d& b) { + return _mm512_min_pd(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, + const Packet16f& b) { + return _mm512_max_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, + const Packet8d& b) { + return _mm512_max_pd(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_and_ps(a, b); +#else + Packet16f res = _mm512_undefined_ps(); + Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); + Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); + res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0); + Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); + Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); + res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1); + Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); + Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); + res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2); + Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); + Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); + res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet8d pand(const Packet8d& a, + const Packet8d& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_and_pd(a, b); +#else + Packet8d res = _mm512_undefined_pd(); + Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); + Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); + res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0); + Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); + Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); + res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_or_ps(a, b); +#else + Packet16f res = _mm512_undefined_ps(); + Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); + Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); + res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0); + Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); + Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); + res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1); + Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); + Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); + res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2); + Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); + Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); + res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet8d por(const Packet8d& a, + const Packet8d& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_or_pd(a, b); +#else + Packet8d res = _mm512_undefined_pd(); + Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); + Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); + res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0); + Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); + Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); + res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_xor_ps(a, b); +#else + Packet16f res = _mm512_undefined_ps(); + Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); + Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); + res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0); + Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); + Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); + res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1); + Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); + Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); + res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2); + Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); + Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); + res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, + const Packet8d& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_xor_pd(a, b); +#else + Packet8d res = _mm512_undefined_pd(); + Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); + Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); + res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0); + Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); + Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); + res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_andnot_ps(a, b); +#else + Packet16f res = _mm512_undefined_ps(); + Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); + Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); + res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0); + Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); + Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); + res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1); + Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); + Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); + res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2); + Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); + Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); + res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a, + const Packet8d& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_andnot_pd(a, b); +#else + Packet8d res = _mm512_undefined_pd(); + Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); + Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); + res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0); + Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); + Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); + res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet16f pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from); +} +template <> +EIGEN_STRONG_INLINE Packet8d pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from); +} +template <> +EIGEN_STRONG_INLINE Packet16i pload(const int* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16f ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from); +} +template <> +EIGEN_STRONG_INLINE Packet8d ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from); +} +template <> +EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( + reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { + Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from); + lane0 = _mm256_blend_ps( + lane0, _mm256_castps128_ps256(_mm_permute_ps( + _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))), + 15); + lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2)); + Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4)); + lane1 = _mm256_blend_ps( + lane1, _mm256_castps128_ps256(_mm_permute_ps( + _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))), + 15); + lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2)); +#ifdef EIGEN_VECTORIZE_AVX512DQ + Packet16f res = _mm512_undefined_ps(); + return _mm512_insertf32x8(res, lane0, 0); + return _mm512_insertf32x8(res, lane1, 1); + return res; +#else + Packet16f res = _mm512_undefined_ps(); + res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0); + res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1); + res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2); + res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3); + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { + Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from); + lane0 = _mm256_permute_pd(lane0, 3 << 2); + Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2)); + lane1 = _mm256_permute_pd(lane1, 3 << 2); + Packet8d res = _mm512_undefined_pd(); + res = _mm512_insertf64x4(res, lane0, 0); + return _mm512_insertf64x4(res, lane1, 1); +} +template <> +EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { + Packet16f tmp = _mm512_undefined_ps(); + tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); + tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); + tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2); + tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3); + return tmp; +} +template <> +EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { + Packet8d tmp = _mm512_undefined_pd(); + Packet2d tmp0 = _mm_load_pd1(from); + Packet2d tmp1 = _mm_load_pd1(from + 1); + Packet4d lane0 = _mm256_broadcastsd_pd(tmp0); + Packet4d lane1 = _mm256_broadcastsd_pd(tmp1); + tmp = _mm512_insertf64x4(tmp, lane0, 0); + return _mm512_insertf64x4(tmp, lane1, 1); +} +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet16f& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from); +} +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet8d& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from); +} +template <> +EIGEN_STRONG_INLINE void pstore(int* to, const Packet16i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), + from); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet8d& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( + reinterpret_cast<__m512i*>(to), from); +} +template <> +EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, + Index stride) { + Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_multiplier = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); + return _mm512_i32gather_ps(indices, from, 4); +} +template <> +EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, + Index stride) { + Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); + return _mm512_i32gather_pd(indices, from, 8); +} +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, + const Packet16f& from, + Index stride) { + Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_multiplier = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); + _mm512_i32scatter_ps(to, indices, from, 4); +} +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, + const Packet8d& from, + Index stride) { + Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); + _mm512_i32scatter_pd(to, indices, from, 8); +} +template <> +EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) { + Packet16f pa = pset1(a); + pstore(to, pa); +} +template <> +EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) { + Packet8d pa = pset1(a); + pstore(to, pa); +} +template <> +EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) { + Packet16i pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { + return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0)); +} +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet8d& a) { + return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0)); +} +template <> +EIGEN_STRONG_INLINE int pfirst(const Packet16i& a) { + return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0); +} +template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) +{ + return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a); +} +template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) +{ + return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a); +} +template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) +{ + return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { + return (__m512d)_mm512_and_si512((__m512i)a, + _mm512_set1_epi64(0x7fffffffffffffff)); +} +#ifdef EIGEN_VECTORIZE_AVX512DQ +#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ + __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \ + _mm512_extractf32x8_ps(INPUT, 1) +#else +#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ + __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ + _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \ + _mm512_extractf32x4_ps(INPUT, 1), 1); \ + __m256 OUTPUT##_1 = _mm256_insertf128_ps( \ + _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \ + _mm512_extractf32x4_ps(INPUT, 3), 1); +#endif +#ifdef EIGEN_VECTORIZE_AVX512DQ +#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ + OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); +#else +#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ + OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ + OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ + OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); +#endif +template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* +vecs) +{ + EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); + EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); + EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2); + EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3); + EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4); + EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5); + EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6); + EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7); + EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8); + EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9); + EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10); + EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11); + EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12); + EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13); + EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14); + EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15); + __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0); + __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0); + __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0); + __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0); + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1); + hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1); + hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1); + hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1); + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0)); + hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0); + hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0); + hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0); + hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0); + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0); + hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1); + hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1); + hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1); + hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1); + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0)); + __m512 final_output; + EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1); + return final_output; +} +template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) +{ + Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0); + Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1); + Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0); + Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1); + Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0); + Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1); + Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0); + Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1); + Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0); + Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1); + Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0); + Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1); + Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0); + Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1); + Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0); + Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1); + Packet4d tmp0, tmp1; + tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC); + tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC)); + tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC); + tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); + __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); + return _mm512_insertf64x4(final_output, final_1, 1); +} +template <> +EIGEN_STRONG_INLINE float predux(const Packet16f& a) { +#if 0 + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f sum = padd(lane0, lane1); + Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1)); + tmp0 = _mm256_hadd_ps(tmp0, tmp0); + return pfirst(_mm256_hadd_ps(tmp0, tmp0)); +#else + Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); + Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); + Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); + Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); + Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3)); + sum = _mm_hadd_ps(sum, sum); + sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); + return pfirst(sum); +#endif +} +template <> +EIGEN_STRONG_INLINE double predux(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + Packet4d sum = padd(lane0, lane1); + Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); + return pfirst(_mm256_hadd_pd(tmp0, tmp0)); +} +template <> +EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + return padd(lane0, lane1); +#else + Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); + Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); + Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); + Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); + Packet4f sum0 = padd(lane0, lane2); + Packet4f sum1 = padd(lane1, lane3); + return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + Packet4d res = padd(lane0, lane1); + return res; +} +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { +#if 0 + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f res = pmul(lane0, lane1); + res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); + res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +#else + Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); + Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); + Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); + Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); + Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); + res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +#endif +} +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + Packet4d res = pmul(lane0, lane1); + res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); +} +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); + Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); + Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); + Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); + Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); + res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + Packet4d res = _mm256_min_pd(lane0, lane1); + res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); +} +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); + Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); + Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); + Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); + Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); + res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + Packet4d res = _mm256_max_pd(lane0, lane1); + res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); +} +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet16f& first, + const Packet16f& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, + Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, + Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); + __m512i second_idx = + _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, + Offset - 5, Offset - 6, Offset - 7, Offset - 8, + Offset - 9, Offset - 10, Offset - 11, Offset - 12, + Offset - 13, Offset - 14, Offset - 15, Offset - 16); + unsigned short mask = 0xFFFF; + mask <<= (16 - Offset); + first = _mm512_permutexvar_ps(first_idx, first); + Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); + first = _mm512_mask_blend_ps(mask, first, tmp); + } + } +}; +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, + Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); + __m512i second_idx = _mm512_set_epi32( + 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, + Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); + unsigned char mask = 0xFF; + mask <<= (8 - Offset); + first = _mm512_permutexvar_pd(first_idx, first); + Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); + first = _mm512_mask_blend_pd(mask, first, tmp); + } + } +}; +#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ + EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]); + __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]); + __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]); + __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]); + __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]); + __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]); + __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]); + __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]); + __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2)); + EIGEN_EXTRACT_8f_FROM_16f(S0, S0); + EIGEN_EXTRACT_8f_FROM_16f(S1, S1); + EIGEN_EXTRACT_8f_FROM_16f(S2, S2); + EIGEN_EXTRACT_8f_FROM_16f(S3, S3); + EIGEN_EXTRACT_8f_FROM_16f(S4, S4); + EIGEN_EXTRACT_8f_FROM_16f(S5, S5); + EIGEN_EXTRACT_8f_FROM_16f(S6, S6); + EIGEN_EXTRACT_8f_FROM_16f(S7, S7); + EIGEN_EXTRACT_8f_FROM_16f(S8, S8); + EIGEN_EXTRACT_8f_FROM_16f(S9, S9); + EIGEN_EXTRACT_8f_FROM_16f(S10, S10); + EIGEN_EXTRACT_8f_FROM_16f(S11, S11); + EIGEN_EXTRACT_8f_FROM_16f(S12, S12); + EIGEN_EXTRACT_8f_FROM_16f(S13, S13); + EIGEN_EXTRACT_8f_FROM_16f(S14, S14); + EIGEN_EXTRACT_8f_FROM_16f(S15, S15); + PacketBlock tmp; + tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20); + tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20); + tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20); + tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20); + tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31); + tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31); + tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31); + tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31); + tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20); + tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20); + tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20); + tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20); + tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31); + tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31); + tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31); + tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31); + tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20); + tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20); + tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20); + tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20); + tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31); + tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31); + tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31); + tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31); + tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20); + tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20); + tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20); + tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20); + tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31); + tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31); + tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31); + tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31); + PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16); +} +#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \ + EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \ + INPUT[2 * INDEX + STRIDE]); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); + EIGEN_EXTRACT_8f_FROM_16f(S0, S0); + EIGEN_EXTRACT_8f_FROM_16f(S1, S1); + EIGEN_EXTRACT_8f_FROM_16f(S2, S2); + EIGEN_EXTRACT_8f_FROM_16f(S3, S3); + PacketBlock tmp; + tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20); + tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20); + tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31); + tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31); + tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20); + tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20); + tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31); + tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31); + PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1); + PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1); + PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1); + PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1); +} +#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \ + OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \ + OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1); +#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \ + OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \ + OUTPUT[INDEX] = \ + _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); + __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff); + __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); + __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff); + PacketBlock tmp; + tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), + _mm512_extractf64x4_pd(T2, 0), 0x20); + tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), + _mm512_extractf64x4_pd(T3, 0), 0x20); + tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), + _mm512_extractf64x4_pd(T2, 0), 0x31); + tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), + _mm512_extractf64x4_pd(T3, 0), 0x31); + tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), + _mm512_extractf64x4_pd(T2, 1), 0x20); + tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), + _mm512_extractf64x4_pd(T3, 1), 0x20); + tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), + _mm512_extractf64x4_pd(T2, 1), 0x31); + tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), + _mm512_extractf64x4_pd(T3, 1), 0x31); + PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1); + PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1); + PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1); + PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1); +} +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]); + __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]); + __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]); + __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]); + __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]); + __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]); + PacketBlock tmp; + tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), + _mm512_extractf64x4_pd(T2, 0), 0x20); + tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), + _mm512_extractf64x4_pd(T3, 0), 0x20); + tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), + _mm512_extractf64x4_pd(T2, 0), 0x31); + tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), + _mm512_extractf64x4_pd(T3, 0), 0x31); + tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), + _mm512_extractf64x4_pd(T2, 1), 0x20); + tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), + _mm512_extractf64x4_pd(T3, 1), 0x20); + tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), + _mm512_extractf64x4_pd(T2, 1), 0x31); + tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), + _mm512_extractf64x4_pd(T3, 1), 0x31); + tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), + _mm512_extractf64x4_pd(T6, 0), 0x20); + tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), + _mm512_extractf64x4_pd(T7, 0), 0x20); + tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), + _mm512_extractf64x4_pd(T6, 0), 0x31); + tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), + _mm512_extractf64x4_pd(T7, 0), 0x31); + tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), + _mm512_extractf64x4_pd(T6, 1), 0x20); + tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), + _mm512_extractf64x4_pd(T7, 1), 0x20); + tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), + _mm512_extractf64x4_pd(T6, 1), 0x31); + tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), + _mm512_extractf64x4_pd(T7, 1), 0x31); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8); +} +template <> +EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& , + const Packet16f& , + const Packet16f& ) { + assert(false && "To be implemented"); + return Packet16f(); +} +template <> +EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& , + const Packet8d& , + const Packet8d& ) { + assert(false && "To be implemented"); + return Packet8d(); +} +} +} +#endif +// end #include "src/Core/arch/AVX512/PacketMath.h" +// #include "src/Core/arch/AVX512/MathFunctions.h" +#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ +#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ +namespace Eigen { +namespace internal { +#if EIGEN_GNUC_AT_LEAST(5, 3) +#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ + const Packet16f p16f_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ + const Packet16f p16f_##NAME = (__m512)pset1(X) +#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ + const Packet8d p8d_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ + const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) +#if defined(EIGEN_VECTORIZE_AVX512DQ) +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +plog(const Packet16f& _x) { + Packet16f x = _x; + _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f); + __mmask16 invalid_mask = + _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); + __mmask16 iszero_mask = + _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ); + x = pmax(x, p16f_min_norm_pos); + Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23)); + Packet16f e = _mm512_sub_ps(emm0, p16f_126f); + x = _mm512_and_ps(x, p16f_inv_mant_mask); + x = _mm512_or_ps(x, p16f_half); + __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); + Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); + x = psub(x, p16f_1); + e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); + x = padd(x, tmp); + Packet16f x2 = pmul(x, x); + Packet16f x3 = pmul(x2, x); + Packet16f y, y1, y2; + y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1); + y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4); + y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7); + y = pmadd(y, x, p16f_cephes_log_p2); + y1 = pmadd(y1, x, p16f_cephes_log_p5); + y2 = pmadd(y2, x, p16f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + y1 = pmul(e, p16f_cephes_log_q1); + tmp = pmul(x2, p16f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p16f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, + _mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); +} +#endif +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +pexp(const Packet16f& _x) { + _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f); + _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f); + Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo); + Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half)); + _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f); + Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x); + Packet16f r2 = pmul(r, r); + Packet16f y = p16f_cephes_exp_p0; + y = pmadd(y, r, p16f_cephes_exp_p1); + y = pmadd(y, r, p16f_cephes_exp_p2); + y = pmadd(y, r, p16f_cephes_exp_p3); + y = pmadd(y, r, p16f_cephes_exp_p4); + y = pmadd(y, r, p16f_cephes_exp_p5); + y = pmadd(y, r2, r); + y = padd(y, p16f_1); + Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127)); + emm0 = _mm512_slli_epi32(emm0, 23); + return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x); +} +#if EIGEN_FAST_MATH +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +psqrt(const Packet16f& _x) { + _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); + Packet16f neg_half = pmul(_x, p16f_minus_half); + __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); + Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x), + _mm512_setzero_ps()); + x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + return pmul(_x, x); +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +psqrt(const Packet8d& _x) { + _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); + _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + Packet8d neg_half = pmul(_x, p8d_minus_half); + __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); + Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x), + _mm512_setzero_pd()); + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + return pmul(_x, x); +} +#else +template <> +EIGEN_STRONG_INLINE Packet16f psqrt(const Packet16f& x) { + return _mm512_sqrt_ps(x); +} +template <> +EIGEN_STRONG_INLINE Packet8d psqrt(const Packet8d& x) { + return _mm512_sqrt_pd(x); +} +#endif +#ifdef EIGEN_FAST_MATH +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +prsqrt(const Packet16f& _x) { + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); + Packet16f neg_half = pmul(_x, p16f_minus_half); + __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); + Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), + _mm512_rsqrt14_ps(_x)); + __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); + Packet16f infs_and_nans = _mm512_mask_blend_ps( + neg_mask, p16f_nan, + _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps())); + x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +prsqrt(const Packet8d& _x) { + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); + _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + Packet8d neg_half = pmul(_x, p8d_minus_half); + __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); + Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), + _mm512_rsqrt14_pd(_x)); + __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); + Packet8d infs_and_nans = _mm512_mask_blend_pd( + neg_mask, p8d_nan, + _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd())); + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); +} +#else +template <> +EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { + return _mm512_rsqrt28_ps(x); +} +#endif +#endif +} +} +#endif +// end #include "src/Core/arch/AVX512/MathFunctions.h" +#elif defined EIGEN_VECTORIZE_AVX +// #include "src/Core/arch/SSE/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_SSE_H +#define EIGEN_PACKET_MATH_SSE_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#endif +#endif +#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004) +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + T m_val; +}; +typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; +typedef eigen_packet_wrapper<__m128d> Packet2d; +#else +typedef __m128 Packet4f; +typedef __m128i Packet4i; +typedef __m128d Packet2d; +#endif +template<> struct is_arithmetic<__m128> { enum { value = true }; }; +template<> struct is_arithmetic<__m128i> { enum { value = true }; }; +template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +#define vec4f_swizzle1(v,p,q,r,s) \ + (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) +#define vec4i_swizzle1(v,p,q,r,s) \ + (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) +#define vec2d_swizzle1(v,p,q) \ + (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) +#define vec4f_swizzle2(a,b,p,q,r,s) \ + (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) +#define vec4i_swizzle2(a,b,p,q,r,s) \ + (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + const Packet4f p4f_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + const Packet2d p2d_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + const Packet4i p4i_##NAME = pset1(X) +#ifndef EIGEN_VECTORIZE_AVX +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasBlend = 1 +#ifdef EIGEN_VECTORIZE_SSE4_1 + , + HasRound = 1, + HasFloor = 1, + HasCeil = 1 +#endif + }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasBlend = 1 +#ifdef EIGEN_VECTORIZE_SSE4_1 + , + HasRound = 1, + HasFloor = 1, + HasCeil = 1 +#endif + }; +}; +#endif +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet4i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasBlend = 1 + }; +}; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +#ifndef EIGEN_VECTORIZE_AVX +template<> struct scalar_div_cost { enum { value = 7 }; }; +template<> struct scalar_div_cost { enum { value = 8 }; }; +#endif +#if EIGEN_COMP_MSVC==1500 +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps(from,from,from,from); } +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } +#else +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } +#endif +#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); + return _mm_xor_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); + return _mm_xor_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) +{ + return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); +} +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_mullo_epi32(a,b); +#else + return vec4i_swizzle1( + vec4i_swizzle2( + _mm_mul_epu32(a,b), + _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), + vec4i_swizzle1(b,1,0,3,2)), + 0,2,0,2), + 0,2,1,3); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_min_epi32(a,b); +#else + Packet4i mask = _mm_cmplt_epi32(a,b); + return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_max_epi32(a,b); +#else + Packet4i mask = _mm_cmpgt_epi32(a,b); + return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); +#endif +} +#ifdef EIGEN_VECTORIZE_SSE4_1 +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +#if EIGEN_COMP_MSVC + template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD + #if (EIGEN_COMP_MSVC==1600) + __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); + res = _mm_loadh_pi(res, (const __m64*)(from+2)); + return res; + #else + return _mm_loadu_ps(from); + #endif + } +#else +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_ps(from); +} +#endif +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_pd(from); +} +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_si128(reinterpret_cast(from)); +} +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast(from))), 0, 0, 1, 1); +} +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ return pset1(from[0]); } +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +{ + Packet4i tmp; + tmp = _mm_loadl_epi64(reinterpret_cast(from)); + return vec4i_swizzle1(tmp, 0, 0, 1, 1); +} +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +{ + return _mm_set_pd(from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +{ + return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + } +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + to[stride*0] = _mm_cvtss_f32(from); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +{ + to[stride*0] = _mm_cvtsd_f64(from); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + to[stride*0] = _mm_cvtsi128_si32(from); + to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); +} +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet4f pa = _mm_set_ss(a); + pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); +} +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet2d pa = _mm_set_sd(a); + pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); +} +#ifndef EIGEN_VECTORIZE_AVX +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif +#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return a.m128_f32[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return a.m128d_f64[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } +#elif EIGEN_COMP_MSVC_STRICT +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } +#else +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return _mm_cvtss_f32(a); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ return _mm_shuffle_ps(a,a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ return _mm_shuffle_pd(a,a,0x1); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm_and_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) +{ + #ifdef EIGEN_VECTORIZE_SSSE3 + return _mm_abs_epi32(a); + #else + Packet4i aux = _mm_srai_epi32(a,31); + return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); + #endif +} +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = pload(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = pload(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif +EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) +{ + vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); + vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); + vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); + vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); +} +#ifdef EIGEN_VECTORIZE_SSE3 +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); +} +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + return _mm_hadd_pd(vecs[0], vecs[1]); +} +#else +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); + tmp0 = _mm_add_ps(tmp0, tmp1); + tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); + tmp1 = _mm_add_ps(tmp1, tmp2); + tmp2 = _mm_movehl_ps(tmp1, tmp0); + tmp0 = _mm_movelh_ps(tmp0, tmp1); + return _mm_add_ps(tmp0, tmp2); +} +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); +} +#endif +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ + return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); +} +#ifdef EIGEN_VECTORIZE_SSSE3 +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); +} +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i tmp0 = _mm_hadd_epi32(a,a); + return pfirst(_mm_hadd_epi32(tmp0,tmp0)); +} +#else +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); + return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); +} +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + Packet4i tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + tmp0 = _mm_add_epi32(tmp0, tmp1); + tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + tmp1 = _mm_add_epi32(tmp1, tmp2); + tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); + tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); + return _mm_add_epi32(tmp0, tmp2); +} +#endif +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ + return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return (aux[0] * aux[1]) * (aux[2] * aux[3]);; +} +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ + return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + int aux0 = aux[0] EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ + return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; + int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; + return aux0>aux2 ? aux0 : aux2; +#endif +} +#if EIGEN_COMP_GNUC +#endif +#ifdef EIGEN_VECTORIZE_SSSE3 +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + if (Offset!=0) + first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + if (Offset!=0) + first = _mm_alignr_epi8(second,first, Offset*4); + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset==1) + first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); + } +}; +#else +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + if (Offset==1) + { + first = _mm_move_ss(first,second); + first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); + } + else if (Offset==2) + { + first = _mm_movehl_ps(first,first); + first = _mm_movelh_ps(first,second); + } + else if (Offset==3) + { + first = _mm_move_ss(first,second); + first = _mm_shuffle_ps(first,second,0x93); + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + if (Offset==1) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_shuffle_epi32(first,0x39); + } + else if (Offset==2) + { + first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); + first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + } + else if (Offset==3) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset==1) + { + first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); + first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); + } + } +}; +#endif +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); + kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); + kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); + kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); +} +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),1); +#else + return _mm_move_ss(a, _mm_load_ss(&b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),1); +#else + return _mm_move_sd(a, _mm_load_sd(&b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),(1<<3)); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); + return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1(b))); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),(1<<1)); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); + return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1(b))); +#endif +} +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { + return ::fmaf(a,b,c); +} +template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { + return ::fma(a,b,c); +} +#endif +} +} +#endif +// end #include "src/Core/arch/SSE/PacketMath.h" +// #include "src/Core/arch/SSE/Complex.h" +#ifndef EIGEN_COMPLEX_SSE_H +#define EIGEN_COMPLEX_SSE_H +namespace Eigen { +namespace internal { +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} + __m128 v; +}; +#ifndef EIGEN_VECTORIZE_AVX +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasBlend = 1 + }; +}; +#endif +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + #ifdef EIGEN_VECTORIZE_SSE3 + return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v), + _mm_mul_ps(_mm_movehdup_ps(a.v), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif +} +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; +#if EIGEN_GNUC_AT_MOST(4,2) + res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast(&from)); +#elif EIGEN_GNUC_AT_LEAST(4,6) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wuninitialized" + res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + #pragma GCC diagnostic pop +#else + res.v = _mm_loadl_pi(res.v, (const __m64*)&from); +#endif + return Packet2cf(_mm_movelh_ps(res.v,res.v)); +} +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); + to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); +} +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + #if EIGEN_GNUC_AT_MOST(4,3) + EIGEN_ALIGN16 std::complex res[2]; + _mm_store_ps((float*)res, a.v); + return res[0]; + #else + std::complex res; + _mm_storel_pi((__m64*)&res, a.v); + return res; + #endif +} +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); +} +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = _mm_movehl_ps(first.v, first.v); + first.v = _mm_movelh_ps(first.v, second.v); + } + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(a, pconj(b)); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(pconj(a), b); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return pconj(internal::pmul(a, b)); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const + { return Packet2cf(Eigen::internal::pmul(x, y.v)); } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const + { return Packet2cf(Eigen::internal::pmul(x.v, y)); } +}; +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res = conj_helper().pmul(a,b); + __m128 s = _mm_mul_ps(b.v,b.v); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); +} +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} + __m128d v; +}; +#ifndef EIGEN_VECTORIZE_AVX +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; +#endif +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) +{ + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_xor_pd(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + #ifdef EIGEN_VECTORIZE_SSE3 + return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif +} +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, a.v); + return std::complex(res[0],res[1]); +} +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) +{ + return pfirst(a); +} +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +{ + return vecs[0]; +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) +{ + return pfirst(a); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& , const Packet1cd& ) + { + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(a, pconj(b)); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(pconj(a), b); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return pconj(internal::pmul(a, b)); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const + { return Packet1cd(Eigen::internal::pmul(x, y.v)); } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const + { return Packet1cd(Eigen::internal::pmul(x.v, y)); } +}; +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + Packet1cd res = conj_helper().pmul(a,b); + __m128d s = _mm_mul_pd(b.v,b.v); + return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); +} +EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128d w1 = _mm_castps_pd(kernel.packet[0].v); + __m128d w2 = _mm_castps_pd(kernel.packet[1].v); + __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2)); + kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2)); + kernel.packet[1].v = tmp; +} +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex b) +{ + return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast(&b))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex b) +{ + return pset1(b); +} +template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex b) +{ + return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast(&b))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex b) +{ + return pset1(b); +} +} +} +#endif +// end #include "src/Core/arch/SSE/Complex.h" +// #include "src/Core/arch/SSE/MathFunctions.h" +#ifndef EIGEN_MATH_FUNCTIONS_SSE_H +#define EIGEN_MATH_FUNCTIONS_SSE_H +namespace Eigen { +namespace internal { +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + Packet4i emm0; + Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); + Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); + x = pmax(x, p4f_min_norm_pos); + emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); + x = _mm_and_ps(x, p4f_inv_mant_mask); + x = _mm_or_ps(x, p4f_half); + emm0 = _mm_sub_epi32(emm0, p4i_0x7f); + Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); + Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); + Packet4f tmp = pand(x, mask); + x = psub(x, p4f_1); + e = psub(e, pand(p4f_1, mask)); + x = padd(x, tmp); + Packet4f x2 = pmul(x,x); + Packet4f x3 = pmul(x2,x); + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y , x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + y1 = pmul(e, p4f_cephes_log_q1); + tmp = pmul(x2, p4f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p4f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), + _mm_and_ps(iszero_mask, p4f_minus_inf)); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + Packet4f tmp, fx; + Packet4i emm0; + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); +#ifdef EIGEN_VECTORIZE_SSE4_1 + fx = _mm_floor_ps(fx); +#else + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); + Packet4f mask = _mm_cmpgt_ps(tmp, fx); + mask = _mm_and_ps(mask, p4f_1); + fx = psub(tmp, mask); +#endif + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + z = pmul(x,x); + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + emm0 = _mm_cvttps_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_0x7f); + emm0 = _mm_slli_epi32(emm0, 23); + return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp(const Packet2d& _x) +{ + Packet2d x = _x; + _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); + _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); + _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); + Packet2d tmp, fx; + Packet4i emm0; + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); +#ifdef EIGEN_VECTORIZE_SSE4_1 + fx = _mm_floor_pd(fx); +#else + emm0 = _mm_cvttpd_epi32(fx); + tmp = _mm_cvtepi32_pd(emm0); + Packet2d mask = _mm_cmpgt_pd(tmp, fx); + mask = _mm_and_pd(mask, p2d_1); + fx = psub(tmp, mask); +#endif + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + Packet2d x2 = pmul(x,x); + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + emm0 = _mm_cvttpd_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_1023_0); + emm0 = _mm_slli_epi32(emm0, 20); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); + return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psin(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(1, 1); + _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); + _EIGEN_DECLARE_CONST_Packet4i(2, 2); + _EIGEN_DECLARE_CONST_Packet4i(4, 4); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); + Packet4f xmm1, xmm2, xmm3, sign_bit, y; + Packet4i emm0, emm2; + sign_bit = x; + x = pabs(x); + sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); + y = pmul(x, p4f_cephes_FOPI); + emm2 = _mm_cvttps_epi32(y); + emm2 = _mm_add_epi32(emm2, p4i_1); + emm2 = _mm_and_si128(emm2, p4i_not1); + y = _mm_cvtepi32_ps(emm2); + emm0 = _mm_and_si128(emm2, p4i_4); + emm0 = _mm_slli_epi32(emm0, 29); + emm2 = _mm_and_si128(emm2, p4i_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + Packet4f swap_sign_bit = _mm_castsi128_ps(emm0); + Packet4f poly_mask = _mm_castsi128_ps(emm2); + sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); + xmm1 = pmul(y, p4f_minus_cephes_DP1); + xmm2 = pmul(y, p4f_minus_cephes_DP2); + xmm3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, xmm1); + x = padd(x, xmm2); + x = padd(x, xmm3); + y = p4f_coscof_p0; + Packet4f z = _mm_mul_ps(x,x); + y = pmadd(y, z, p4f_coscof_p1); + y = pmadd(y, z, p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + Packet4f tmp = pmul(z, p4f_half); + y = psub(y, tmp); + y = padd(y, p4f_1); + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmul(y2, x); + y2 = padd(y2, x); + y2 = _mm_and_ps(poly_mask, y2); + y = _mm_andnot_ps(poly_mask, y); + y = _mm_or_ps(y,y2); + return _mm_xor_ps(y, sign_bit); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pcos(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(1, 1); + _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); + _EIGEN_DECLARE_CONST_Packet4i(2, 2); + _EIGEN_DECLARE_CONST_Packet4i(4, 4); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); + Packet4f xmm1, xmm2, xmm3, y; + Packet4i emm0, emm2; + x = pabs(x); + y = pmul(x, p4f_cephes_FOPI); + emm2 = _mm_cvttps_epi32(y); + emm2 = _mm_add_epi32(emm2, p4i_1); + emm2 = _mm_and_si128(emm2, p4i_not1); + y = _mm_cvtepi32_ps(emm2); + emm2 = _mm_sub_epi32(emm2, p4i_2); + emm0 = _mm_andnot_si128(emm2, p4i_4); + emm0 = _mm_slli_epi32(emm0, 29); + emm2 = _mm_and_si128(emm2, p4i_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + Packet4f sign_bit = _mm_castsi128_ps(emm0); + Packet4f poly_mask = _mm_castsi128_ps(emm2); + xmm1 = pmul(y, p4f_minus_cephes_DP1); + xmm2 = pmul(y, p4f_minus_cephes_DP2); + xmm3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, xmm1); + x = padd(x, xmm2); + x = padd(x, xmm3); + y = p4f_coscof_p0; + Packet4f z = pmul(x,x); + y = pmadd(y,z,p4f_coscof_p1); + y = pmadd(y,z,p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + Packet4f tmp = _mm_mul_ps(z, p4f_half); + y = psub(y, tmp); + y = padd(y, p4f_1); + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmadd(y2, x, x); + y2 = _mm_and_ps(poly_mask, y2); + y = _mm_andnot_ps(poly_mask, y); + y = _mm_or_ps(y,y2); + return _mm_xor_ps(y, sign_bit); +} +#if EIGEN_FAST_MATH +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& _x) +{ + Packet4f half = pmul(_x, pset1(.5f)); + Packet4f denormal_mask = _mm_and_ps( + _mm_cmpge_ps(_x, _mm_setzero_ps()), + _mm_cmplt_ps(_x, pset1((std::numeric_limits::min)()))); + Packet4f x = _mm_rsqrt_ps(_x); + x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + return _mm_andnot_ps(denormal_mask, pmul(_x,x)); +} +#else +template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } +#endif +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } +#if EIGEN_FAST_MATH +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& _x) { + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + Packet4f neg_half = pmul(_x, p4f_minus_half); + Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); + Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); + Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); + Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); + Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), + _mm_and_ps(zero_mask, p4f_inf)); + x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); + return _mm_or_ps(x, infs_and_nans); +} +#else +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& x) { + return _mm_div_ps(pset1(1.0f), _mm_sqrt_ps(x)); +} +#endif +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt(const Packet2d& x) { + return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& x) { + return internal::generic_fast_tanh_float(x); +} +} +namespace numext { +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sqrt(const float &x) +{ + return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x)))); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sqrt(const double &x) +{ +#if EIGEN_COMP_GNUC_STRICT + return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); +#else + return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); +#endif +} +} +} +#endif +// end #include "src/Core/arch/SSE/MathFunctions.h" +// #include "src/Core/arch/AVX/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_AVX_H +#define EIGEN_PACKET_MATH_AVX_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif +#endif +typedef __m256 Packet8f; +typedef __m256i Packet8i; +typedef __m256d Packet4d; +template<> struct is_arithmetic<__m256> { enum { value = true }; }; +template<> struct is_arithmetic<__m256i> { enum { value = true }; }; +template<> struct is_arithmetic<__m256d> { enum { value = true }; }; +#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ + const Packet8f p8f_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ + const Packet4d p4d_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \ + const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1(X)) +#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \ + const Packet8i p8i_##NAME = pset1(X) +#ifndef EIGEN_VECTORIZE_AVX512 +template<> struct packet_traits : default_packet_traits +{ + typedef Packet8f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + HasHalfPacket = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasBlend = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1 + }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 1, + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasBlend = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1 + }; +}; +#endif +template<> struct scalar_div_cost { enum { value = 14 }; }; +template<> struct scalar_div_cost { enum { value = 16 }; }; +template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; +template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } +template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } +template<> EIGEN_STRONG_INLINE Packet8f plset(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) +{ + return _mm256_sub_ps(_mm256_set1_ps(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) +{ + return _mm256_sub_pd(_mm256_set1_pd(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmul(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pdiv(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pdiv(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& , const Packet8i& ) +{ eigen_assert(false && "packet integer division are not supported by AVX"); + return pset1(0); +} +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { +#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) + Packet8f res = c; + __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_ps(a,b,c); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { +#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) + Packet4d res = c; + __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_pd(a,b,c); +#endif +} +#endif +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet8f pceil(const Packet8f& a) { return _mm256_ceil_ps(a); } +template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { return _mm256_ceil_pd(a); } +template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } +template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } +template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } +template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) +{ + Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); +} +template<> EIGEN_STRONG_INLINE Packet4d ploaddup(const double* from) +{ + Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); + return _mm256_permute_pd(tmp, 3<<2); +} +template<> EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) +{ + Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); + return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); +} +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } +template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, Index stride) +{ + return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4d pgather(const double* from, Index stride) +{ + return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet8f& from, Index stride) +{ + __m128 low = _mm256_extractf128_ps(from, 0); + to[stride*0] = _mm_cvtss_f32(low); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); + __m128 high = _mm256_extractf128_ps(from, 1); + to[stride*4] = _mm_cvtss_f32(high); + to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); + to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); + to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet4d& from, Index stride) +{ + __m128d low = _mm256_extractf128_pd(from, 0); + to[stride*0] = _mm_cvtsd_f64(low); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); + __m128d high = _mm256_extractf128_pd(from, 1); + to[stride*2] = _mm_cvtsd_f64(high); + to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); +} +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet8f pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet4d pa = pset1(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) +{ + Packet8i pa = pset1(a); + pstore(to, pa); +} +#ifndef EIGEN_VECTORIZE_AVX512 +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif +template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { + return _mm_cvtss_f32(_mm256_castps256_ps128(a)); +} +template<> EIGEN_STRONG_INLINE double pfirst(const Packet4d& a) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); +} +template<> EIGEN_STRONG_INLINE int pfirst(const Packet8i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} +template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) +{ + __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); + return _mm256_permute2f128_ps(tmp, tmp, 1); +} +template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) +{ + __m256d tmp = _mm256_shuffle_pd(a,a,5); + return _mm256_permute2f128_pd(tmp, tmp, 1); + __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); + return _mm256_permute_pd(swap_halves,5); +} +template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) +{ + const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm256_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) +{ + const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm256_and_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) +{ + __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); + __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); + __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); + __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + return final; +} +template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) +{ + Packet4d tmp0, tmp1; + tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + return _mm256_blend_pd(tmp0, tmp1, 0xC); +} +template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) +{ + return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)))); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) +{ + return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); +} +template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) +{ + return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); +} +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) +{ + Packet8f tmp; + tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) +{ + Packet4d tmp; + tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) +{ + Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) +{ + Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} +template<> EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) +{ + Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) +{ + Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) + { + if (Offset==1) + { + first = _mm256_blend_ps(first, second, 1); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0x88); + } + else if (Offset==2) + { + first = _mm256_blend_ps(first, second, 3); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0xcc); + } + else if (Offset==3) + { + first = _mm256_blend_ps(first, second, 7); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0xee); + } + else if (Offset==4) + { + first = _mm256_blend_ps(first, second, 15); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); + } + else if (Offset==5) + { + first = _mm256_blend_ps(first, second, 31); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0x88); + } + else if (Offset==6) + { + first = _mm256_blend_ps(first, second, 63); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xcc); + } + else if (Offset==7) + { + first = _mm256_blend_ps(first, second, 127); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xee); + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) + { + if (Offset==1) + { + first = _mm256_blend_pd(first, second, 1); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 0xA); + } + else if (Offset==2) + { + first = _mm256_blend_pd(first, second, 3); + first = _mm256_permute2f128_pd(first, first, 1); + } + else if (Offset==3) + { + first = _mm256_blend_pd(first, second, 7); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 5); + } + } +}; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); + __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); + __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); + __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); + kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); + kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); + kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); + kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); + kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); + kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); + __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); + __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); + __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); + kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); + kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); + kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); + kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); +} +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) +{ + return _mm256_blend_ps(a,pset1(b),1); +} +template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) +{ + return _mm256_blend_pd(a,pset1(b),1); +} +template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) +{ + return _mm256_blend_ps(a,pset1(b),(1<<7)); +} +template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) +{ + return _mm256_blend_pd(a,pset1(b),(1<<3)); +} +} +} +#endif +// end #include "src/Core/arch/AVX/PacketMath.h" +// #include "src/Core/arch/AVX/MathFunctions.h" +#ifndef EIGEN_MATH_FUNCTIONS_AVX_H +#define EIGEN_MATH_FUNCTIONS_AVX_H +namespace Eigen { +namespace internal { +inline Packet8i pshiftleft(Packet8i v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(v, n); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} +inline Packet8f pshiftright(Packet8f v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); + return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +psin(const Packet8f& _x) { + Packet8f x = _x; + _EIGEN_DECLARE_CONST_Packet8i(one, 1); + _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f); + _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f); + _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f); + _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f); + Packet8f z = pmul(x, p8f_one_over_pi); + Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four)); + x = pmadd(shift, p8f_neg_pi_first, x); + x = pmadd(shift, p8f_neg_pi_second, x); + x = pmadd(shift, p8f_neg_pi_third, x); + z = pmul(x, p8f_four_over_pi); + Packet8i shift_ints = _mm256_cvtps_epi32(shift); + Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); + Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); + Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f); + Packet8f z_minus_two = psub(z, p8f_two); + Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two); + Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4); + right = pmadd(right, z_minus_two2, p8f_coeff_right_2); + right = pmadd(right, z_minus_two2, p8f_coeff_right_0); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f); + Packet8f z2 = pmul(z, z); + Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5); + left = pmadd(left, z2, p8f_coeff_left_3); + left = pmadd(left, z2, p8f_coeff_left_1); + left = pmul(left, z); + left = _mm256_andnot_ps(ival_mask, left); + right = _mm256_and_ps(ival_mask, right); + Packet8f res = _mm256_or_ps(left, right); + res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask)); + return res; +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +plog(const Packet8f& _x) { + Packet8f x = _x; + _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); + Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); + Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); + x = pmax(x, p8f_min_norm_pos); + Packet8f emm0 = pshiftright(x,23); + Packet8f e = _mm256_sub_ps(emm0, p8f_126f); + x = _mm256_and_ps(x, p8f_inv_mant_mask); + x = _mm256_or_ps(x, p8f_half); + Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); + Packet8f tmp = _mm256_and_ps(x, mask); + x = psub(x, p8f_1); + e = psub(e, _mm256_and_ps(p8f_1, mask)); + x = padd(x, tmp); + Packet8f x2 = pmul(x, x); + Packet8f x3 = pmul(x2, x); + Packet8f y, y1, y2; + y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); + y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); + y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); + y = pmadd(y, x, p8f_cephes_log_p2); + y1 = pmadd(y1, x, p8f_cephes_log_p5); + y2 = pmadd(y2, x, p8f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + y1 = pmul(e, p8f_cephes_log_q1); + tmp = pmul(x2, p8f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p8f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + return _mm256_or_ps( + _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), + _mm256_and_ps(iszero_mask, p8f_minus_inf)); +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +pexp(const Packet8f& _x) { + _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); + _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); + Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); + Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); +#ifdef EIGEN_VECTORIZE_FMA + _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); + Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); +#else + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); + Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); + r = psub(r, pmul(m, p8f_cephes_exp_C2)); +#endif + Packet8f r2 = pmul(r, r); + Packet8f y = p8f_cephes_exp_p0; + y = pmadd(y, r, p8f_cephes_exp_p1); + y = pmadd(y, r, p8f_cephes_exp_p2); + y = pmadd(y, r, p8f_cephes_exp_p3); + y = pmadd(y, r, p8f_cephes_exp_p4); + y = pmadd(y, r, p8f_cephes_exp_p5); + y = pmadd(y, r2, r); + y = padd(y, p8f_1); + Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); + emm0 = pshiftleft(emm0, 23); + return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +ptanh(const Packet8f& x) { + return internal::generic_fast_tanh_float(x); +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +pexp(const Packet4d& _x) { + Packet4d x = _x; + _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); + _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); + _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); + _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); + _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); + _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); + Packet4d tmp, fx; + x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); + fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); + fx = _mm256_floor_pd(fx); + tmp = pmul(fx, p4d_cephes_exp_C1); + Packet4d z = pmul(fx, p4d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + Packet4d x2 = pmul(x, x); + Packet4d px = p4d_cephes_exp_p0; + px = pmadd(px, x2, p4d_cephes_exp_p1); + px = pmadd(px, x2, p4d_cephes_exp_p2); + px = pmul(px, x); + Packet4d qx = p4d_cephes_exp_q0; + qx = pmadd(qx, x2, p4d_cephes_exp_q1); + qx = pmadd(qx, x2, p4d_cephes_exp_q2); + qx = pmadd(qx, x2, p4d_cephes_exp_q3); + x = _mm256_div_pd(px, psub(qx, px)); + x = pmadd(p4d_2, x, p4d_1); + __m128i emm0 = _mm256_cvtpd_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_1023); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i lo = _mm_slli_epi64(emm0, 52); + __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); + __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); + e = _mm256_insertf128_si256(e, hi, 1); + return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); +} +#if EIGEN_FAST_MATH +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +psqrt(const Packet8f& _x) { + Packet8f half = pmul(_x, pset1(.5f)); + Packet8f denormal_mask = _mm256_and_ps( + _mm256_cmp_ps(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ)); + Packet8f x = _mm256_rsqrt_ps(_x); + x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + return _mm256_andnot_ps(denormal_mask, pmul(_x,x)); +} +#else +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f psqrt(const Packet8f& x) { + return _mm256_sqrt_ps(x); +} +#endif +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4d psqrt(const Packet4d& x) { + return _mm256_sqrt_pd(x); +} +#if EIGEN_FAST_MATH +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f prsqrt(const Packet8f& _x) { + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); + Packet8f neg_half = pmul(_x, p8f_minus_half); + Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); + Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); + Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); + Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); + Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), + _mm256_and_ps(zero_mask, p8f_inf)); + x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); + return _mm256_or_ps(x, infs_and_nans); +} +#else +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f prsqrt(const Packet8f& x) { + _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); + return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x)); +} +#endif +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4d prsqrt(const Packet4d& x) { + _EIGEN_DECLARE_CONST_Packet4d(one, 1.0); + return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x)); +} +} +} +#endif +// end #include "src/Core/arch/AVX/MathFunctions.h" +// #include "src/Core/arch/AVX/Complex.h" +#ifndef EIGEN_COMPLEX_AVX_H +#define EIGEN_COMPLEX_AVX_H +namespace Eigen { +namespace internal { +struct Packet4cf +{ + EIGEN_STRONG_INLINE Packet4cf() {} + EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {} + __m256 v; +}; +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet4cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; +template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) +{ + return Packet4cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) +{ + const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet4cf(_mm256_xor_ps(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) +{ + __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v); + __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + __m256 result = _mm256_addsub_ps(tmp1, tmp2); + return Packet4cf(result); +} +template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) +{ + return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); +} +template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) +{ + Packet2cf a = ploaddup(from); + Packet2cf b = ploaddup(from+1); + return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1)); +} +template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather, Packet4cf>(const std::complex* from, Index stride) +{ + return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]), + std::imag(from[2*stride]), std::real(from[2*stride]), + std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cf>(std::complex* to, const Packet4cf& from, Index stride) +{ + __m128 low = _mm256_extractf128_ps(from.v, 0); + to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); + to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); + __m128 high = _mm256_extractf128_ps(from.v, 1); + to[stride*2] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); + to[stride*3] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cf& a) +{ + return pfirst(Packet2cf(_mm256_castps256_ps128(a.v))); +} +template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { + __m128 low = _mm256_extractf128_ps(a.v, 0); + __m128 high = _mm256_extractf128_ps(a.v, 1); + __m128d lowd = _mm_castps_pd(low); + __m128d highd = _mm_castps_pd(high); + low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1)); + high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1)); + __m256 result = _mm256_setzero_ps(); + result = _mm256_insertf128_ps(result, low, 1); + result = _mm256_insertf128_ps(result, high, 0); + return Packet4cf(result); +} +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cf& a) +{ + return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)), + Packet2cf(_mm256_extractf128_ps(a.v,1)))); +} +template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) +{ + Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t0 = _mm256_hadd_ps(t0,t1); + Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t2 = _mm256_hadd_ps(t2,t3); + t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); + t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); + return Packet4cf(_mm256_add_ps(t1,t3)); +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) +{ + return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), + Packet2cf(_mm256_extractf128_ps(a.v, 1)))); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const + { return Packet4cf(Eigen::internal::pmul(x, y.v)); } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const + { return Packet4cf(Eigen::internal::pmul(x.v, y)); } +}; +template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) +{ + Packet4cf num = pmul(a, pconj(b)); + __m256 tmp = _mm256_mul_ps(b.v, b.v); + __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1); + __m256 denom = _mm256_add_ps(tmp, tmp2); + return Packet4cf(_mm256_div_ps(num.v, denom)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& x) +{ + return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); +} +struct Packet2cd +{ + EIGEN_STRONG_INLINE Packet2cd() {} + EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {} + __m256d v; +}; +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 2, + HasHalfPacket = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; +template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) +{ + const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet2cd(_mm256_xor_pd(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) +{ + __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0); + __m256d even = _mm256_mul_pd(tmp1, b.v); + __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF); + __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5); + __m256d odd = _mm256_mul_pd(tmp2, tmp3); + return Packet2cd(_mm256_addsub_pd(even, odd)); +} +template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cd pset1(const std::complex& from) +{ + return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from)); +} +template<> EIGEN_STRONG_INLINE Packet2cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather, Packet2cd>(const std::complex* from, Index stride) +{ + return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cd>(std::complex* to, const Packet2cd& from, Index stride) +{ + __m128d low = _mm256_extractf128_pd(from.v, 0); + to[stride*0] = std::complex(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1))); + __m128d high = _mm256_extractf128_pd(from.v, 1); + to[stride*1] = std::complex(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1))); +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cd& a) +{ + __m128d low = _mm256_extractf128_pd(a.v, 0); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex(res[0],res[1]); +} +template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { + __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1); + return Packet2cd(result); +} +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cd& a) +{ + return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); +} +template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) +{ + Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); + Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); + return Packet2cd(_mm256_add_pd(t0,t1)); +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) +{ + return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const + { return Packet2cd(Eigen::internal::pmul(x, y.v)); } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const + { return Packet2cd(Eigen::internal::pmul(x.v, y)); } +}; +template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) +{ + Packet2cd num = pmul(a, pconj(b)); + __m256d tmp = _mm256_mul_pd(b.v, b.v); + __m256d denom = _mm256_hadd_pd(tmp, tmp); + return Packet2cd(_mm256_div_pd(num.v, denom)); +} +template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x) +{ + return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5)); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256d P0 = _mm256_castps_pd(kernel.packet[0].v); + __m256d P1 = _mm256_castps_pd(kernel.packet[1].v); + __m256d P2 = _mm256_castps_pd(kernel.packet[2].v); + __m256d P3 = _mm256_castps_pd(kernel.packet[3].v); + __m256d T0 = _mm256_shuffle_pd(P0, P1, 15); + __m256d T1 = _mm256_shuffle_pd(P0, P1, 0); + __m256d T2 = _mm256_shuffle_pd(P2, P3, 15); + __m256d T3 = _mm256_shuffle_pd(P2, P3, 0); + kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32)); + kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49)); + kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32)); + kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49)); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4)); + kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4)); + kernel.packet[0].v = tmp; +} +template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex b) +{ + return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,1|2)); +} +template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex b) +{ + return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,1|2)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex b) +{ + return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,(1<<7)|(1<<6))); +} +template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex b) +{ + return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,(1<<3)|(1<<2))); +} +} +} +#endif +// end #include "src/Core/arch/AVX/Complex.h" +// #include "src/Core/arch/AVX/TypeCasting.h" +#ifndef EIGEN_TYPE_CASTING_AVX_H +#define EIGEN_TYPE_CASTING_AVX_H +namespace Eigen { +namespace internal { +template <> +struct type_casting_traits { + enum { + VectorizedCast = 0, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; +template <> +struct type_casting_traits { + enum { + VectorizedCast = 0, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; +template<> EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { + return _mm256_cvtps_epi32(a); +} +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i& a) { + return _mm256_cvtepi32_ps(a); +} +} +} +#endif +// end #include "src/Core/arch/AVX/TypeCasting.h" +#elif defined EIGEN_VECTORIZE_SSE +// #include "src/Core/arch/SSE/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_SSE_H +#define EIGEN_PACKET_MATH_SSE_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#endif +#endif +#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004) +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + T m_val; +}; +typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; +typedef eigen_packet_wrapper<__m128d> Packet2d; +#else +typedef __m128 Packet4f; +typedef __m128i Packet4i; +typedef __m128d Packet2d; +#endif +template<> struct is_arithmetic<__m128> { enum { value = true }; }; +template<> struct is_arithmetic<__m128i> { enum { value = true }; }; +template<> struct is_arithmetic<__m128d> { enum { value = true }; }; +#define vec4f_swizzle1(v,p,q,r,s) \ + (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) +#define vec4i_swizzle1(v,p,q,r,s) \ + (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) +#define vec2d_swizzle1(v,p,q) \ + (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) +#define vec4f_swizzle2(a,b,p,q,r,s) \ + (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) +#define vec4i_swizzle2(a,b,p,q,r,s) \ + (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + const Packet4f p4f_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + const Packet2d p2d_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + const Packet4i p4i_##NAME = pset1(X) +#ifndef EIGEN_VECTORIZE_AVX +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasBlend = 1 +#ifdef EIGEN_VECTORIZE_SSE4_1 + , + HasRound = 1, + HasFloor = 1, + HasCeil = 1 +#endif + }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasBlend = 1 +#ifdef EIGEN_VECTORIZE_SSE4_1 + , + HasRound = 1, + HasFloor = 1, + HasCeil = 1 +#endif + }; +}; +#endif +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet4i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasBlend = 1 + }; +}; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +#ifndef EIGEN_VECTORIZE_AVX +template<> struct scalar_div_cost { enum { value = 7 }; }; +template<> struct scalar_div_cost { enum { value = 8 }; }; +#endif +#if EIGEN_COMP_MSVC==1500 +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps(from,from,from,from); } +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set_pd(from,from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set_epi32(from,from,from,from); } +#else +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return _mm_set_ps1(from); } +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } +#endif +#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); + return _mm_xor_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); + return _mm_xor_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) +{ + return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); +} +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_mullo_epi32(a,b); +#else + return vec4i_swizzle1( + vec4i_swizzle2( + _mm_mul_epu32(a,b), + _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), + vec4i_swizzle1(b,1,0,3,2)), + 0,2,0,2), + 0,2,1,3); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_min_epi32(a,b); +#else + Packet4i mask = _mm_cmplt_epi32(a,b); + return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_max_epi32(a,b); +#else + Packet4i mask = _mm_cmpgt_epi32(a,b); + return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); +#endif +} +#ifdef EIGEN_VECTORIZE_SSE4_1 +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } +#if EIGEN_COMP_MSVC + template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD + #if (EIGEN_COMP_MSVC==1600) + __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); + res = _mm_loadh_pi(res, (const __m64*)(from+2)); + return res; + #else + return _mm_loadu_ps(from); + #endif + } +#else +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_ps(from); +} +#endif +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_pd(from); +} +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return _mm_loadu_si128(reinterpret_cast(from)); +} +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast(from))), 0, 0, 1, 1); +} +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ return pset1(from[0]); } +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +{ + Packet4i tmp; + tmp = _mm_loadl_epi64(reinterpret_cast(from)); + return vec4i_swizzle1(tmp, 0, 0, 1, 1); +} +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +{ + return _mm_set_pd(from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +{ + return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + } +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + to[stride*0] = _mm_cvtss_f32(from); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +{ + to[stride*0] = _mm_cvtsd_f64(from); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + to[stride*0] = _mm_cvtsi128_si32(from); + to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); +} +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet4f pa = _mm_set_ss(a); + pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); +} +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet2d pa = _mm_set_sd(a); + pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); +} +#ifndef EIGEN_VECTORIZE_AVX +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif +#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return a.m128_f32[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return a.m128d_f64[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } +#elif EIGEN_COMP_MSVC_STRICT +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } +#else +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return _mm_cvtss_f32(a); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } +#endif +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ return _mm_shuffle_ps(a,a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ return _mm_shuffle_pd(a,a,0x1); } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ return _mm_shuffle_epi32(a,0x1B); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) +{ + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm_and_pd(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) +{ + #ifdef EIGEN_VECTORIZE_SSSE3 + return _mm_abs_epi32(a); + #else + Packet4i aux = _mm_srai_epi32(a,31); + return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); + #endif +} +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = pload(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = pload(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif +EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) +{ + vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); + vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); + vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); + vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); +} +#ifdef EIGEN_VECTORIZE_SSE3 +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); +} +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + return _mm_hadd_pd(vecs[0], vecs[1]); +} +#else +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); + tmp0 = _mm_add_ps(tmp0, tmp1); + tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); + tmp1 = _mm_add_ps(tmp1, tmp2); + tmp2 = _mm_movehl_ps(tmp1, tmp0); + tmp0 = _mm_movelh_ps(tmp0, tmp1); + return _mm_add_ps(tmp0, tmp2); +} +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); +} +#endif +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ + return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); +} +#ifdef EIGEN_VECTORIZE_SSSE3 +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); +} +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i tmp0 = _mm_hadd_epi32(a,a); + return pfirst(_mm_hadd_epi32(tmp0,tmp0)); +} +#else +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); + return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); +} +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + Packet4i tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + tmp0 = _mm_add_epi32(tmp0, tmp1); + tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + tmp1 = _mm_add_epi32(tmp1, tmp2); + tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); + tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); + return _mm_add_epi32(tmp0, tmp2); +} +#endif +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ + return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return (aux[0] * aux[1]) * (aux[2] * aux[3]);; +} +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ + return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + int aux0 = aux[0] EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); + return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +} +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ + return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); +} +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; + int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; + return aux0>aux2 ? aux0 : aux2; +#endif +} +#if EIGEN_COMP_GNUC +#endif +#ifdef EIGEN_VECTORIZE_SSSE3 +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + if (Offset!=0) + first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + if (Offset!=0) + first = _mm_alignr_epi8(second,first, Offset*4); + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset==1) + first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); + } +}; +#else +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + if (Offset==1) + { + first = _mm_move_ss(first,second); + first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); + } + else if (Offset==2) + { + first = _mm_movehl_ps(first,first); + first = _mm_movelh_ps(first,second); + } + else if (Offset==3) + { + first = _mm_move_ss(first,second); + first = _mm_shuffle_ps(first,second,0x93); + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + if (Offset==1) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_shuffle_epi32(first,0x39); + } + else if (Offset==2) + { + first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); + first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + } + else if (Offset==3) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset==1) + { + first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); + first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); + } + } +}; +#endif +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); + kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); + kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); + kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); +} +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),1); +#else + return _mm_move_ss(a, _mm_load_ss(&b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),1); +#else + return _mm_move_sd(a, _mm_load_sd(&b)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),(1<<3)); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); + return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1(b))); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),(1<<1)); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); + return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1(b))); +#endif +} +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { + return ::fmaf(a,b,c); +} +template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { + return ::fma(a,b,c); +} +#endif +} +} +#endif +// end #include "src/Core/arch/SSE/PacketMath.h" +// #include "src/Core/arch/SSE/MathFunctions.h" +#ifndef EIGEN_MATH_FUNCTIONS_SSE_H +#define EIGEN_MATH_FUNCTIONS_SSE_H +namespace Eigen { +namespace internal { +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + Packet4i emm0; + Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); + Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); + x = pmax(x, p4f_min_norm_pos); + emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); + x = _mm_and_ps(x, p4f_inv_mant_mask); + x = _mm_or_ps(x, p4f_half); + emm0 = _mm_sub_epi32(emm0, p4i_0x7f); + Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); + Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); + Packet4f tmp = pand(x, mask); + x = psub(x, p4f_1); + e = psub(e, pand(p4f_1, mask)); + x = padd(x, tmp); + Packet4f x2 = pmul(x,x); + Packet4f x3 = pmul(x2,x); + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y , x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + y1 = pmul(e, p4f_cephes_log_q1); + tmp = pmul(x2, p4f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p4f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), + _mm_and_ps(iszero_mask, p4f_minus_inf)); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + Packet4f tmp, fx; + Packet4i emm0; + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); +#ifdef EIGEN_VECTORIZE_SSE4_1 + fx = _mm_floor_ps(fx); +#else + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); + Packet4f mask = _mm_cmpgt_ps(tmp, fx); + mask = _mm_and_ps(mask, p4f_1); + fx = psub(tmp, mask); +#endif + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + z = pmul(x,x); + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + emm0 = _mm_cvttps_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_0x7f); + emm0 = _mm_slli_epi32(emm0, 23); + return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp(const Packet2d& _x) +{ + Packet2d x = _x; + _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); + _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); + _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); + Packet2d tmp, fx; + Packet4i emm0; + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); +#ifdef EIGEN_VECTORIZE_SSE4_1 + fx = _mm_floor_pd(fx); +#else + emm0 = _mm_cvttpd_epi32(fx); + tmp = _mm_cvtepi32_pd(emm0); + Packet2d mask = _mm_cmpgt_pd(tmp, fx); + mask = _mm_and_pd(mask, p2d_1); + fx = psub(tmp, mask); +#endif + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + Packet2d x2 = pmul(x,x); + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + emm0 = _mm_cvttpd_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_1023_0); + emm0 = _mm_slli_epi32(emm0, 20); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); + return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psin(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(1, 1); + _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); + _EIGEN_DECLARE_CONST_Packet4i(2, 2); + _EIGEN_DECLARE_CONST_Packet4i(4, 4); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); + Packet4f xmm1, xmm2, xmm3, sign_bit, y; + Packet4i emm0, emm2; + sign_bit = x; + x = pabs(x); + sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); + y = pmul(x, p4f_cephes_FOPI); + emm2 = _mm_cvttps_epi32(y); + emm2 = _mm_add_epi32(emm2, p4i_1); + emm2 = _mm_and_si128(emm2, p4i_not1); + y = _mm_cvtepi32_ps(emm2); + emm0 = _mm_and_si128(emm2, p4i_4); + emm0 = _mm_slli_epi32(emm0, 29); + emm2 = _mm_and_si128(emm2, p4i_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + Packet4f swap_sign_bit = _mm_castsi128_ps(emm0); + Packet4f poly_mask = _mm_castsi128_ps(emm2); + sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); + xmm1 = pmul(y, p4f_minus_cephes_DP1); + xmm2 = pmul(y, p4f_minus_cephes_DP2); + xmm3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, xmm1); + x = padd(x, xmm2); + x = padd(x, xmm3); + y = p4f_coscof_p0; + Packet4f z = _mm_mul_ps(x,x); + y = pmadd(y, z, p4f_coscof_p1); + y = pmadd(y, z, p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + Packet4f tmp = pmul(z, p4f_half); + y = psub(y, tmp); + y = padd(y, p4f_1); + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmul(y2, x); + y2 = padd(y2, x); + y2 = _mm_and_ps(poly_mask, y2); + y = _mm_andnot_ps(poly_mask, y); + y = _mm_or_ps(y,y2); + return _mm_xor_ps(y, sign_bit); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pcos(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(1, 1); + _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); + _EIGEN_DECLARE_CONST_Packet4i(2, 2); + _EIGEN_DECLARE_CONST_Packet4i(4, 4); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); + Packet4f xmm1, xmm2, xmm3, y; + Packet4i emm0, emm2; + x = pabs(x); + y = pmul(x, p4f_cephes_FOPI); + emm2 = _mm_cvttps_epi32(y); + emm2 = _mm_add_epi32(emm2, p4i_1); + emm2 = _mm_and_si128(emm2, p4i_not1); + y = _mm_cvtepi32_ps(emm2); + emm2 = _mm_sub_epi32(emm2, p4i_2); + emm0 = _mm_andnot_si128(emm2, p4i_4); + emm0 = _mm_slli_epi32(emm0, 29); + emm2 = _mm_and_si128(emm2, p4i_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + Packet4f sign_bit = _mm_castsi128_ps(emm0); + Packet4f poly_mask = _mm_castsi128_ps(emm2); + xmm1 = pmul(y, p4f_minus_cephes_DP1); + xmm2 = pmul(y, p4f_minus_cephes_DP2); + xmm3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, xmm1); + x = padd(x, xmm2); + x = padd(x, xmm3); + y = p4f_coscof_p0; + Packet4f z = pmul(x,x); + y = pmadd(y,z,p4f_coscof_p1); + y = pmadd(y,z,p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + Packet4f tmp = _mm_mul_ps(z, p4f_half); + y = psub(y, tmp); + y = padd(y, p4f_1); + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmadd(y2, x, x); + y2 = _mm_and_ps(poly_mask, y2); + y = _mm_andnot_ps(poly_mask, y); + y = _mm_or_ps(y,y2); + return _mm_xor_ps(y, sign_bit); +} +#if EIGEN_FAST_MATH +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& _x) +{ + Packet4f half = pmul(_x, pset1(.5f)); + Packet4f denormal_mask = _mm_and_ps( + _mm_cmpge_ps(_x, _mm_setzero_ps()), + _mm_cmplt_ps(_x, pset1((std::numeric_limits::min)()))); + Packet4f x = _mm_rsqrt_ps(_x); + x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + return _mm_andnot_ps(denormal_mask, pmul(_x,x)); +} +#else +template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& x) { return _mm_sqrt_ps(x); } +#endif +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } +#if EIGEN_FAST_MATH +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& _x) { + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + Packet4f neg_half = pmul(_x, p4f_minus_half); + Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); + Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); + Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); + Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); + Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), + _mm_and_ps(zero_mask, p4f_inf)); + x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); + return _mm_or_ps(x, infs_and_nans); +} +#else +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& x) { + return _mm_div_ps(pset1(1.0f), _mm_sqrt_ps(x)); +} +#endif +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt(const Packet2d& x) { + return _mm_div_pd(pset1(1.0), _mm_sqrt_pd(x)); +} +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& x) { + return internal::generic_fast_tanh_float(x); +} +} +namespace numext { +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sqrt(const float &x) +{ + return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x)))); +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sqrt(const double &x) +{ +#if EIGEN_COMP_GNUC_STRICT + return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); +#else + return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); +#endif +} +} +} +#endif +// end #include "src/Core/arch/SSE/MathFunctions.h" +// #include "src/Core/arch/SSE/Complex.h" +#ifndef EIGEN_COMPLEX_SSE_H +#define EIGEN_COMPLEX_SSE_H +namespace Eigen { +namespace internal { +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} + __m128 v; +}; +#ifndef EIGEN_VECTORIZE_AVX +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasBlend = 1 + }; +}; +#endif +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + #ifdef EIGEN_VECTORIZE_SSE3 + return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v), + _mm_mul_ps(_mm_movehdup_ps(a.v), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif +} +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; +#if EIGEN_GNUC_AT_MOST(4,2) + res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast(&from)); +#elif EIGEN_GNUC_AT_LEAST(4,6) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wuninitialized" + res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + #pragma GCC diagnostic pop +#else + res.v = _mm_loadl_pi(res.v, (const __m64*)&from); +#endif + return Packet2cf(_mm_movelh_ps(res.v,res.v)); +} +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + to[stride*0] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); + to[stride*1] = std::complex(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); +} +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + #if EIGEN_GNUC_AT_MOST(4,3) + EIGEN_ALIGN16 std::complex res[2]; + _mm_store_ps((float*)res, a.v); + return res[0]; + #else + std::complex res; + _mm_storel_pi((__m64*)&res, a.v); + return res; + #endif +} +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); +} +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = _mm_movehl_ps(first.v, first.v); + first.v = _mm_movelh_ps(first.v, second.v); + } + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(a, pconj(b)); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(pconj(a), b); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return pconj(internal::pmul(a, b)); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const + { return Packet2cf(Eigen::internal::pmul(x, y.v)); } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const + { return Packet2cf(Eigen::internal::pmul(x.v, y)); } +}; +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res = conj_helper().pmul(a,b); + __m128 s = _mm_mul_ps(b.v,b.v); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); +} +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} + __m128d v; +}; +#ifndef EIGEN_VECTORIZE_AVX +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; +#endif +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) +{ + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_xor_pd(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + #ifdef EIGEN_VECTORIZE_SSE3 + return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif +} +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, a.v); + return std::complex(res[0],res[1]); +} +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) +{ + return pfirst(a); +} +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +{ + return vecs[0]; +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) +{ + return pfirst(a); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& , const Packet1cd& ) + { + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(a, pconj(b)); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(pconj(a), b); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return pconj(internal::pmul(a, b)); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #endif + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const + { return Packet1cd(Eigen::internal::pmul(x, y.v)); } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const + { return Packet1cd(Eigen::internal::pmul(x.v, y)); } +}; +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + Packet1cd res = conj_helper().pmul(a,b); + __m128d s = _mm_mul_pd(b.v,b.v); + return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); +} +EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + __m128d w1 = _mm_castps_pd(kernel.packet[0].v); + __m128d w2 = _mm_castps_pd(kernel.packet[1].v); + __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2)); + kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2)); + kernel.packet[1].v = tmp; +} +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex b) +{ + return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast(&b))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex b) +{ + return pset1(b); +} +template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex b) +{ + return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast(&b))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex b) +{ + return pset1(b); +} +} +} +#endif +// end #include "src/Core/arch/SSE/Complex.h" +// #include "src/Core/arch/SSE/TypeCasting.h" +#ifndef EIGEN_TYPE_CASTING_SSE_H +#define EIGEN_TYPE_CASTING_SSE_H +namespace Eigen { +namespace internal { +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return _mm_cvttps_epi32(a); +} +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return _mm_cvtepi32_ps(a); +} +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); +} +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; +template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { + return _mm_cvtps_pd(a); +} +} +} +#endif +// end #include "src/Core/arch/SSE/TypeCasting.h" +#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) +// #include "src/Core/arch/AltiVec/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_ALTIVEC_H +#define EIGEN_PACKET_MATH_ALTIVEC_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#endif +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif +typedef __vector float Packet4f; +typedef __vector int Packet4i; +typedef __vector unsigned int Packet4ui; +typedef __vector __bool int Packet4bi; +typedef __vector short int Packet8i; +typedef __vector unsigned char Packet16uc; +#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) +#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = vec_splat_s32(X) +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) +#define DST_CHAN 1 +#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) +static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); +static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); +#ifndef __VSX__ +static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); +#endif +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; +#ifdef __PPC64__ +#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 +#else +#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 +#endif +#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) +#ifdef _BIG_ENDIAN +static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); +#ifdef __VSX__ +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +#endif +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8); +static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8); +static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); +#else +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8); +static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8); +static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); +#endif +static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); +static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); +static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; +static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); +#ifdef _BIG_ENDIAN +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); +#else +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); +#endif +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); +#else + #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#endif +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, +#ifdef __VSX__ + HasSqrt = 1, +#if !EIGEN_COMP_CLANG + HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif +#else + HasSqrt = 0, + HasRsqrt = 0, +#endif + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet4i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 + }; +}; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) +{ + union { + Packet16uc v; + unsigned char n[16]; + } vt; + vt.v = v; + for (int i=0; i< 16; i++) + s << (int)vt.n[i] << ", "; + return s; +} +inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) +{ + union { + Packet4f v; + float n[4]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + return s; +} +inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) +{ + union { + Packet4i v; + int n[4]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + return s; +} +inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) +{ + union { + Packet4ui v; + unsigned int n[4]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; + return s; +} +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD +#ifdef __VSX__ + return vec_vsx_ld(0, from); +#else + return vec_ld(0, from); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD +#ifdef __VSX__ + return vec_vsx_ld(0, from); +#else + return vec_ld(0, from); +#endif +} +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + EIGEN_DEBUG_ALIGNED_STORE +#ifdef __VSX__ + vec_vsx_st(from, 0, to); +#else + vec_st(from, 0, to); +#endif +} +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) +{ + EIGEN_DEBUG_ALIGNED_STORE +#ifdef __VSX__ + vec_vsx_st(from, 0, to); +#else + vec_st(from, 0, to); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + Packet4f v = {from, from, from, from}; + return v; +} +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { + Packet4i v = {from, from, from, from}; + return v; +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + pstore(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + pstore((int *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return a * b; } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ +#ifndef __VSX__ + Packet4f t, y_0, y_1; + y_0 = vec_re(b); + t = vec_nmsub(y_0, b, p4f_ONE); + y_1 = vec_madd(y_0, t, y_0); + return vec_madd(a, y_1, p4f_MZERO); +#else + return vec_div(a, b); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& , const Packet4i& ) +{ eigen_assert(false && "packet integer division are not supported by AltiVec"); + return pset1(0); +} +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } +#ifdef _BIG_ENDIAN +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + Packet16uc MSQ, LSQ; + Packet16uc mask; + MSQ = vec_ld(0, (unsigned char *)from); + LSQ = vec_ld(15, (unsigned char *)from); + mask = vec_lvsl(0, from); + return (Packet4f) vec_perm(MSQ, LSQ, mask); +} +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + Packet16uc MSQ, LSQ; + Packet16uc mask; + MSQ = vec_ld(0, (unsigned char *)from); + LSQ = vec_ld(15, (unsigned char *)from); + mask = vec_lvsl(0, from); + return (Packet4i) vec_perm(MSQ, LSQ, mask); +} +#else +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); +} +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); +} +#endif +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +{ + Packet4i p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} +#ifdef _BIG_ENDIAN +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE + Packet16uc MSQ, LSQ, edges; + Packet16uc edgeAlign, align; + MSQ = vec_ld(0, (unsigned char *)to); + LSQ = vec_ld(15, (unsigned char *)to); + edgeAlign = vec_lvsl(0, to); + edges=vec_perm(LSQ,MSQ,edgeAlign); + align = vec_lvsr( 0, to ); + MSQ = vec_perm(edges,(Packet16uc)from,align); + LSQ = vec_perm((Packet16uc)from,edges,align); + vec_st( LSQ, 15, (unsigned char *)to ); + vec_st( MSQ, 0, (unsigned char *)to ); +} +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) +{ + EIGEN_DEBUG_UNALIGNED_STORE + Packet16uc MSQ, LSQ, edges; + Packet16uc edgeAlign, align; + MSQ = vec_ld(0, (unsigned char *)to); + LSQ = vec_ld(15, (unsigned char *)to); + edgeAlign = vec_lvsl(0, to); + edges=vec_perm(LSQ, MSQ, edgeAlign); + align = vec_lvsr( 0, to ); + MSQ = vec_perm(edges, (Packet16uc) from, align); + LSQ = vec_perm((Packet16uc) from, edges, align); + vec_st( LSQ, 15, (unsigned char *)to ); + vec_st( MSQ, 0, (unsigned char *)to ); +} +#else +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); +} +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); +} +#endif +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet4f b, sum; + b = vec_sld(a, a, 8); + sum = a + b; + b = vec_sld(sum, sum, 4); + sum += b; + return pfirst(sum); +} +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f v[4], sum[4]; + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + sum[0] = sum[0] + sum[1]; + sum[1] = sum[2] + sum[3]; + sum[0] = sum[0] + sum[1]; + return sum[0]; +} +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i sum; + sum = vec_sums(a, p4i_ZERO); +#ifdef _BIG_ENDIAN + sum = vec_sld(sum, p4i_ZERO, 12); +#else + sum = vec_sld(p4i_ZERO, sum, 4); +#endif + return pfirst(sum); +} +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + Packet4i v[4], sum[4]; + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + sum[0] = sum[0] + sum[1]; + sum[1] = sum[2] + sum[3]; + sum[0] = sum[0] + sum[1]; + return sum[0]; +} +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f prod; + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); +} +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return aux[0] * aux[1] * aux[2] * aux[3]; +} +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f b, res; + b = vec_min(a, vec_sld(a, a, 8)); + res = vec_min(b, vec_sld(b, b, 4)); + return pfirst(res); +} +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ + Packet4i b, res; + b = vec_min(a, vec_sld(a, a, 8)); + res = vec_min(b, vec_sld(b, b, 4)); + return pfirst(res); +} +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f b, res; + b = vec_max(a, vec_sld(a, a, 8)); + res = vec_max(b, vec_sld(b, b, 4)); + return pfirst(res); +} +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ + Packet4i b, res; + b = vec_max(a, vec_sld(a, a, 8)); + res = vec_max(b, vec_sld(b, b, 4)); + return pfirst(res); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { +#ifdef _BIG_ENDIAN + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } +#else + switch (Offset % 4) { + case 1: + first = vec_sld(second, first, 12); break; + case 2: + first = vec_sld(second, first, 8); break; + case 3: + first = vec_sld(second, first, 4); break; + } +#endif + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { +#ifdef _BIG_ENDIAN + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } +#else + switch (Offset % 4) { + case 1: + first = vec_sld(second, first, 12); break; + case 2: + first = vec_sld(second, first, 8); break; + case 3: + first = vec_sld(second, first, 4); break; + } +#endif + } +}; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4i t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); + return vec_sel(elsePacket, thenPacket, mask); +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); + return vec_sel(elsePacket, thenPacket, mask); +} +#ifdef __VSX__ +typedef __vector double Packet2d; +typedef __vector unsigned long long Packet2ul; +typedef __vector long long Packet2l; +#if EIGEN_COMP_CLANG +typedef Packet2ul Packet2bl; +#else +typedef __vector __bool long Packet2bl; +#endif +static Packet2l p2l_ONE = { 1, 1 }; +static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); +static Packet2d p2d_MZERO = { -0.0, -0.0 }; +#ifdef _BIG_ENDIAN +static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); +#else +static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ONE), reinterpret_cast(p2d_ZERO), 8)); +#endif +template Packet2d vec_splat_dbl(Packet2d& a); +template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) +{ + return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_HI)); +} +template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) +{ + return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_LO)); +} +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) +{ + union { + Packet2l v; + int64_t n[2]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1]; + return s; +} +inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) +{ + union { + Packet2d v; + double n[2]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1]; + return s; +} +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD +#ifdef __VSX__ + return vec_vsx_ld(0, from); +#else + return vec_ld(0, from); +#endif +} +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) +{ + EIGEN_DEBUG_ALIGNED_STORE +#ifdef __VSX__ + vec_vsx_st(from, 0, to); +#else + vec_st(from, 0, to); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + Packet2d v = {from, from}; + return v; +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload(a); + a0 = vec_splat_dbl<0>(a1); + a1 = vec_splat_dbl<1>(a1); + a3 = pload(a+2); + a2 = vec_splat_dbl<0>(a3); + a3 = vec_splat_dbl<1>(a3); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + pstore(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return pset1(a) + p2d_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); +} +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ + Packet2d p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_splat_dbl<0>(p); +} +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); +} +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); +} +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ + Packet2d b, sum; + b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); + sum = a + b; + return pfirst(sum); +} +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); + v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); +#ifdef _BIG_ENDIAN + sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); +#else + sum = reinterpret_cast(vec_sld(reinterpret_cast(v[1]), reinterpret_cast(v[0]), 8)); +#endif + return sum; +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ + return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ + return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ + return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) +#ifdef _BIG_ENDIAN + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); +#else + first = reinterpret_cast(vec_sld(reinterpret_cast(second), reinterpret_cast(first), 8)); +#endif + } +}; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet2d t0, t1; + t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + kernel.packet[0] = t0; + kernel.packet[1] = t1; +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2bl mask = vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} +#endif +} +} +#endif +// end #include "src/Core/arch/AltiVec/PacketMath.h" +// #include "src/Core/arch/AltiVec/MathFunctions.h" +#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H +namespace Eigen { +namespace internal { +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +#ifdef __VSX__ +static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); +static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); +#ifdef __POWER8_VECTOR__ +static Packet2l p2l_1023 = { 1023, 1023 }; +static Packet2ul p2ul_52 = { 52, 52 }; +#endif +#endif +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4i emm0; + Packet4ui isvalid_mask = reinterpret_cast(vec_cmpge(x, p4f_ZERO)); + Packet4ui iszero_mask = reinterpret_cast(vec_cmpeq(x, p4f_ZERO)); + x = pmax(x, p4f_min_norm_pos); + emm0 = vec_sr(reinterpret_cast(x), + reinterpret_cast(p4i_23)); + x = pand(x, p4f_inv_mant_mask); + x = por(x, p4f_half); + emm0 = psub(emm0, p4i_0x7f); + Packet4f e = padd(vec_ctf(emm0, 0), p4f_1); + Packet4f mask = reinterpret_cast(vec_cmplt(x, p4f_cephes_SQRTHF)); + Packet4f tmp = pand(x, mask); + x = psub(x, p4f_1); + e = psub(e, pand(p4f_1, mask)); + x = padd(x, tmp); + Packet4f x2 = pmul(x,x); + Packet4f x3 = pmul(x2,x); + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y , x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + y1 = pmul(e, p4f_cephes_log_q1); + tmp = pmul(x2, p4f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p4f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + x = vec_sel(x, p4f_minus_inf, iszero_mask); + x = vec_sel(p4f_minus_nan, x, isvalid_mask); + return x; +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4f tmp, fx; + Packet4i emm0; + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + fx = pfloor(fx); + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + z = pmul(x,x); + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + emm0 = vec_cts(fx, 0); + emm0 = vec_add(emm0, p4i_0x7f); + emm0 = vec_sl(emm0, reinterpret_cast(p4i_23)); + Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), + isnumber_mask); +} +#ifndef EIGEN_COMP_CLANG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& x) +{ + return vec_rsqrt(x); +} +#endif +#ifdef __VSX__ +#ifndef EIGEN_COMP_CLANG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt(const Packet2d& x) +{ + return vec_rsqrt(x); +} +#endif +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& x) +{ + return vec_sqrt(x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt(const Packet2d& x) +{ + return vec_sqrt(x); +} +static inline Packet2l ConvertToPacket2l(const Packet2d& x) { +#if EIGEN_GNUC_AT_LEAST(5, 4) || \ + (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) + return vec_cts(x, 0); +#else + double tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2l l = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return l; +#endif +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp(const Packet2d& _x) +{ + Packet2d x = _x; + Packet2d tmp, fx; + Packet2l emm0; + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half); + fx = pfloor(fx); + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + Packet2d x2 = pmul(x,x); + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + emm0 = ConvertToPacket2l(fx); +#ifdef __POWER8_VECTOR__ + emm0 = vec_add(emm0, p2l_1023); + emm0 = vec_sl(emm0, p2ul_52); +#else + _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); + _EIGEN_DECLARE_CONST_Packet4i(20, 20); + Packet4i emm04i = reinterpret_cast(emm0); + emm04i = vec_add(emm04i, p4i_1023); + emm04i = vec_sl(emm04i, reinterpret_cast(p4i_20)); + static const Packet16uc perm = { + 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; +#ifdef _BIG_ENDIAN + emm0 = reinterpret_cast(vec_perm(p4i_ZERO, emm04i, perm)); +#else + emm0 = reinterpret_cast(vec_perm(emm04i, p4i_ZERO, perm)); +#endif +#endif + Packet2ul isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(x, reinterpret_cast(emm0)), _x), + isnumber_mask); +} +#endif +} +} +#endif +// end #include "src/Core/arch/AltiVec/MathFunctions.h" +// #include "src/Core/arch/AltiVec/Complex.h" +#ifndef EIGEN_COMPLEX32_ALTIVEC_H +#define EIGEN_COMPLEX32_ALTIVEC_H +namespace Eigen { +namespace internal { +static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); +#ifdef __VSX__ +#if defined(_BIG_ENDIAN) +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8); +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8); +#else +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8); +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8); +#endif +#endif +struct Packet2cf +{ + EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {} + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + Packet4f v; +}; +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, +#ifdef __VSX__ + HasBlend = 1, +#endif + HasSetLinear = 0 + }; +}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + if((std::ptrdiff_t(&from) % 16) == 0) + res.v = pload((const float *)&from); + else + res.v = ploadu((const float *)&from); + res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { return Packet2cf(pload((const float *) from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { return Packet2cf(ploadu((const float*) from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f v1, v2; + v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + v1 = vec_madd(v1, b.v, p4f_ZERO); + v2 = vec_madd(v2, b.v, p4f_ZERO); + v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); + v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); + return Packet2cf(padd(v1, v2)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_PPC_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore((float *)&res, a.v); + return res[0]; +} +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet4f rev_a; + rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); + return Packet2cf(rev_a); +} +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + Packet4f b; + b = vec_sld(a.v, a.v, 8); + b = padd(a.v, b); + return pfirst(Packet2cf(b)); +} +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + Packet4f b1, b2; +#ifdef _BIG_ENDIAN + b1 = vec_sld(vecs[0].v, vecs[1].v, 8); + b2 = vec_sld(vecs[1].v, vecs[0].v, 8); +#else + b1 = vec_sld(vecs[1].v, vecs[0].v, 8); + b2 = vec_sld(vecs[0].v, vecs[1].v, 8); +#endif + b2 = vec_sld(b2, b2, 8); + b2 = padd(b1, b2); + return Packet2cf(b2); +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + Packet4f b; + Packet2cf prod; + b = vec_sld(a.v, a.v, 8); + prod = pmul(a, Packet2cf(b)); + return pfirst(prod); +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { +#ifdef _BIG_ENDIAN + first.v = vec_sld(first.v, second.v, 8); +#else + first.v = vec_sld(second.v, first.v, 8); +#endif + } + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const + { return Packet2cf(internal::pmul(x, y.v)); } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const + { return Packet2cf(internal::pmul(x.v, y)); } +}; +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res = conj_helper().pmul(a, b); + Packet4f s = pmul(b.v, b.v); + return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); +} +template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} +#ifdef __VSX__ +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); + return result; +} +#endif +#ifdef __VSX__ +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ return ploadu(&from); } +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast(p2ul_CONJ_XOR2))); } +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d a_re, a_im, v1, v2; + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + v1 = vec_madd(a_re, b.v, p2d_ZERO); + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = reinterpret_cast(vec_sld(reinterpret_cast(v2), reinterpret_cast(v2), 8)); + v2 = pxor(v2, reinterpret_cast(p2ul_CONJ_XOR1)); + return Packet1cd(padd(v1, v2)); +} +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_PPC_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + return res[0]; +} +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& , const Packet1cd& ) + { + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const + { return Packet1cd(internal::pmul(x, y.v)); } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const + { return Packet1cd(internal::pmul(x.v, y)); } +}; +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = pmul(b.v, b.v); + return Packet1cd(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_REVERSE64)))); +} +EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} +#endif +} +} +#endif +// end #include "src/Core/arch/AltiVec/Complex.h" +#elif defined EIGEN_VECTORIZE_NEON +// #include "src/Core/arch/NEON/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_NEON_H +#define EIGEN_PACKET_MATH_NEON_H +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#if EIGEN_ARCH_ARM64 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#else +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#endif +#endif +typedef float32x2_t Packet2f; +typedef float32x4_t Packet4f; +typedef int32x4_t Packet4i; +typedef int32x2_t Packet2i; +typedef uint32x4_t Packet4ui; +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + const Packet4f p4f_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1(X)) +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + const Packet4i p4i_##NAME = pset1(X) +#if EIGEN_ARCH_ARM64 + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : ); +#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); +#elif defined __pld + #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) +#elif EIGEN_ARCH_ARM32 + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); +#else + #define EIGEN_ARM_PREFETCH(ADDR) +#endif +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket=0, + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, + HasSqrt = 0 + }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4i type; + typedef Packet4i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket=0 + }; +}; +#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM +EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } +EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } +#endif +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) +{ + const float f[] = {0, 1, 2, 3}; + Packet4f countdown = vld1q_f32(f); + return vaddq_f32(pset1(a), countdown); +} +template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) +{ + const int32_t i[] = {0, 1, 2, 3}; + Packet4i countdown = vld1q_s32(i); + return vaddq_s32(pset1(a), countdown); +} +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ +#if EIGEN_ARCH_ARM64 + return vdivq_f32(a,b); +#else + Packet4f inv, restep, div; + inv = vrecpeq_f32(b); + restep = vrecpsq_f32(b, inv); + inv = vmulq_f32(restep, inv); + div = vmulq_f32(a, inv); + return div; +#endif +} +template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& , const Packet4i& ) +{ eigen_assert(false && "packet integer division are not supported by NEON"); + return pset1(0); +} +#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { +#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM + Packet4f r = c; + asm volatile( + "vmla.f32 %q[r], %q[a], %q[b]" + : [r] "+w" (r) + : [a] "w" (a), + [b] "w" (b) + : ); + return r; +#else + return vmlaq_f32(c,a,b); +#endif +} +#endif +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + float32x2_t lo, hi; + lo = vld1_dup_f32(from); + hi = vld1_dup_f32(from+1); + return vcombine_f32(lo, hi); +} +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) +{ + int32x2_t lo, hi; + lo = vld1_dup_s32(from); + hi = vld1_dup_s32(from+1); + return vcombine_s32(lo, hi); +} +template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + Packet4f res = pset1(0.f); + res = vsetq_lane_f32(from[0*stride], res, 0); + res = vsetq_lane_f32(from[1*stride], res, 1); + res = vsetq_lane_f32(from[2*stride], res, 2); + res = vsetq_lane_f32(from[3*stride], res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) +{ + Packet4i res = pset1(0); + res = vsetq_lane_s32(from[0*stride], res, 0); + res = vsetq_lane_s32(from[1*stride], res, 1); + res = vsetq_lane_s32(from[2*stride], res, 2); + res = vsetq_lane_s32(from[3*stride], res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + to[stride*0] = vgetq_lane_f32(from, 0); + to[stride*1] = vgetq_lane_f32(from, 1); + to[stride*2] = vgetq_lane_f32(from, 2); + to[stride*3] = vgetq_lane_f32(from, 3); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) +{ + to[stride*0] = vgetq_lane_s32(from, 0); + to[stride*1] = vgetq_lane_s32(from, 1); + to[stride*2] = vgetq_lane_s32(from, 2); + to[stride*3] = vgetq_lane_s32(from, 3); +} +template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { + float32x2_t a_lo, a_hi; + Packet4f a_r64; + a_r64 = vrev64q_f32(a); + a_lo = vget_low_f32(a_r64); + a_hi = vget_high_f32(a_r64); + return vcombine_f32(a_hi, a_lo); +} +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { + int32x2_t a_lo, a_hi; + Packet4i a_r64; + a_r64 = vrev64q_s32(a); + a_lo = vget_low_s32(a_r64); + a_hi = vget_high_s32(a_r64); + return vcombine_s32(a_hi, a_lo); +} +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + float32x2_t a_lo, a_hi, sum; + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + sum = vpadd_f32(a_lo, a_hi); + sum = vpadd_f32(sum, sum); + return vget_lane_f32(sum, 0); +} +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + float32x4x2_t vtrn1, vtrn2, res1, res2; + Packet4f sum1, sum2, sum; + vtrn1 = vzipq_f32(vecs[0], vecs[2]); + vtrn2 = vzipq_f32(vecs[1], vecs[3]); + res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); + res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); + sum1 = vaddq_f32(res1.val[0], res1.val[1]); + sum2 = vaddq_f32(res2.val[0], res2.val[1]); + sum = vaddq_f32(sum1, sum2); + return sum; +} +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) +{ + int32x2_t a_lo, a_hi, sum; + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + sum = vpadd_s32(a_lo, a_hi); + sum = vpadd_s32(sum, sum); + return vget_lane_s32(sum, 0); +} +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + int32x4x2_t vtrn1, vtrn2, res1, res2; + Packet4i sum1, sum2, sum; + vtrn1 = vzipq_s32(vecs[0], vecs[2]); + vtrn2 = vzipq_s32(vecs[1], vecs[3]); + res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); + res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); + sum1 = vaddq_s32(res1.val[0], res1.val[1]); + sum2 = vaddq_s32(res2.val[0], res2.val[1]); + sum = vaddq_s32(sum1, sum2); + return sum; +} +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + float32x2_t a_lo, a_hi, prod; + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + prod = vmul_f32(a_lo, a_hi); + prod = vmul_f32(prod, vrev64_f32(prod)); + return vget_lane_f32(prod, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) +{ + int32x2_t a_lo, a_hi, prod; + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + prod = vmul_s32(a_lo, a_hi); + prod = vmul_s32(prod, vrev64_s32(prod)); + return vget_lane_s32(prod, 0); +} +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + float32x2_t a_lo, a_hi, min; + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + min = vpmin_f32(a_lo, a_hi); + min = vpmin_f32(min, min); + return vget_lane_f32(min, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) +{ + int32x2_t a_lo, a_hi, min; + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + min = vpmin_s32(a_lo, a_hi); + min = vpmin_s32(min, min); + return vget_lane_s32(min, 0); +} +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + float32x2_t a_lo, a_hi, max; + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + max = vpmax_f32(a_lo, a_hi); + max = vpmax_f32(max, max); + return vget_lane_f32(max, 0); +} +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) +{ + int32x2_t a_lo, a_hi, max; + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + max = vpmax_s32(a_lo, a_hi); + max = vpmax_s32(max, max); + return vget_lane_s32(max, 0); +} +#define PALIGN_NEON(Offset,Type,Command) \ +template<>\ +struct palign_impl\ +{\ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ + {\ + if (Offset!=0)\ + first = Command(first, second, Offset);\ + }\ +};\ +PALIGN_NEON(0,Packet4f,vextq_f32) +PALIGN_NEON(1,Packet4f,vextq_f32) +PALIGN_NEON(2,Packet4f,vextq_f32) +PALIGN_NEON(3,Packet4f,vextq_f32) +PALIGN_NEON(0,Packet4i,vextq_s32) +PALIGN_NEON(1,Packet4i,vextq_s32) +PALIGN_NEON(2,Packet4i,vextq_s32) +PALIGN_NEON(3,Packet4i,vextq_s32) +#undef PALIGN_NEON +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); + float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); + kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); + kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); + kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); + int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); + kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); + kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); + kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +} +#ifdef __apple_build_version__ +#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000) +#else +#define EIGEN_APPLE_DOUBLE_NEON_BUG 0 +#endif +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +template +uint64x2_t vreinterpretq_u64_f64(T a) +{ + return (uint64x2_t) a; +} +template +float64x2_t vreinterpretq_f64_u64(T a) +{ + return (float64x2_t) a; +} +typedef float64x2_t Packet2d; +typedef float64x1_t Packet1d; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket=0, + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; +}; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) +{ + const double countdown_raw[] = {0.0,1.0}; + const Packet2d countdown = vld1q_f64(countdown_raw); + return vaddq_f64(pset1(a), countdown); +} +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } +#ifdef __ARM_FEATURE_FMA +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +#endif +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ + return vld1q_dup_f64(from); +} +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +{ + Packet2d res = pset1(0.0); + res = vsetq_lane_f64(from[0*stride], res, 0); + res = vsetq_lane_f64(from[1*stride], res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +{ + to[stride*0] = vgetq_lane_f64(from, 0); + to[stride*1] = vgetq_lane_f64(from, 1); +} +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a, 0); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + float64x2_t trn1, trn2; + trn1 = vzip1q_f64(vecs[0], vecs[1]); + trn2 = vzip2q_f64(vecs[0], vecs[1]); + return vaddq_f64(trn1, trn2); +} +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } +#define PALIGN_NEON(Offset,Type,Command) \ +template<>\ +struct palign_impl\ +{\ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ + {\ + if (Offset!=0)\ + first = Command(first, second, Offset);\ + }\ +};\ +PALIGN_NEON(0,Packet2d,vextq_f64) +PALIGN_NEON(1,Packet2d,vextq_f64) +#undef PALIGN_NEON +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; +} +#endif +} +} +#endif +// end #include "src/Core/arch/NEON/PacketMath.h" +// #include "src/Core/arch/NEON/MathFunctions.h" +#ifndef EIGEN_MATH_FUNCTIONS_NEON_H +#define EIGEN_MATH_FUNCTIONS_NEON_H +namespace Eigen { +namespace internal { +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4f tmp, fx; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + x = vminq_f32(x, p4f_exp_hi); + x = vmaxq_f32(x, p4f_exp_lo); + fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + Packet4ui mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + tmp = vmulq_f32(fx, p4f_cephes_exp_C1); + Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, p4f_cephes_exp_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p5); + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, p4f_1); + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, p4i_0x7f); + mm = vshlq_n_s32(mm, 23); + Packet4f pow2n = vreinterpretq_f32_s32(mm); + y = vmulq_f32(y, pow2n); + return y; +} +} +} +#endif +// end #include "src/Core/arch/NEON/MathFunctions.h" +// #include "src/Core/arch/NEON/Complex.h" +#ifndef EIGEN_COMPLEX_NEON_H +#define EIGEN_COMPLEX_NEON_H +namespace Eigen { +namespace internal { +inline uint32x4_t p4ui_CONJ_XOR() { +#if EIGEN_COMP_CLANG + uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; + return ret; +#else + static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; + return vld1q_u32( conj_XOR_DATA ); +#endif +} +inline uint32x2_t p2ui_CONJ_XOR() { + static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 }; + return vld1_u32( conj_XOR_DATA ); +} +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + Packet4f v; +}; +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + float32x2_t r64; + r64 = vld1_f32((float *)&from); + return Packet2cf(vcombine_f32(r64, r64)); +} +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet4ui b = vreinterpretq_u32_f32(a.v); + return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); +} +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f v1, v2; + v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0)); + v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1)); + v1 = vmulq_f32(v1, b.v); + v2 = vmulq_f32(v2, b.v); + v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR())); + v2 = vrev64q_f32(v2); + return Packet2cf(vaddq_f32(v1, v2)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) +{ + return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) +{ + return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) +{ + return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) +{ + return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + Packet4f res = pset1(0.f); + res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); + res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); + res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); + return Packet2cf(res); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); + to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); +} +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 x[2]; + vst1q_f32((float *)x, a.v); + return x[0]; +} +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + float32x2_t a_lo, a_hi; + Packet4f a_r128; + a_lo = vget_low_f32(a.v); + a_hi = vget_high_f32(a.v); + a_r128 = vcombine_f32(a_hi, a_lo); + return Packet2cf(a_r128); +} +template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) +{ + return Packet2cf(vrev64q_f32(a.v)); +} +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + float32x2_t a1, a2; + std::complex s; + a1 = vget_low_f32(a.v); + a2 = vget_high_f32(a.v); + a2 = vadd_f32(a1, a2); + vst1_f32((float *)&s, a2); + return s; +} +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + Packet4f sum1, sum2, sum; + sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); + sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); + sum = vaddq_f32(sum1, sum2); + return Packet2cf(sum); +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + float32x2_t a1, a2, v1, v2, prod; + std::complex s; + a1 = vget_low_f32(a.v); + a2 = vget_high_f32(a.v); + v1 = vdup_lane_f32(a1, 0); + v2 = vdup_lane_f32(a1, 1); + v1 = vmul_f32(v1, a2); + v2 = vmul_f32(v2, a2); + v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); + v2 = vrev64_f32(v2); + prod = vadd_f32(v1, v2); + vst1_f32((float *)&s, prod); + return s; +} +template +struct palign_impl +{ + EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = vextq_f32(first.v, second.v, 2); + } + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res = conj_helper().pmul(a,b); + Packet4f s, rev_s; + s = vmulq_f32(b.v, b.v); + rev_s = vrev64q_f32(s); + return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +#if EIGEN_COMP_CLANG + static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; +#else + const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; + static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA ); +#endif +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d v1, v2; + v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); + v2 = vdupq_lane_f64(vget_high_f64(a.v), 0); + v1 = vmulq_f64(v1, b.v); + v2 = vmulq_f64(v2, b.v); + v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); + v2 = preverse(v2); + return Packet1cd(vaddq_f64(v1, v2)); +} +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((double *)addr); } +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +{ + Packet2d res = pset1(0.0); + res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1); + return Packet1cd(res); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) +{ + to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + std::complex EIGEN_ALIGN16 res; + pstore >(&res, a); + return res; +} +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& , const Packet1cd& ) + { + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = pmul(b.v, b.v); + Packet2d rev_s = preverse(s); + return Packet1cd(pdiv(res.v, padd(s,rev_s))); +} +EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} +#endif +} +} +#endif +// end #include "src/Core/arch/NEON/Complex.h" +#elif defined EIGEN_VECTORIZE_ZVECTOR +// #include "src/Core/arch/ZVector/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_ZVECTOR_H +#define EIGEN_PACKET_MATH_ZVECTOR_H +#include +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#endif +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#endif +typedef __vector int Packet4i; +typedef __vector unsigned int Packet4ui; +typedef __vector __bool int Packet4bi; +typedef __vector short int Packet8i; +typedef __vector unsigned char Packet16uc; +typedef __vector double Packet2d; +typedef __vector unsigned long long Packet2ul; +typedef __vector long long Packet2l; +typedef struct { + Packet2d v4f[2]; +} Packet4f; +typedef union { + int32_t i[4]; + uint32_t ui[4]; + int64_t l[2]; + uint64_t ul[2]; + double d[2]; + Packet4i v4i; + Packet4ui v4ui; + Packet2l v2l; + Packet2ul v2ul; + Packet2d v2d; +} Packet; +#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = reinterpret_cast(vec_splat_s32(X)) +#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = reinterpret_cast(vec_splat_s64(X)) +#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = reinterpret_cast(vec_splat_s64(X)) +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = pset1(X) +#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = pset1(X) +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); +static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); +static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; +#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 +#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) +static Packet16uc p16uc_FORWARD = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8); +static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); +inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) +{ + Packet vt; + vt.v4i = v; + s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3]; + return s; +} +inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) +{ + Packet vt; + vt.v4ui = v; + s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3]; + return s; +} +inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) +{ + Packet vt; + vt.v2l = v; + s << vt.l[0] << ", " << vt.l[1]; + return s; +} +inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v) +{ + Packet vt; + vt.v2ul = v; + s << vt.ul[0] << ", " << vt.ul[1] ; + return s; +} +inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) +{ + Packet vt; + vt.v2d = v; + s << vt.d[0] << ", " << vt.d[1]; + return s; +} +template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); + first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); + break; + case 2: + first.v4f[0] = first.v4f[1]; + first.v4f[1] = second.v4f[0]; + break; + case 3: + first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); + first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); + break; + } + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); + } +}; +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4i; +} +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v2d; +} +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4i = from; +} +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v2d = from; +} +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) +{ + return vec_splats(from); +} +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + return vec_splats(from); +} +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1(static_cast(from)); + to.v4f[1] = to.v4f[0]; + return to; +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload(a); + a0 = vec_splat(a1, 0); + a1 = vec_splat(a1, 1); + a3 = pload(a+2); + a2 = vec_splat(a3, 0); + a3 = vec_splat(a3, 1); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + pstore((int *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + pstore(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +{ + Packet4i p = pload(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ + Packet2d p = pload(from); + return vec_perm(p, p, p16uc_PSET64_HI); +} +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); +} +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse(a.v4f[1]); + rev.v4f[1] = preverse(a.v4f[0]); + return rev; +} +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i b, sum; + b = vec_sld(a, a, 8); + sum = padd(a, b); + b = vec_sld(sum, sum, 4); + sum = padd(sum, b); + return pfirst(sum); +} +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ + Packet2d b, sum; + b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); + sum = padd(a, b); + return pfirst(sum); +} +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet2d sum; + sum = padd(a.v4f[0], a.v4f[1]); + double first = predux(sum); + return static_cast(first); +} +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + Packet4i v[4], sum[4]; + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + sum[0] = padd(sum[0], sum[1]); + sum[1] = padd(sum[2], sum[3]); + sum[0] = padd(sum[0], sum[1]); + return sum[0]; +} +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = padd(vecs[0], reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8))); + v[1] = padd(vecs[1], reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8))); + sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); + return sum; +} +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + transpose.packet[2] = vecs[2]; + transpose.packet[3] = vecs[3]; + ptranspose(transpose); + Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); + sum = padd(sum, transpose.packet[2]); + sum = padd(sum, transpose.packet[3]); + return sum; +} +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return aux[0] * aux[1] * aux[2] * aux[3]; +} +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ + return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ + Packet4i b, res; + b = pmin(a, vec_sld(a, a, 8)); + res = pmin(b, vec_sld(b, b, 4)); + return pfirst(res); +} +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ + return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet2d b, res; + b = pmin(a.v4f[0], a.v4f[1]); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ + Packet4i b, res; + b = pmax(a, vec_sld(a, a, 8)); + res = pmax(b, vec_sld(b, b, 4)); + return pfirst(res); +} +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ + return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet2d b, res; + b = pmax(a.v4f[0], a.v4f[1]); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + kernel.packet[0] = t0; + kernel.packet[1] = t1; +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + PacketBlock t0,t1,t2,t3; + t0.packet[0] = kernel.packet[0].v4f[0]; + t0.packet[1] = kernel.packet[1].v4f[0]; + t1.packet[0] = kernel.packet[0].v4f[1]; + t1.packet[1] = kernel.packet[1].v4f[1]; + t2.packet[0] = kernel.packet[2].v4f[0]; + t2.packet[1] = kernel.packet[3].v4f[0]; + t3.packet[0] = kernel.packet[2].v4f[1]; + t3.packet[1] = kernel.packet[3].v4f[1]; + ptranspose(t0); + ptranspose(t1); + ptranspose(t2); + ptranspose(t3); + kernel.packet[0].v4f[0] = t0.packet[0]; + kernel.packet[0].v4f[1] = t2.packet[0]; + kernel.packet[1].v4f[0] = t0.packet[1]; + kernel.packet[1].v4f[1] = t2.packet[1]; + kernel.packet[2].v4f[0] = t1.packet[0]; + kernel.packet[2].v4f[1] = t3.packet[0]; + kernel.packet[3].v4f[0] = t1.packet[1]; + kernel.packet[3].v4f[1] = t3.packet[1]; +} +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; + Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast(p2l_ONE)); + Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast(p2l_ONE)); + Packet4f result; + result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi); + result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo); + return result; +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} +} +} +#endif +// end #include "src/Core/arch/ZVector/PacketMath.h" +// #include "src/Core/arch/ZVector/MathFunctions.h" +#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H +namespace Eigen { +namespace internal { +static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); +static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp(const Packet2d& _x) +{ + Packet2d x = _x; + Packet2d tmp, fx; + Packet2l emm0; + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); + fx = vec_floor(fx); + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + Packet2d x2 = pmul(x,x); + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + emm0 = vec_ctsl(fx, 0); + static const Packet2l p2l_1023 = { 1023, 1023 }; + static const Packet2ul p2ul_52 = { 52, 52 }; + emm0 = emm0 + p2l_1023; + emm0 = emm0 << reinterpret_cast(p2ul_52); + Packet2ul isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(x, reinterpret_cast(emm0)), _x), + isnumber_mask); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& x) +{ + Packet4f res; + res.v4f[0] = pexp(x.v4f[0]); + res.v4f[1] = pexp(x.v4f[1]); + return res; +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt(const Packet2d& x) +{ + return __builtin_s390_vfsqdb(x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& x) +{ + Packet4f res; + res.v4f[0] = psqrt(x.v4f[0]); + res.v4f[1] = psqrt(x.v4f[1]); + return res; +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt(const Packet2d& x) { + return pset1(1.0) / psqrt(x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& x) { + Packet4f res; + res.v4f[0] = prsqrt(x.v4f[0]); + res.v4f[1] = prsqrt(x.v4f[1]); + return res; +} +} +} +#endif +// end #include "src/Core/arch/ZVector/MathFunctions.h" +// #include "src/Core/arch/ZVector/Complex.h" +#ifndef EIGEN_COMPLEX32_ALTIVEC_H +#define EIGEN_COMPLEX32_ALTIVEC_H +namespace Eigen { +namespace internal { +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8); +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8); +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + union { + Packet4f v; + Packet1cd cd[2]; + }; +}; +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasBlend = 1, + HasSetLinear = 0 + }; +}; +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 1, + HasHalfPacket = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) +{ + return pload(from); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) +{ + pstore >(to, from); +} +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d a_re, a_im, v1, v2; + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + v1 = vec_madd(a_re, b.v, p2d_ZERO); + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8); + v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1); + return Packet1cd(v1 + v2); +} +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; +} +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + std::complex EIGEN_ALIGN16 res; + pstore >(&res, a); + return res; +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + return res[0]; +} +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) +{ + return pfirst(a); +} +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +{ + return vecs[0]; +} +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + ptranspose(transpose); + return padd(transpose.packet[0], transpose.packet[1]); +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) +{ + return pfirst(a); +} +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& , const Packet1cd& ) + { + } +}; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset == 1) { + first.cd[0] = first.cd[1]; + first.cd[1] = second.cd[0]; + } + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); +} +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res; + res.cd[0] = pdiv(a.cd[0], b.cd[0]); + res.cd[1] = pdiv(a.cd[1], b.cd[1]); + return res; +} +EIGEN_STRONG_INLINE Packet1cd pcplxflip(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + Packet2cf res; + res.cd[0] = pcplxflip(x.cd[0]); + res.cd[1] = pcplxflip(x.cd[1]); + return res; +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet1cd tmp = kernel.packet[0].cd[1]; + kernel.packet[0].cd[1] = kernel.packet[1].cd[0]; + kernel.packet[1].cd[0] = tmp; +} +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] }; + result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); + return result; +} +} +} +#endif +// end #include "src/Core/arch/ZVector/Complex.h" +#endif +#if defined EIGEN_VECTORIZE_CUDA +// #include "src/Core/arch/CUDA/PacketMath.h" +#ifndef EIGEN_PACKET_MATH_CUDA_H +#define EIGEN_PACKET_MATH_CUDA_H +namespace Eigen { +namespace internal { +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct packet_traits : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, + HasBetaInc = 1, + HasBlend = 0, + }; +}; +template<> struct packet_traits : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, + HasBetaInc = 1, + HasBlend = 0, + }; +}; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { + return make_double2(from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { + return make_double2(a, a+1); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { + return *reinterpret_cast(from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { + return *reinterpret_cast(from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { + return make_double2(from[0], from[1]); +} +template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { + return make_double2(from[0], from[0]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { + *reinterpret_cast(to) = from; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { + *reinterpret_cast(to) = from; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return __ldg((const double2*)from); +#else + return make_double2(from[0], from[1]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return make_double2(__ldg(from+0), __ldg(from+1)); +#else + return make_double2(from[0], from[1]); +#endif +} +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { + return make_double2(from[0*stride], from[1*stride]); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} +template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { + return a.x + a.y; +} +template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { + return fmax(a.x, a.y); +} +template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { + return fmin(a.x, a.y); +} +template<> EIGEN_DEVICE_FUNC inline float predux_mul(const float4& a) { + return a.x * a.y * a.z * a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) { + return a.x * a.y; +} +template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { + return make_double2(fabs(a.x), fabs(a.y)); +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} +#endif +} +} +#endif +// end #include "src/Core/arch/CUDA/PacketMath.h" +// #include "src/Core/arch/CUDA/MathFunctions.h" +#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H +#define EIGEN_MATH_FUNCTIONS_CUDA_H +namespace Eigen { +namespace internal { +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plog(const float4& a) +{ + return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plog(const double2& a) +{ + using ::log; + return make_double2(log(a.x), log(a.y)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plog1p(const float4& a) +{ + return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plog1p(const double2& a) +{ + return make_double2(log1p(a.x), log1p(a.y)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pexp(const float4& a) +{ + return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pexp(const double2& a) +{ + using ::exp; + return make_double2(exp(a.x), exp(a.y)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 psqrt(const float4& a) +{ + return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 psqrt(const double2& a) +{ + using ::sqrt; + return make_double2(sqrt(a.x), sqrt(a.y)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 prsqrt(const float4& a) +{ + return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 prsqrt(const double2& a) +{ + return make_double2(rsqrt(a.x), rsqrt(a.y)); +} +#endif +} +} +#endif +// end #include "src/Core/arch/CUDA/MathFunctions.h" +#endif +// #include "src/Core/arch/Default/Settings.h" +#ifndef EIGEN_DEFAULT_SETTINGS_H +#define EIGEN_DEFAULT_SETTINGS_H +#ifndef EIGEN_UNROLLING_LIMIT +#define EIGEN_UNROLLING_LIMIT 100 +#endif +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif +#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH +#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8 +#endif +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8 +#endif +#endif +// end #include "src/Core/arch/Default/Settings.h" +// #include "src/Core/functors/BinaryFunctors.h" +#ifndef EIGEN_BINARY_FUNCTORS_H +#define EIGEN_BINARY_FUNCTORS_H +namespace Eigen { +namespace internal { +template +struct binary_op_base +{ + typedef Arg1 first_argument_type; + typedef Arg2 second_argument_type; +}; +template +struct scalar_sum_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) +#else + scalar_sum_op() { + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::padd(a,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + { return internal::predux(a); } +}; +template +struct functor_traits > { + enum { + Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + PacketAccess = is_same::value && packet_traits::HasAdd && packet_traits::HasAdd + }; +}; +template<> struct scalar_sum_op : scalar_sum_op { + EIGEN_DEPRECATED + scalar_sum_op() {} +}; +template +struct scalar_product_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) +#else + scalar_product_op() { + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pmul(a,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + { return internal::predux_mul(a); } +}; +template +struct functor_traits > { + enum { + Cost = (NumTraits::MulCost + NumTraits::MulCost)/2, + PacketAccess = is_same::value && packet_traits::HasMul && packet_traits::HasMul + }; +}; +template +struct scalar_conj_product_op : binary_op_base +{ + enum { + Conj = NumTraits::IsComplex + }; + typedef typename ScalarBinaryOpTraits::ReturnType result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const + { return conj_helper().pmul(a,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return conj_helper().pmul(a,b); } +}; +template +struct functor_traits > { + enum { + Cost = NumTraits::MulCost, + PacketAccess = internal::is_same::value && packet_traits::HasMul + }; +}; +template +struct scalar_min_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pmin(a,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + { return internal::predux_min(a); } +}; +template +struct functor_traits > { + enum { + Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + PacketAccess = internal::is_same::value && packet_traits::HasMin + }; +}; +template +struct scalar_max_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pmax(a,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const + { return internal::predux_max(a); } +}; +template +struct functor_traits > { + enum { + Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + PacketAccess = internal::is_same::value && packet_traits::HasMax + }; +}; +template struct scalar_cmp_op; +template +struct functor_traits > { + enum { + Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + PacketAccess = false + }; +}; +template +struct result_of(LhsScalar,RhsScalar)> { + typedef bool type; +}; +template +struct scalar_cmp_op : binary_op_base +{ + typedef bool result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;} +}; +template +struct scalar_cmp_op : binary_op_base +{ + typedef bool result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a +struct scalar_cmp_op : binary_op_base +{ + typedef bool result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;} +}; +template +struct scalar_cmp_op : binary_op_base +{ + typedef bool result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;} +}; +template +struct scalar_cmp_op : binary_op_base +{ + typedef bool result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;} +}; +template +struct scalar_cmp_op : binary_op_base +{ + typedef bool result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);} +}; +template +struct scalar_cmp_op : binary_op_base +{ + typedef bool result_type; + EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;} +}; +template +struct scalar_hypot_op : binary_op_base +{ + EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const + { + EIGEN_USING_STD_MATH(sqrt) + Scalar p, qp; + if(_x>_y) + { + p = _x; + qp = _y / p; + } + else + { + p = _y; + qp = _x / p; + } + return p * sqrt(Scalar(1) + qp*qp); + } +}; +template +struct functor_traits > { + enum + { + Cost = 3 * NumTraits::AddCost + + 2 * NumTraits::MulCost + + 2 * scalar_div_cost::value, + PacketAccess = false + }; +}; +template +struct scalar_pow_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op) +#else + scalar_pow_op() { + typedef Scalar LhsScalar; + typedef Exponent RhsScalar; + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC + inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); } +}; +template +struct functor_traits > { + enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; +}; +template +struct scalar_difference_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) +#else + scalar_difference_op() { + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::psub(a,b); } +}; +template +struct functor_traits > { + enum { + Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + PacketAccess = is_same::value && packet_traits::HasSub && packet_traits::HasSub + }; +}; +template +struct scalar_quotient_op : binary_op_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType result_type; +#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN + EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) +#else + scalar_quotient_op() { + EIGEN_SCALAR_BINARY_OP_PLUGIN + } +#endif + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const + { return internal::pdiv(a,b); } +}; +template +struct functor_traits > { + typedef typename scalar_quotient_op::result_type result_type; + enum { + PacketAccess = is_same::value && packet_traits::HasDiv && packet_traits::HasDiv, + Cost = scalar_div_cost::value + }; +}; +struct scalar_boolean_and_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; } +}; +template<> struct functor_traits { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; +struct scalar_boolean_or_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; } +}; +template<> struct functor_traits { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; +struct scalar_boolean_xor_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } +}; +template<> struct functor_traits { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; +template struct bind1st_op : BinaryOp { + typedef typename BinaryOp::first_argument_type first_argument_type; + typedef typename BinaryOp::second_argument_type second_argument_type; + typedef typename BinaryOp::result_type result_type; + bind1st_op(const first_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const + { return BinaryOp::packetOp(internal::pset1(m_value), b); } + first_argument_type m_value; +}; +template struct functor_traits > : functor_traits {}; +template struct bind2nd_op : BinaryOp { + typedef typename BinaryOp::first_argument_type first_argument_type; + typedef typename BinaryOp::second_argument_type second_argument_type; + typedef typename BinaryOp::result_type result_type; + bind2nd_op(const second_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return BinaryOp::packetOp(a,internal::pset1(m_value)); } + second_argument_type m_value; +}; +template struct functor_traits > : functor_traits {}; +} +} +#endif +// end #include "src/Core/functors/BinaryFunctors.h" +// #include "src/Core/functors/UnaryFunctors.h" +#ifndef EIGEN_UNARY_FUNCTORS_H +#define EIGEN_UNARY_FUNCTORS_H +namespace Eigen { +namespace internal { +template struct scalar_opposite_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::pnegate(a); } +}; +template +struct functor_traits > +{ enum { + Cost = NumTraits::AddCost, + PacketAccess = packet_traits::HasNegate }; +}; +template struct scalar_abs_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op) + typedef typename NumTraits::Real result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs(a); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::pabs(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::AddCost, + PacketAccess = packet_traits::HasAbs + }; +}; +template struct scalar_score_coeff_op : scalar_abs_op +{ + typedef void Score_is_abs; +}; +template +struct functor_traits > : functor_traits > {}; +template struct abs_knowing_score +{ + EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score) + typedef typename NumTraits::Real result_type; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { return numext::abs(a); } +}; +template struct abs_knowing_score::Score_is_abs> +{ + EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score) + typedef typename NumTraits::Real result_type; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; } +}; +template struct scalar_abs2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op) + typedef typename NumTraits::Real result_type; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::pmul(a,a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasAbs2 }; }; +template struct scalar_conjugate_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op) + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::IsComplex ? NumTraits::AddCost : 0, + PacketAccess = packet_traits::HasConj + }; +}; +template struct scalar_arg_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op) + typedef typename NumTraits::Real result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const + { return internal::parg(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::IsComplex ? 5 * NumTraits::MulCost : NumTraits::AddCost, + PacketAccess = packet_traits::HasArg + }; +}; +template +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef NewType result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast(a); } +}; +template +struct functor_traits > +{ enum { Cost = is_same::value ? 0 : NumTraits::AddCost, PacketAccess = false }; }; +template +struct scalar_real_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op) + typedef typename NumTraits::Real result_type; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); } +}; +template +struct functor_traits > +{ enum { Cost = 0, PacketAccess = false }; }; +template +struct scalar_imag_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op) + typedef typename NumTraits::Real result_type; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); } +}; +template +struct functor_traits > +{ enum { Cost = 0, PacketAccess = false }; }; +template +struct scalar_real_ref_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op) + typedef typename NumTraits::Real result_type; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast(&a)); } +}; +template +struct functor_traits > +{ enum { Cost = 0, PacketAccess = false }; }; +template +struct scalar_imag_ref_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op) + typedef typename NumTraits::Real result_type; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast(&a)); } +}; +template +struct functor_traits > +{ enum { Cost = 0, PacketAccess = false }; }; +template struct scalar_exp_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::exp(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); } +}; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasExp, +#ifdef EIGEN_VECTORIZE_FMA + Cost = + (sizeof(Scalar) == 4 + ? (8 * NumTraits::AddCost + 6 * NumTraits::MulCost) + : (14 * NumTraits::AddCost + + 6 * NumTraits::MulCost + + scalar_div_cost::HasDiv>::value)) +#else + Cost = + (sizeof(Scalar) == 4 + ? (21 * NumTraits::AddCost + 13 * NumTraits::MulCost) + : (23 * NumTraits::AddCost + + 12 * NumTraits::MulCost + + scalar_div_cost::HasDiv>::value)) +#endif + }; +}; +template struct scalar_log_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); } +}; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasLog, + Cost = + (PacketAccess +#ifdef EIGEN_VECTORIZE_FMA + ? (20 * NumTraits::AddCost + 7 * NumTraits::MulCost) +#else + ? (36 * NumTraits::AddCost + 14 * NumTraits::MulCost) +#endif + : sizeof(Scalar)==4 ? 40 : 85) + }; +}; +template struct scalar_log1p_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_log1p_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::log1p(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog1p(a); } +}; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasLog1p, + Cost = functor_traits >::Cost + }; +}; +template struct scalar_log10_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { EIGEN_USING_STD_MATH(log10) return log10(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); } +}; +template +struct functor_traits > +{ enum { Cost = 5 * NumTraits::MulCost, PacketAccess = packet_traits::HasLog10 }; }; +template struct scalar_sqrt_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sqrt(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); } +}; +template +struct functor_traits > { + enum { +#if EIGEN_FAST_MATH + Cost = (sizeof(Scalar) == 8 ? 28 + : (3 * NumTraits::AddCost + + 5 * NumTraits::MulCost)), +#else + Cost = (sizeof(Scalar) == 8 ? 28 : 14), +#endif + PacketAccess = packet_traits::HasSqrt + }; +}; +template struct scalar_rsqrt_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return Scalar(1)/numext::sqrt(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); } +}; +template +struct functor_traits > +{ enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasRsqrt + }; +}; +template struct scalar_cos_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op) + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return numext::cos(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasCos + }; +}; +template struct scalar_sin_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sin(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasSin + }; +}; +template struct scalar_tan_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::tan(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasTan + }; +}; +template struct scalar_acos_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::acos(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasACos + }; +}; +template struct scalar_asin_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::asin(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasASin + }; +}; +template struct scalar_atan_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::atan(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasATan + }; +}; +template +struct scalar_tanh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return ptanh(x); } +}; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasTanh, + Cost = ( (EIGEN_FAST_MATH && is_same::value) +#ifdef EIGEN_VECTORIZE_FMA + ? (2 * NumTraits::AddCost + + 6 * NumTraits::MulCost + + scalar_div_cost::HasDiv>::value) +#else + ? (11 * NumTraits::AddCost + + 11 * NumTraits::MulCost + + scalar_div_cost::HasDiv>::value) +#endif + : (6 * NumTraits::AddCost + + 3 * NumTraits::MulCost + + 2 * scalar_div_cost::HasDiv>::value + + functor_traits >::Cost)) + }; +}; +template struct scalar_sinh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::sinh(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasSinh + }; +}; +template struct scalar_cosh_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::cosh(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = 5 * NumTraits::MulCost, + PacketAccess = packet_traits::HasCosh + }; +}; +template +struct scalar_inverse_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op) + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; } + template + EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return internal::pdiv(pset1(Scalar(1)),a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasDiv }; }; +template +struct scalar_square_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op) + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a; } + template + EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return internal::pmul(a,a); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasMul }; }; +template +struct scalar_cube_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op) + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a*a; } + template + EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const + { return internal::pmul(a,pmul(a,a)); } +}; +template +struct functor_traits > +{ enum { Cost = 2*NumTraits::MulCost, PacketAccess = packet_traits::HasMul }; }; +template struct scalar_round_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_round_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::round(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pround(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = packet_traits::HasRound + }; +}; +template struct scalar_floor_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_floor_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::floor(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pfloor(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = packet_traits::HasFloor + }; +}; +template struct scalar_ceil_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); } + template + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = packet_traits::HasCeil + }; +}; +template struct scalar_isnan_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op) + typedef bool result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = false + }; +}; +template struct scalar_isinf_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op) + typedef bool result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = false + }; +}; +template struct scalar_isfinite_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op) + typedef bool result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); } +}; +template +struct functor_traits > +{ + enum { + Cost = NumTraits::MulCost, + PacketAccess = false + }; +}; +template struct scalar_boolean_not_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_not_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a) const { return !a; } +}; +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false + }; +}; +template::IsComplex!=0) > struct scalar_sign_op; +template +struct scalar_sign_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + { + return Scalar( (a>Scalar(0)) - (a +struct scalar_sign_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op) + EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const + { + typedef typename NumTraits::Real real_type; + real_type aa = numext::abs(a); + if (aa==real_type(0)) + return Scalar(0); + aa = real_type(1)/aa; + return Scalar(real(a)*aa, imag(a)*aa ); + } +}; +template +struct functor_traits > +{ enum { + Cost = + NumTraits::IsComplex + ? ( 8*NumTraits::MulCost ) + : ( 3*NumTraits::AddCost), + PacketAccess = packet_traits::HasSign + }; +}; +} +} +#endif +// end #include "src/Core/functors/UnaryFunctors.h" +// #include "src/Core/functors/NullaryFunctors.h" +#ifndef EIGEN_NULLARY_FUNCTORS_H +#define EIGEN_NULLARY_FUNCTORS_H +namespace Eigen { +namespace internal { +template +struct scalar_constant_op { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() () const { return m_other; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const { return internal::pset1(m_other); } + const Scalar m_other; +}; +template +struct functor_traits > +{ enum { Cost = 0 , + PacketAccess = packet_traits::Vectorizable, IsRepeatable = true }; }; +template struct scalar_identity_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op) + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType row, IndexType col) const { return row==col ? Scalar(1) : Scalar(0); } +}; +template +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; +template struct linspaced_op_impl; +template +struct linspaced_op_impl +{ + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), + m_flip(numext::abs(high) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { + typedef typename NumTraits::Real RealScalar; + if(m_flip) + return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step); + else + return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const + { + if(m_flip) + { + Packet pi = plset(Scalar(i-m_size1)); + Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); + if(i==0) + res = pinsertfirst(res, m_low); + return res; + } + else + { + Packet pi = plset(Scalar(i)); + Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); + if(i==m_size1-unpacket_traits::size+1) + res = pinsertlast(res, m_high); + return res; + } + } + const Scalar m_low; + const Scalar m_high; + const Index m_size1; + const Scalar m_step; + const bool m_flip; +}; +template +struct linspaced_op_impl +{ + linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : + m_low(low), + m_multiplier((high-low)/convert_index(num_steps<=1 ? 1 : num_steps-1)), + m_divisor(convert_index((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))), + m_use_divisor(num_steps>1 && (numext::abs(high-low)+1) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar operator() (IndexType i) const + { + if(m_use_divisor) return m_low + convert_index(i)/m_divisor; + else return m_low + convert_index(i)*m_multiplier; + } + const Scalar m_low; + const Scalar m_multiplier; + const Scalar m_divisor; + const bool m_use_divisor; +}; +template struct linspaced_op; +template struct functor_traits< linspaced_op > +{ + enum + { + Cost = 1, + PacketAccess = (!NumTraits::IsInteger) && packet_traits::HasSetLinear && packet_traits::HasBlend, + IsRepeatable = true + }; +}; +template struct linspaced_op +{ + linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) + : impl((num_steps==1 ? high : low),high,num_steps) + {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); } + const linspaced_op_impl::IsInteger> impl; +}; +template struct functor_has_linear_access { enum { ret = !has_binary_operator::value }; }; +#if !( (EIGEN_COMP_MSVC>1600) || (EIGEN_GNUC_AT_LEAST(4,8)) || (EIGEN_COMP_ICC>=1600)) +template +struct has_nullary_operator,IndexType> { enum { value = 1}; }; +template +struct has_unary_operator,IndexType> { enum { value = 0}; }; +template +struct has_binary_operator,IndexType> { enum { value = 0}; }; +template +struct has_nullary_operator,IndexType> { enum { value = 0}; }; +template +struct has_unary_operator,IndexType> { enum { value = 0}; }; +template +struct has_binary_operator,IndexType> { enum { value = 1}; }; +template +struct has_nullary_operator,IndexType> { enum { value = 0}; }; +template +struct has_unary_operator,IndexType> { enum { value = 1}; }; +template +struct has_binary_operator,IndexType> { enum { value = 0}; }; +template +struct has_nullary_operator,IndexType> { enum { value = 1}; }; +template +struct has_unary_operator,IndexType> { enum { value = 0}; }; +template +struct has_binary_operator,IndexType> { enum { value = 0}; }; +#endif +} +} +#endif +// end #include "src/Core/functors/NullaryFunctors.h" +// #include "src/Core/functors/AssignmentFunctors.h" +#ifndef EIGEN_ASSIGNMENT_FUNCTORS_H +#define EIGEN_ASSIGNMENT_FUNCTORS_H +namespace Eigen { +namespace internal { +template struct assign_op { + EIGEN_EMPTY_STRUCT_CTOR(assign_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; } + template + EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const + { internal::pstoret(a,b); } +}; +template struct assign_op {}; +template +struct functor_traits > { + enum { + Cost = NumTraits::ReadCost, + PacketAccess = is_same::value && packet_traits::Vectorizable && packet_traits::Vectorizable + }; +}; +template struct add_assign_op { + EIGEN_EMPTY_STRUCT_CTOR(add_assign_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a += b; } + template + EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const + { internal::pstoret(a,internal::padd(internal::ploadt(a),b)); } +}; +template +struct functor_traits > { + enum { + Cost = NumTraits::ReadCost + NumTraits::AddCost, + PacketAccess = is_same::value && packet_traits::HasAdd + }; +}; +template struct sub_assign_op { + EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a -= b; } + template + EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const + { internal::pstoret(a,internal::psub(internal::ploadt(a),b)); } +}; +template +struct functor_traits > { + enum { + Cost = NumTraits::ReadCost + NumTraits::AddCost, + PacketAccess = is_same::value && packet_traits::HasSub + }; +}; +template +struct mul_assign_op { + EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; } + template + EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const + { internal::pstoret(a,internal::pmul(internal::ploadt(a),b)); } +}; +template +struct functor_traits > { + enum { + Cost = NumTraits::ReadCost + NumTraits::MulCost, + PacketAccess = is_same::value && packet_traits::HasMul + }; +}; +template struct div_assign_op { + EIGEN_EMPTY_STRUCT_CTOR(div_assign_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a /= b; } + template + EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const + { internal::pstoret(a,internal::pdiv(internal::ploadt(a),b)); } +}; +template +struct functor_traits > { + enum { + Cost = NumTraits::ReadCost + NumTraits::MulCost, + PacketAccess = is_same::value && packet_traits::HasDiv + }; +}; +template struct swap_assign_op { + EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const + { +#ifdef __CUDACC__ + Scalar t=b; const_cast(b)=a; a=t; +#else + using std::swap; + swap(a,const_cast(b)); +#endif + } +}; +template +struct functor_traits > { + enum { + Cost = 3 * NumTraits::ReadCost, + PacketAccess = packet_traits::Vectorizable + }; +}; +} +} +#endif +// end #include "src/Core/functors/AssignmentFunctors.h" +// #include "src/Core/DenseCoeffsBase.h" +#ifndef EIGEN_DENSECOEFFSBASE_H +#define EIGEN_DENSECOEFFSBASE_H +namespace Eigen { +namespace internal { +template struct add_const_on_value_type_if_arithmetic +{ + typedef typename conditional::value, T, typename add_const_on_value_type::type>::type type; +}; +} +template +class DenseCoeffsBase : public EigenBase +{ + public: + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename internal::conditional::Flags&LvalueBit), + const Scalar&, + typename internal::conditional::value, Scalar, const Scalar>::type + >::type CoeffReturnType; + typedef typename internal::add_const_on_value_type_if_arithmetic< + typename internal::packet_traits::type + >::type PacketReturnType; + typedef EigenBase Base; + using Base::rows; + using Base::cols; + using Base::size; + using Base::derived; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const + { + return int(Derived::RowsAtCompileTime) == 1 ? 0 + : int(Derived::ColsAtCompileTime) == 1 ? inner + : int(Derived::Flags)&RowMajorBit ? outer + : inner; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const + { + return int(Derived::ColsAtCompileTime) == 1 ? 0 + : int(Derived::RowsAtCompileTime) == 1 ? inner + : int(Derived::Flags)&RowMajorBit ? inner + : outer; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const + { + eigen_internal_assert(row >= 0 && row < rows() + && col >= 0 && col < cols()); + return internal::evaluator(derived()).coeff(row,col); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const + { + return coeff(rowIndexByOuterInner(outer, inner), + colIndexByOuterInner(outer, inner)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const + { + eigen_assert(row >= 0 && row < rows() + && col >= 0 && col < cols()); + return coeff(row, col); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType + coeff(Index index) const + { + EIGEN_STATIC_ASSERT(internal::evaluator::Flags & LinearAccessBit, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS) + eigen_internal_assert(index >= 0 && index < size()); + return internal::evaluator(derived()).coeff(index); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType + operator[](Index index) const + { + EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime, + THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD) + eigen_assert(index >= 0 && index < size()); + return coeff(index); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType + operator()(Index index) const + { + eigen_assert(index >= 0 && index < size()); + return coeff(index); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType + x() const { return (*this)[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType + y() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS); + return (*this)[1]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType + z() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS); + return (*this)[2]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CoeffReturnType + w() const + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS); + return (*this)[3]; + } + template + EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const + { + typedef typename internal::packet_traits::type DefaultPacketType; + eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols()); + return internal::evaluator(derived()).template packet(row,col); + } + template + EIGEN_STRONG_INLINE PacketReturnType packetByOuterInner(Index outer, Index inner) const + { + return packet(rowIndexByOuterInner(outer, inner), + colIndexByOuterInner(outer, inner)); + } + template + EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT(internal::evaluator::Flags & LinearAccessBit, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS) + typedef typename internal::packet_traits::type DefaultPacketType; + eigen_internal_assert(index >= 0 && index < size()); + return internal::evaluator(derived()).template packet(index); + } + protected: + void coeffRef(); + void coeffRefByOuterInner(); + void writePacket(); + void writePacketByOuterInner(); + void copyCoeff(); + void copyCoeffByOuterInner(); + void copyPacket(); + void copyPacketByOuterInner(); + void stride(); + void innerStride(); + void outerStride(); + void rowStride(); + void colStride(); +}; +template +class DenseCoeffsBase : public DenseCoeffsBase +{ + public: + typedef DenseCoeffsBase Base; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + using Base::coeff; + using Base::rows; + using Base::cols; + using Base::size; + using Base::derived; + using Base::rowIndexByOuterInner; + using Base::colIndexByOuterInner; + using Base::operator[]; + using Base::operator(); + using Base::x; + using Base::y; + using Base::z; + using Base::w; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) + { + eigen_internal_assert(row >= 0 && row < rows() + && col >= 0 && col < cols()); + return internal::evaluator(derived()).coeffRef(row,col); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + coeffRefByOuterInner(Index outer, Index inner) + { + return coeffRef(rowIndexByOuterInner(outer, inner), + colIndexByOuterInner(outer, inner)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + operator()(Index row, Index col) + { + eigen_assert(row >= 0 && row < rows() + && col >= 0 && col < cols()); + return coeffRef(row, col); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + coeffRef(Index index) + { + EIGEN_STATIC_ASSERT(internal::evaluator::Flags & LinearAccessBit, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS) + eigen_internal_assert(index >= 0 && index < size()); + return internal::evaluator(derived()).coeffRef(index); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + operator[](Index index) + { + EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime, + THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD) + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + x() { return (*this)[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + y() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=2, OUT_OF_RANGE_ACCESS); + return (*this)[1]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + z() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=3, OUT_OF_RANGE_ACCESS); + return (*this)[2]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& + w() + { + EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime==-1 || Derived::SizeAtCompileTime>=4, OUT_OF_RANGE_ACCESS); + return (*this)[3]; + } +}; +template +class DenseCoeffsBase : public DenseCoeffsBase +{ + public: + typedef DenseCoeffsBase Base; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + using Base::rows; + using Base::cols; + using Base::size; + using Base::derived; + EIGEN_DEVICE_FUNC + inline Index innerStride() const + { + return derived().innerStride(); + } + EIGEN_DEVICE_FUNC + inline Index outerStride() const + { + return derived().outerStride(); + } + inline Index stride() const + { + return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); + } + EIGEN_DEVICE_FUNC + inline Index rowStride() const + { + return Derived::IsRowMajor ? outerStride() : innerStride(); + } + EIGEN_DEVICE_FUNC + inline Index colStride() const + { + return Derived::IsRowMajor ? innerStride() : outerStride(); + } +}; +template +class DenseCoeffsBase + : public DenseCoeffsBase +{ + public: + typedef DenseCoeffsBase Base; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + using Base::rows; + using Base::cols; + using Base::size; + using Base::derived; + EIGEN_DEVICE_FUNC + inline Index innerStride() const + { + return derived().innerStride(); + } + EIGEN_DEVICE_FUNC + inline Index outerStride() const + { + return derived().outerStride(); + } + inline Index stride() const + { + return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); + } + EIGEN_DEVICE_FUNC + inline Index rowStride() const + { + return Derived::IsRowMajor ? outerStride() : innerStride(); + } + EIGEN_DEVICE_FUNC + inline Index colStride() const + { + return Derived::IsRowMajor ? innerStride() : outerStride(); + } +}; +namespace internal { +template +struct first_aligned_impl +{ + static inline Index run(const Derived&) + { return 0; } +}; +template +struct first_aligned_impl +{ + static inline Index run(const Derived& m) + { + return internal::first_aligned(m.data(), m.size()); + } +}; +template +static inline Index first_aligned(const DenseBase& m) +{ + enum { ReturnZero = (int(evaluator::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) }; + return first_aligned_impl::run(m.derived()); +} +template +static inline Index first_default_aligned(const DenseBase& m) +{ + typedef typename Derived::Scalar Scalar; + typedef typename packet_traits::type DefaultPacketType; + return internal::first_aligned::alignment),Derived>(m); +} +template::ret> +struct inner_stride_at_compile_time +{ + enum { ret = traits::InnerStrideAtCompileTime }; +}; +template +struct inner_stride_at_compile_time +{ + enum { ret = 0 }; +}; +template::ret> +struct outer_stride_at_compile_time +{ + enum { ret = traits::OuterStrideAtCompileTime }; +}; +template +struct outer_stride_at_compile_time +{ + enum { ret = 0 }; +}; +} +} +#endif +// end #include "src/Core/DenseCoeffsBase.h" +// #include "src/Core/DenseBase.h" +#ifndef EIGEN_DENSEBASE_H +#define EIGEN_DENSEBASE_H +namespace Eigen { +namespace internal { +static inline void check_DenseIndex_is_signed() { + EIGEN_STATIC_ASSERT(NumTraits::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); +} +} +template class DenseBase +#ifndef EIGEN_PARSED_BY_DOXYGEN + : public DenseCoeffsBase +#else + : public DenseCoeffsBase +#endif +{ + public: + typedef Eigen::InnerIterator InnerIterator; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::StorageIndex StorageIndex; + typedef typename internal::traits::Scalar Scalar; + typedef Scalar value_type; + typedef typename NumTraits::Real RealScalar; + typedef DenseCoeffsBase Base; + using Base::derived; + using Base::const_cast_derived; + using Base::rows; + using Base::cols; + using Base::size; + using Base::rowIndexByOuterInner; + using Base::colIndexByOuterInner; + using Base::coeff; + using Base::coeffByOuterInner; + using Base::operator(); + using Base::operator[]; + using Base::x; + using Base::y; + using Base::z; + using Base::w; + using Base::stride; + using Base::innerStride; + using Base::outerStride; + using Base::rowStride; + using Base::colStride; + typedef typename Base::CoeffReturnType CoeffReturnType; + enum { + RowsAtCompileTime = internal::traits::RowsAtCompileTime, + ColsAtCompileTime = internal::traits::ColsAtCompileTime, + SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, + internal::traits::ColsAtCompileTime>::ret), + MaxRowsAtCompileTime = internal::traits::MaxRowsAtCompileTime, + MaxColsAtCompileTime = internal::traits::MaxColsAtCompileTime, + MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, + internal::traits::MaxColsAtCompileTime>::ret), + IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 + || internal::traits::MaxColsAtCompileTime == 1, + Flags = internal::traits::Flags, + IsRowMajor = int(Flags) & RowMajorBit, + InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime) + : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime), + InnerStrideAtCompileTime = internal::inner_stride_at_compile_time::ret, + OuterStrideAtCompileTime = internal::outer_stride_at_compile_time::ret + }; + typedef typename internal::find_best_packet::type PacketScalar; + enum { IsPlainObjectBase = 0 }; + typedef Matrix::Scalar, + internal::traits::RowsAtCompileTime, + internal::traits::ColsAtCompileTime, + AutoAlign | (internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor), + internal::traits::MaxRowsAtCompileTime, + internal::traits::MaxColsAtCompileTime + > PlainMatrix; + typedef Array::Scalar, + internal::traits::RowsAtCompileTime, + internal::traits::ColsAtCompileTime, + AutoAlign | (internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor), + internal::traits::MaxRowsAtCompileTime, + internal::traits::MaxColsAtCompileTime + > PlainArray; + typedef typename internal::conditional::XprKind,MatrixXpr >::value, + PlainMatrix, PlainArray>::type PlainObject; + EIGEN_DEVICE_FUNC + inline Index nonZeros() const { return size(); } + EIGEN_DEVICE_FUNC + Index outerSize() const + { + return IsVectorAtCompileTime ? 1 + : int(IsRowMajor) ? this->rows() : this->cols(); + } + EIGEN_DEVICE_FUNC + Index innerSize() const + { + return IsVectorAtCompileTime ? this->size() + : int(IsRowMajor) ? this->cols() : this->rows(); + } + EIGEN_DEVICE_FUNC + void resize(Index newSize) + { + EIGEN_ONLY_USED_FOR_DEBUG(newSize); + eigen_assert(newSize == this->size() + && "DenseBase::resize() does not actually allow to resize."); + } + EIGEN_DEVICE_FUNC + void resize(Index rows, Index cols) + { + EIGEN_ONLY_USED_FOR_DEBUG(rows); + EIGEN_ONLY_USED_FOR_DEBUG(cols); + eigen_assert(rows == this->rows() && cols == this->cols() + && "DenseBase::resize() does not actually allow to resize."); + } +#ifndef EIGEN_PARSED_BY_DOXYGEN + typedef CwiseNullaryOp,PlainObject> ConstantReturnType; + typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; + typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; + typedef Matrix::Scalar>::Real, internal::traits::ColsAtCompileTime, 1> EigenvaluesReturnType; +#endif + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator=(const DenseBase& other); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator=(const DenseBase& other); + template + EIGEN_DEVICE_FUNC + Derived& operator=(const EigenBase &other); + template + EIGEN_DEVICE_FUNC + Derived& operator+=(const EigenBase &other); + template + EIGEN_DEVICE_FUNC + Derived& operator-=(const EigenBase &other); + template + EIGEN_DEVICE_FUNC + Derived& operator=(const ReturnByValue& func); + template + EIGEN_DEVICE_FUNC + Derived& lazyAssign(const DenseBase& other); + EIGEN_DEVICE_FUNC + CommaInitializer operator<< (const Scalar& s); + template + EIGEN_DEPRECATED + const Derived& flagged() const + { return derived(); } + template + EIGEN_DEVICE_FUNC + CommaInitializer operator<< (const DenseBase& other); + typedef Transpose TransposeReturnType; + EIGEN_DEVICE_FUNC + TransposeReturnType transpose(); + typedef typename internal::add_const >::type ConstTransposeReturnType; + EIGEN_DEVICE_FUNC + ConstTransposeReturnType transpose() const; + EIGEN_DEVICE_FUNC + void transposeInPlace(); + EIGEN_DEVICE_FUNC static const ConstantReturnType + Constant(Index rows, Index cols, const Scalar& value); + EIGEN_DEVICE_FUNC static const ConstantReturnType + Constant(Index size, const Scalar& value); + EIGEN_DEVICE_FUNC static const ConstantReturnType + Constant(const Scalar& value); + EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType + LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType + LinSpaced(Index size, const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType + LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType + LinSpaced(const Scalar& low, const Scalar& high); + template EIGEN_DEVICE_FUNC + static const CwiseNullaryOp + NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func); + template EIGEN_DEVICE_FUNC + static const CwiseNullaryOp + NullaryExpr(Index size, const CustomNullaryOp& func); + template EIGEN_DEVICE_FUNC + static const CwiseNullaryOp + NullaryExpr(const CustomNullaryOp& func); + EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols); + EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index size); + EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(); + EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols); + EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size); + EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(); + EIGEN_DEVICE_FUNC void fill(const Scalar& value); + EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value); + EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC Derived& setZero(); + EIGEN_DEVICE_FUNC Derived& setOnes(); + EIGEN_DEVICE_FUNC Derived& setRandom(); + template EIGEN_DEVICE_FUNC + bool isApprox(const DenseBase& other, + const RealScalar& prec = NumTraits::dummy_precision()) const; + EIGEN_DEVICE_FUNC + bool isMuchSmallerThan(const RealScalar& other, + const RealScalar& prec = NumTraits::dummy_precision()) const; + template EIGEN_DEVICE_FUNC + bool isMuchSmallerThan(const DenseBase& other, + const RealScalar& prec = NumTraits::dummy_precision()) const; + EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits::dummy_precision()) const; + EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits::dummy_precision()) const; + EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits::dummy_precision()) const; + EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits::dummy_precision()) const; + inline bool hasNaN() const; + inline bool allFinite() const; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const Scalar& other); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const Scalar& other); + typedef typename internal::add_const_on_value_type::type>::type EvalReturnType; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE EvalReturnType eval() const + { + return typename internal::eval::type(derived()); + } + template + EIGEN_DEVICE_FUNC + void swap(const DenseBase& other) + { + EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); + eigen_assert(rows()==other.rows() && cols()==other.cols()); + call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op()); + } + template + EIGEN_DEVICE_FUNC + void swap(PlainObjectBase& other) + { + eigen_assert(rows()==other.rows() && cols()==other.cols()); + call_assignment(derived(), other.derived(), internal::swap_assign_op()); + } + EIGEN_DEVICE_FUNC inline const NestByValue nestByValue() const; + EIGEN_DEVICE_FUNC inline const ForceAlignedAccess forceAlignedAccess() const; + EIGEN_DEVICE_FUNC inline ForceAlignedAccess forceAlignedAccess(); + template EIGEN_DEVICE_FUNC + inline const typename internal::conditional,Derived&>::type forceAlignedAccessIf() const; + template EIGEN_DEVICE_FUNC + inline typename internal::conditional,Derived&>::type forceAlignedAccessIf(); + EIGEN_DEVICE_FUNC Scalar sum() const; + EIGEN_DEVICE_FUNC Scalar mean() const; + EIGEN_DEVICE_FUNC Scalar trace() const; + EIGEN_DEVICE_FUNC Scalar prod() const; + EIGEN_DEVICE_FUNC typename internal::traits::Scalar minCoeff() const; + EIGEN_DEVICE_FUNC typename internal::traits::Scalar maxCoeff() const; + template EIGEN_DEVICE_FUNC + typename internal::traits::Scalar minCoeff(IndexType* row, IndexType* col) const; + template EIGEN_DEVICE_FUNC + typename internal::traits::Scalar maxCoeff(IndexType* row, IndexType* col) const; + template EIGEN_DEVICE_FUNC + typename internal::traits::Scalar minCoeff(IndexType* index) const; + template EIGEN_DEVICE_FUNC + typename internal::traits::Scalar maxCoeff(IndexType* index) const; + template + EIGEN_DEVICE_FUNC + Scalar redux(const BinaryOp& func) const; + template + EIGEN_DEVICE_FUNC + void visit(Visitor& func) const; + inline const WithFormat format(const IOFormat& fmt) const + { + return WithFormat(derived(), fmt); + } + EIGEN_DEVICE_FUNC + CoeffReturnType value() const + { + EIGEN_STATIC_ASSERT_SIZE_1x1(Derived) + eigen_assert(this->rows() == 1 && this->cols() == 1); + return derived().coeff(0,0); + } + EIGEN_DEVICE_FUNC bool all() const; + EIGEN_DEVICE_FUNC bool any() const; + EIGEN_DEVICE_FUNC Index count() const; + typedef VectorwiseOp RowwiseReturnType; + typedef const VectorwiseOp ConstRowwiseReturnType; + typedef VectorwiseOp ColwiseReturnType; + typedef const VectorwiseOp ConstColwiseReturnType; + EIGEN_DEVICE_FUNC inline ConstRowwiseReturnType rowwise() const { + return ConstRowwiseReturnType(derived()); + } + EIGEN_DEVICE_FUNC RowwiseReturnType rowwise(); + EIGEN_DEVICE_FUNC inline ConstColwiseReturnType colwise() const { + return ConstColwiseReturnType(derived()); + } + EIGEN_DEVICE_FUNC ColwiseReturnType colwise(); + typedef CwiseNullaryOp,PlainObject> RandomReturnType; + static const RandomReturnType Random(Index rows, Index cols); + static const RandomReturnType Random(Index size); + static const RandomReturnType Random(); + template + const Select + select(const DenseBase& thenMatrix, + const DenseBase& elseMatrix) const; + template + inline const Select + select(const DenseBase& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const; + template + inline const Select + select(const typename ElseDerived::Scalar& thenScalar, const DenseBase& elseMatrix) const; + template RealScalar lpNorm() const; + template + EIGEN_DEVICE_FUNC + const Replicate replicate() const; + EIGEN_DEVICE_FUNC + const Replicate replicate(Index rowFactor, Index colFactor) const + { + return Replicate(derived(), rowFactor, colFactor); + } + typedef Reverse ReverseReturnType; + typedef const Reverse ConstReverseReturnType; + EIGEN_DEVICE_FUNC ReverseReturnType reverse(); + EIGEN_DEVICE_FUNC ConstReverseReturnType reverse() const + { + return ConstReverseReturnType(derived()); + } + EIGEN_DEVICE_FUNC void reverseInPlace(); +#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase +#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) +// # include "../plugins/BlockMethods.h" +#ifndef EIGEN_PARSED_BY_DOXYGEN +typedef Block::RowsAtCompileTime, 1, !IsRowMajor> ColXpr; +typedef const Block::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr; +typedef Block::ColsAtCompileTime, IsRowMajor> RowXpr; +typedef const Block::ColsAtCompileTime, IsRowMajor> ConstRowXpr; +typedef Block::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr; +typedef const Block::RowsAtCompileTime, Dynamic, !IsRowMajor> ConstColsBlockXpr; +typedef Block::ColsAtCompileTime, IsRowMajor> RowsBlockXpr; +typedef const Block::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr; +template struct NColsBlockXpr { typedef Block::RowsAtCompileTime, N, !IsRowMajor> Type; }; +template struct ConstNColsBlockXpr { typedef const Block::RowsAtCompileTime, N, !IsRowMajor> Type; }; +template struct NRowsBlockXpr { typedef Block::ColsAtCompileTime, IsRowMajor> Type; }; +template struct ConstNRowsBlockXpr { typedef const Block::ColsAtCompileTime, IsRowMajor> Type; }; +typedef Block BlockXpr; +typedef const Block ConstBlockXpr; +template struct FixedBlockXpr { typedef Block Type; }; +template struct ConstFixedBlockXpr { typedef Block Type; }; +typedef VectorBlock SegmentReturnType; +typedef const VectorBlock ConstSegmentReturnType; +template struct FixedSegmentReturnType { typedef VectorBlock Type; }; +template struct ConstFixedSegmentReturnType { typedef const VectorBlock Type; }; +#endif +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +EIGEN_DEVICE_FUNC +inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) +{ + return BlockXpr(derived(), startRow, startCol, blockRows, blockCols); +} +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const +{ + return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +EIGEN_DEVICE_FUNC +inline BlockXpr topRightCorner(Index cRows, Index cCols) +{ + return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols); +} +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const +{ + return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type topRightCorner() +{ + return typename FixedBlockXpr::Type(derived(), 0, cols() - CCols); +} +template +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type topRightCorner() const +{ + return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - CCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +inline typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) +{ + return typename FixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); +} +template +inline const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const +{ + return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +EIGEN_DEVICE_FUNC +inline BlockXpr topLeftCorner(Index cRows, Index cCols) +{ + return BlockXpr(derived(), 0, 0, cRows, cCols); +} +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const +{ + return ConstBlockXpr(derived(), 0, 0, cRows, cCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type topLeftCorner() +{ + return typename FixedBlockXpr::Type(derived(), 0, 0); +} +template +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type topLeftCorner() const +{ + return typename ConstFixedBlockXpr::Type(derived(), 0, 0); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +inline typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) +{ + return typename FixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); +} +template +inline const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const +{ + return typename ConstFixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +EIGEN_DEVICE_FUNC +inline BlockXpr bottomRightCorner(Index cRows, Index cCols) +{ + return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); +} +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const +{ + return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type bottomRightCorner() +{ + return typename FixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); +} +template +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type bottomRightCorner() const +{ + return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +inline typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) +{ + return typename FixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); +} +template +inline const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const +{ + return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +EIGEN_DEVICE_FUNC +inline BlockXpr bottomLeftCorner(Index cRows, Index cCols) +{ + return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols); +} +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const +{ + return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type bottomLeftCorner() +{ + return typename FixedBlockXpr::Type(derived(), rows() - CRows, 0); +} +template +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type bottomLeftCorner() const +{ + return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, 0); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +inline typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) +{ + return typename FixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); +} +template +inline const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const +{ + return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) +EIGEN_DEVICE_FUNC +inline RowsBlockXpr topRows(Index n) +{ + return RowsBlockXpr(derived(), 0, 0, n, cols()); +} +EIGEN_DEVICE_FUNC +inline ConstRowsBlockXpr topRows(Index n) const +{ + return ConstRowsBlockXpr(derived(), 0, 0, n, cols()); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) +template +EIGEN_DEVICE_FUNC +inline typename NRowsBlockXpr::Type topRows(Index n = N) +{ + return typename NRowsBlockXpr::Type(derived(), 0, 0, n, cols()); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstNRowsBlockXpr::Type topRows(Index n = N) const +{ + return typename ConstNRowsBlockXpr::Type(derived(), 0, 0, n, cols()); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) +EIGEN_DEVICE_FUNC +inline RowsBlockXpr bottomRows(Index n) +{ + return RowsBlockXpr(derived(), rows() - n, 0, n, cols()); +} +EIGEN_DEVICE_FUNC +inline ConstRowsBlockXpr bottomRows(Index n) const +{ + return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols()); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) +template +EIGEN_DEVICE_FUNC +inline typename NRowsBlockXpr::Type bottomRows(Index n = N) +{ + return typename NRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const +{ + return typename ConstNRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) +EIGEN_DEVICE_FUNC +inline RowsBlockXpr middleRows(Index startRow, Index n) +{ + return RowsBlockXpr(derived(), startRow, 0, n, cols()); +} +EIGEN_DEVICE_FUNC +inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const +{ + return ConstRowsBlockXpr(derived(), startRow, 0, n, cols()); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) +template +EIGEN_DEVICE_FUNC +inline typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) +{ + return typename NRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const +{ + return typename ConstNRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) +EIGEN_DEVICE_FUNC +inline ColsBlockXpr leftCols(Index n) +{ + return ColsBlockXpr(derived(), 0, 0, rows(), n); +} +EIGEN_DEVICE_FUNC +inline ConstColsBlockXpr leftCols(Index n) const +{ + return ConstColsBlockXpr(derived(), 0, 0, rows(), n); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) +template +EIGEN_DEVICE_FUNC +inline typename NColsBlockXpr::Type leftCols(Index n = N) +{ + return typename NColsBlockXpr::Type(derived(), 0, 0, rows(), n); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstNColsBlockXpr::Type leftCols(Index n = N) const +{ + return typename ConstNColsBlockXpr::Type(derived(), 0, 0, rows(), n); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) +EIGEN_DEVICE_FUNC +inline ColsBlockXpr rightCols(Index n) +{ + return ColsBlockXpr(derived(), 0, cols() - n, rows(), n); +} +EIGEN_DEVICE_FUNC +inline ConstColsBlockXpr rightCols(Index n) const +{ + return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) +template +EIGEN_DEVICE_FUNC +inline typename NColsBlockXpr::Type rightCols(Index n = N) +{ + return typename NColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstNColsBlockXpr::Type rightCols(Index n = N) const +{ + return typename ConstNColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) +EIGEN_DEVICE_FUNC +inline ColsBlockXpr middleCols(Index startCol, Index numCols) +{ + return ColsBlockXpr(derived(), 0, startCol, rows(), numCols); +} +EIGEN_DEVICE_FUNC +inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const +{ + return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) +template +EIGEN_DEVICE_FUNC +inline typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) +{ + return typename NColsBlockXpr::Type(derived(), 0, startCol, rows(), n); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const +{ + return typename ConstNColsBlockXpr::Type(derived(), 0, startCol, rows(), n); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type block(Index startRow, Index startCol) +{ + return typename FixedBlockXpr::Type(derived(), startRow, startCol); +} +template +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const +{ + return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol); +} +EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +template +inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, + Index blockRows, Index blockCols) +{ + return typename FixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); +} +template +inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, + Index blockRows, Index blockCols) const +{ + return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) +EIGEN_DEVICE_FUNC +inline ColXpr col(Index i) +{ + return ColXpr(derived(), i); +} +EIGEN_DEVICE_FUNC +inline ConstColXpr col(Index i) const +{ + return ConstColXpr(derived(), i); +} +EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) +EIGEN_DEVICE_FUNC +inline RowXpr row(Index i) +{ + return RowXpr(derived(), i); +} +EIGEN_DEVICE_FUNC +inline ConstRowXpr row(Index i) const +{ + return ConstRowXpr(derived(), i); +} +EIGEN_DEVICE_FUNC +inline SegmentReturnType segment(Index start, Index n) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return SegmentReturnType(derived(), start, n); +} +EIGEN_DEVICE_FUNC +inline ConstSegmentReturnType segment(Index start, Index n) const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return ConstSegmentReturnType(derived(), start, n); +} +EIGEN_DEVICE_FUNC +inline SegmentReturnType head(Index n) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return SegmentReturnType(derived(), 0, n); +} +EIGEN_DEVICE_FUNC +inline ConstSegmentReturnType head(Index n) const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return ConstSegmentReturnType(derived(), 0, n); +} +EIGEN_DEVICE_FUNC +inline SegmentReturnType tail(Index n) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return SegmentReturnType(derived(), this->size() - n, n); +} +EIGEN_DEVICE_FUNC +inline ConstSegmentReturnType tail(Index n) const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return ConstSegmentReturnType(derived(), this->size() - n, n); +} +template +EIGEN_DEVICE_FUNC +inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return typename FixedSegmentReturnType::Type(derived(), start, n); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return typename ConstFixedSegmentReturnType::Type(derived(), start, n); +} +template +EIGEN_DEVICE_FUNC +inline typename FixedSegmentReturnType::Type head(Index n = N) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return typename FixedSegmentReturnType::Type(derived(), 0, n); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return typename ConstFixedSegmentReturnType::Type(derived(), 0, n); +} +template +EIGEN_DEVICE_FUNC +inline typename FixedSegmentReturnType::Type tail(Index n = N) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return typename FixedSegmentReturnType::Type(derived(), size() - n); +} +template +EIGEN_DEVICE_FUNC +inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); +} +// end # include "../plugins/BlockMethods.h" +# ifdef EIGEN_DENSEBASE_PLUGIN +# include EIGEN_DENSEBASE_PLUGIN +# endif +#undef EIGEN_CURRENT_STORAGE_BASE_CLASS +#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL +#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF + template + EIGEN_DEVICE_FUNC + inline void evalTo(Dest& ) const + { + EIGEN_STATIC_ASSERT((internal::is_same::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS); + } + protected: + EIGEN_DEVICE_FUNC DenseBase() + { +#ifdef EIGEN_INTERNAL_DEBUGGING + EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor)) + && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, int(!IsRowMajor))), + INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION) +#endif + } + private: + EIGEN_DEVICE_FUNC explicit DenseBase(int); + EIGEN_DEVICE_FUNC DenseBase(int,int); + template EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase&); +}; +} +#endif +// end #include "src/Core/DenseBase.h" +// #include "src/Core/MatrixBase.h" +#ifndef EIGEN_MATRIXBASE_H +#define EIGEN_MATRIXBASE_H +namespace Eigen { +template class MatrixBase + : public DenseBase +{ + public: +#ifndef EIGEN_PARSED_BY_DOXYGEN + typedef MatrixBase StorageBaseType; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::StorageIndex StorageIndex; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + typedef DenseBase Base; + using Base::RowsAtCompileTime; + using Base::ColsAtCompileTime; + using Base::SizeAtCompileTime; + using Base::MaxRowsAtCompileTime; + using Base::MaxColsAtCompileTime; + using Base::MaxSizeAtCompileTime; + using Base::IsVectorAtCompileTime; + using Base::Flags; + using Base::derived; + using Base::const_cast_derived; + using Base::rows; + using Base::cols; + using Base::size; + using Base::coeff; + using Base::coeffRef; + using Base::lazyAssign; + using Base::eval; + using Base::operator+=; + using Base::operator-=; + using Base::operator*=; + using Base::operator/=; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType; + typedef typename Base::RowXpr RowXpr; + typedef typename Base::ColXpr ColXpr; +#endif +#ifndef EIGEN_PARSED_BY_DOXYGEN + typedef Matrix SquareMatrixType; +#endif + EIGEN_DEVICE_FUNC + inline Index diagonalSize() const { return (numext::mini)(rows(),cols()); } + typedef typename Base::PlainObject PlainObject; +#ifndef EIGEN_PARSED_BY_DOXYGEN + typedef CwiseNullaryOp,PlainObject> ConstantReturnType; + typedef typename internal::conditional::IsComplex, + CwiseUnaryOp, ConstTransposeReturnType>, + ConstTransposeReturnType + >::type AdjointReturnType; + typedef Matrix, internal::traits::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType; + typedef CwiseNullaryOp,PlainObject> IdentityReturnType; + typedef Block, SquareMatrixType>, + internal::traits::RowsAtCompileTime, + internal::traits::ColsAtCompileTime> BasisReturnType; +#endif +#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase +#define EIGEN_DOC_UNARY_ADDONS(X,Y) +// # include "../plugins/CommonCwiseUnaryOps.h" +#ifndef EIGEN_PARSED_BY_DOXYGEN +typedef typename internal::conditional::IsComplex, + const CwiseUnaryOp, const Derived>, + const Derived& + >::type ConjugateReturnType; +typedef typename internal::conditional::IsComplex, + const CwiseUnaryOp, const Derived>, + const Derived& + >::type RealReturnType; +typedef typename internal::conditional::IsComplex, + CwiseUnaryView, Derived>, + Derived& + >::type NonConstRealReturnType; +typedef CwiseUnaryOp, const Derived> ImagReturnType; +typedef CwiseUnaryView, Derived> NonConstImagReturnType; +typedef CwiseUnaryOp, const Derived> NegativeReturnType; +#endif +EIGEN_DOC_UNARY_ADDONS(operator-,opposite) +EIGEN_DEVICE_FUNC +inline const NegativeReturnType +operator-() const { return NegativeReturnType(derived()); } +template struct CastXpr { typedef typename internal::cast_return_type, const Derived> >::type Type; }; +EIGEN_DOC_UNARY_ADDONS(cast,conversion function) +template +EIGEN_DEVICE_FUNC +typename CastXpr::Type +cast() const +{ + return typename CastXpr::Type(derived()); +} +EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) +EIGEN_DEVICE_FUNC +inline ConjugateReturnType +conjugate() const +{ + return ConjugateReturnType(derived()); +} +EIGEN_DOC_UNARY_ADDONS(real,real part function) +EIGEN_DEVICE_FUNC +inline RealReturnType +real() const { return RealReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function) +EIGEN_DEVICE_FUNC +inline const ImagReturnType +imag() const { return ImagReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function) +template +EIGEN_DEVICE_FUNC +inline const CwiseUnaryOp +unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const +{ + return CwiseUnaryOp(derived(), func); +} +EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function) +template +EIGEN_DEVICE_FUNC +inline const CwiseUnaryView +unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const +{ + return CwiseUnaryView(derived(), func); +} +EIGEN_DOC_UNARY_ADDONS(real,real part function) +EIGEN_DEVICE_FUNC +inline NonConstRealReturnType +real() { return NonConstRealReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function) +EIGEN_DEVICE_FUNC +inline NonConstImagReturnType +imag() { return NonConstImagReturnType(derived()); } +// end # include "../plugins/CommonCwiseUnaryOps.h" +// # include "../plugins/CommonCwiseBinaryOps.h" +EIGEN_MAKE_CWISE_BINARY_OP(operator-,difference) +EIGEN_MAKE_CWISE_BINARY_OP(operator+,sum) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp +binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other, const CustomBinaryOp& func = CustomBinaryOp()) const +{ + return CwiseBinaryOp(derived(), other.derived(), func); +} +#ifndef EIGEN_PARSED_BY_DOXYGEN +EIGEN_MAKE_SCALAR_BINARY_OP(operator*,product) +#else +template +const CwiseBinaryOp,Derived,Constant > operator*(const T& scalar) const; +template friend +const CwiseBinaryOp,Constant,Derived> operator*(const T& scalar, const StorageBaseType& expr); +#endif +#ifndef EIGEN_PARSED_BY_DOXYGEN +EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/,quotient) +#else +template +const CwiseBinaryOp,Derived,Constant > operator/(const T& scalar) const; +#endif +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp +operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + EIGEN_STATIC_ASSERT((internal::is_same::value && internal::is_same::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return CwiseBinaryOp(derived(),other.derived()); +} +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp +operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + EIGEN_STATIC_ASSERT((internal::is_same::value && internal::is_same::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return CwiseBinaryOp(derived(),other.derived()); +} +// end # include "../plugins/CommonCwiseBinaryOps.h" +// # include "../plugins/MatrixCwiseUnaryOps.h" +typedef CwiseUnaryOp, const Derived> CwiseAbsReturnType; +typedef CwiseUnaryOp, const Derived> CwiseAbs2ReturnType; +typedef CwiseUnaryOp, const Derived> CwiseSqrtReturnType; +typedef CwiseUnaryOp, const Derived> CwiseSignReturnType; +typedef CwiseUnaryOp, const Derived> CwiseInverseReturnType; +EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value) +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseAbsReturnType +cwiseAbs() const { return CwiseAbsReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value) +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseAbs2ReturnType +cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root) +EIGEN_DEVICE_FUNC +inline const CwiseSqrtReturnType +cwiseSqrt() const { return CwiseSqrtReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function) +EIGEN_DEVICE_FUNC +inline const CwiseSignReturnType +cwiseSign() const { return CwiseSignReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse) +EIGEN_DEVICE_FUNC +inline const CwiseInverseReturnType +cwiseInverse() const { return CwiseInverseReturnType(derived()); } +// end # include "../plugins/MatrixCwiseUnaryOps.h" +// # include "../plugins/MatrixCwiseBinaryOps.h" +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product) +cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived()); +} +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMin(const Scalar &other) const +{ + return cwiseMin(Derived::Constant(rows(), cols(), other)); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMax(const Scalar &other) const +{ + return cwiseMax(Derived::Constant(rows(), cols(), other)); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +typedef CwiseBinaryOp, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType; +EIGEN_DEVICE_FUNC +inline const CwiseScalarEqualReturnType +cwiseEqual(const Scalar& s) const +{ + return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op()); +} +// end # include "../plugins/MatrixCwiseBinaryOps.h" +# ifdef EIGEN_MATRIXBASE_PLUGIN +# include EIGEN_MATRIXBASE_PLUGIN +# endif +#undef EIGEN_CURRENT_STORAGE_BASE_CLASS +#undef EIGEN_DOC_UNARY_ADDONS + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator=(const MatrixBase& other); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator=(const DenseBase& other); + template + EIGEN_DEVICE_FUNC + Derived& operator=(const EigenBase& other); + template + EIGEN_DEVICE_FUNC + Derived& operator=(const ReturnByValue& other); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const MatrixBase& other); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const MatrixBase& other); +#ifdef __CUDACC__ + template + EIGEN_DEVICE_FUNC + const Product + operator*(const MatrixBase &other) const + { return this->lazyProduct(other); } +#else + template + const Product + operator*(const MatrixBase &other) const; +#endif + template + EIGEN_DEVICE_FUNC + const Product + lazyProduct(const MatrixBase &other) const; + template + Derived& operator*=(const EigenBase& other); + template + void applyOnTheLeft(const EigenBase& other); + template + void applyOnTheRight(const EigenBase& other); + template + EIGEN_DEVICE_FUNC + const Product + operator*(const DiagonalBase &diagonal) const; + template + EIGEN_DEVICE_FUNC + typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType + dot(const MatrixBase& other) const; + EIGEN_DEVICE_FUNC RealScalar squaredNorm() const; + EIGEN_DEVICE_FUNC RealScalar norm() const; + RealScalar stableNorm() const; + RealScalar blueNorm() const; + RealScalar hypotNorm() const; + EIGEN_DEVICE_FUNC const PlainObject normalized() const; + EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const; + EIGEN_DEVICE_FUNC void normalize(); + EIGEN_DEVICE_FUNC void stableNormalize(); + EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const; + EIGEN_DEVICE_FUNC void adjointInPlace(); + typedef Diagonal DiagonalReturnType; + EIGEN_DEVICE_FUNC + DiagonalReturnType diagonal(); + typedef typename internal::add_const >::type ConstDiagonalReturnType; + EIGEN_DEVICE_FUNC + ConstDiagonalReturnType diagonal() const; + template struct DiagonalIndexReturnType { typedef Diagonal Type; }; + template struct ConstDiagonalIndexReturnType { typedef const Diagonal Type; }; + template + EIGEN_DEVICE_FUNC + typename DiagonalIndexReturnType::Type diagonal(); + template + EIGEN_DEVICE_FUNC + typename ConstDiagonalIndexReturnType::Type diagonal() const; + typedef Diagonal DiagonalDynamicIndexReturnType; + typedef typename internal::add_const >::type ConstDiagonalDynamicIndexReturnType; + EIGEN_DEVICE_FUNC + DiagonalDynamicIndexReturnType diagonal(Index index); + EIGEN_DEVICE_FUNC + ConstDiagonalDynamicIndexReturnType diagonal(Index index) const; + template struct TriangularViewReturnType { typedef TriangularView Type; }; + template struct ConstTriangularViewReturnType { typedef const TriangularView Type; }; + template + EIGEN_DEVICE_FUNC + typename TriangularViewReturnType::Type triangularView(); + template + EIGEN_DEVICE_FUNC + typename ConstTriangularViewReturnType::Type triangularView() const; + template struct SelfAdjointViewReturnType { typedef SelfAdjointView Type; }; + template struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView Type; }; + template + EIGEN_DEVICE_FUNC + typename SelfAdjointViewReturnType::Type selfadjointView(); + template + EIGEN_DEVICE_FUNC + typename ConstSelfAdjointViewReturnType::Type selfadjointView() const; + const SparseView sparseView(const Scalar& m_reference = Scalar(0), + const typename NumTraits::Real& m_epsilon = NumTraits::dummy_precision()) const; + EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(); + EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(Index rows, Index cols); + EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index size, Index i); + EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index i); + EIGEN_DEVICE_FUNC static const BasisReturnType UnitX(); + EIGEN_DEVICE_FUNC static const BasisReturnType UnitY(); + EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ(); + EIGEN_DEVICE_FUNC static const BasisReturnType UnitW(); + EIGEN_DEVICE_FUNC + const DiagonalWrapper asDiagonal() const; + const PermutationWrapper asPermutation() const; + EIGEN_DEVICE_FUNC + Derived& setIdentity(); + EIGEN_DEVICE_FUNC + Derived& setIdentity(Index rows, Index cols); + bool isIdentity(const RealScalar& prec = NumTraits::dummy_precision()) const; + bool isDiagonal(const RealScalar& prec = NumTraits::dummy_precision()) const; + bool isUpperTriangular(const RealScalar& prec = NumTraits::dummy_precision()) const; + bool isLowerTriangular(const RealScalar& prec = NumTraits::dummy_precision()) const; + template + bool isOrthogonal(const MatrixBase& other, + const RealScalar& prec = NumTraits::dummy_precision()) const; + bool isUnitary(const RealScalar& prec = NumTraits::dummy_precision()) const; + template + EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase& other) const + { return cwiseEqual(other).all(); } + template + EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const + { return cwiseNotEqual(other).any(); } + NoAlias noalias(); + inline const Derived& forceAlignedAccess() const { return derived(); } + inline Derived& forceAlignedAccess() { return derived(); } + template inline const Derived& forceAlignedAccessIf() const { return derived(); } + template inline Derived& forceAlignedAccessIf() { return derived(); } + EIGEN_DEVICE_FUNC Scalar trace() const; + template EIGEN_DEVICE_FUNC RealScalar lpNorm() const; + EIGEN_DEVICE_FUNC MatrixBase& matrix() { return *this; } + EIGEN_DEVICE_FUNC const MatrixBase& matrix() const { return *this; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper array() { return ArrayWrapper(derived()); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper array() const { return ArrayWrapper(derived()); } + inline const FullPivLU fullPivLu() const; + inline const PartialPivLU partialPivLu() const; + inline const PartialPivLU lu() const; + inline const Inverse inverse() const; + template + inline void computeInverseAndDetWithCheck( + ResultType& inverse, + typename ResultType::Scalar& determinant, + bool& invertible, + const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() + ) const; + template + inline void computeInverseWithCheck( + ResultType& inverse, + bool& invertible, + const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() + ) const; + Scalar determinant() const; + inline const LLT llt() const; + inline const LDLT ldlt() const; + inline const HouseholderQR householderQr() const; + inline const ColPivHouseholderQR colPivHouseholderQr() const; + inline const FullPivHouseholderQR fullPivHouseholderQr() const; + inline const CompleteOrthogonalDecomposition completeOrthogonalDecomposition() const; + inline EigenvaluesReturnType eigenvalues() const; + inline RealScalar operatorNorm() const; + inline JacobiSVD jacobiSvd(unsigned int computationOptions = 0) const; + inline BDCSVD bdcSvd(unsigned int computationOptions = 0) const; + #ifndef EIGEN_PARSED_BY_DOXYGEN + template struct cross_product_return_type { + typedef typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType Scalar; + typedef Matrix type; + }; + #endif + template + EIGEN_DEVICE_FUNC +#ifndef EIGEN_PARSED_BY_DOXYGEN + inline typename cross_product_return_type::type +#else + inline PlainObject +#endif + cross(const MatrixBase& other) const; + template + EIGEN_DEVICE_FUNC + inline PlainObject cross3(const MatrixBase& other) const; + EIGEN_DEVICE_FUNC + inline PlainObject unitOrthogonal(void) const; + EIGEN_DEVICE_FUNC + inline Matrix eulerAngles(Index a0, Index a1, Index a2) const; + enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical) + : ColsAtCompileTime==1 ? Vertical : Horizontal }; + typedef Homogeneous HomogeneousReturnType; + EIGEN_DEVICE_FUNC + inline HomogeneousReturnType homogeneous() const; + enum { + SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1 + }; + typedef Block::ColsAtCompileTime==1 ? SizeMinusOne : 1, + internal::traits::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne; + typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne,Scalar,quotient) HNormalizedReturnType; + EIGEN_DEVICE_FUNC + inline const HNormalizedReturnType hnormalized() const; + void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); + template + void makeHouseholder(EssentialPart& essential, + Scalar& tau, RealScalar& beta) const; + template + void applyHouseholderOnTheLeft(const EssentialPart& essential, + const Scalar& tau, + Scalar* workspace); + template + void applyHouseholderOnTheRight(const EssentialPart& essential, + const Scalar& tau, + Scalar* workspace); + template + void applyOnTheLeft(Index p, Index q, const JacobiRotation& j); + template + void applyOnTheRight(Index p, Index q, const JacobiRotation& j); + template + EIGEN_STRONG_INLINE const typename SparseMatrixBase::template CwiseProductDenseReturnType::Type + cwiseProduct(const SparseMatrixBase &other) const + { + return other.cwiseProduct(derived()); + } + typedef typename internal::stem_function::type StemFunction; + const MatrixExponentialReturnValue exp() const; + const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; + const MatrixFunctionReturnValue cosh() const; + const MatrixFunctionReturnValue sinh() const; + const MatrixFunctionReturnValue cos() const; + const MatrixFunctionReturnValue sin() const; + const MatrixSquareRootReturnValue sqrt() const; + const MatrixLogarithmReturnValue log() const; + const MatrixPowerReturnValue pow(const RealScalar& p) const; + const MatrixComplexPowerReturnValue pow(const std::complex& p) const; + protected: + EIGEN_DEVICE_FUNC MatrixBase() : Base() {} + private: + EIGEN_DEVICE_FUNC explicit MatrixBase(int); + EIGEN_DEVICE_FUNC MatrixBase(int,int); + template EIGEN_DEVICE_FUNC explicit MatrixBase(const MatrixBase&); + protected: + template Derived& operator+=(const ArrayBase& ) + {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;} + template Derived& operator-=(const ArrayBase& ) + {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;} +}; +template +template +inline Derived& +MatrixBase::operator*=(const EigenBase &other) +{ + other.derived().applyThisOnTheRight(derived()); + return derived(); +} +template +template +inline void MatrixBase::applyOnTheRight(const EigenBase &other) +{ + other.derived().applyThisOnTheRight(derived()); +} +template +template +inline void MatrixBase::applyOnTheLeft(const EigenBase &other) +{ + other.derived().applyThisOnTheLeft(derived()); +} +} +#endif +// end #include "src/Core/MatrixBase.h" +// #include "src/Core/EigenBase.h" +#ifndef EIGEN_EIGENBASE_H +#define EIGEN_EIGENBASE_H +namespace Eigen { +template struct EigenBase +{ + typedef Eigen::Index Index; + typedef typename internal::traits::StorageKind StorageKind; + EIGEN_DEVICE_FUNC + Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + const Derived& derived() const { return *static_cast(this); } + EIGEN_DEVICE_FUNC + inline Derived& const_cast_derived() const + { return *static_cast(const_cast(this)); } + EIGEN_DEVICE_FUNC + inline const Derived& const_derived() const + { return *static_cast(this); } + EIGEN_DEVICE_FUNC + inline Index rows() const { return derived().rows(); } + EIGEN_DEVICE_FUNC + inline Index cols() const { return derived().cols(); } + EIGEN_DEVICE_FUNC + inline Index size() const { return rows() * cols(); } + template + EIGEN_DEVICE_FUNC + inline void evalTo(Dest& dst) const + { derived().evalTo(dst); } + template + EIGEN_DEVICE_FUNC + inline void addTo(Dest& dst) const + { + typename Dest::PlainObject res(rows(),cols()); + evalTo(res); + dst += res; + } + template + EIGEN_DEVICE_FUNC + inline void subTo(Dest& dst) const + { + typename Dest::PlainObject res(rows(),cols()); + evalTo(res); + dst -= res; + } + template + EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const + { + dst = dst * this->derived(); + } + template + EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const + { + dst = this->derived() * dst; + } +}; +template +template +EIGEN_DEVICE_FUNC +Derived& DenseBase::operator=(const EigenBase &other) +{ + call_assignment(derived(), other.derived()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC +Derived& DenseBase::operator+=(const EigenBase &other) +{ + call_assignment(derived(), other.derived(), internal::add_assign_op()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC +Derived& DenseBase::operator-=(const EigenBase &other) +{ + call_assignment(derived(), other.derived(), internal::sub_assign_op()); + return derived(); +} +} +#endif +// end #include "src/Core/EigenBase.h" +// #include "src/Core/Product.h" +#ifndef EIGEN_PRODUCT_H +#define EIGEN_PRODUCT_H +namespace Eigen { +template class ProductImpl; +namespace internal { +template +struct traits > +{ + typedef typename remove_all::type LhsCleaned; + typedef typename remove_all::type RhsCleaned; + typedef traits LhsTraits; + typedef traits RhsTraits; + typedef MatrixXpr XprKind; + typedef typename ScalarBinaryOpTraits::Scalar, typename traits::Scalar>::ReturnType Scalar; + typedef typename product_promote_storage_type::ret>::ret StorageKind; + typedef typename promote_index_type::type StorageIndex; + enum { + RowsAtCompileTime = LhsTraits::RowsAtCompileTime, + ColsAtCompileTime = RhsTraits::ColsAtCompileTime, + MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime, + MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime, + InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime), + Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit + : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0 + : ( ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit)) + || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) ) ? RowMajorBit + : NoPreferredStorageOrderBit + }; +}; +} +template +class Product : public ProductImpl<_Lhs,_Rhs,Option, + typename internal::product_promote_storage_type::StorageKind, + typename internal::traits<_Rhs>::StorageKind, + internal::product_type<_Lhs,_Rhs>::ret>::ret> +{ + public: + typedef _Lhs Lhs; + typedef _Rhs Rhs; + typedef typename ProductImpl< + Lhs, Rhs, Option, + typename internal::product_promote_storage_type::StorageKind, + typename internal::traits::StorageKind, + internal::product_type::ret>::ret>::Base Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(Product) + typedef typename internal::ref_selector::type LhsNested; + typedef typename internal::ref_selector::type RhsNested; + typedef typename internal::remove_all::type LhsNestedCleaned; + typedef typename internal::remove_all::type RhsNestedCleaned; + EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) + { + eigen_assert(lhs.cols() == rhs.rows() + && "invalid matrix product" + && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); + } + EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } + EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } + protected: + LhsNested m_lhs; + RhsNested m_rhs; +}; +namespace internal { +template::ret> +class dense_product_base + : public internal::dense_xpr_base >::type +{}; +template +class dense_product_base + : public internal::dense_xpr_base >::type +{ + typedef Product ProductXpr; + typedef typename internal::dense_xpr_base::type Base; +public: + using Base::derived; + typedef typename Base::Scalar Scalar; + operator const Scalar() const + { + return internal::evaluator(derived()).coeff(0,0); + } +}; +} +template +class ProductImpl : public internal::generic_xpr_base, MatrixXpr, StorageKind>::type +{ + public: + typedef typename internal::generic_xpr_base, MatrixXpr, StorageKind>::type Base; +}; +template +class ProductImpl + : public internal::dense_product_base +{ + typedef Product Derived; + public: + typedef typename internal::dense_product_base Base; + EIGEN_DENSE_PUBLIC_INTERFACE(Derived) + protected: + enum { + IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && + (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic), + EnableCoeff = IsOneByOne || Option==LazyProduct + }; + public: + EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const + { + EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); + eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); + return internal::evaluator(derived()).coeff(row,col); + } + EIGEN_DEVICE_FUNC Scalar coeff(Index i) const + { + EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); + eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); + return internal::evaluator(derived()).coeff(i); + } +}; +} +#endif +// end #include "src/Core/Product.h" +// #include "src/Core/CoreEvaluators.h" +#ifndef EIGEN_COREEVALUATORS_H +#define EIGEN_COREEVALUATORS_H +namespace Eigen { +namespace internal { +template +struct storage_kind_to_evaluator_kind { + typedef IndexBased Kind; +}; +template struct storage_kind_to_shape; +template<> struct storage_kind_to_shape { typedef DenseShape Shape; }; +template<> struct storage_kind_to_shape { typedef SolverShape Shape; }; +template<> struct storage_kind_to_shape { typedef PermutationShape Shape; }; +template<> struct storage_kind_to_shape { typedef TranspositionsShape Shape; }; +template< typename T, + typename Arg1Kind = typename evaluator_traits::Kind, + typename Arg2Kind = typename evaluator_traits::Kind, + typename Arg3Kind = typename evaluator_traits::Kind, + typename Arg1Scalar = typename traits::Scalar, + typename Arg2Scalar = typename traits::Scalar, + typename Arg3Scalar = typename traits::Scalar> struct ternary_evaluator; +template< typename T, + typename LhsKind = typename evaluator_traits::Kind, + typename RhsKind = typename evaluator_traits::Kind, + typename LhsScalar = typename traits::Scalar, + typename RhsScalar = typename traits::Scalar> struct binary_evaluator; +template< typename T, + typename Kind = typename evaluator_traits::Kind, + typename Scalar = typename T::Scalar> struct unary_evaluator; +template +struct evaluator_traits_base +{ + typedef typename storage_kind_to_evaluator_kind::StorageKind>::Kind Kind; + typedef typename storage_kind_to_shape::StorageKind>::Shape Shape; +}; +template +struct evaluator_traits : public evaluator_traits_base +{ +}; +template::Shape > +struct evaluator_assume_aliasing { + static const bool value = false; +}; +template +struct evaluator : public unary_evaluator +{ + typedef unary_evaluator Base; + EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {} +}; +template +struct evaluator + : evaluator +{ + EIGEN_DEVICE_FUNC + explicit evaluator(const T& xpr) : evaluator(xpr) {} +}; +template +struct evaluator_base : public noncopyable +{ + typedef traits ExpressionTraits; + enum { + Alignment = 0 + }; +}; +template +struct evaluator > + : evaluator_base +{ + typedef PlainObjectBase PlainObjectType; + typedef typename PlainObjectType::Scalar Scalar; + typedef typename PlainObjectType::CoeffReturnType CoeffReturnType; + enum { + IsRowMajor = PlainObjectType::IsRowMajor, + IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime, + RowsAtCompileTime = PlainObjectType::RowsAtCompileTime, + ColsAtCompileTime = PlainObjectType::ColsAtCompileTime, + CoeffReadCost = NumTraits::ReadCost, + Flags = traits::EvaluatorFlags, + Alignment = traits::Alignment + }; + EIGEN_DEVICE_FUNC evaluator() + : m_data(0), + m_outerStride(IsVectorAtCompileTime ? 0 + : int(IsRowMajor) ? ColsAtCompileTime + : RowsAtCompileTime) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m) + : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + if (IsRowMajor) + return m_data[row * m_outerStride.value() + col]; + else + return m_data[row + col * m_outerStride.value()]; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_data[index]; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + if (IsRowMajor) + return const_cast(m_data)[row * m_outerStride.value() + col]; + else + return const_cast(m_data)[row + col * m_outerStride.value()]; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + return const_cast(m_data)[index]; + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + if (IsRowMajor) + return ploadt(m_data + row * m_outerStride.value() + col); + else + return ploadt(m_data + row + col * m_outerStride.value()); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + return ploadt(m_data + index); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index row, Index col, const PacketType& x) + { + if (IsRowMajor) + return pstoret + (const_cast(m_data) + row * m_outerStride.value() + col, x); + else + return pstoret + (const_cast(m_data) + row + col * m_outerStride.value(), x); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketType& x) + { + return pstoret(const_cast(m_data) + index, x); + } +protected: + const Scalar *m_data; + variable_if_dynamic m_outerStride; +}; +template +struct evaluator > + : evaluator > > +{ + typedef Matrix XprType; + EIGEN_DEVICE_FUNC evaluator() {} + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) + : evaluator >(m) + { } +}; +template +struct evaluator > + : evaluator > > +{ + typedef Array XprType; + EIGEN_DEVICE_FUNC evaluator() {} + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) + : evaluator >(m) + { } +}; +template +struct unary_evaluator, IndexBased> + : evaluator_base > +{ + typedef Transpose XprType; + enum { + CoeffReadCost = evaluator::CoeffReadCost, + Flags = evaluator::Flags ^ RowMajorBit, + Alignment = evaluator::Alignment + }; + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_argImpl.coeff(col, row); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_argImpl.coeff(index); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_argImpl.coeffRef(col, row); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename XprType::Scalar& coeffRef(Index index) + { + return m_argImpl.coeffRef(index); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + return m_argImpl.template packet(col, row); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + return m_argImpl.template packet(index); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index row, Index col, const PacketType& x) + { + m_argImpl.template writePacket(col, row, x); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketType& x) + { + m_argImpl.template writePacket(index, x); + } +protected: + evaluator m_argImpl; +}; +template::value, + bool has_unary = has_unary_operator::value, + bool has_binary = has_binary_operator::value> +struct nullary_wrapper +{ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const { return op(i,j); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const { return op.template packetOp(i,j); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp(i); } +}; +template +struct nullary_wrapper +{ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType=0, IndexType=0) const { return op(); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType=0, IndexType=0) const { return op.template packetOp(); } +}; +template +struct nullary_wrapper +{ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j=0) const { return op(i,j); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j=0) const { return op.template packetOp(i,j); } +}; +template +struct nullary_wrapper +{ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const { + eigen_assert(i==0 || j==0); + return op(i+j); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const { + eigen_assert(i==0 || j==0); + return op.template packetOp(i+j); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { return op(i); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { return op.template packetOp(i); } +}; +template +struct nullary_wrapper {}; +#if 0 && EIGEN_COMP_MSVC>0 +template struct nullary_wrapper_workaround_msvc { + nullary_wrapper_workaround_msvc(const T&); + operator T()const; +}; +template +struct nullary_wrapper +{ + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const { + return nullary_wrapper >::value, + has_unary_operator >::value, + has_binary_operator >::value>().operator()(op,i,j); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const { + return nullary_wrapper >::value, + has_unary_operator >::value, + has_binary_operator >::value>().operator()(op,i); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const { + return nullary_wrapper >::value, + has_unary_operator >::value, + has_binary_operator >::value>().template packetOp(op,i,j); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const { + return nullary_wrapper >::value, + has_unary_operator >::value, + has_binary_operator >::value>().template packetOp(op,i); + } +}; +#endif +template +struct evaluator > + : evaluator_base > +{ + typedef CwiseNullaryOp XprType; + typedef typename internal::remove_all::type PlainObjectTypeCleaned; + enum { + CoeffReadCost = internal::functor_traits::Cost, + Flags = (evaluator::Flags + & ( HereditaryBits + | (functor_has_linear_access::ret ? LinearAccessBit : 0) + | (functor_traits::PacketAccess ? PacketAccessBit : 0))) + | (functor_traits::IsRepeatable ? 0 : EvalBeforeNestingBit), + Alignment = AlignedMax + }; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n) + : m_functor(n.functor()), m_wrapper() + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + typedef typename XprType::CoeffReturnType CoeffReturnType; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(IndexType row, IndexType col) const + { + return m_wrapper(m_functor, row, col); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(IndexType index) const + { + return m_wrapper(m_functor,index); + } + template + EIGEN_STRONG_INLINE + PacketType packet(IndexType row, IndexType col) const + { + return m_wrapper.template packetOp(m_functor, row, col); + } + template + EIGEN_STRONG_INLINE + PacketType packet(IndexType index) const + { + return m_wrapper.template packetOp(m_functor, index); + } +protected: + const NullaryOp m_functor; + const internal::nullary_wrapper m_wrapper; +}; +template +struct unary_evaluator, IndexBased > + : evaluator_base > +{ + typedef CwiseUnaryOp XprType; + enum { + CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + Flags = evaluator::Flags + & (HereditaryBits | LinearAccessBit | (functor_traits::PacketAccess ? PacketAccessBit : 0)), + Alignment = evaluator::Alignment + }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit unary_evaluator(const XprType& op) + : m_functor(op.functor()), + m_argImpl(op.nestedExpression()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_functor(m_argImpl.coeff(row, col)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_functor(m_argImpl.coeff(index)); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + return m_functor.packetOp(m_argImpl.template packet(row, col)); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + return m_functor.packetOp(m_argImpl.template packet(index)); + } +protected: + const UnaryOp m_functor; + evaluator m_argImpl; +}; +template +struct evaluator > + : public ternary_evaluator > +{ + typedef CwiseTernaryOp XprType; + typedef ternary_evaluator > Base; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} +}; +template +struct ternary_evaluator, IndexBased, IndexBased> + : evaluator_base > +{ + typedef CwiseTernaryOp XprType; + enum { + CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + Arg1Flags = evaluator::Flags, + Arg2Flags = evaluator::Flags, + Arg3Flags = evaluator::Flags, + SameType = is_same::value && is_same::value, + StorageOrdersAgree = (int(Arg1Flags)&RowMajorBit)==(int(Arg2Flags)&RowMajorBit) && (int(Arg1Flags)&RowMajorBit)==(int(Arg3Flags)&RowMajorBit), + Flags0 = (int(Arg1Flags) | int(Arg2Flags) | int(Arg3Flags)) & ( + HereditaryBits + | (int(Arg1Flags) & int(Arg2Flags) & int(Arg3Flags) & + ( (StorageOrdersAgree ? LinearAccessBit : 0) + | (functor_traits::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0) + ) + ) + ), + Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit), + Alignment = EIGEN_PLAIN_ENUM_MIN( + EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment), + evaluator::Alignment) + }; + EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) + : m_functor(xpr.functor()), + m_arg1Impl(xpr.arg1()), + m_arg2Impl(xpr.arg2()), + m_arg3Impl(xpr.arg3()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + return m_functor.packetOp(m_arg1Impl.template packet(row, col), + m_arg2Impl.template packet(row, col), + m_arg3Impl.template packet(row, col)); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + return m_functor.packetOp(m_arg1Impl.template packet(index), + m_arg2Impl.template packet(index), + m_arg3Impl.template packet(index)); + } +protected: + const TernaryOp m_functor; + evaluator m_arg1Impl; + evaluator m_arg2Impl; + evaluator m_arg3Impl; +}; +template +struct evaluator > + : public binary_evaluator > +{ + typedef CwiseBinaryOp XprType; + typedef binary_evaluator > Base; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} +}; +template +struct binary_evaluator, IndexBased, IndexBased> + : evaluator_base > +{ + typedef CwiseBinaryOp XprType; + enum { + CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + LhsFlags = evaluator::Flags, + RhsFlags = evaluator::Flags, + SameType = is_same::value, + StorageOrdersAgree = (int(LhsFlags)&RowMajorBit)==(int(RhsFlags)&RowMajorBit), + Flags0 = (int(LhsFlags) | int(RhsFlags)) & ( + HereditaryBits + | (int(LhsFlags) & int(RhsFlags) & + ( (StorageOrdersAgree ? LinearAccessBit : 0) + | (functor_traits::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0) + ) + ) + ), + Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit), + Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment,evaluator::Alignment) + }; + EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr) + : m_functor(xpr.functor()), + m_lhsImpl(xpr.lhs()), + m_rhsImpl(xpr.rhs()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index)); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + return m_functor.packetOp(m_lhsImpl.template packet(row, col), + m_rhsImpl.template packet(row, col)); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + return m_functor.packetOp(m_lhsImpl.template packet(index), + m_rhsImpl.template packet(index)); + } +protected: + const BinaryOp m_functor; + evaluator m_lhsImpl; + evaluator m_rhsImpl; +}; +template +struct unary_evaluator, IndexBased> + : evaluator_base > +{ + typedef CwiseUnaryView XprType; + enum { + CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + Flags = (evaluator::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)), + Alignment = 0 + }; + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) + : m_unaryOp(op.functor()), + m_argImpl(op.nestedExpression()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_unaryOp(m_argImpl.coeff(row, col)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_unaryOp(m_argImpl.coeff(index)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_unaryOp(m_argImpl.coeffRef(row, col)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + return m_unaryOp(m_argImpl.coeffRef(index)); + } +protected: + const UnaryOp m_unaryOp; + evaluator m_argImpl; +}; +template +struct mapbase_evaluator; +template +struct mapbase_evaluator : evaluator_base +{ + typedef Derived XprType; + typedef typename XprType::PointerType PointerType; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + enum { + IsRowMajor = XprType::RowsAtCompileTime, + ColsAtCompileTime = XprType::ColsAtCompileTime, + CoeffReadCost = NumTraits::ReadCost + }; + EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map) + : m_data(const_cast(map.data())), + m_innerStride(map.innerStride()), + m_outerStride(map.outerStride()) + { + EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator::Flags&PacketAccessBit, internal::inner_stride_at_compile_time::ret==1), + PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_data[col * colStride() + row * rowStride()]; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_data[index * m_innerStride.value()]; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_data[col * colStride() + row * rowStride()]; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + return m_data[index * m_innerStride.value()]; + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + PointerType ptr = m_data + row * rowStride() + col * colStride(); + return internal::ploadt(ptr); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + return internal::ploadt(m_data + index * m_innerStride.value()); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index row, Index col, const PacketType& x) + { + PointerType ptr = m_data + row * rowStride() + col * colStride(); + return internal::pstoret(ptr, x); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketType& x) + { + internal::pstoret(m_data + index * m_innerStride.value(), x); + } +protected: + EIGEN_DEVICE_FUNC + inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } + EIGEN_DEVICE_FUNC + inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } + PointerType m_data; + const internal::variable_if_dynamic m_innerStride; + const internal::variable_if_dynamic m_outerStride; +}; +template +struct evaluator > + : public mapbase_evaluator, PlainObjectType> +{ + typedef Map XprType; + typedef typename XprType::Scalar Scalar; + typedef typename packet_traits::type PacketScalar; + enum { + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 + ? int(PlainObjectType::InnerStrideAtCompileTime) + : int(StrideType::InnerStrideAtCompileTime), + OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 + ? int(PlainObjectType::OuterStrideAtCompileTime) + : int(StrideType::OuterStrideAtCompileTime), + HasNoInnerStride = InnerStrideAtCompileTime == 1, + HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0, + HasNoStride = HasNoInnerStride && HasNoOuterStride, + IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic, + PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit), + LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit), + Flags = int( evaluator::Flags) & (LinearAccessMask&PacketAccessMask), + Alignment = int(MapOptions)&int(AlignedMask) + }; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map) + : mapbase_evaluator(map) + { } +}; +template +struct evaluator > + : public mapbase_evaluator, PlainObjectType> +{ + typedef Ref XprType; + enum { + Flags = evaluator >::Flags, + Alignment = evaluator >::Alignment + }; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref) + : mapbase_evaluator(ref) + { } +}; +template::ret> struct block_evaluator; +template +struct evaluator > + : block_evaluator +{ + typedef Block XprType; + typedef typename XprType::Scalar Scalar; + typedef typename packet_traits::type PacketScalar; + enum { + CoeffReadCost = evaluator::CoeffReadCost, + RowsAtCompileTime = traits::RowsAtCompileTime, + ColsAtCompileTime = traits::ColsAtCompileTime, + MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, + MaxColsAtCompileTime = traits::MaxColsAtCompileTime, + ArgTypeIsRowMajor = (int(evaluator::Flags)&RowMajorBit) != 0, + IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1 + : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0 + : ArgTypeIsRowMajor, + HasSameStorageOrderAsArgType = (IsRowMajor == ArgTypeIsRowMajor), + InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime), + InnerStrideAtCompileTime = HasSameStorageOrderAsArgType + ? int(inner_stride_at_compile_time::ret) + : int(outer_stride_at_compile_time::ret), + OuterStrideAtCompileTime = HasSameStorageOrderAsArgType + ? int(outer_stride_at_compile_time::ret) + : int(inner_stride_at_compile_time::ret), + MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, + FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, + FlagsRowMajorBit = XprType::Flags&RowMajorBit, + Flags0 = evaluator::Flags & ( (HereditaryBits & ~RowMajorBit) | + DirectAccessBit | + MaskPacketAccessBit), + Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, + PacketAlignment = unpacket_traits::alignment, + Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, + Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) + }; + typedef block_evaluator block_evaluator_type; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } +}; +template +struct block_evaluator + : unary_evaluator > +{ + typedef Block XprType; + EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) + : unary_evaluator(block) + {} +}; +template +struct unary_evaluator, IndexBased> + : evaluator_base > +{ + typedef Block XprType; + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) + : m_argImpl(block.nestedExpression()), + m_startRow(block.startRow()), + m_startCol(block.startCol()) + { } + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + enum { + RowsAtCompileTime = XprType::RowsAtCompileTime + }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + return m_argImpl.template packet(m_startRow.value() + row, m_startCol.value() + col); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + return packet(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index row, Index col, const PacketType& x) + { + return m_argImpl.template writePacket(m_startRow.value() + row, m_startCol.value() + col, x); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketType& x) + { + return writePacket(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, + x); + } +protected: + evaluator m_argImpl; + const variable_if_dynamic m_startRow; + const variable_if_dynamic m_startCol; +}; +template +struct block_evaluator + : mapbase_evaluator, + typename Block::PlainObject> +{ + typedef Block XprType; + typedef typename XprType::Scalar Scalar; + EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) + : mapbase_evaluator(block) + { + eigen_assert(((internal::UIntPtr(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); + } +}; +template +struct evaluator > + : evaluator_base > +{ + typedef Select XprType; + enum { + CoeffReadCost = evaluator::CoeffReadCost + + EIGEN_PLAIN_ENUM_MAX(evaluator::CoeffReadCost, + evaluator::CoeffReadCost), + Flags = (unsigned int)evaluator::Flags & evaluator::Flags & HereditaryBits, + Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) + }; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) + : m_conditionImpl(select.conditionMatrix()), + m_thenImpl(select.thenMatrix()), + m_elseImpl(select.elseMatrix()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + if (m_conditionImpl.coeff(row, col)) + return m_thenImpl.coeff(row, col); + else + return m_elseImpl.coeff(row, col); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + if (m_conditionImpl.coeff(index)) + return m_thenImpl.coeff(index); + else + return m_elseImpl.coeff(index); + } +protected: + evaluator m_conditionImpl; + evaluator m_thenImpl; + evaluator m_elseImpl; +}; +template +struct unary_evaluator > + : evaluator_base > +{ + typedef Replicate XprType; + typedef typename XprType::CoeffReturnType CoeffReturnType; + enum { + Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor + }; + typedef typename internal::nested_eval::type ArgTypeNested; + typedef typename internal::remove_all::type ArgTypeNestedCleaned; + enum { + CoeffReadCost = evaluator::CoeffReadCost, + LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0, + Flags = (evaluator::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits::Flags & RowMajorBit), + Alignment = evaluator::Alignment + }; + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate) + : m_arg(replicate.nestedExpression()), + m_argImpl(m_arg), + m_rows(replicate.nestedExpression().rows()), + m_cols(replicate.nestedExpression().cols()) + {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + const Index actual_row = internal::traits::RowsAtCompileTime==1 ? 0 + : RowFactor==1 ? row + : row % m_rows.value(); + const Index actual_col = internal::traits::ColsAtCompileTime==1 ? 0 + : ColFactor==1 ? col + : col % m_cols.value(); + return m_argImpl.coeff(actual_row, actual_col); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + const Index actual_index = internal::traits::RowsAtCompileTime==1 + ? (ColFactor==1 ? index : index%m_cols.value()) + : (RowFactor==1 ? index : index%m_rows.value()); + return m_argImpl.coeff(actual_index); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + const Index actual_row = internal::traits::RowsAtCompileTime==1 ? 0 + : RowFactor==1 ? row + : row % m_rows.value(); + const Index actual_col = internal::traits::ColsAtCompileTime==1 ? 0 + : ColFactor==1 ? col + : col % m_cols.value(); + return m_argImpl.template packet(actual_row, actual_col); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + const Index actual_index = internal::traits::RowsAtCompileTime==1 + ? (ColFactor==1 ? index : index%m_cols.value()) + : (RowFactor==1 ? index : index%m_rows.value()); + return m_argImpl.template packet(actual_index); + } +protected: + const ArgTypeNested m_arg; + evaluator m_argImpl; + const variable_if_dynamic m_rows; + const variable_if_dynamic m_cols; +}; +template< typename ArgType, typename MemberOp, int Direction> +struct evaluator > + : evaluator_base > +{ + typedef PartialReduxExpr XprType; + typedef typename internal::nested_eval::type ArgTypeNested; + typedef typename internal::remove_all::type ArgTypeNestedCleaned; + typedef typename ArgType::Scalar InputScalar; + typedef typename XprType::Scalar Scalar; + enum { + TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) + }; + typedef typename MemberOp::template Cost CostOpType; + enum { + CoeffReadCost = TraversalSize==Dynamic ? HugeCost + : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), + Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, + Alignment = 0 + }; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) + : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value)); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const + { + if (Direction==Vertical) + return m_functor(m_arg.col(j)); + else + return m_functor(m_arg.row(i)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const + { + if (Direction==Vertical) + return m_functor(m_arg.col(index)); + else + return m_functor(m_arg.row(index)); + } +protected: + typename internal::add_const_on_value_type::type m_arg; + const MemberOp m_functor; +}; +template +struct evaluator_wrapper_base + : evaluator_base +{ + typedef typename remove_all::type ArgType; + enum { + CoeffReadCost = evaluator::CoeffReadCost, + Flags = evaluator::Flags, + Alignment = evaluator::Alignment + }; + EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} + typedef typename ArgType::Scalar Scalar; + typedef typename ArgType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_argImpl.coeff(row, col); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_argImpl.coeff(index); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_argImpl.coeffRef(row, col); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + return m_argImpl.coeffRef(index); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + return m_argImpl.template packet(row, col); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + return m_argImpl.template packet(index); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index row, Index col, const PacketType& x) + { + m_argImpl.template writePacket(row, col, x); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketType& x) + { + m_argImpl.template writePacket(index, x); + } +protected: + evaluator m_argImpl; +}; +template +struct unary_evaluator > + : evaluator_wrapper_base > +{ + typedef ArrayWrapper XprType; + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) + : evaluator_wrapper_base >(wrapper.nestedExpression()) + { } +}; +template struct reverse_packet_cond; +template +struct unary_evaluator > + : evaluator_base > +{ + typedef Reverse XprType; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + enum { + IsRowMajor = XprType::IsRowMajor, + IsColMajor = !IsRowMajor, + ReverseRow = (Direction == Vertical) || (Direction == BothDirections), + ReverseCol = (Direction == Horizontal) || (Direction == BothDirections), + ReversePacket = (Direction == BothDirections) + || ((Direction == Vertical) && IsColMajor) + || ((Direction == Horizontal) && IsRowMajor), + CoeffReadCost = evaluator::CoeffReadCost, + Flags0 = evaluator::Flags, + LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) ) + || ((ReverseRow && XprType::ColsAtCompileTime==1) || (ReverseCol && XprType::RowsAtCompileTime==1)) + ? LinearAccessBit : 0, + Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess), + Alignment = 0 + }; + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse) + : m_argImpl(reverse.nestedExpression()), + m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1), + m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index col) const + { + return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row, + ReverseCol ? m_cols.value() - col - 1 : col); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index col) + { + return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row, + ReverseCol ? m_cols.value() - col - 1 : col); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index row, Index col) const + { + enum { + PacketSize = unpacket_traits::size, + OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1, + OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1 + }; + typedef internal::reverse_packet_cond reverse_packet; + return reverse_packet::run(m_argImpl.template packet( + ReverseRow ? m_rows.value() - row - OffsetRow : row, + ReverseCol ? m_cols.value() - col - OffsetCol : col)); + } + template + EIGEN_STRONG_INLINE + PacketType packet(Index index) const + { + enum { PacketSize = unpacket_traits::size }; + return preverse(m_argImpl.template packet(m_rows.value() * m_cols.value() - index - PacketSize)); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index row, Index col, const PacketType& x) + { + enum { + PacketSize = unpacket_traits::size, + OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1, + OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1 + }; + typedef internal::reverse_packet_cond reverse_packet; + m_argImpl.template writePacket( + ReverseRow ? m_rows.value() - row - OffsetRow : row, + ReverseCol ? m_cols.value() - col - OffsetCol : col, + reverse_packet::run(x)); + } + template + EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketType& x) + { + enum { PacketSize = unpacket_traits::size }; + m_argImpl.template writePacket + (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x)); + } +protected: + evaluator m_argImpl; + const variable_if_dynamic m_rows; + const variable_if_dynamic m_cols; +}; +template +struct evaluator > + : evaluator_base > +{ + typedef Diagonal XprType; + enum { + CoeffReadCost = evaluator::CoeffReadCost, + Flags = (unsigned int)(evaluator::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit, + Alignment = 0 + }; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal) + : m_argImpl(diagonal.nestedExpression()), + m_index(diagonal.index()) + { } + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index row, Index) const + { + return m_argImpl.coeff(row + rowOffset(), row + colOffset()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + CoeffReturnType coeff(Index index) const + { + return m_argImpl.coeff(index + rowOffset(), index + colOffset()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index row, Index) + { + return m_argImpl.coeffRef(row + rowOffset(), row + colOffset()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Scalar& coeffRef(Index index) + { + return m_argImpl.coeffRef(index + rowOffset(), index + colOffset()); + } +protected: + evaluator m_argImpl; + const internal::variable_if_dynamicindex m_index; +private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; } +}; +template class EvalToTemp; +template +struct traits > + : public traits +{ }; +template +class EvalToTemp + : public dense_xpr_base >::type +{ + public: + typedef typename dense_xpr_base::type Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp) + explicit EvalToTemp(const ArgType& arg) + : m_arg(arg) + { } + const ArgType& arg() const + { + return m_arg; + } + Index rows() const + { + return m_arg.rows(); + } + Index cols() const + { + return m_arg.cols(); + } + private: + const ArgType& m_arg; +}; +template +struct evaluator > + : public evaluator +{ + typedef EvalToTemp XprType; + typedef typename ArgType::PlainObject PlainObject; + typedef evaluator Base; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + : m_result(xpr.arg()) + { + ::new (static_cast(this)) Base(m_result); + } + EIGEN_DEVICE_FUNC evaluator(const ArgType& arg) + : m_result(arg) + { + ::new (static_cast(this)) Base(m_result); + } +protected: + PlainObject m_result; +}; +} +} +#endif +// end #include "src/Core/CoreEvaluators.h" +// #include "src/Core/AssignEvaluator.h" +#ifndef EIGEN_ASSIGN_EVALUATOR_H +#define EIGEN_ASSIGN_EVALUATOR_H +namespace Eigen { +namespace internal { +template +struct copy_using_evaluator_traits +{ + typedef typename DstEvaluator::XprType Dst; + typedef typename Dst::Scalar DstScalar; + enum { + DstFlags = DstEvaluator::Flags, + SrcFlags = SrcEvaluator::Flags + }; +public: + enum { + DstAlignment = DstEvaluator::Alignment, + SrcAlignment = SrcEvaluator::Alignment, + DstHasDirectAccess = DstFlags & DirectAccessBit, + JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) + }; +private: + enum { + InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime) + : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime) + : int(Dst::RowsAtCompileTime), + InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime) + : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) + : int(Dst::MaxRowsAtCompileTime), + OuterStride = int(outer_stride_at_compile_time::ret), + MaxSizeAtCompileTime = Dst::SizeAtCompileTime + }; + typedef typename find_best_packet::type LinearPacketType; + typedef typename find_best_packet::type InnerPacketType; + enum { + LinearPacketSize = unpacket_traits::size, + InnerPacketSize = unpacket_traits::size + }; +public: + enum { + LinearRequiredAlignment = unpacket_traits::alignment, + InnerRequiredAlignment = unpacket_traits::alignment + }; +private: + enum { + DstIsRowMajor = DstFlags&RowMajorBit, + SrcIsRowMajor = SrcFlags&RowMajorBit, + StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)), + MightVectorize = bool(StorageOrdersAgree) + && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) + && bool(functor_traits::PacketAccess), + MayInnerVectorize = MightVectorize + && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0 + && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 + && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), + MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), + MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess + && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), + MaySliceVectorize = bool(MightVectorize) && bool(DstHasDirectAccess) + && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize))) + }; +public: + enum { + Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) + : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) + : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) + : int(MayLinearize) ? int(LinearTraversal) + : int(DefaultTraversal), + Vectorized = int(Traversal) == InnerVectorizedTraversal + || int(Traversal) == LinearVectorizedTraversal + || int(Traversal) == SliceVectorizedTraversal + }; + typedef typename conditional::type PacketType; +private: + enum { + ActualPacketSize = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize + : Vectorized ? InnerPacketSize + : 1, + UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize, + MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic + && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit), + MayUnrollInner = int(InnerSize) != Dynamic + && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit) + }; +public: + enum { + Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal)) + ? ( + int(MayUnrollCompletely) ? int(CompleteUnrolling) + : int(MayUnrollInner) ? int(InnerUnrolling) + : int(NoUnrolling) + ) + : int(Traversal) == int(LinearVectorizedTraversal) + ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment))) + ? int(CompleteUnrolling) + : int(NoUnrolling) ) + : int(Traversal) == int(LinearTraversal) + ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) + : int(NoUnrolling) ) +#if EIGEN_UNALIGNED_VECTORIZE + : int(Traversal) == int(SliceVectorizedTraversal) + ? ( bool(MayUnrollInner) ? int(InnerUnrolling) + : int(NoUnrolling) ) +#endif + : int(NoUnrolling) + }; +#ifdef EIGEN_DEBUG_ASSIGN + static void debug() + { + std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl; + std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl; + std::cerr.setf(std::ios::hex, std::ios::basefield); + std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl; + std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl; + std::cerr.unsetf(std::ios::hex); + EIGEN_DEBUG_VAR(DstAlignment) + EIGEN_DEBUG_VAR(SrcAlignment) + EIGEN_DEBUG_VAR(LinearRequiredAlignment) + EIGEN_DEBUG_VAR(InnerRequiredAlignment) + EIGEN_DEBUG_VAR(JointAlignment) + EIGEN_DEBUG_VAR(InnerSize) + EIGEN_DEBUG_VAR(InnerMaxSize) + EIGEN_DEBUG_VAR(LinearPacketSize) + EIGEN_DEBUG_VAR(InnerPacketSize) + EIGEN_DEBUG_VAR(ActualPacketSize) + EIGEN_DEBUG_VAR(StorageOrdersAgree) + EIGEN_DEBUG_VAR(MightVectorize) + EIGEN_DEBUG_VAR(MayLinearize) + EIGEN_DEBUG_VAR(MayInnerVectorize) + EIGEN_DEBUG_VAR(MayLinearVectorize) + EIGEN_DEBUG_VAR(MaySliceVectorize) + std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; + EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost) + EIGEN_DEBUG_VAR(UnrollingLimit) + EIGEN_DEBUG_VAR(MayUnrollCompletely) + EIGEN_DEBUG_VAR(MayUnrollInner) + std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl; + std::cerr << std::endl; + } +#endif +}; +template +struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling +{ + typedef typename Kernel::DstEvaluatorType DstEvaluatorType; + typedef typename DstEvaluatorType::XprType DstXprType; + enum { + outer = Index / DstXprType::InnerSizeAtCompileTime, + inner = Index % DstXprType::InnerSizeAtCompileTime + }; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + kernel.assignCoeffByOuterInner(outer, inner); + copy_using_evaluator_DefaultTraversal_CompleteUnrolling::run(kernel); + } +}; +template +struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } +}; +template +struct copy_using_evaluator_DefaultTraversal_InnerUnrolling +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) + { + kernel.assignCoeffByOuterInner(outer, Index_); + copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); + } +}; +template +struct copy_using_evaluator_DefaultTraversal_InnerUnrolling +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { } +}; +template +struct copy_using_evaluator_LinearTraversal_CompleteUnrolling +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) + { + kernel.assignCoeff(Index); + copy_using_evaluator_LinearTraversal_CompleteUnrolling::run(kernel); + } +}; +template +struct copy_using_evaluator_LinearTraversal_CompleteUnrolling +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } +}; +template +struct copy_using_evaluator_innervec_CompleteUnrolling +{ + typedef typename Kernel::DstEvaluatorType DstEvaluatorType; + typedef typename DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; + enum { + outer = Index / DstXprType::InnerSizeAtCompileTime, + inner = Index % DstXprType::InnerSizeAtCompileTime, + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment + }; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + kernel.template assignPacketByOuterInner(outer, inner); + enum { NextIndex = Index + unpacket_traits::size }; + copy_using_evaluator_innervec_CompleteUnrolling::run(kernel); + } +}; +template +struct copy_using_evaluator_innervec_CompleteUnrolling +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } +}; +template +struct copy_using_evaluator_innervec_InnerUnrolling +{ + typedef typename Kernel::PacketType PacketType; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) + { + kernel.template assignPacketByOuterInner(outer, Index_); + enum { NextIndex = Index_ + unpacket_traits::size }; + copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); + } +}; +template +struct copy_using_evaluator_innervec_InnerUnrolling +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { } +}; +template +struct dense_assignment_loop; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel) + { + for(Index outer = 0; outer < kernel.outerSize(); ++outer) { + for(Index inner = 0; inner < kernel.innerSize(); ++inner) { + kernel.assignCoeffByOuterInner(outer, inner); + } + } + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + copy_using_evaluator_DefaultTraversal_CompleteUnrolling::run(kernel); + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + const Index outerSize = kernel.outerSize(); + for(Index outer = 0; outer < outerSize; ++outer) + copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); + } +}; +template +struct unaligned_dense_assignment_loop +{ + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {} +}; +template <> +struct unaligned_dense_assignment_loop +{ +#if EIGEN_COMP_MSVC + template + static EIGEN_DONT_INLINE void run(Kernel &kernel, + Index start, + Index end) +#else + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, + Index start, + Index end) +#endif + { + for (Index index = start; index < end; ++index) + kernel.assignCoeff(index); + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + const Index size = kernel.size(); + typedef typename Kernel::Scalar Scalar; + typedef typename Kernel::PacketType PacketType; + enum { + requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment, + packetSize = unpacket_traits::size, + dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), + dstAlignment = packet_traits::AlignedOnScalar ? int(requestedAlignment) + : int(Kernel::AssignmentTraits::DstAlignment), + srcAlignment = Kernel::AssignmentTraits::JointAlignment + }; + const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned(kernel.dstDataPtr(), size); + const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize; + unaligned_dense_assignment_loop::run(kernel, 0, alignedStart); + for(Index index = alignedStart; index < alignedEnd; index += packetSize) + kernel.template assignPacket(index); + unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; + enum { size = DstXprType::SizeAtCompileTime, + packetSize =unpacket_traits::size, + alignedSize = (size/packetSize)*packetSize }; + copy_using_evaluator_innervec_CompleteUnrolling::run(kernel); + copy_using_evaluator_DefaultTraversal_CompleteUnrolling::run(kernel); + } +}; +template +struct dense_assignment_loop +{ + typedef typename Kernel::PacketType PacketType; + enum { + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment + }; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + const Index innerSize = kernel.innerSize(); + const Index outerSize = kernel.outerSize(); + const Index packetSize = unpacket_traits::size; + for(Index outer = 0; outer < outerSize; ++outer) + for(Index inner = 0; inner < innerSize; inner+=packetSize) + kernel.template assignPacketByOuterInner(outer, inner); + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + copy_using_evaluator_innervec_CompleteUnrolling::run(kernel); + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::AssignmentTraits Traits; + const Index outerSize = kernel.outerSize(); + for(Index outer = 0; outer < outerSize; ++outer) + copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + const Index size = kernel.size(); + for(Index i = 0; i < size; ++i) + kernel.assignCoeff(i); + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + copy_using_evaluator_LinearTraversal_CompleteUnrolling::run(kernel); + } +}; +template +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + typedef typename Kernel::Scalar Scalar; + typedef typename Kernel::PacketType PacketType; + enum { + packetSize = unpacket_traits::size, + requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment), + alignable = packet_traits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar), + dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), + dstAlignment = alignable ? int(requestedAlignment) + : int(Kernel::AssignmentTraits::DstAlignment) + }; + const Scalar *dst_ptr = kernel.dstDataPtr(); + if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0) + { + return dense_assignment_loop::run(kernel); + } + const Index packetAlignedMask = packetSize - 1; + const Index innerSize = kernel.innerSize(); + const Index outerSize = kernel.outerSize(); + const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0; + Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize); + for(Index outer = 0; outer < outerSize; ++outer) + { + const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask); + for(Index inner = 0; inner(outer, inner); + for(Index inner = alignedEnd; inner +struct dense_assignment_loop +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) + { + typedef typename Kernel::DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; + enum { size = DstXprType::InnerSizeAtCompileTime, + packetSize =unpacket_traits::size, + vectorizableSize = (size/packetSize)*packetSize }; + for(Index outer = 0; outer < kernel.outerSize(); ++outer) + { + copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); + copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); + } + } +}; +#endif +template +class generic_dense_assignment_kernel +{ +protected: + typedef typename DstEvaluatorTypeT::XprType DstXprType; + typedef typename SrcEvaluatorTypeT::XprType SrcXprType; +public: + typedef DstEvaluatorTypeT DstEvaluatorType; + typedef SrcEvaluatorTypeT SrcEvaluatorType; + typedef typename DstEvaluatorType::Scalar Scalar; + typedef copy_using_evaluator_traits AssignmentTraits; + typedef typename AssignmentTraits::PacketType PacketType; + EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) + : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) + { + #ifdef EIGEN_DEBUG_ASSIGN + AssignmentTraits::debug(); + #endif + } + EIGEN_DEVICE_FUNC Index size() const { return m_dstExpr.size(); } + EIGEN_DEVICE_FUNC Index innerSize() const { return m_dstExpr.innerSize(); } + EIGEN_DEVICE_FUNC Index outerSize() const { return m_dstExpr.outerSize(); } + EIGEN_DEVICE_FUNC Index rows() const { return m_dstExpr.rows(); } + EIGEN_DEVICE_FUNC Index cols() const { return m_dstExpr.cols(); } + EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); } + EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; } + EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col) + { + m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index) + { + m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) + { + Index row = rowIndexByOuterInner(outer, inner); + Index col = colIndexByOuterInner(outer, inner); + assignCoeff(row, col); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) + { + m_functor.template assignPacket(&m_dst.coeffRef(row,col), m_src.template packet(row,col)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index) + { + m_functor.template assignPacket(&m_dst.coeffRef(index), m_src.template packet(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) + { + Index row = rowIndexByOuterInner(outer, inner); + Index col = colIndexByOuterInner(outer, inner); + assignPacket(row, col); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) + { + typedef typename DstEvaluatorType::ExpressionTraits Traits; + return int(Traits::RowsAtCompileTime) == 1 ? 0 + : int(Traits::ColsAtCompileTime) == 1 ? inner + : int(DstEvaluatorType::Flags)&RowMajorBit ? outer + : inner; + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) + { + typedef typename DstEvaluatorType::ExpressionTraits Traits; + return int(Traits::ColsAtCompileTime) == 1 ? 0 + : int(Traits::RowsAtCompileTime) == 1 ? inner + : int(DstEvaluatorType::Flags)&RowMajorBit ? inner + : outer; + } + EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const + { + return m_dstExpr.data(); + } +protected: + DstEvaluatorType& m_dst; + const SrcEvaluatorType& m_src; + const Functor &m_functor; + DstXprType& m_dstExpr; +}; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &) +{ + EIGEN_ONLY_USED_FOR_DEBUG(dst); + EIGEN_ONLY_USED_FOR_DEBUG(src); + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op &) +{ + Index dstRows = src.rows(); + Index dstCols = src.cols(); + if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols))) + dst.resize(dstRows, dstCols); + eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func) +{ + typedef evaluator DstEvaluatorType; + typedef evaluator SrcEvaluatorType; + SrcEvaluatorType srcEvaluator(src); + resize_if_allowed(dst, src, func); + DstEvaluatorType dstEvaluator(dst); + typedef generic_dense_assignment_kernel Kernel; + Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); + dense_assignment_loop::run(kernel); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) +{ + call_dense_assignment_loop(dst, src, internal::assign_op()); +} +template struct AssignmentKind; +struct Dense2Dense {}; +struct EigenBase2EigenBase {}; +template struct AssignmentKind { typedef EigenBase2EigenBase Kind; }; +template<> struct AssignmentKind { typedef Dense2Dense Kind; }; +template< typename DstXprType, typename SrcXprType, typename Functor, + typename Kind = typename AssignmentKind< typename evaluator_traits::Shape , typename evaluator_traits::Shape >::Kind, + typename EnableIf = void> +struct Assignment; +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src) +{ + call_assignment(dst, src, internal::assign_op()); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(const Dst& dst, const Src& src) +{ + call_assignment(dst, src, internal::assign_op()); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing::value, void*>::type = 0) +{ + typename plain_matrix_type::type tmp(src); + call_assignment_no_alias(dst, tmp, func); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if::value, void*>::type = 0) +{ + call_assignment_no_alias(dst, src, func); +} +template class StorageBase, typename Src, typename Func> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment(NoAlias& dst, const Src& src, const Func& func) +{ + call_assignment_no_alias(dst.expression(), src, func); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) +{ + enum { + NeedToTranspose = ( (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) + || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1) + ) && int(Dst::SizeAtCompileTime) != 1 + }; + typedef typename internal::conditional, Dst>::type ActualDstTypeCleaned; + typedef typename internal::conditional, Dst&>::type ActualDstType; + ActualDstType actualDst(dst); + EIGEN_STATIC_ASSERT_LVALUE(Dst) + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src) + EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar); + Assignment::run(actualDst, src, func); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias(Dst& dst, const Src& src) +{ + call_assignment_no_alias(dst, src, internal::assign_op()); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) +{ + EIGEN_STATIC_ASSERT_LVALUE(Dst) + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src) + EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar); + Assignment::run(dst, src, func); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) +{ + call_assignment_no_alias_no_transpose(dst, src, internal::assign_op()); +} +template void check_for_aliasing(const Dst &dst, const Src &src); +template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> +struct Assignment +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func) + { +#ifndef EIGEN_NO_DEBUG + internal::check_for_aliasing(dst, src); +#endif + call_dense_assignment_loop(dst, src, func); + } +}; +template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> +struct Assignment +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) + { + Index dstRows = src.rows(); + Index dstCols = src.cols(); + if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) + dst.resize(dstRows, dstCols); + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); + src.evalTo(dst); + } + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) + { + Index dstRows = src.rows(); + Index dstCols = src.cols(); + if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) + dst.resize(dstRows, dstCols); + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); + src.addTo(dst); + } + template + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) + { + Index dstRows = src.rows(); + Index dstCols = src.cols(); + if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) + dst.resize(dstRows, dstCols); + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); + src.subTo(dst); + } +}; +} +} +#endif +// end #include "src/Core/AssignEvaluator.h" +#ifndef EIGEN_PARSED_BY_DOXYGEN +// #include "src/Core/Assign.h" +#ifndef EIGEN_ASSIGN_H +#define EIGEN_ASSIGN_H +namespace Eigen { +template +template +EIGEN_STRONG_INLINE Derived& DenseBase + ::lazyAssign(const DenseBase& other) +{ + enum{ + SameType = internal::is_same::value + }; + EIGEN_STATIC_ASSERT_LVALUE(Derived) + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived) + EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + eigen_assert(rows() == other.rows() && cols() == other.cols()); + internal::call_assignment_no_alias(derived(),other.derived()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE Derived& DenseBase::operator=(const DenseBase& other) +{ + internal::call_assignment(derived(), other.derived()); + return derived(); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE Derived& DenseBase::operator=(const DenseBase& other) +{ + internal::call_assignment(derived(), other.derived()); + return derived(); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE Derived& MatrixBase::operator=(const MatrixBase& other) +{ + internal::call_assignment(derived(), other.derived()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE Derived& MatrixBase::operator=(const DenseBase& other) +{ + internal::call_assignment(derived(), other.derived()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE Derived& MatrixBase::operator=(const EigenBase& other) +{ + internal::call_assignment(derived(), other.derived()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE Derived& MatrixBase::operator=(const ReturnByValue& other) +{ + other.derived().evalTo(derived()); + return derived(); +} +} +#endif +// end #include "src/Core/Assign.h" +#endif +// #include "src/Core/ArrayBase.h" +#ifndef EIGEN_ARRAYBASE_H +#define EIGEN_ARRAYBASE_H +namespace Eigen { +template class ArrayBase + : public DenseBase +{ + public: +#ifndef EIGEN_PARSED_BY_DOXYGEN + typedef ArrayBase StorageBaseType; + typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + typedef DenseBase Base; + using Base::RowsAtCompileTime; + using Base::ColsAtCompileTime; + using Base::SizeAtCompileTime; + using Base::MaxRowsAtCompileTime; + using Base::MaxColsAtCompileTime; + using Base::MaxSizeAtCompileTime; + using Base::IsVectorAtCompileTime; + using Base::Flags; + using Base::derived; + using Base::const_cast_derived; + using Base::rows; + using Base::cols; + using Base::size; + using Base::coeff; + using Base::coeffRef; + using Base::lazyAssign; + using Base::operator=; + using Base::operator+=; + using Base::operator-=; + using Base::operator*=; + using Base::operator/=; + typedef typename Base::CoeffReturnType CoeffReturnType; +#endif +#ifndef EIGEN_PARSED_BY_DOXYGEN + typedef typename Base::PlainObject PlainObject; + typedef CwiseNullaryOp,PlainObject> ConstantReturnType; +#endif +#define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase +#define EIGEN_DOC_UNARY_ADDONS(X,Y) +// # include "../plugins/CommonCwiseUnaryOps.h" +#ifndef EIGEN_PARSED_BY_DOXYGEN +typedef typename internal::conditional::IsComplex, + const CwiseUnaryOp, const Derived>, + const Derived& + >::type ConjugateReturnType; +typedef typename internal::conditional::IsComplex, + const CwiseUnaryOp, const Derived>, + const Derived& + >::type RealReturnType; +typedef typename internal::conditional::IsComplex, + CwiseUnaryView, Derived>, + Derived& + >::type NonConstRealReturnType; +typedef CwiseUnaryOp, const Derived> ImagReturnType; +typedef CwiseUnaryView, Derived> NonConstImagReturnType; +typedef CwiseUnaryOp, const Derived> NegativeReturnType; +#endif +EIGEN_DOC_UNARY_ADDONS(operator-,opposite) +EIGEN_DEVICE_FUNC +inline const NegativeReturnType +operator-() const { return NegativeReturnType(derived()); } +template struct CastXpr { typedef typename internal::cast_return_type, const Derived> >::type Type; }; +EIGEN_DOC_UNARY_ADDONS(cast,conversion function) +template +EIGEN_DEVICE_FUNC +typename CastXpr::Type +cast() const +{ + return typename CastXpr::Type(derived()); +} +EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) +EIGEN_DEVICE_FUNC +inline ConjugateReturnType +conjugate() const +{ + return ConjugateReturnType(derived()); +} +EIGEN_DOC_UNARY_ADDONS(real,real part function) +EIGEN_DEVICE_FUNC +inline RealReturnType +real() const { return RealReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function) +EIGEN_DEVICE_FUNC +inline const ImagReturnType +imag() const { return ImagReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(unaryExpr,unary function) +template +EIGEN_DEVICE_FUNC +inline const CwiseUnaryOp +unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const +{ + return CwiseUnaryOp(derived(), func); +} +EIGEN_DOC_UNARY_ADDONS(unaryViewExpr,unary function) +template +EIGEN_DEVICE_FUNC +inline const CwiseUnaryView +unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const +{ + return CwiseUnaryView(derived(), func); +} +EIGEN_DOC_UNARY_ADDONS(real,real part function) +EIGEN_DEVICE_FUNC +inline NonConstRealReturnType +real() { return NonConstRealReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(imag,imaginary part function) +EIGEN_DEVICE_FUNC +inline NonConstImagReturnType +imag() { return NonConstImagReturnType(derived()); } +// end # include "../plugins/CommonCwiseUnaryOps.h" +// # include "../plugins/MatrixCwiseUnaryOps.h" +typedef CwiseUnaryOp, const Derived> CwiseAbsReturnType; +typedef CwiseUnaryOp, const Derived> CwiseAbs2ReturnType; +typedef CwiseUnaryOp, const Derived> CwiseSqrtReturnType; +typedef CwiseUnaryOp, const Derived> CwiseSignReturnType; +typedef CwiseUnaryOp, const Derived> CwiseInverseReturnType; +EIGEN_DOC_UNARY_ADDONS(cwiseAbs,absolute value) +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseAbsReturnType +cwiseAbs() const { return CwiseAbsReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(cwiseAbs2,squared absolute value) +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseAbs2ReturnType +cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(cwiseSqrt,square-root) +EIGEN_DEVICE_FUNC +inline const CwiseSqrtReturnType +cwiseSqrt() const { return CwiseSqrtReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(cwiseSign,sign function) +EIGEN_DEVICE_FUNC +inline const CwiseSignReturnType +cwiseSign() const { return CwiseSignReturnType(derived()); } +EIGEN_DOC_UNARY_ADDONS(cwiseInverse,inverse) +EIGEN_DEVICE_FUNC +inline const CwiseInverseReturnType +cwiseInverse() const { return CwiseInverseReturnType(derived()); } +// end # include "../plugins/MatrixCwiseUnaryOps.h" +// # include "../plugins/ArrayCwiseUnaryOps.h" +typedef CwiseUnaryOp, const Derived> AbsReturnType; +typedef CwiseUnaryOp, const Derived> ArgReturnType; +typedef CwiseUnaryOp, const Derived> Abs2ReturnType; +typedef CwiseUnaryOp, const Derived> SqrtReturnType; +typedef CwiseUnaryOp, const Derived> RsqrtReturnType; +typedef CwiseUnaryOp, const Derived> SignReturnType; +typedef CwiseUnaryOp, const Derived> InverseReturnType; +typedef CwiseUnaryOp, const Derived> BooleanNotReturnType; +typedef CwiseUnaryOp, const Derived> ExpReturnType; +typedef CwiseUnaryOp, const Derived> LogReturnType; +typedef CwiseUnaryOp, const Derived> Log1pReturnType; +typedef CwiseUnaryOp, const Derived> Log10ReturnType; +typedef CwiseUnaryOp, const Derived> CosReturnType; +typedef CwiseUnaryOp, const Derived> SinReturnType; +typedef CwiseUnaryOp, const Derived> TanReturnType; +typedef CwiseUnaryOp, const Derived> AcosReturnType; +typedef CwiseUnaryOp, const Derived> AsinReturnType; +typedef CwiseUnaryOp, const Derived> AtanReturnType; +typedef CwiseUnaryOp, const Derived> TanhReturnType; +typedef CwiseUnaryOp, const Derived> SinhReturnType; +typedef CwiseUnaryOp, const Derived> CoshReturnType; +typedef CwiseUnaryOp, const Derived> SquareReturnType; +typedef CwiseUnaryOp, const Derived> CubeReturnType; +typedef CwiseUnaryOp, const Derived> RoundReturnType; +typedef CwiseUnaryOp, const Derived> FloorReturnType; +typedef CwiseUnaryOp, const Derived> CeilReturnType; +typedef CwiseUnaryOp, const Derived> IsNaNReturnType; +typedef CwiseUnaryOp, const Derived> IsInfReturnType; +typedef CwiseUnaryOp, const Derived> IsFiniteReturnType; +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const AbsReturnType +abs() const +{ + return AbsReturnType(derived()); +} +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const ArgReturnType +arg() const +{ + return ArgReturnType(derived()); +} +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const Abs2ReturnType +abs2() const +{ + return Abs2ReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const ExpReturnType +exp() const +{ + return ExpReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const LogReturnType +log() const +{ + return LogReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const Log1pReturnType +log1p() const +{ + return Log1pReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const Log10ReturnType +log10() const +{ + return Log10ReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const SqrtReturnType +sqrt() const +{ + return SqrtReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const RsqrtReturnType +rsqrt() const +{ + return RsqrtReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const SignReturnType +sign() const +{ + return SignReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const CosReturnType +cos() const +{ + return CosReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const SinReturnType +sin() const +{ + return SinReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const TanReturnType +tan() const +{ + return TanReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const AtanReturnType +atan() const +{ + return AtanReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const AcosReturnType +acos() const +{ + return AcosReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const AsinReturnType +asin() const +{ + return AsinReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const TanhReturnType +tanh() const +{ + return TanhReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const SinhReturnType +sinh() const +{ + return SinhReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const CoshReturnType +cosh() const +{ + return CoshReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const InverseReturnType +inverse() const +{ + return InverseReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const SquareReturnType +square() const +{ + return SquareReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const CubeReturnType +cube() const +{ + return CubeReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const RoundReturnType +round() const +{ + return RoundReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const FloorReturnType +floor() const +{ + return FloorReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const CeilReturnType +ceil() const +{ + return CeilReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const IsNaNReturnType +isNaN() const +{ + return IsNaNReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const IsInfReturnType +isInf() const +{ + return IsInfReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const IsFiniteReturnType +isFinite() const +{ + return IsFiniteReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const BooleanNotReturnType +operator!() const +{ + EIGEN_STATIC_ASSERT((internal::is_same::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return BooleanNotReturnType(derived()); +} +typedef CwiseUnaryOp, const Derived> LgammaReturnType; +typedef CwiseUnaryOp, const Derived> DigammaReturnType; +typedef CwiseUnaryOp, const Derived> ErfReturnType; +typedef CwiseUnaryOp, const Derived> ErfcReturnType; +EIGEN_DEVICE_FUNC +inline const LgammaReturnType +lgamma() const +{ + return LgammaReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const DigammaReturnType +digamma() const +{ + return DigammaReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const ErfReturnType +erf() const +{ + return ErfReturnType(derived()); +} +EIGEN_DEVICE_FUNC +inline const ErfcReturnType +erfc() const +{ + return ErfcReturnType(derived()); +} +// end # include "../plugins/ArrayCwiseUnaryOps.h" +// # include "../plugins/CommonCwiseBinaryOps.h" +EIGEN_MAKE_CWISE_BINARY_OP(operator-,difference) +EIGEN_MAKE_CWISE_BINARY_OP(operator+,sum) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp +binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other, const CustomBinaryOp& func = CustomBinaryOp()) const +{ + return CwiseBinaryOp(derived(), other.derived(), func); +} +#ifndef EIGEN_PARSED_BY_DOXYGEN +EIGEN_MAKE_SCALAR_BINARY_OP(operator*,product) +#else +template +const CwiseBinaryOp,Derived,Constant > operator*(const T& scalar) const; +template friend +const CwiseBinaryOp,Constant,Derived> operator*(const T& scalar, const StorageBaseType& expr); +#endif +#ifndef EIGEN_PARSED_BY_DOXYGEN +EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/,quotient) +#else +template +const CwiseBinaryOp,Derived,Constant > operator/(const T& scalar) const; +#endif +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp +operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + EIGEN_STATIC_ASSERT((internal::is_same::value && internal::is_same::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return CwiseBinaryOp(derived(),other.derived()); +} +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp +operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + EIGEN_STATIC_ASSERT((internal::is_same::value && internal::is_same::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return CwiseBinaryOp(derived(),other.derived()); +} +// end # include "../plugins/CommonCwiseBinaryOps.h" +// # include "../plugins/MatrixCwiseBinaryOps.h" +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product) +cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived()); +} +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMin(const Scalar &other) const +{ + return cwiseMin(Derived::Constant(rows(), cols(), other)); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMax(const Scalar &other) const +{ + return cwiseMax(Derived::Constant(rows(), cols(), other)); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +typedef CwiseBinaryOp, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType; +EIGEN_DEVICE_FUNC +inline const CwiseScalarEqualReturnType +cwiseEqual(const Scalar& s) const +{ + return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op()); +} +// end # include "../plugins/MatrixCwiseBinaryOps.h" +// # include "../plugins/ArrayCwiseBinaryOps.h" +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product) +operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived,OtherDerived,product)(derived(), other.derived()); +} +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} +EIGEN_MAKE_CWISE_BINARY_OP(min,min) +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const Scalar &other) const +{ + return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); +} +EIGEN_MAKE_CWISE_BINARY_OP(max,max) +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const Scalar &other) const +{ + return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); +} +EIGEN_MAKE_CWISE_BINARY_OP(pow,pow) +#ifndef EIGEN_PARSED_BY_DOXYGEN +EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(pow,pow) +#else +template +const CwiseBinaryOp,Derived,Constant > pow(const T& exponent) const; +#endif +#define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR) \ +template \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> \ +OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const \ +{ \ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); \ +}\ +typedef CwiseBinaryOp, const Derived, const CwiseNullaryOp, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \ +typedef CwiseBinaryOp, const CwiseNullaryOp, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp ## COMPARATOR ## ReturnType \ +OP(const Scalar& s) const { \ + return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \ +} \ +EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \ +OP(const Scalar& s, const Derived& d) { \ + return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \ +} +#define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR) \ +template \ +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp, const OtherDerived, const Derived> \ +OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const \ +{ \ + return CwiseBinaryOp, const OtherDerived, const Derived>(other.derived(), derived()); \ +} \ +EIGEN_DEVICE_FUNC \ +inline const RCmp ## RCOMPARATOR ## ReturnType \ +OP(const Scalar& s) const { \ + return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this); \ +} \ +friend inline const Cmp ## RCOMPARATOR ## ReturnType \ +OP(const Scalar& s, const Derived& d) { \ + return d.R_OP(Derived::PlainObject::Constant(d.rows(), d.cols(), s)); \ +} +EIGEN_MAKE_CWISE_COMP_OP(operator<, LT) +EIGEN_MAKE_CWISE_COMP_OP(operator<=, LE) +EIGEN_MAKE_CWISE_COMP_R_OP(operator>, operator<, LT) +EIGEN_MAKE_CWISE_COMP_R_OP(operator>=, operator<=, LE) +EIGEN_MAKE_CWISE_COMP_OP(operator==, EQ) +EIGEN_MAKE_CWISE_COMP_OP(operator!=, NEQ) +#undef EIGEN_MAKE_CWISE_COMP_OP +#undef EIGEN_MAKE_CWISE_COMP_R_OP +#ifndef EIGEN_PARSED_BY_DOXYGEN +EIGEN_MAKE_SCALAR_BINARY_OP(operator+,sum) +#else +template +const CwiseBinaryOp,Derived,Constant > operator+(const T& scalar) const; +template friend +const CwiseBinaryOp,Constant,Derived> operator+(const T& scalar, const StorageBaseType& expr); +#endif +#ifndef EIGEN_PARSED_BY_DOXYGEN +EIGEN_MAKE_SCALAR_BINARY_OP(operator-,difference) +#else +template +const CwiseBinaryOp,Derived,Constant > operator-(const T& scalar) const; +template friend +const CwiseBinaryOp,Constant,Derived> operator-(const T& scalar, const StorageBaseType& expr); +#endif +#ifndef EIGEN_PARSED_BY_DOXYGEN + EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(operator/,quotient) +#else + template friend + inline const CwiseBinaryOp,Constant,Derived> + operator/(const T& s,const StorageBaseType& a); +#endif +template +EIGEN_DEVICE_FUNC +inline const CwiseBinaryOp +operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + EIGEN_STATIC_ASSERT((internal::is_same::value && internal::is_same::value), + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL); + return CwiseBinaryOp(derived(),other.derived()); +} +#if 0 +template +inline const CwiseBinaryOp, const DerivedN, const Derived> +polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS &n) const +{ + return CwiseBinaryOp, const DerivedN, const Derived>(n.derived(), this->derived()); +} +#endif +template +inline const CwiseBinaryOp, const Derived, const DerivedQ> +zeta(const EIGEN_CURRENT_STORAGE_BASE_CLASS &q) const +{ + return CwiseBinaryOp, const Derived, const DerivedQ>(this->derived(), q.derived()); +} +// end # include "../plugins/ArrayCwiseBinaryOps.h" +# ifdef EIGEN_ARRAYBASE_PLUGIN +# include EIGEN_ARRAYBASE_PLUGIN +# endif +#undef EIGEN_CURRENT_STORAGE_BASE_CLASS +#undef EIGEN_DOC_UNARY_ADDONS + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator=(const ArrayBase& other) + { + internal::call_assignment(derived(), other.derived()); + return derived(); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator=(const Scalar &value) + { Base::setConstant(value); return derived(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const Scalar& scalar); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const Scalar& scalar); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const ArrayBase& other); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const ArrayBase& other); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const ArrayBase& other); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const ArrayBase& other); + public: + EIGEN_DEVICE_FUNC + ArrayBase& array() { return *this; } + EIGEN_DEVICE_FUNC + const ArrayBase& array() const { return *this; } + protected: + EIGEN_DEVICE_FUNC + ArrayBase() : Base() {} + private: + explicit ArrayBase(Index); + ArrayBase(Index,Index); + template explicit ArrayBase(const ArrayBase&); + protected: + template Derived& operator+=(const MatrixBase& ) + {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;} + template Derived& operator-=(const MatrixBase& ) + {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;} +}; +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & +ArrayBase::operator-=(const ArrayBase &other) +{ + call_assignment(derived(), other.derived(), internal::sub_assign_op()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & +ArrayBase::operator+=(const ArrayBase& other) +{ + call_assignment(derived(), other.derived(), internal::add_assign_op()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & +ArrayBase::operator*=(const ArrayBase& other) +{ + call_assignment(derived(), other.derived(), internal::mul_assign_op()); + return derived(); +} +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & +ArrayBase::operator/=(const ArrayBase& other) +{ + call_assignment(derived(), other.derived(), internal::div_assign_op()); + return derived(); +} +} +#endif +// end #include "src/Core/ArrayBase.h" +// #include "src/Core/util/BlasUtil.h" +#ifndef EIGEN_BLASUTIL_H +#define EIGEN_BLASUTIL_H +namespace Eigen { +namespace internal { +template +struct gebp_kernel; +template +struct gemm_pack_rhs; +template +struct gemm_pack_lhs; +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResStorageOrder> +struct general_matrix_matrix_product; +template +struct general_matrix_vector_product; +template struct conj_if; +template<> struct conj_if { + template + inline T operator()(const T& x) const { return numext::conj(x); } + template + inline T pconj(const T& x) const { return internal::pconj(x); } +}; +template<> struct conj_if { + template + inline const T& operator()(const T& x) const { return x; } + template + inline const T& pconj(const T& x) const { return x; } +}; +template +struct conj_helper +{ + typedef typename ScalarBinaryOpTraits::ReturnType Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const + { return conj_if()(x) * conj_if()(y); } +}; +template struct conj_helper +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); } +}; +template struct conj_helper, std::complex, false,true> +{ + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const + { return c + pmul(x,y); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); } +}; +template struct conj_helper, std::complex, true,false> +{ + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const + { return c + pmul(x,y); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } +}; +template struct conj_helper, std::complex, true,true> +{ + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const + { return c + pmul(x,y); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } +}; +template struct conj_helper, RealScalar, Conj,false> +{ + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const + { return conj_if()(x)*y; } +}; +template struct conj_helper, false,Conj> +{ + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const + { return padd(c, pmul(x,y)); } + EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const + { return x*conj_if()(y); } +}; +template struct get_factor { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); } +}; +template struct get_factor::Real> { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE typename NumTraits::Real run(const Scalar& x) { return numext::real(x); } +}; +template +class BlasVectorMapper { + public: + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + return m_data[i]; + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const { + return ploadt(m_data + i); + } + template + EIGEN_DEVICE_FUNC bool aligned(Index i) const { + return (UIntPtr(m_data+i)%sizeof(Packet))==0; + } + protected: + Scalar* m_data; +}; +template +class BlasLinearMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i]; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return ploadt(m_data + i); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return ploadt(m_data + i); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { + pstoret(m_data + i, p); + } + protected: + Scalar *m_data; +}; +template +class blas_data_mapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + typedef BlasLinearMapper LinearMapper; + typedef BlasVectorMapper VectorMapper; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper + getSubMapper(Index i, Index j) const { + return blas_data_mapper(&operator()(i, j), m_stride); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j)); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(&operator()(i, j)); + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { + pscatter(&operator()(i, j), p, m_stride); + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather(&operator()(i, j), m_stride); + } + EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC Index firstAligned(Index size) const { + if (UIntPtr(m_data)%sizeof(Scalar)) { + return -1; + } + return internal::first_default_aligned(m_data, size); + } + protected: + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; +}; +template +class const_blas_data_mapper : public blas_data_mapper { + public: + EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper(data, stride) {} + EIGEN_ALWAYS_INLINE const_blas_data_mapper getSubMapper(Index i, Index j) const { + return const_blas_data_mapper(&(this->operator()(i, j)), this->m_stride); + } +}; +template struct blas_traits +{ + typedef typename traits::Scalar Scalar; + typedef const XprType& ExtractType; + typedef XprType _ExtractType; + enum { + IsComplex = NumTraits::IsComplex, + IsTransposed = false, + NeedToConjugate = false, + HasUsableDirectAccess = ( (int(XprType::Flags)&DirectAccessBit) + && ( bool(XprType::IsVectorAtCompileTime) + || int(inner_stride_at_compile_time::ret) == 1) + ) ? 1 : 0 + }; + typedef typename conditional::type DirectLinearAccessType; + static inline ExtractType extract(const XprType& x) { return x; } + static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } +}; +template +struct blas_traits, NestedXpr> > + : blas_traits +{ + typedef blas_traits Base; + typedef CwiseUnaryOp, NestedXpr> XprType; + typedef typename Base::ExtractType ExtractType; + enum { + IsComplex = NumTraits::IsComplex, + NeedToConjugate = Base::NeedToConjugate ? 0 : IsComplex + }; + static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); } + static inline Scalar extractScalarFactor(const XprType& x) { return conj(Base::extractScalarFactor(x.nestedExpression())); } +}; +template +struct blas_traits, const CwiseNullaryOp,Plain>, NestedXpr> > + : blas_traits +{ + typedef blas_traits Base; + typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; + typedef typename Base::ExtractType ExtractType; + static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline Scalar extractScalarFactor(const XprType& x) + { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } +}; +template +struct blas_traits, NestedXpr, const CwiseNullaryOp,Plain> > > + : blas_traits +{ + typedef blas_traits Base; + typedef CwiseBinaryOp, NestedXpr, const CwiseNullaryOp,Plain> > XprType; + typedef typename Base::ExtractType ExtractType; + static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); } + static inline Scalar extractScalarFactor(const XprType& x) + { return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other; } +}; +template +struct blas_traits, const CwiseNullaryOp,Plain1>, + const CwiseNullaryOp,Plain2> > > + : blas_traits,Plain1> > +{}; +template +struct blas_traits, NestedXpr> > + : blas_traits +{ + typedef blas_traits Base; + typedef CwiseUnaryOp, NestedXpr> XprType; + typedef typename Base::ExtractType ExtractType; + static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); } + static inline Scalar extractScalarFactor(const XprType& x) + { return - Base::extractScalarFactor(x.nestedExpression()); } +}; +template +struct blas_traits > + : blas_traits +{ + typedef typename NestedXpr::Scalar Scalar; + typedef blas_traits Base; + typedef Transpose XprType; + typedef Transpose ExtractType; + typedef Transpose _ExtractType; + typedef typename conditional::type DirectLinearAccessType; + enum { + IsTransposed = Base::IsTransposed ? 0 : 1 + }; + static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); } + static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); } +}; +template +struct blas_traits + : blas_traits +{}; +template::HasUsableDirectAccess> +struct extract_data_selector { + static const typename T::Scalar* run(const T& m) + { + return blas_traits::extract(m).data(); + } +}; +template +struct extract_data_selector { + static typename T::Scalar* run(const T&) { return 0; } +}; +template const typename T::Scalar* extract_data(const T& m) +{ + return extract_data_selector::run(m); +} +} +} +#endif +// end #include "src/Core/util/BlasUtil.h" +// #include "src/Core/DenseStorage.h" +#ifndef EIGEN_MATRIXSTORAGE_H +#define EIGEN_MATRIXSTORAGE_H +#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN; +#else + #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) +#endif +namespace Eigen { +namespace internal { +struct constructor_without_unaligned_array_assert {}; +template +EIGEN_DEVICE_FUNC +void check_static_allocation_size() +{ + #if EIGEN_STACK_ALLOCATION_LIMIT + EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG); + #endif +} +template ::value > +struct plain_array +{ + T array[Size]; + EIGEN_DEVICE_FUNC + plain_array() + { + check_static_allocation_size(); + } + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; +#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) + #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) +#elif EIGEN_GNUC_AT_LEAST(4,7) + template + EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; } + #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ + eigen_assert((internal::UIntPtr(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \ + && "this assertion is explained here: " \ + "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ + " **** READ THIS WEB PAGE !!! ****"); +#else + #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ + eigen_assert((internal::UIntPtr(array) & (sizemask)) == 0 \ + && "this assertion is explained here: " \ + "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ + " **** READ THIS WEB PAGE !!! ****"); +#endif +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size]; + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7); + check_static_allocation_size(); + } + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size]; + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15); + check_static_allocation_size(); + } + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size]; + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31); + check_static_allocation_size(); + } + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size]; + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63); + check_static_allocation_size(); + } + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; +template +struct plain_array +{ + T array[1]; + EIGEN_DEVICE_FUNC plain_array() {} + EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} +}; +} +template class DenseStorage; +template class DenseStorage +{ + internal::plain_array m_data; + public: + EIGEN_DEVICE_FUNC DenseStorage() { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) + } + EIGEN_DEVICE_FUNC + explicit DenseStorage(internal::constructor_without_unaligned_array_assert) + : m_data(internal::constructor_without_unaligned_array_assert()) {} + EIGEN_DEVICE_FUNC + DenseStorage(const DenseStorage& other) : m_data(other.m_data) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) + } + EIGEN_DEVICE_FUNC + DenseStorage& operator=(const DenseStorage& other) + { + if (this != &other) m_data = other.m_data; + return *this; + } + EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); + EIGEN_UNUSED_VARIABLE(size); + EIGEN_UNUSED_VARIABLE(rows); + EIGEN_UNUSED_VARIABLE(cols); + } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } + EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} + EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} + EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {} + EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } + EIGEN_DEVICE_FUNC T *data() { return m_data.array; } +}; +template class DenseStorage +{ + public: + EIGEN_DEVICE_FUNC DenseStorage() {} + EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {} + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {} + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; } + EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {} + EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {} + EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} + EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} + EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {} + EIGEN_DEVICE_FUNC const T *data() const { return 0; } + EIGEN_DEVICE_FUNC T *data() { return 0; } +}; +template class DenseStorage +: public DenseStorage { }; +template class DenseStorage +: public DenseStorage { }; +template class DenseStorage +: public DenseStorage { }; +template class DenseStorage +{ + internal::plain_array m_data; + Index m_rows; + Index m_cols; + public: + EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {} + EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) + : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {} + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {} + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) + { + if (this != &other) + { + m_data = other.m_data; + m_rows = other.m_rows; + m_cols = other.m_cols; + } + return *this; + } + EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) + { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC Index rows() const {return m_rows;} + EIGEN_DEVICE_FUNC Index cols() const {return m_cols;} + EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; } + EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; } + EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } + EIGEN_DEVICE_FUNC T *data() { return m_data.array; } +}; +template class DenseStorage +{ + internal::plain_array m_data; + Index m_rows; + public: + EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {} + EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) + : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {} + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {} + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) + { + if (this != &other) + { + m_data = other.m_data; + m_rows = other.m_rows; + } + return *this; + } + EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } + EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} + EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;} + EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; } + EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; } + EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } + EIGEN_DEVICE_FUNC T *data() { return m_data.array; } +}; +template class DenseStorage +{ + internal::plain_array m_data; + Index m_cols; + public: + EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {} + EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) + : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {} + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {} + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) + { + if (this != &other) + { + m_data = other.m_data; + m_cols = other.m_cols; + } + return *this; + } + EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {} + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;} + EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + void conservativeResize(Index, Index, Index cols) { m_cols = cols; } + void resize(Index, Index, Index cols) { m_cols = cols; } + EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } + EIGEN_DEVICE_FUNC T *data() { return m_data.array; } +}; +template class DenseStorage +{ + T *m_data; + Index m_rows; + Index m_cols; + public: + EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {} + EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) + : m_data(0), m_rows(0), m_cols(0) {} + EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) + : m_data(internal::conditional_aligned_new_auto(size)), m_rows(rows), m_cols(cols) + { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0); + } + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) + : m_data(internal::conditional_aligned_new_auto(other.m_rows*other.m_cols)) + , m_rows(other.m_rows) + , m_cols(other.m_cols) + { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*m_cols) + internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data); + } + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) + { + if (this != &other) + { + DenseStorage tmp(other); + this->swap(tmp); + } + return *this; + } +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT + : m_data(std::move(other.m_data)) + , m_rows(std::move(other.m_rows)) + , m_cols(std::move(other.m_cols)) + { + other.m_data = nullptr; + other.m_rows = 0; + other.m_cols = 0; + } + EIGEN_DEVICE_FUNC + DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT + { + using std::swap; + swap(m_data, other.m_data); + swap(m_rows, other.m_rows); + swap(m_cols, other.m_cols); + return *this; + } +#endif + EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto(m_data, m_rows*m_cols); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) + { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} + EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + void conservativeResize(Index size, Index rows, Index cols) + { + m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*m_cols); + m_rows = rows; + m_cols = cols; + } + EIGEN_DEVICE_FUNC void resize(Index size, Index rows, Index cols) + { + if(size != m_rows*m_cols) + { + internal::conditional_aligned_delete_auto(m_data, m_rows*m_cols); + if (size) + m_data = internal::conditional_aligned_new_auto(size); + else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + } + m_rows = rows; + m_cols = cols; + } + EIGEN_DEVICE_FUNC const T *data() const { return m_data; } + EIGEN_DEVICE_FUNC T *data() { return m_data; } +}; +template class DenseStorage +{ + T *m_data; + Index m_cols; + public: + EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {} + explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {} + EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto(size)), m_cols(cols) + { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0); + EIGEN_UNUSED_VARIABLE(rows); + } + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) + : m_data(internal::conditional_aligned_new_auto(_Rows*other.m_cols)) + , m_cols(other.m_cols) + { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows) + internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data); + } + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) + { + if (this != &other) + { + DenseStorage tmp(other); + this->swap(tmp); + } + return *this; + } +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT + : m_data(std::move(other.m_data)) + , m_cols(std::move(other.m_cols)) + { + other.m_data = nullptr; + other.m_cols = 0; + } + EIGEN_DEVICE_FUNC + DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT + { + using std::swap; + swap(m_data, other.m_data); + swap(m_cols, other.m_cols); + return *this; + } +#endif + EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto(m_data, _Rows*m_cols); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} + EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} + EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) + { + m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, _Rows*m_cols); + m_cols = cols; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols) + { + if(size != _Rows*m_cols) + { + internal::conditional_aligned_delete_auto(m_data, _Rows*m_cols); + if (size) + m_data = internal::conditional_aligned_new_auto(size); + else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + } + m_cols = cols; + } + EIGEN_DEVICE_FUNC const T *data() const { return m_data; } + EIGEN_DEVICE_FUNC T *data() { return m_data; } +}; +template class DenseStorage +{ + T *m_data; + Index m_rows; + public: + EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {} + explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {} + EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto(size)), m_rows(rows) + { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols); + EIGEN_UNUSED_VARIABLE(cols); + } + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) + : m_data(internal::conditional_aligned_new_auto(other.m_rows*_Cols)) + , m_rows(other.m_rows) + { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols) + internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data); + } + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) + { + if (this != &other) + { + DenseStorage tmp(other); + this->swap(tmp); + } + return *this; + } +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT + : m_data(std::move(other.m_data)) + , m_rows(std::move(other.m_rows)) + { + other.m_data = nullptr; + other.m_rows = 0; + } + EIGEN_DEVICE_FUNC + DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT + { + using std::swap; + swap(m_data, other.m_data); + swap(m_rows, other.m_rows); + return *this; + } +#endif + EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto(m_data, _Cols*m_rows); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } + EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} + EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} + void conservativeResize(Index size, Index rows, Index) + { + m_data = internal::conditional_aligned_realloc_new_auto(m_data, size, m_rows*_Cols); + m_rows = rows; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index) + { + if(size != m_rows*_Cols) + { + internal::conditional_aligned_delete_auto(m_data, _Cols*m_rows); + if (size) + m_data = internal::conditional_aligned_new_auto(size); + else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + } + m_rows = rows; + } + EIGEN_DEVICE_FUNC const T *data() const { return m_data; } + EIGEN_DEVICE_FUNC T *data() { return m_data; } +}; +} +#endif +// end #include "src/Core/DenseStorage.h" +// #include "src/Core/NoAlias.h" +#ifndef EIGEN_NOALIAS_H +#define EIGEN_NOALIAS_H +namespace Eigen { +template class StorageBase> +class NoAlias +{ + public: + typedef typename ExpressionType::Scalar Scalar; + explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase& other) + { + call_assignment_no_alias(m_expression, other.derived(), internal::assign_op()); + return m_expression; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase& other) + { + call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op()); + return m_expression; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase& other) + { + call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op()); + return m_expression; + } + EIGEN_DEVICE_FUNC + ExpressionType& expression() const + { + return m_expression; + } + protected: + ExpressionType& m_expression; +}; +template +NoAlias MatrixBase::noalias() +{ + return NoAlias(derived()); +} +} +#endif +// end #include "src/Core/NoAlias.h" +// #include "src/Core/PlainObjectBase.h" +#ifndef EIGEN_DENSESTORAGEBASE_H +#define EIGEN_DENSESTORAGEBASE_H +#if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO) +# define EIGEN_INITIALIZE_COEFFS +# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i::quiet_NaN(); +#else +# undef EIGEN_INITIALIZE_COEFFS +# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED +#endif +namespace Eigen { +namespace internal { +template struct check_rows_cols_for_overflow { + template + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE void run(Index, Index) + { + } +}; +template<> struct check_rows_cols_for_overflow { + template + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols) + { + Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; + bool error = (rows == 0 || cols == 0) ? false + : (rows > max_index / cols); + if (error) + throw_std_bad_alloc(); + } +}; +template +struct conservative_resize_like_impl; +template struct matrix_swap_impl; +} +#ifdef EIGEN_PARSED_BY_DOXYGEN +namespace doxygen { +template struct dense_xpr_base_dispatcher; +template +struct dense_xpr_base_dispatcher > + : public MatrixBase {}; +template +struct dense_xpr_base_dispatcher > + : public ArrayBase {}; +} +template +class PlainObjectBase : public doxygen::dense_xpr_base_dispatcher +#else +template +class PlainObjectBase : public internal::dense_xpr_base::type +#endif +{ + public: + enum { Options = internal::traits::Options }; + typedef typename internal::dense_xpr_base::type Base; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + typedef Derived DenseType; + using Base::RowsAtCompileTime; + using Base::ColsAtCompileTime; + using Base::SizeAtCompileTime; + using Base::MaxRowsAtCompileTime; + using Base::MaxColsAtCompileTime; + using Base::MaxSizeAtCompileTime; + using Base::IsVectorAtCompileTime; + using Base::Flags; + template friend class Eigen::Map; + friend class Eigen::Map; + typedef Eigen::Map MapType; + friend class Eigen::Map; + typedef const Eigen::Map ConstMapType; +#if EIGEN_MAX_ALIGN_BYTES>0 + friend class Eigen::Map; + friend class Eigen::Map; +#endif + typedef Eigen::Map AlignedMapType; + typedef const Eigen::Map ConstAlignedMapType; + template struct StridedMapType { typedef Eigen::Map type; }; + template struct StridedConstMapType { typedef Eigen::Map type; }; + template struct StridedAlignedMapType { typedef Eigen::Map type; }; + template struct StridedConstAlignedMapType { typedef Eigen::Map type; }; + protected: + DenseStorage m_storage; + public: + enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits::Alignment>0) }; + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) + EIGEN_DEVICE_FUNC + Base& base() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + const Base& base() const { return *static_cast(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const + { + if(Flags & RowMajorBit) + return m_storage.data()[colId + rowId * m_storage.cols()]; + else + return m_storage.data()[rowId + colId * m_storage.rows()]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + return m_storage.data()[index]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId) + { + if(Flags & RowMajorBit) + return m_storage.data()[colId + rowId * m_storage.cols()]; + else + return m_storage.data()[rowId + colId * m_storage.rows()]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return m_storage.data()[index]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const + { + if(Flags & RowMajorBit) + return m_storage.data()[colId + rowId * m_storage.cols()]; + else + return m_storage.data()[rowId + colId * m_storage.rows()]; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const + { + return m_storage.data()[index]; + } + template + EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const + { + return internal::ploadt + (m_storage.data() + (Flags & RowMajorBit + ? colId + rowId * m_storage.cols() + : rowId + colId * m_storage.rows())); + } + template + EIGEN_STRONG_INLINE PacketScalar packet(Index index) const + { + return internal::ploadt(m_storage.data() + index); + } + template + EIGEN_STRONG_INLINE void writePacket(Index rowId, Index colId, const PacketScalar& val) + { + internal::pstoret + (m_storage.data() + (Flags & RowMajorBit + ? colId + rowId * m_storage.cols() + : rowId + colId * m_storage.rows()), val); + } + template + EIGEN_STRONG_INLINE void writePacket(Index index, const PacketScalar& val) + { + internal::pstoret(m_storage.data() + index, val); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const + { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() + { return m_storage.data(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void resize(Index rows, Index cols) + { + eigen_assert( EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,rows==RowsAtCompileTime) + && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,cols==ColsAtCompileTime) + && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,rows<=MaxRowsAtCompileTime) + && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,cols<=MaxColsAtCompileTime) + && rows>=0 && cols>=0 && "Invalid sizes when resizing a matrix or array."); + internal::check_rows_cols_for_overflow::run(rows, cols); + #ifdef EIGEN_INITIALIZE_COEFFS + Index size = rows*cols; + bool size_changed = size != this->size(); + m_storage.resize(size, rows, cols); + if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + #else + m_storage.resize(rows*cols, rows, cols); + #endif + } + EIGEN_DEVICE_FUNC + inline void resize(Index size) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase) + eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime==Dynamic || size<=MaxSizeAtCompileTime)) || SizeAtCompileTime == size) && size>=0); + #ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + #endif + if(RowsAtCompileTime == 1) + m_storage.resize(size, 1, size); + else + m_storage.resize(size, size, 1); + #ifdef EIGEN_INITIALIZE_COEFFS + if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + #endif + } + EIGEN_DEVICE_FUNC + inline void resize(NoChange_t, Index cols) + { + resize(rows(), cols); + } + EIGEN_DEVICE_FUNC + inline void resize(Index rows, NoChange_t) + { + resize(rows, cols()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void resizeLike(const EigenBase& _other) + { + const OtherDerived& other = _other.derived(); + internal::check_rows_cols_for_overflow::run(other.rows(), other.cols()); + const Index othersize = other.rows()*other.cols(); + if(RowsAtCompileTime == 1) + { + eigen_assert(other.rows() == 1 || other.cols() == 1); + resize(1, othersize); + } + else if(ColsAtCompileTime == 1) + { + eigen_assert(other.rows() == 1 || other.cols() == 1); + resize(othersize, 1); + } + else resize(other.rows(), other.cols()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void conservativeResize(Index rows, Index cols) + { + internal::conservative_resize_like_impl::run(*this, rows, cols); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void conservativeResize(Index rows, NoChange_t) + { + conservativeResize(rows, cols()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index cols) + { + conservativeResize(rows(), cols); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void conservativeResize(Index size) + { + internal::conservative_resize_like_impl::run(*this, size); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase& other) + { + internal::conservative_resize_like_impl::run(*this, other); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other) + { + return _set(other); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase& other) + { + _resize_to_match(other); + return Base::lazyAssign(other.derived()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue& func) + { + resize(func.rows(), func.cols()); + return Base::operator=(func); + } + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PlainObjectBase() : m_storage() + { + } +#ifndef EIGEN_PARSED_BY_DOXYGEN + EIGEN_DEVICE_FUNC + explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert) + : m_storage(internal::constructor_without_unaligned_array_assert()) + { + } +#endif +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + PlainObjectBase(PlainObjectBase&& other) EIGEN_NOEXCEPT + : m_storage( std::move(other.m_storage) ) + { + } + EIGEN_DEVICE_FUNC + PlainObjectBase& operator=(PlainObjectBase&& other) EIGEN_NOEXCEPT + { + using std::swap; + swap(m_storage, other.m_storage); + return *this; + } +#endif + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other) + : Base(), m_storage(other.m_storage) { } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols) + : m_storage(size, rows, cols) + { + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase &other) + : m_storage() + { + _check_template_params(); + resizeLike(other); + _set_noalias(other); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase &other) + : m_storage() + { + _check_template_params(); + resizeLike(other); + *this = other.derived(); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue& other) + { + _check_template_params(); + resize(other.rows(), other.cols()); + other.evalTo(this->derived()); + } + public: + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& operator=(const EigenBase &other) + { + _resize_to_match(other); + Base::operator=(other.derived()); + return this->derived(); + } + static inline ConstMapType Map(const Scalar* data) + { return ConstMapType(data); } + static inline MapType Map(Scalar* data) + { return MapType(data); } + static inline ConstMapType Map(const Scalar* data, Index size) + { return ConstMapType(data, size); } + static inline MapType Map(Scalar* data, Index size) + { return MapType(data, size); } + static inline ConstMapType Map(const Scalar* data, Index rows, Index cols) + { return ConstMapType(data, rows, cols); } + static inline MapType Map(Scalar* data, Index rows, Index cols) + { return MapType(data, rows, cols); } + static inline ConstAlignedMapType MapAligned(const Scalar* data) + { return ConstAlignedMapType(data); } + static inline AlignedMapType MapAligned(Scalar* data) + { return AlignedMapType(data); } + static inline ConstAlignedMapType MapAligned(const Scalar* data, Index size) + { return ConstAlignedMapType(data, size); } + static inline AlignedMapType MapAligned(Scalar* data, Index size) + { return AlignedMapType(data, size); } + static inline ConstAlignedMapType MapAligned(const Scalar* data, Index rows, Index cols) + { return ConstAlignedMapType(data, rows, cols); } + static inline AlignedMapType MapAligned(Scalar* data, Index rows, Index cols) + { return AlignedMapType(data, rows, cols); } + template + static inline typename StridedConstMapType >::type Map(const Scalar* data, const Stride& stride) + { return typename StridedConstMapType >::type(data, stride); } + template + static inline typename StridedMapType >::type Map(Scalar* data, const Stride& stride) + { return typename StridedMapType >::type(data, stride); } + template + static inline typename StridedConstMapType >::type Map(const Scalar* data, Index size, const Stride& stride) + { return typename StridedConstMapType >::type(data, size, stride); } + template + static inline typename StridedMapType >::type Map(Scalar* data, Index size, const Stride& stride) + { return typename StridedMapType >::type(data, size, stride); } + template + static inline typename StridedConstMapType >::type Map(const Scalar* data, Index rows, Index cols, const Stride& stride) + { return typename StridedConstMapType >::type(data, rows, cols, stride); } + template + static inline typename StridedMapType >::type Map(Scalar* data, Index rows, Index cols, const Stride& stride) + { return typename StridedMapType >::type(data, rows, cols, stride); } + template + static inline typename StridedConstAlignedMapType >::type MapAligned(const Scalar* data, const Stride& stride) + { return typename StridedConstAlignedMapType >::type(data, stride); } + template + static inline typename StridedAlignedMapType >::type MapAligned(Scalar* data, const Stride& stride) + { return typename StridedAlignedMapType >::type(data, stride); } + template + static inline typename StridedConstAlignedMapType >::type MapAligned(const Scalar* data, Index size, const Stride& stride) + { return typename StridedConstAlignedMapType >::type(data, size, stride); } + template + static inline typename StridedAlignedMapType >::type MapAligned(Scalar* data, Index size, const Stride& stride) + { return typename StridedAlignedMapType >::type(data, size, stride); } + template + static inline typename StridedConstAlignedMapType >::type MapAligned(const Scalar* data, Index rows, Index cols, const Stride& stride) + { return typename StridedConstAlignedMapType >::type(data, rows, cols, stride); } + template + static inline typename StridedAlignedMapType >::type MapAligned(Scalar* data, Index rows, Index cols, const Stride& stride) + { return typename StridedAlignedMapType >::type(data, rows, cols, stride); } + using Base::setConstant; + EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val); + EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val); + using Base::setZero; + EIGEN_DEVICE_FUNC Derived& setZero(Index size); + EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols); + using Base::setOnes; + EIGEN_DEVICE_FUNC Derived& setOnes(Index size); + EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols); + using Base::setRandom; + Derived& setRandom(Index size); + Derived& setRandom(Index rows, Index cols); + #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN + #include EIGEN_PLAINOBJECTBASE_PLUGIN + #endif + protected: + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase& other) + { + #ifdef EIGEN_NO_AUTOMATIC_RESIZING + eigen_assert((this->size()==0 || (IsVectorAtCompileTime ? (this->size() == other.size()) + : (rows() == other.rows() && cols() == other.cols()))) + && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined"); + EIGEN_ONLY_USED_FOR_DEBUG(other); + #else + resizeLike(other); + #endif + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& _set(const DenseBase& other) + { + internal::call_assignment(this->derived(), other.derived()); + return this->derived(); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase& other) + { + internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op()); + return this->derived(); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if::type* = 0) + { + EIGEN_STATIC_ASSERT(bool(NumTraits::IsInteger) && + bool(NumTraits::IsInteger), + FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) + resize(rows,cols); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if::type* = 0) + { + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2) + m_storage.data()[0] = Scalar(val0); + m_storage.data()[1] = Scalar(val1); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1, + typename internal::enable_if< (!internal::is_same::value) + && (internal::is_same::value) + && (internal::is_same::value) + && Base::SizeAtCompileTime==2,T1>::type* = 0) + { + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2) + m_storage.data()[0] = Scalar(val0); + m_storage.data()[1] = Scalar(val1); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if< (Base::SizeAtCompileTime!=1 || !internal::is_convertible::value) + && ((!internal::is_same::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0) + { + const bool is_integer = NumTraits::IsInteger; + EIGEN_UNUSED_VARIABLE(is_integer); + EIGEN_STATIC_ASSERT(is_integer, + FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) + resize(size); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) + { + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1) + m_storage.data()[0] = val0; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Index& val0, + typename internal::enable_if< (!internal::is_same::value) + && (internal::is_same::value) + && Base::SizeAtCompileTime==1 + && internal::is_convertible::value,T*>::type* = 0) + { + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1) + m_storage.data()[0] = Scalar(val0); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Scalar* data){ + this->_set_noalias(ConstMapType(data)); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const DenseBase& other){ + this->_set_noalias(other); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Derived& other){ + this->_set_noalias(other); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const EigenBase& other){ + this->derived() = other; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const ReturnByValue& other) + { + resize(other.rows(), other.cols()); + other.evalTo(this->derived()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const RotationBase& r) + { + this->derived() = r; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Scalar& val0, + typename internal::enable_if< Base::SizeAtCompileTime!=Dynamic + && Base::SizeAtCompileTime!=1 + && internal::is_convertible::value + && internal::is_same::XprKind,ArrayXpr>::value,T>::type* = 0) + { + Base::setConstant(val0); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Index& val0, + typename internal::enable_if< (!internal::is_same::value) + && (internal::is_same::value) + && Base::SizeAtCompileTime!=Dynamic + && Base::SizeAtCompileTime!=1 + && internal::is_convertible::value + && internal::is_same::XprKind,ArrayXpr>::value,T*>::type* = 0) + { + Base::setConstant(val0); + } + template + friend struct internal::matrix_swap_impl; + public: +#ifndef EIGEN_PARSED_BY_DOXYGEN + template + EIGEN_DEVICE_FUNC + void swap(DenseBase & other) + { + enum { SwapPointers = internal::is_same::value && Base::SizeAtCompileTime==Dynamic }; + internal::matrix_swap_impl::run(this->derived(), other.derived()); + } + template + EIGEN_DEVICE_FUNC + void swap(DenseBase const & other) + { Base::swap(other.derived()); } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void _check_template_params() + { + EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor) + && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0) + && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0)) + && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0)) + && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0)) + && ((MaxColsAtCompileTime == Dynamic) || (MaxColsAtCompileTime >= 0)) + && (MaxRowsAtCompileTime == RowsAtCompileTime || RowsAtCompileTime==Dynamic) + && (MaxColsAtCompileTime == ColsAtCompileTime || ColsAtCompileTime==Dynamic) + && (Options & (DontAlign|RowMajor)) == Options), + INVALID_MATRIX_TEMPLATE_PARAMETERS) + } + enum { IsPlainObjectBase = 1 }; +#endif +}; +namespace internal { +template +struct conservative_resize_like_impl +{ + static void run(DenseBase& _this, Index rows, Index cols) + { + if (_this.rows() == rows && _this.cols() == cols) return; + EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) + if ( ( Derived::IsRowMajor && _this.cols() == cols) || + (!Derived::IsRowMajor && _this.rows() == rows) ) + { + internal::check_rows_cols_for_overflow::run(rows, cols); + _this.derived().m_storage.conservativeResize(rows*cols,rows,cols); + } + else + { + typename Derived::PlainObject tmp(rows,cols); + const Index common_rows = numext::mini(rows, _this.rows()); + const Index common_cols = numext::mini(cols, _this.cols()); + tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); + _this.derived().swap(tmp); + } + } + static void run(DenseBase& _this, const DenseBase& other) + { + if (_this.rows() == other.rows() && _this.cols() == other.cols()) return; + EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) + EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived) + if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || + (!Derived::IsRowMajor && _this.rows() == other.rows()) ) + { + const Index new_rows = other.rows() - _this.rows(); + const Index new_cols = other.cols() - _this.cols(); + _this.derived().m_storage.conservativeResize(other.size(),other.rows(),other.cols()); + if (new_rows>0) + _this.bottomRightCorner(new_rows, other.cols()) = other.bottomRows(new_rows); + else if (new_cols>0) + _this.bottomRightCorner(other.rows(), new_cols) = other.rightCols(new_cols); + } + else + { + typename Derived::PlainObject tmp(other); + const Index common_rows = numext::mini(tmp.rows(), _this.rows()); + const Index common_cols = numext::mini(tmp.cols(), _this.cols()); + tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); + _this.derived().swap(tmp); + } + } +}; +template +struct conservative_resize_like_impl + : conservative_resize_like_impl +{ + using conservative_resize_like_impl::run; + static void run(DenseBase& _this, Index size) + { + const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size; + const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1; + _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + } + static void run(DenseBase& _this, const DenseBase& other) + { + if (_this.rows() == other.rows() && _this.cols() == other.cols()) return; + const Index num_new_elements = other.size() - _this.size(); + const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows(); + const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1; + _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + if (num_new_elements > 0) + _this.tail(num_new_elements) = other.tail(num_new_elements); + } +}; +template +struct matrix_swap_impl +{ + EIGEN_DEVICE_FUNC + static inline void run(MatrixTypeA& a, MatrixTypeB& b) + { + a.base().swap(b); + } +}; +template +struct matrix_swap_impl +{ + EIGEN_DEVICE_FUNC + static inline void run(MatrixTypeA& a, MatrixTypeB& b) + { + static_cast(a).m_storage.swap(static_cast(b).m_storage); + } +}; +} +} +#endif +// end #include "src/Core/PlainObjectBase.h" +// #include "src/Core/Matrix.h" +#ifndef EIGEN_MATRIX_H +#define EIGEN_MATRIX_H +namespace Eigen { +namespace internal { +template +struct traits > +{ +private: + enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret }; + typedef typename find_best_packet<_Scalar,size>::type PacketScalar; + enum { + row_major_bit = _Options&RowMajor ? RowMajorBit : 0, + is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, + max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, + default_alignment = compute_default_alignment<_Scalar,max_size>::value, + actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, + required_alignment = unpacket_traits::alignment, + packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0 + }; +public: + typedef _Scalar Scalar; + typedef Dense StorageKind; + typedef Eigen::Index StorageIndex; + typedef MatrixXpr XprKind; + enum { + RowsAtCompileTime = _Rows, + ColsAtCompileTime = _Cols, + MaxRowsAtCompileTime = _MaxRows, + MaxColsAtCompileTime = _MaxCols, + Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret, + Options = _Options, + InnerStrideAtCompileTime = 1, + OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime, + EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit, + Alignment = actual_alignment + }; +}; +} +template +class Matrix + : public PlainObjectBase > +{ + public: + typedef PlainObjectBase Base; + enum { Options = _Options }; + EIGEN_DENSE_PUBLIC_INTERFACE(Matrix) + typedef typename Base::PlainObject PlainObject; + using Base::base; + using Base::coeffRef; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other) + { + return Base::_set(other); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix& operator=(const DenseBase& other) + { + return Base::_set(other); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase &other) + { + return Base::operator=(other); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue& func) + { + return Base::operator=(func); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix() : Base() + { + Base::_check_template_params(); + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } + EIGEN_DEVICE_FUNC + explicit Matrix(internal::constructor_without_unaligned_array_assert) + : Base(internal::constructor_without_unaligned_array_assert()) + { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) + : Base(std::move(other)) + { + Base::_check_template_params(); + if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) + Base::_set_noalias(other); + } + EIGEN_DEVICE_FUNC + Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) + { + other.swap(*this); + return *this; + } +#endif + #ifndef EIGEN_PARSED_BY_DOXYGEN + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit Matrix(const T& x) + { + Base::_check_template_params(); + Base::template _init1(x); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) + { + Base::_check_template_params(); + Base::template _init2(x, y); + } + #else + EIGEN_DEVICE_FUNC + explicit Matrix(const Scalar *data); + EIGEN_STRONG_INLINE explicit Matrix(Index dim); + Matrix(const Scalar& x); + EIGEN_DEVICE_FUNC + Matrix(Index rows, Index cols); + Matrix(const Scalar& x, const Scalar& y); + #endif + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) + { + Base::_check_template_params(); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 3) + m_storage.data()[0] = x; + m_storage.data()[1] = y; + m_storage.data()[2] = z; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) + { + Base::_check_template_params(); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 4) + m_storage.data()[0] = x; + m_storage.data()[1] = y; + m_storage.data()[2] = z; + m_storage.data()[3] = w; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix(const Matrix& other) : Base(other) + { } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix(const EigenBase &other) + : Base(other.derived()) + { } + EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; } + EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); } + template + EIGEN_DEVICE_FUNC + explicit Matrix(const RotationBase& r); + template + EIGEN_DEVICE_FUNC + Matrix& operator=(const RotationBase& r); + #ifdef EIGEN_MATRIX_PLUGIN + #include EIGEN_MATRIX_PLUGIN + #endif + protected: + template + friend struct internal::conservative_resize_like_impl; + using Base::m_storage; +}; +#define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix) \ + \ +typedef Matrix Matrix##SizeSuffix##TypeSuffix; \ + \ +typedef Matrix Vector##SizeSuffix##TypeSuffix; \ + \ +typedef Matrix RowVector##SizeSuffix##TypeSuffix; +#define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size) \ + \ +typedef Matrix Matrix##Size##X##TypeSuffix; \ + \ +typedef Matrix Matrix##X##Size##TypeSuffix; +#define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \ +EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2) \ +EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3) \ +EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4) \ +EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \ +EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \ +EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \ +EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 4) +EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int, i) +EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float, f) +EIGEN_MAKE_TYPEDEFS_ALL_SIZES(double, d) +EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cf) +EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) +#undef EIGEN_MAKE_TYPEDEFS_ALL_SIZES +#undef EIGEN_MAKE_TYPEDEFS +#undef EIGEN_MAKE_FIXED_TYPEDEFS +} +#endif +// end #include "src/Core/Matrix.h" +// #include "src/Core/Array.h" +#ifndef EIGEN_ARRAY_H +#define EIGEN_ARRAY_H +namespace Eigen { +namespace internal { +template +struct traits > : traits > +{ + typedef ArrayXpr XprKind; + typedef ArrayBase > XprBase; +}; +} +template +class Array + : public PlainObjectBase > +{ + public: + typedef PlainObjectBase Base; + EIGEN_DENSE_PUBLIC_INTERFACE(Array) + enum { Options = _Options }; + typedef typename Base::PlainObject PlainObject; + protected: + template + friend struct internal::conservative_resize_like_impl; + using Base::m_storage; + public: + using Base::base; + using Base::coeff; + using Base::coeffRef; + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array& operator=(const EigenBase &other) + { + return Base::operator=(other); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array& operator=(const Scalar &value) + { + Base::setConstant(value); + return *this; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array& operator=(const DenseBase& other) + { + return Base::_set(other); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array& operator=(const Array& other) + { + return Base::_set(other); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array() : Base() + { + Base::_check_template_params(); + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } +#ifndef EIGEN_PARSED_BY_DOXYGEN + EIGEN_DEVICE_FUNC + Array(internal::constructor_without_unaligned_array_assert) + : Base(internal::constructor_without_unaligned_array_assert()) + { + Base::_check_template_params(); + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } +#endif +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + Array(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) + : Base(std::move(other)) + { + Base::_check_template_params(); + if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) + Base::_set_noalias(other); + } + EIGEN_DEVICE_FUNC + Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) + { + other.swap(*this); + return *this; + } +#endif + #ifndef EIGEN_PARSED_BY_DOXYGEN + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit Array(const T& x) + { + Base::_check_template_params(); + Base::template _init1(x); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1) + { + Base::_check_template_params(); + this->template _init2(val0, val1); + } + #else + EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit Array(Index dim); + Array(const Scalar& value); + Array(Index rows, Index cols); + Array(const Scalar& val0, const Scalar& val1); + #endif + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) + { + Base::_check_template_params(); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 3) + m_storage.data()[0] = val0; + m_storage.data()[1] = val1; + m_storage.data()[2] = val2; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) + { + Base::_check_template_params(); + EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 4) + m_storage.data()[0] = val0; + m_storage.data()[1] = val1; + m_storage.data()[2] = val2; + m_storage.data()[3] = val3; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const Array& other) + : Base(other) + { } + private: + struct PrivateType {}; + public: + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Array(const EigenBase &other, + typename internal::enable_if::value, + PrivateType>::type = PrivateType()) + : Base(other.derived()) + { } + EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; } + EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); } + #ifdef EIGEN_ARRAY_PLUGIN + #include EIGEN_ARRAY_PLUGIN + #endif + private: + template + friend struct internal::matrix_swap_impl; +}; +#define EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix) \ + \ +typedef Array Array##SizeSuffix##SizeSuffix##TypeSuffix; \ + \ +typedef Array Array##SizeSuffix##TypeSuffix; +#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, Size) \ + \ +typedef Array Array##Size##X##TypeSuffix; \ + \ +typedef Array Array##X##Size##TypeSuffix; +#define EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \ +EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 2, 2) \ +EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 3, 3) \ +EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 4, 4) \ +EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \ +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \ +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \ +EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 4) +EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(int, i) +EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(float, f) +EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(double, d) +EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cf) +EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) +#undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES +#undef EIGEN_MAKE_ARRAY_TYPEDEFS +#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE +#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \ +using Eigen::Matrix##SizeSuffix##TypeSuffix; \ +using Eigen::Vector##SizeSuffix##TypeSuffix; \ +using Eigen::RowVector##SizeSuffix##TypeSuffix; +#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(TypeSuffix) \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) +#define EIGEN_USING_ARRAY_TYPEDEFS \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(i) \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(f) \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(d) \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cf) \ +EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cd) +} +#endif +// end #include "src/Core/Array.h" +// #include "src/Core/CwiseBinaryOp.h" +#ifndef EIGEN_CWISE_BINARY_OP_H +#define EIGEN_CWISE_BINARY_OP_H +namespace Eigen { +namespace internal { +template +struct traits > +{ + typedef typename remove_all::type Ancestor; + typedef typename traits::XprKind XprKind; + enum { + RowsAtCompileTime = traits::RowsAtCompileTime, + ColsAtCompileTime = traits::ColsAtCompileTime, + MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, + MaxColsAtCompileTime = traits::MaxColsAtCompileTime + }; + typedef typename result_of< + BinaryOp( + const typename Lhs::Scalar&, + const typename Rhs::Scalar& + ) + >::type Scalar; + typedef typename cwise_promote_storage_type::StorageKind, + typename traits::StorageKind, + BinaryOp>::ret StorageKind; + typedef typename promote_index_type::StorageIndex, + typename traits::StorageIndex>::type StorageIndex; + typedef typename Lhs::Nested LhsNested; + typedef typename Rhs::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + enum { + Flags = cwise_promote_storage_order::StorageKind,typename traits::StorageKind,_LhsNested::Flags & RowMajorBit,_RhsNested::Flags & RowMajorBit>::value + }; +}; +} +template +class CwiseBinaryOpImpl; +template +class CwiseBinaryOp : + public CwiseBinaryOpImpl< + BinaryOp, LhsType, RhsType, + typename internal::cwise_promote_storage_type::StorageKind, + typename internal::traits::StorageKind, + BinaryOp>::ret>, + internal::no_assignment_operator +{ + public: + typedef typename internal::remove_all::type Functor; + typedef typename internal::remove_all::type Lhs; + typedef typename internal::remove_all::type Rhs; + typedef typename CwiseBinaryOpImpl< + BinaryOp, LhsType, RhsType, + typename internal::cwise_promote_storage_type::StorageKind, + typename internal::traits::StorageKind, + BinaryOp>::ret>::Base Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp) + typedef typename internal::ref_selector::type LhsNested; + typedef typename internal::ref_selector::type RhsNested; + typedef typename internal::remove_reference::type _LhsNested; + typedef typename internal::remove_reference::type _RhsNested; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) + : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) + { + EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar); + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs) + eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rows() const { + if (internal::traits::type>::RowsAtCompileTime==Dynamic) + return m_rhs.rows(); + else + return m_lhs.rows(); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index cols() const { + if (internal::traits::type>::ColsAtCompileTime==Dynamic) + return m_rhs.cols(); + else + return m_lhs.cols(); + } + EIGEN_DEVICE_FUNC + const _LhsNested& lhs() const { return m_lhs; } + EIGEN_DEVICE_FUNC + const _RhsNested& rhs() const { return m_rhs; } + EIGEN_DEVICE_FUNC + const BinaryOp& functor() const { return m_functor; } + protected: + LhsNested m_lhs; + RhsNested m_rhs; + const BinaryOp m_functor; +}; +template +class CwiseBinaryOpImpl + : public internal::generic_xpr_base >::type +{ +public: + typedef typename internal::generic_xpr_base >::type Base; +}; +template +template +EIGEN_STRONG_INLINE Derived & +MatrixBase::operator-=(const MatrixBase &other) +{ + call_assignment(derived(), other.derived(), internal::sub_assign_op()); + return derived(); +} +template +template +EIGEN_STRONG_INLINE Derived & +MatrixBase::operator+=(const MatrixBase& other) +{ + call_assignment(derived(), other.derived(), internal::add_assign_op()); + return derived(); +} +} +#endif +// end #include "src/Core/CwiseBinaryOp.h" +// #include "src/Core/CwiseUnaryOp.h" +#ifndef EIGEN_CWISE_UNARY_OP_H +#define EIGEN_CWISE_UNARY_OP_H +namespace Eigen { +namespace internal { +template +struct traits > + : traits +{ + typedef typename result_of< + UnaryOp(const typename XprType::Scalar&) + >::type Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; + enum { + Flags = _XprTypeNested::Flags & RowMajorBit + }; +}; +} +template +class CwiseUnaryOpImpl; +template +class CwiseUnaryOp : public CwiseUnaryOpImpl::StorageKind>, internal::no_assignment_operator +{ + public: + typedef typename CwiseUnaryOpImpl::StorageKind>::Base Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp) + typedef typename internal::ref_selector::type XprTypeNested; + typedef typename internal::remove_all::type NestedExpression; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index rows() const { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const UnaryOp& functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::remove_all::type& + nestedExpression() { return m_xpr; } + protected: + XprTypeNested m_xpr; + const UnaryOp m_functor; +}; +template +class CwiseUnaryOpImpl + : public internal::generic_xpr_base >::type +{ +public: + typedef typename internal::generic_xpr_base >::type Base; +}; +} +#endif +// end #include "src/Core/CwiseUnaryOp.h" +// #include "src/Core/CwiseNullaryOp.h" +#ifndef EIGEN_CWISE_NULLARY_OP_H +#define EIGEN_CWISE_NULLARY_OP_H +namespace Eigen { +namespace internal { +template +struct traits > : traits +{ + enum { + Flags = traits::Flags & RowMajorBit + }; +}; +} +template +class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp >::type, internal::no_assignment_operator +{ + public: + typedef typename internal::dense_xpr_base::type Base; + EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp) + EIGEN_DEVICE_FUNC + CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp()) + : m_rows(rows), m_cols(cols), m_functor(func) + { + eigen_assert(rows >= 0 + && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) + && cols >= 0 + && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); } + EIGEN_DEVICE_FUNC + const NullaryOp& functor() const { return m_functor; } + protected: + const internal::variable_if_dynamic m_rows; + const internal::variable_if_dynamic m_cols; + const NullaryOp m_functor; +}; +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) +{ + return CwiseNullaryOp(rows, cols, func); +} +template +template +EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + if(RowsAtCompileTime == 1) return CwiseNullaryOp(1, size, func); + else return CwiseNullaryOp(size, 1, func); +} +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +DenseBase::NullaryExpr(const CustomNullaryOp& func) +{ + return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); +} +template +EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Constant(Index rows, Index cols, const Scalar& value) +{ + return DenseBase::NullaryExpr(rows, cols, internal::scalar_constant_op(value)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Constant(Index size, const Scalar& value) +{ + return DenseBase::NullaryExpr(size, internal::scalar_constant_op(value)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Constant(const Scalar& value) +{ + EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) + return DenseBase::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_constant_op(value)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +DenseBase::LinSpaced(const Scalar& low, const Scalar& high) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); +} +template +EIGEN_DEVICE_FUNC bool DenseBase::isApproxToConstant +(const Scalar& val, const RealScalar& prec) const +{ + typename internal::nested_eval::type self(derived()); + for(Index j = 0; j < cols(); ++j) + for(Index i = 0; i < rows(); ++i) + if(!internal::isApprox(self.coeff(i, j), val, prec)) + return false; + return true; +} +template +EIGEN_DEVICE_FUNC bool DenseBase::isConstant +(const Scalar& val, const RealScalar& prec) const +{ + return isApproxToConstant(val, prec); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) +{ + setConstant(val); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) +{ + return derived() = Constant(rows(), cols(), val); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setConstant(Index size, const Scalar& val) +{ + resize(size); + return setConstant(val); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) +{ + resize(rows, cols); + return setConstant(val); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return setLinSpaced(size(), low, high); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Zero(Index rows, Index cols) +{ + return Constant(rows, cols, Scalar(0)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Zero(Index size) +{ + return Constant(size, Scalar(0)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Zero() +{ + return Constant(Scalar(0)); +} +template +EIGEN_DEVICE_FUNC bool DenseBase::isZero(const RealScalar& prec) const +{ + typename internal::nested_eval::type self(derived()); + for(Index j = 0; j < cols(); ++j) + for(Index i = 0; i < rows(); ++i) + if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast(1), prec)) + return false; + return true; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setZero() +{ + return setConstant(Scalar(0)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setZero(Index newSize) +{ + resize(newSize); + return setConstant(Scalar(0)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setZero(Index rows, Index cols) +{ + resize(rows, cols); + return setConstant(Scalar(0)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Ones(Index rows, Index cols) +{ + return Constant(rows, cols, Scalar(1)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Ones(Index newSize) +{ + return Constant(newSize, Scalar(1)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +DenseBase::Ones() +{ + return Constant(Scalar(1)); +} +template +EIGEN_DEVICE_FUNC bool DenseBase::isOnes +(const RealScalar& prec) const +{ + return isApproxToConstant(Scalar(1), prec); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() +{ + return setConstant(Scalar(1)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setOnes(Index newSize) +{ + resize(newSize); + return setConstant(Scalar(1)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& +PlainObjectBase::setOnes(Index rows, Index cols) +{ + resize(rows, cols); + return setConstant(Scalar(1)); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +MatrixBase::Identity(Index rows, Index cols) +{ + return DenseBase::NullaryExpr(rows, cols, internal::scalar_identity_op()); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +MatrixBase::Identity() +{ + EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) + return MatrixBase::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_identity_op()); +} +template +bool MatrixBase::isIdentity +(const RealScalar& prec) const +{ + typename internal::nested_eval::type self(derived()); + for(Index j = 0; j < cols(); ++j) + { + for(Index i = 0; i < rows(); ++i) + { + if(i == j) + { + if(!internal::isApprox(self.coeff(i, j), static_cast(1), prec)) + return false; + } + else + { + if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast(1), prec)) + return false; + } + } + } + return true; +} +namespace internal { +template=16)> +struct setIdentity_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Derived& run(Derived& m) + { + return m = Derived::Identity(m.rows(), m.cols()); + } +}; +template +struct setIdentity_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Derived& run(Derived& m) + { + m.setZero(); + const Index size = numext::mini(m.rows(), m.cols()); + for(Index i = 0; i < size; ++i) m.coeffRef(i,i) = typename Derived::Scalar(1); + return m; + } +}; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() +{ + return internal::setIdentity_impl::run(derived()); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) +{ + derived().resize(rows, cols); + return setIdentity(); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + return BasisReturnType(SquareMatrixType::Identity(),i); +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() +{ return Derived::Unit(0); } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() +{ return Derived::Unit(1); } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() +{ return Derived::Unit(2); } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() +{ return Derived::Unit(3); } +} +#endif +// end #include "src/Core/CwiseNullaryOp.h" +// #include "src/Core/Stride.h" +#ifndef EIGEN_STRIDE_H +#define EIGEN_STRIDE_H +namespace Eigen { +template +class Stride +{ + public: + typedef Eigen::Index Index; + enum { + InnerStrideAtCompileTime = _InnerStrideAtCompileTime, + OuterStrideAtCompileTime = _OuterStrideAtCompileTime + }; + EIGEN_DEVICE_FUNC + Stride() + : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) + { + eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic); + } + EIGEN_DEVICE_FUNC + Stride(Index outerStride, Index innerStride) + : m_outer(outerStride), m_inner(innerStride) + { + eigen_assert(innerStride>=0 && outerStride>=0); + } + EIGEN_DEVICE_FUNC + Stride(const Stride& other) + : m_outer(other.outer()), m_inner(other.inner()) + {} + EIGEN_DEVICE_FUNC + inline Index outer() const { return m_outer.value(); } + EIGEN_DEVICE_FUNC + inline Index inner() const { return m_inner.value(); } + protected: + internal::variable_if_dynamic m_outer; + internal::variable_if_dynamic m_inner; +}; +template +class InnerStride : public Stride<0, Value> +{ + typedef Stride<0, Value> Base; + public: + EIGEN_DEVICE_FUNC InnerStride() : Base() {} + EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {} +}; +template +class OuterStride : public Stride +{ + typedef Stride Base; + public: + EIGEN_DEVICE_FUNC OuterStride() : Base() {} + EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {} +}; +} +#endif +// end #include "src/Core/Stride.h" +// #include "src/Core/MapBase.h" +#ifndef EIGEN_MAPBASE_H +#define EIGEN_MAPBASE_H +#define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \ + EIGEN_STATIC_ASSERT((int(internal::evaluator::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \ + YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT) +namespace Eigen { +template class MapBase + : public internal::dense_xpr_base::type +{ + public: + typedef typename internal::dense_xpr_base::type Base; + enum { + RowsAtCompileTime = internal::traits::RowsAtCompileTime, + ColsAtCompileTime = internal::traits::ColsAtCompileTime, + SizeAtCompileTime = Base::SizeAtCompileTime + }; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::packet_traits::type PacketScalar; + typedef typename NumTraits::Real RealScalar; + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar *, + const Scalar *>::type + PointerType; + using Base::derived; + using Base::MaxRowsAtCompileTime; + using Base::MaxColsAtCompileTime; + using Base::MaxSizeAtCompileTime; + using Base::IsVectorAtCompileTime; + using Base::Flags; + using Base::IsRowMajor; + using Base::rows; + using Base::cols; + using Base::size; + using Base::coeff; + using Base::coeffRef; + using Base::lazyAssign; + using Base::eval; + using Base::innerStride; + using Base::outerStride; + using Base::rowStride; + using Base::colStride; + using Base::operator=; + typedef typename Base::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); } + EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); } + EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC + inline const Scalar& coeff(Index rowId, Index colId) const + { + return m_data[colId * colStride() + rowId * rowStride()]; + } + EIGEN_DEVICE_FUNC + inline const Scalar& coeff(Index index) const + { + EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) + return m_data[index * innerStride()]; + } + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index rowId, Index colId) const + { + return this->m_data[colId * colStride() + rowId * rowStride()]; + } + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index index) const + { + EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) + return this->m_data[index * innerStride()]; + } + template + inline PacketScalar packet(Index rowId, Index colId) const + { + return internal::ploadt + (m_data + (colId * colStride() + rowId * rowStride())); + } + template + inline PacketScalar packet(Index index) const + { + EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) + return internal::ploadt(m_data + index * innerStride()); + } + EIGEN_DEVICE_FUNC + explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) + { + EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) + checkSanity(); + } + EIGEN_DEVICE_FUNC + inline MapBase(PointerType dataPtr, Index vecSize) + : m_data(dataPtr), + m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)), + m_cols(ColsAtCompileTime == Dynamic ? vecSize : Index(ColsAtCompileTime)) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) + eigen_assert(vecSize >= 0); + eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize); + checkSanity(); + } + EIGEN_DEVICE_FUNC + inline MapBase(PointerType dataPtr, Index rows, Index cols) + : m_data(dataPtr), m_rows(rows), m_cols(cols) + { + eigen_assert( (dataPtr == 0) + || ( rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) + && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols))); + checkSanity(); + } + #ifdef EIGEN_MAPBASE_PLUGIN + #include EIGEN_MAPBASE_PLUGIN + #endif + protected: + template + EIGEN_DEVICE_FUNC + void checkSanity(typename internal::enable_if<(internal::traits::Alignment>0),void*>::type = 0) const + { +#if EIGEN_MAX_ALIGN_BYTES>0 + eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits::Alignment) == 0) + || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); +#endif + } + template + EIGEN_DEVICE_FUNC + void checkSanity(typename internal::enable_if::Alignment==0,void*>::type = 0) const + {} + PointerType m_data; + const internal::variable_if_dynamic m_rows; + const internal::variable_if_dynamic m_cols; +}; +template class MapBase + : public MapBase +{ + typedef MapBase ReadOnlyMapBase; + public: + typedef MapBase Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::PacketScalar PacketScalar; + typedef typename Base::StorageIndex StorageIndex; + typedef typename Base::PointerType PointerType; + using Base::derived; + using Base::rows; + using Base::cols; + using Base::size; + using Base::coeff; + using Base::coeffRef; + using Base::innerStride; + using Base::outerStride; + using Base::rowStride; + using Base::colStride; + typedef typename internal::conditional< + internal::is_lvalue::value, + Scalar, + const Scalar + >::type ScalarWithConstIfNotLvalue; + EIGEN_DEVICE_FUNC + inline const Scalar* data() const { return this->m_data; } + EIGEN_DEVICE_FUNC + inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } + EIGEN_DEVICE_FUNC + inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col) + { + return this->m_data[col * colStride() + row * rowStride()]; + } + EIGEN_DEVICE_FUNC + inline ScalarWithConstIfNotLvalue& coeffRef(Index index) + { + EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) + return this->m_data[index * innerStride()]; + } + template + inline void writePacket(Index row, Index col, const PacketScalar& val) + { + internal::pstoret + (this->m_data + (col * colStride() + row * rowStride()), val); + } + template + inline void writePacket(Index index, const PacketScalar& val) + { + EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) + internal::pstoret + (this->m_data + index * innerStride(), val); + } + EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {} + EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {} + EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {} + EIGEN_DEVICE_FUNC + Derived& operator=(const MapBase& other) + { + ReadOnlyMapBase::Base::operator=(other); + return derived(); + } + using ReadOnlyMapBase::Base::operator=; +}; +#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS +} +#endif +// end #include "src/Core/MapBase.h" +// #include "src/Core/Map.h" +#ifndef EIGEN_MAP_H +#define EIGEN_MAP_H +namespace Eigen { +namespace internal { +template +struct traits > + : public traits +{ + typedef traits TraitsBase; + enum { + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 + ? int(PlainObjectType::InnerStrideAtCompileTime) + : int(StrideType::InnerStrideAtCompileTime), + OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 + ? int(PlainObjectType::OuterStrideAtCompileTime) + : int(StrideType::OuterStrideAtCompileTime), + Alignment = int(MapOptions)&int(AlignedMask), + Flags0 = TraitsBase::Flags & (~NestByRefBit), + Flags = is_lvalue::value ? int(Flags0) : (int(Flags0) & ~LvalueBit) + }; +private: + enum { Options }; +}; +} +template class Map + : public MapBase > +{ + public: + typedef MapBase Base; + EIGEN_DENSE_PUBLIC_INTERFACE(Map) + typedef typename Base::PointerType PointerType; + typedef PointerType PointerArgType; + EIGEN_DEVICE_FUNC + inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; } + EIGEN_DEVICE_FUNC + inline Index innerStride() const + { + return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1; + } + EIGEN_DEVICE_FUNC + inline Index outerStride() const + { + return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() + : IsVectorAtCompileTime ? this->size() + : int(Flags)&RowMajorBit ? this->cols() + : this->rows(); + } + EIGEN_DEVICE_FUNC + explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType()) + : Base(cast_to_pointer_type(dataPtr)), m_stride(stride) + { + PlainObjectType::Base::_check_template_params(); + } + EIGEN_DEVICE_FUNC + inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType()) + : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride) + { + PlainObjectType::Base::_check_template_params(); + } + EIGEN_DEVICE_FUNC + inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType()) + : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride) + { + PlainObjectType::Base::_check_template_params(); + } + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map) + protected: + StrideType m_stride; +}; +} +#endif +// end #include "src/Core/Map.h" +// #include "src/Core/Block.h" +#ifndef EIGEN_BLOCK_H +#define EIGEN_BLOCK_H +namespace Eigen { +namespace internal { +template +struct traits > : traits +{ + typedef typename traits::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::XprKind XprKind; + typedef typename ref_selector::type XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; + enum{ + MatrixRows = traits::RowsAtCompileTime, + MatrixCols = traits::ColsAtCompileTime, + RowsAtCompileTime = MatrixRows == 0 ? 0 : BlockRows, + ColsAtCompileTime = MatrixCols == 0 ? 0 : BlockCols, + MaxRowsAtCompileTime = BlockRows==0 ? 0 + : RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) + : int(traits::MaxRowsAtCompileTime), + MaxColsAtCompileTime = BlockCols==0 ? 0 + : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) + : int(traits::MaxColsAtCompileTime), + XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0, + IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 + : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 + : XprTypeIsRowMajor, + HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor), + InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime), + InnerStrideAtCompileTime = HasSameStorageOrderAsXprType + ? int(inner_stride_at_compile_time::ret) + : int(outer_stride_at_compile_time::ret), + OuterStrideAtCompileTime = HasSameStorageOrderAsXprType + ? int(outer_stride_at_compile_time::ret) + : int(inner_stride_at_compile_time::ret), + FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, + FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, + Flags = (traits::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit, + Alignment = 0 + }; +}; +template::ret> class BlockImpl_dense; +} +template class BlockImpl; +template class Block + : public BlockImpl::StorageKind> +{ + typedef BlockImpl::StorageKind> Impl; + public: + typedef Impl Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(Block) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block) + typedef typename internal::remove_all::type NestedExpression; + EIGEN_DEVICE_FUNC + inline Block(XprType& xpr, Index i) : Impl(xpr,i) + { + eigen_assert( (i>=0) && ( + ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows() + && startCol >= 0 && BlockCols >= 0 && startCol + BlockCols <= xpr.cols()); + } + EIGEN_DEVICE_FUNC + inline Block(XprType& xpr, + Index startRow, Index startCol, + Index blockRows, Index blockCols) + : Impl(xpr, startRow, startCol, blockRows, blockCols) + { + eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows) + && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols)); + eigen_assert(startRow >= 0 && blockRows >= 0 && startRow <= xpr.rows() - blockRows + && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols); + } +}; +template +class BlockImpl + : public internal::BlockImpl_dense +{ + typedef internal::BlockImpl_dense Impl; + typedef typename XprType::StorageIndex StorageIndex; + public: + typedef Impl Base; + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) + EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} + EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} + EIGEN_DEVICE_FUNC + inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + : Impl(xpr, startRow, startCol, blockRows, blockCols) {} +}; +namespace internal { +template class BlockImpl_dense + : public internal::dense_xpr_base >::type +{ + typedef Block BlockType; + typedef typename internal::ref_selector::non_const_type XprTypeNested; + public: + typedef typename internal::dense_xpr_base::type Base; + EIGEN_DENSE_PUBLIC_INTERFACE(BlockType) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense) + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, Index i) + : m_xpr(xpr), + m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0), + m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0), + m_blockRows(BlockRows==1 ? 1 : xpr.rows()), + m_blockCols(BlockCols==1 ? 1 : xpr.cols()) + {} + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) + : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol), + m_blockRows(BlockRows), m_blockCols(BlockCols) + {} + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, + Index startRow, Index startCol, + Index blockRows, Index blockCols) + : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol), + m_blockRows(blockRows), m_blockCols(blockCols) + {} + EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); } + EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); } + EIGEN_DEVICE_FUNC + inline Scalar& coeffRef(Index rowId, Index colId) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value()); + } + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index rowId, Index colId) const + { + return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const + { + return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value()); + } + EIGEN_DEVICE_FUNC + inline Scalar& coeffRef(Index index) + { + EIGEN_STATIC_ASSERT_LVALUE(XprType) + return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + } + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index index) const + { + return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + } + EIGEN_DEVICE_FUNC + inline const CoeffReturnType coeff(Index index) const + { + return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + } + template + inline PacketScalar packet(Index rowId, Index colId) const + { + return m_xpr.template packet(rowId + m_startRow.value(), colId + m_startCol.value()); + } + template + inline void writePacket(Index rowId, Index colId, const PacketScalar& val) + { + m_xpr.template writePacket(rowId + m_startRow.value(), colId + m_startCol.value(), val); + } + template + inline PacketScalar packet(Index index) const + { + return m_xpr.template packet + (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0)); + } + template + inline void writePacket(Index index, const PacketScalar& val) + { + m_xpr.template writePacket + (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), + m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val); + } + #ifdef EIGEN_PARSED_BY_DOXYGEN + EIGEN_DEVICE_FUNC inline const Scalar* data() const; + EIGEN_DEVICE_FUNC inline Index innerStride() const; + EIGEN_DEVICE_FUNC inline Index outerStride() const; + #endif + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& nestedExpression() const + { + return m_xpr; + } + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } + EIGEN_DEVICE_FUNC + StorageIndex startRow() const + { + return m_startRow.value(); + } + EIGEN_DEVICE_FUNC + StorageIndex startCol() const + { + return m_startCol.value(); + } + protected: + XprTypeNested m_xpr; + const internal::variable_if_dynamic m_startRow; + const internal::variable_if_dynamic m_startCol; + const internal::variable_if_dynamic m_blockRows; + const internal::variable_if_dynamic m_blockCols; +}; +template +class BlockImpl_dense + : public MapBase > +{ + typedef Block BlockType; + typedef typename internal::ref_selector::non_const_type XprTypeNested; + enum { + XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0 + }; + public: + typedef MapBase Base; + EIGEN_DENSE_PUBLIC_INTERFACE(BlockType) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense) + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, Index i) + : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) + || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), + BlockRows==1 ? 1 : xpr.rows(), + BlockCols==1 ? 1 : xpr.cols()), + m_xpr(xpr), + m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0), + m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0) + { + init(); + } + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) + : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), + m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) + { + init(); + } + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, + Index startRow, Index startCol, + Index blockRows, Index blockCols) + : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), + m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) + { + init(); + } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& nestedExpression() const + { + return m_xpr; + } + EIGEN_DEVICE_FUNC + XprType& nestedExpression() { return m_xpr; } + EIGEN_DEVICE_FUNC + inline Index innerStride() const + { + return internal::traits::HasSameStorageOrderAsXprType + ? m_xpr.innerStride() + : m_xpr.outerStride(); + } + EIGEN_DEVICE_FUNC + inline Index outerStride() const + { + return m_outerStride; + } + EIGEN_DEVICE_FUNC + StorageIndex startRow() const + { + return m_startRow.value(); + } + EIGEN_DEVICE_FUNC + StorageIndex startCol() const + { + return m_startCol.value(); + } + #ifndef __SUNPRO_CC + protected: + #endif + #ifndef EIGEN_PARSED_BY_DOXYGEN + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) + : Base(data, blockRows, blockCols), m_xpr(xpr) + { + init(); + } + #endif + protected: + EIGEN_DEVICE_FUNC + void init() + { + m_outerStride = internal::traits::HasSameStorageOrderAsXprType + ? m_xpr.outerStride() + : m_xpr.innerStride(); + } + XprTypeNested m_xpr; + const internal::variable_if_dynamic m_startRow; + const internal::variable_if_dynamic m_startCol; + Index m_outerStride; +}; +} +} +#endif +// end #include "src/Core/Block.h" +// #include "src/Core/Transpose.h" +#ifndef EIGEN_TRANSPOSE_H +#define EIGEN_TRANSPOSE_H +namespace Eigen { +namespace internal { +template +struct traits > : public traits +{ + typedef typename ref_selector::type MatrixTypeNested; + typedef typename remove_reference::type MatrixTypeNestedPlain; + enum { + RowsAtCompileTime = MatrixType::ColsAtCompileTime, + ColsAtCompileTime = MatrixType::RowsAtCompileTime, + MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime, + FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, + Flags0 = traits::Flags & ~(LvalueBit | NestByRefBit), + Flags1 = Flags0 | FlagsLvalueBit, + Flags = Flags1 ^ RowMajorBit, + InnerStrideAtCompileTime = inner_stride_at_compile_time::ret, + OuterStrideAtCompileTime = outer_stride_at_compile_time::ret + }; +}; +} +template class TransposeImpl; +template class Transpose + : public TransposeImpl::StorageKind> +{ + public: + typedef typename internal::ref_selector::non_const_type MatrixTypeNested; + typedef typename TransposeImpl::StorageKind>::Base Base; + EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose) + typedef typename internal::remove_all::type NestedExpression; + EIGEN_DEVICE_FUNC + explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {} + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose) + EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_matrix; } + EIGEN_DEVICE_FUNC + typename internal::remove_reference::type& + nestedExpression() { return m_matrix; } + void resize(Index nrows, Index ncols) { + m_matrix.resize(ncols,nrows); + } + protected: + typename internal::ref_selector::non_const_type m_matrix; +}; +namespace internal { +template::ret> +struct TransposeImpl_base +{ + typedef typename dense_xpr_base >::type type; +}; +template +struct TransposeImpl_base +{ + typedef typename dense_xpr_base >::type type; +}; +} +template +class TransposeImpl + : public internal::generic_xpr_base >::type +{ +public: + typedef typename internal::generic_xpr_base >::type Base; +}; +template class TransposeImpl + : public internal::TransposeImpl_base::type +{ + public: + typedef typename internal::TransposeImpl_base::type Base; + using Base::coeffRef; + EIGEN_DENSE_PUBLIC_INTERFACE(Transpose) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl) + EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); } + EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); } + typedef typename internal::conditional< + internal::is_lvalue::value, + Scalar, + const Scalar + >::type ScalarWithConstIfNotLvalue; + EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index rowId, Index colId) const + { + return derived().nestedExpression().coeffRef(colId, rowId); + } + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index index) const + { + return derived().nestedExpression().coeffRef(index); + } +}; +template +inline Transpose +DenseBase::transpose() +{ + return TransposeReturnType(derived()); +} +template +inline typename DenseBase::ConstTransposeReturnType +DenseBase::transpose() const +{ + return ConstTransposeReturnType(derived()); +} +template +inline const typename MatrixBase::AdjointReturnType +MatrixBase::adjoint() const +{ + return AdjointReturnType(this->transpose()); +} +namespace internal { +template::size)) + && (internal::evaluator::Flags&PacketAccessBit) > +struct inplace_transpose_selector; +template +struct inplace_transpose_selector { + static void run(MatrixType& m) { + m.matrix().template triangularView().swap(m.matrix().transpose()); + } +}; +template +struct inplace_transpose_selector { + static void run(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + const Index PacketSize = internal::packet_traits::size; + const Index Alignment = internal::evaluator::Alignment; + PacketBlock A; + for (Index i=0; i(i,0); + internal::ptranspose(A); + for (Index i=0; i(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]); + } +}; +template +struct inplace_transpose_selector { + static void run(MatrixType& m) { + if (m.rows()==m.cols()) + m.matrix().template triangularView().swap(m.matrix().transpose()); + else + m = m.transpose().eval(); + } +}; +} +template +inline void DenseBase::transposeInPlace() +{ + eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) + && "transposeInPlace() called on a non-square non-resizable matrix"); + internal::inplace_transpose_selector::run(derived()); +} +template +inline void MatrixBase::adjointInPlace() +{ + derived() = adjoint().eval(); +} +#ifndef EIGEN_NO_DEBUG +namespace internal { +template +struct check_transpose_aliasing_compile_time_selector +{ + enum { ret = bool(blas_traits::IsTransposed) != DestIsTransposed }; +}; +template +struct check_transpose_aliasing_compile_time_selector > +{ + enum { ret = bool(blas_traits::IsTransposed) != DestIsTransposed + || bool(blas_traits::IsTransposed) != DestIsTransposed + }; +}; +template +struct check_transpose_aliasing_run_time_selector +{ + static bool run(const Scalar* dest, const OtherDerived& src) + { + return (bool(blas_traits::IsTransposed) != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src)); + } +}; +template +struct check_transpose_aliasing_run_time_selector > +{ + static bool run(const Scalar* dest, const CwiseBinaryOp& src) + { + return ((blas_traits::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.lhs()))) + || ((blas_traits::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.rhs()))); + } +}; +template::IsTransposed,OtherDerived>::ret + > +struct checkTransposeAliasing_impl +{ + static void run(const Derived& dst, const OtherDerived& other) + { + eigen_assert((!check_transpose_aliasing_run_time_selector + ::IsTransposed,OtherDerived> + ::run(extract_data(dst), other)) + && "aliasing detected during transposition, use transposeInPlace() " + "or evaluate the rhs into a temporary using .eval()"); + } +}; +template +struct checkTransposeAliasing_impl +{ + static void run(const Derived&, const OtherDerived&) + { + } +}; +template +void check_for_aliasing(const Dst &dst, const Src &src) +{ + internal::checkTransposeAliasing_impl::run(dst, src); +} +} +#endif +} +#endif +// end #include "src/Core/Transpose.h" +// #include "src/Core/Redux.h" +#ifndef EIGEN_REDUX_H +#define EIGEN_REDUX_H +namespace Eigen { +namespace internal { +template +struct redux_traits +{ +public: + typedef typename find_best_packet::type PacketType; + enum { + PacketSize = unpacket_traits::size, + InnerMaxSize = int(Derived::IsRowMajor) + ? Derived::MaxColsAtCompileTime + : Derived::MaxRowsAtCompileTime + }; + enum { + MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) + && (functor_traits::PacketAccess), + MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), + MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize + }; +public: + enum { + Traversal = int(MayLinearVectorize) ? int(LinearVectorizedTraversal) + : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) + : int(DefaultTraversal) + }; +public: + enum { + Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost + : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits::Cost, + UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) + }; +public: + enum { + Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling + }; +#ifdef EIGEN_DEBUG_ASSIGN + static void debug() + { + std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; + std::cerr.setf(std::ios::hex, std::ios::basefield); + EIGEN_DEBUG_VAR(Derived::Flags) + std::cerr.unsetf(std::ios::hex); + EIGEN_DEBUG_VAR(InnerMaxSize) + EIGEN_DEBUG_VAR(PacketSize) + EIGEN_DEBUG_VAR(MightVectorize) + EIGEN_DEBUG_VAR(MayLinearVectorize) + EIGEN_DEBUG_VAR(MaySliceVectorize) + EIGEN_DEBUG_VAR(Traversal) + EIGEN_DEBUG_VAR(UnrollingLimit) + EIGEN_DEBUG_VAR(Unrolling) + std::cerr << std::endl; + } +#endif +}; +template +struct redux_novec_unroller +{ + enum { + HalfLength = Length/2 + }; + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + { + return func(redux_novec_unroller::run(mat,func), + redux_novec_unroller::run(mat,func)); + } +}; +template +struct redux_novec_unroller +{ + enum { + outer = Start / Derived::InnerSizeAtCompileTime, + inner = Start % Derived::InnerSizeAtCompileTime + }; + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) + { + return mat.coeffByOuterInner(outer, inner); + } +}; +template +struct redux_novec_unroller +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } +}; +template +struct redux_vec_unroller +{ + enum { + PacketSize = redux_traits::PacketSize, + HalfLength = Length/2 + }; + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) + { + return func.packetOp( + redux_vec_unroller::run(mat,func), + redux_vec_unroller::run(mat,func) ); + } +}; +template +struct redux_vec_unroller +{ + enum { + index = Start * redux_traits::PacketSize, + outer = index / int(Derived::InnerSizeAtCompileTime), + inner = index % int(Derived::InnerSizeAtCompileTime), + alignment = Derived::Alignment + }; + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) + { + return mat.template packetByOuterInner(outer, inner); + } +}; +template::Traversal, + int Unrolling = redux_traits::Unrolling +> +struct redux_impl; +template +struct redux_impl +{ + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + { + eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + Scalar res; + res = mat.coeffByOuterInner(0, 0); + for(Index i = 1; i < mat.innerSize(); ++i) + res = func(res, mat.coeffByOuterInner(0, i)); + for(Index i = 1; i < mat.outerSize(); ++i) + for(Index j = 0; j < mat.innerSize(); ++j) + res = func(res, mat.coeffByOuterInner(i, j)); + return res; + } +}; +template +struct redux_impl + : public redux_novec_unroller +{}; +template +struct redux_impl +{ + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + static Scalar run(const Derived &mat, const Func& func) + { + const Index size = mat.size(); + const Index packetSize = redux_traits::PacketSize; + const int packetAlignment = unpacket_traits::alignment; + enum { + alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), + alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) + }; + const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); + const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); + const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); + const Index alignedEnd2 = alignedStart + alignedSize2; + const Index alignedEnd = alignedStart + alignedSize; + Scalar res; + if(alignedSize) + { + PacketScalar packet_res0 = mat.template packet(alignedStart); + if(alignedSize>packetSize) + { + PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); + for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) + { + packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); + packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); + } + packet_res0 = func.packetOp(packet_res0,packet_res1); + if(alignedEnd>alignedEnd2) + packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); + } + res = func.predux(packet_res0); + for(Index index = 0; index < alignedStart; ++index) + res = func(res,mat.coeff(index)); + for(Index index = alignedEnd; index < size; ++index) + res = func(res,mat.coeff(index)); + } + else + { + res = mat.coeff(0); + for(Index index = 1; index < size; ++index) + res = func(res,mat.coeff(index)); + } + return res; + } +}; +template +struct redux_impl +{ + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketType; + EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) + { + eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + const Index innerSize = mat.innerSize(); + const Index outerSize = mat.outerSize(); + enum { + packetSize = redux_traits::PacketSize + }; + const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; + Scalar res; + if(packetedInnerSize) + { + PacketType packet_res = mat.template packet(0,0); + for(Index j=0; j(j,i)); + res = func.predux(packet_res); + for(Index j=0; j::run(mat, func); + } + return res; + } +}; +template +struct redux_impl +{ + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + enum { + PacketSize = redux_traits::PacketSize, + Size = Derived::SizeAtCompileTime, + VectorizedSize = (Size / PacketSize) * PacketSize + }; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + { + eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + if (VectorizedSize > 0) { + Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + if (VectorizedSize != Size) + res = func(res,redux_novec_unroller::run(mat,func)); + return res; + } + else { + return redux_novec_unroller::run(mat,func); + } + } +}; +template +class redux_evaluator +{ +public: + typedef _XprType XprType; + EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketScalar PacketScalar; + typedef typename XprType::PacketReturnType PacketReturnType; + enum { + MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, + MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, + Flags = evaluator::Flags & ~DirectAccessBit, + IsRowMajor = XprType::IsRowMajor, + SizeAtCompileTime = XprType::SizeAtCompileTime, + InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, + CoeffReadCost = evaluator::CoeffReadCost, + Alignment = evaluator::Alignment + }; + EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } + EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } + EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } + EIGEN_DEVICE_FUNC + CoeffReturnType coeff(Index row, Index col) const + { return m_evaluator.coeff(row, col); } + EIGEN_DEVICE_FUNC + CoeffReturnType coeff(Index index) const + { return m_evaluator.coeff(index); } + template + PacketType packet(Index row, Index col) const + { return m_evaluator.template packet(row, col); } + template + PacketType packet(Index index) const + { return m_evaluator.template packet(index); } + EIGEN_DEVICE_FUNC + CoeffReturnType coeffByOuterInner(Index outer, Index inner) const + { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + template + PacketType packetByOuterInner(Index outer, Index inner) const + { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + const XprType & nestedExpression() const { return m_xpr; } +protected: + internal::evaluator m_evaluator; + const XprType &m_xpr; +}; +} +template +template +typename internal::traits::Scalar +DenseBase::redux(const Func& func) const +{ + eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); + typedef typename internal::redux_evaluator ThisEvaluator; + ThisEvaluator thisEval(derived()); + return internal::redux_impl::run(thisEval, func); +} +template +EIGEN_STRONG_INLINE typename internal::traits::Scalar +DenseBase::minCoeff() const +{ + return derived().redux(Eigen::internal::scalar_min_op()); +} +template +EIGEN_STRONG_INLINE typename internal::traits::Scalar +DenseBase::maxCoeff() const +{ + return derived().redux(Eigen::internal::scalar_max_op()); +} +template +EIGEN_STRONG_INLINE typename internal::traits::Scalar +DenseBase::sum() const +{ + if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) + return Scalar(0); + return derived().redux(Eigen::internal::scalar_sum_op()); +} +template +EIGEN_STRONG_INLINE typename internal::traits::Scalar +DenseBase::mean() const +{ +#ifdef __INTEL_COMPILER + #pragma warning push + #pragma warning ( disable : 2259 ) +#endif + return Scalar(derived().redux(Eigen::internal::scalar_sum_op())) / Scalar(this->size()); +#ifdef __INTEL_COMPILER + #pragma warning pop +#endif +} +template +EIGEN_STRONG_INLINE typename internal::traits::Scalar +DenseBase::prod() const +{ + if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) + return Scalar(1); + return derived().redux(Eigen::internal::scalar_product_op()); +} +template +EIGEN_STRONG_INLINE typename internal::traits::Scalar +MatrixBase::trace() const +{ + return derived().diagonal().sum(); +} +} +#endif +// end #include "src/Core/Redux.h" +// #include "src/Core/GeneralProduct.h" +#ifndef EIGEN_GENERAL_PRODUCT_H +#define EIGEN_GENERAL_PRODUCT_H +namespace Eigen { +enum { + Large = 2, + Small = 3 +}; +namespace internal { +template struct product_type_selector; +template struct product_size_category +{ + enum { is_large = MaxSize == Dynamic || + Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || + (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), + value = is_large ? Large + : Size == 1 ? 1 + : Small + }; +}; +template struct product_type +{ + typedef typename remove_all::type _Lhs; + typedef typename remove_all::type _Rhs; + enum { + MaxRows = traits<_Lhs>::MaxRowsAtCompileTime, + Rows = traits<_Lhs>::RowsAtCompileTime, + MaxCols = traits<_Rhs>::MaxColsAtCompileTime, + Cols = traits<_Rhs>::ColsAtCompileTime, + MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::MaxColsAtCompileTime, + traits<_Rhs>::MaxRowsAtCompileTime), + Depth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::ColsAtCompileTime, + traits<_Rhs>::RowsAtCompileTime) + }; +private: + enum { + rows_select = product_size_category::value, + cols_select = product_size_category::value, + depth_select = product_size_category::value + }; + typedef product_type_selector selector; +public: + enum { + value = selector::ret, + ret = selector::ret + }; +#ifdef EIGEN_DEBUG_PRODUCT + static void debug() + { + EIGEN_DEBUG_VAR(Rows); + EIGEN_DEBUG_VAR(Cols); + EIGEN_DEBUG_VAR(Depth); + EIGEN_DEBUG_VAR(rows_select); + EIGEN_DEBUG_VAR(cols_select); + EIGEN_DEBUG_VAR(depth_select); + EIGEN_DEBUG_VAR(value); + } +#endif +}; +template struct product_type_selector { enum { ret = OuterProduct }; }; +template struct product_type_selector { enum { ret = LazyCoeffBasedProductMode }; }; +template struct product_type_selector<1, N, 1> { enum { ret = LazyCoeffBasedProductMode }; }; +template struct product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; }; +template<> struct product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; }; +template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector<1, Small,Small> { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = LazyCoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = LazyCoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = LazyCoeffBasedProductMode }; }; +template<> struct product_type_selector<1, Large,Small> { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector<1, Large,Large> { enum { ret = GemvProduct }; }; +template<> struct product_type_selector<1, Small,Large> { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = GemvProduct }; }; +template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = GemmProduct }; }; +template<> struct product_type_selector { enum { ret = GemmProduct }; }; +template<> struct product_type_selector { enum { ret = GemmProduct }; }; +template<> struct product_type_selector { enum { ret = GemmProduct }; }; +template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = CoeffBasedProductMode }; }; +template<> struct product_type_selector { enum { ret = GemmProduct }; }; +} +namespace internal { +template +struct gemv_dense_selector; +} +namespace internal { +template struct gemv_static_vector_if; +template +struct gemv_static_vector_if +{ + EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } +}; +template +struct gemv_static_vector_if +{ + EIGEN_STRONG_INLINE Scalar* data() { return 0; } +}; +template +struct gemv_static_vector_if +{ + enum { + ForceAlignment = internal::packet_traits::Vectorizable, + PacketSize = internal::packet_traits::size + }; + #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0 + internal::plain_array m_data; + EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; } + #else + internal::plain_array m_data; + EIGEN_STRONG_INLINE Scalar* data() { + return ForceAlignment + ? reinterpret_cast((internal::UIntPtr(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES) + : m_data.array; + } + #endif +}; +template +struct gemv_dense_selector +{ + template + static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) + { + Transpose destT(dest); + enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor }; + gemv_dense_selector + ::run(rhs.transpose(), lhs.transpose(), destT, alpha); + } +}; +template<> struct gemv_dense_selector +{ + template + static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) + { + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar ResScalar; + typedef typename Dest::RealScalar RealScalar; + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef Map, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits::size)> MappedDest; + ActualLhsType actualLhs = LhsBlasTraits::extract(lhs); + ActualRhsType actualRhs = RhsBlasTraits::extract(rhs); + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) + * RhsBlasTraits::extractScalarFactor(rhs); + typedef typename conditional::type ActualDest; + enum { + EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1), + ComplexByReal = (NumTraits::IsComplex) && (!NumTraits::IsComplex), + MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal + }; + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + RhsScalar compatibleAlpha = get_factor::run(actualAlpha); + if(!MightCannotUseDest) + { + general_matrix_vector_product + ::run( + actualLhs.rows(), actualLhs.cols(), + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhs.data(), actualRhs.innerStride()), + dest.data(), 1, + compatibleAlpha); + } + else + { + gemv_static_vector_if static_dest; + const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); + const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; + ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), + evalToDest ? dest.data() : static_dest.data()); + if(!evalToDest) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + Index size = dest.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + if(!alphaIsCompatible) + { + MappedDest(actualDestPtr, dest.size()).setZero(); + compatibleAlpha = RhsScalar(1); + } + else + MappedDest(actualDestPtr, dest.size()) = dest; + } + general_matrix_vector_product + ::run( + actualLhs.rows(), actualLhs.cols(), + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhs.data(), actualRhs.innerStride()), + actualDestPtr, 1, + compatibleAlpha); + if (!evalToDest) + { + if(!alphaIsCompatible) + dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size()); + else + dest = MappedDest(actualDestPtr, dest.size()); + } + } + } +}; +template<> struct gemv_dense_selector +{ + template + static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) + { + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar ResScalar; + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all::type ActualRhsTypeCleaned; + typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); + typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) + * RhsBlasTraits::extractScalarFactor(rhs); + enum { + DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 + }; + gemv_static_vector_if static_rhs; + ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(), + DirectlyUseRhs ? const_cast(actualRhs.data()) : static_rhs.data()); + if(!DirectlyUseRhs) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + Index size = actualRhs.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + Map(actualRhsPtr, actualRhs.size()) = actualRhs; + } + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + general_matrix_vector_product + ::run( + actualLhs.rows(), actualLhs.cols(), + LhsMapper(actualLhs.data(), actualLhs.outerStride()), + RhsMapper(actualRhsPtr, 1), + dest.data(), dest.col(0).innerStride(), + actualAlpha); + } +}; +template<> struct gemv_dense_selector +{ + template + static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) + { + EIGEN_STATIC_ASSERT((!nested_eval::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE); + typename nested_eval::type actual_rhs(rhs); + const Index size = rhs.rows(); + for(Index k=0; k struct gemv_dense_selector +{ + template + static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) + { + EIGEN_STATIC_ASSERT((!nested_eval::Evaluate),EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE); + typename nested_eval::type actual_rhs(rhs); + const Index rows = dest.rows(); + for(Index i=0; i +template +inline const Product +MatrixBase::operator*(const MatrixBase &other) const +{ + enum { + ProductIsValid = Derived::ColsAtCompileTime==Dynamic + || OtherDerived::RowsAtCompileTime==Dynamic + || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime), + AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime, + SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived) + }; + EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes), + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS) + EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors), + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION) + EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT) +#ifdef EIGEN_DEBUG_PRODUCT + internal::product_type::debug(); +#endif + return Product(derived(), other.derived()); +} +#endif +template +template +const Product +MatrixBase::lazyProduct(const MatrixBase &other) const +{ + enum { + ProductIsValid = Derived::ColsAtCompileTime==Dynamic + || OtherDerived::RowsAtCompileTime==Dynamic + || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime), + AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime, + SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived) + }; + EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes), + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS) + EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors), + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION) + EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT) + return Product(derived(), other.derived()); +} +} +#endif +// end #include "src/Core/GeneralProduct.h" +// #include "src/Core/products/GeneralBlockPanelKernel.h" +#ifndef EIGEN_GENERAL_BLOCK_PANEL_H +#define EIGEN_GENERAL_BLOCK_PANEL_H +namespace Eigen { +namespace internal { +template +class gebp_traits; +inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) +{ + return a<=0 ? b : a; +} +#if EIGEN_ARCH_i386_OR_x86_64 +const std::ptrdiff_t defaultL1CacheSize = 32*1024; +const std::ptrdiff_t defaultL2CacheSize = 256*1024; +const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024; +#else +const std::ptrdiff_t defaultL1CacheSize = 16*1024; +const std::ptrdiff_t defaultL2CacheSize = 512*1024; +const std::ptrdiff_t defaultL3CacheSize = 512*1024; +#endif +struct CacheSizes { + CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize); + m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize); + m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize); + } + std::ptrdiff_t m_l1; + std::ptrdiff_t m_l2; + std::ptrdiff_t m_l3; +}; +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) +{ + static CacheSizes m_cacheSizes; + if(action==SetAction) + { + eigen_internal_assert(l1!=0 && l2!=0); + m_cacheSizes.m_l1 = *l1; + m_cacheSizes.m_l2 = *l2; + m_cacheSizes.m_l3 = *l3; + } + else if(action==GetAction) + { + eigen_internal_assert(l1!=0 && l2!=0); + *l1 = m_cacheSizes.m_l1; + *l2 = m_cacheSizes.m_l2; + *l3 = m_cacheSizes.m_l3; + } + else + { + eigen_internal_assert(false); + } +} +template +void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + typedef gebp_traits Traits; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + if (num_threads > 1) { + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + kr = 8, + mr = Traits::mr, + nr = Traits::nr + }; + const Index k_cache = (numext::mini)((l1-ksub)/kdiv, 320); + if (k_cache < k) { + k = k_cache - (k_cache % kr); + eigen_internal_assert(k > 0); + } + const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + const Index n_per_thread = numext::div_ceil(n, num_threads); + if (n_cache <= n_per_thread) { + eigen_internal_assert(n_cache >= static_cast(nr)); + n = n_cache - (n_cache % nr); + eigen_internal_assert(n > 0); + } else { + n = (numext::mini)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); + } + if (l3 > l2) { + const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + const Index m_per_thread = numext::div_ceil(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { + m = m_cache - (m_cache % mr); + eigen_internal_assert(m > 0); + } else { + m = (numext::mini)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); + } + } + } + else { +#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + l1 = 9*1024; + l2 = 32*1024; + l3 = 512*1024; +#endif + if((numext::maxi)(k,(numext::maxi)(m,n))<48) + return; + typedef typename Traits::ResScalar ResScalar; + enum { + k_peeling = 8, + k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) + }; + const Index max_kc = numext::maxi(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); + const Index old_k = k; + if(k>max_kc) + { + k = (k%max_kc)==0 ? max_kc + : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); + } + #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + const Index actual_l2 = l3; + #else + const Index actual_l2 = 1572864; + #endif + Index max_nc; + const Index lhs_bytes = m * k * sizeof(LhsScalar); + const Index remaining_l1 = l1- k_sub - lhs_bytes; + if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k) + { + max_nc = remaining_l1 / (k*sizeof(RhsScalar)); + } + else + { + max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); + } + Index nc = numext::mini(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + if(n>nc) + { + n = (n%nc)==0 ? nc + : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1)))); + } + else if(old_k==k) + { + Index problem_size = k*n*sizeof(LhsScalar); + Index actual_lm = actual_l2; + Index max_mc = m; + if(problem_size<=1024) + { + actual_lm = l1; + } + else if(l3!=0 && problem_size<=32768) + { + actual_lm = l2; + max_mc = (numext::mini)(576,max_mc); + } + Index mc = (numext::mini)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + if (mc > Traits::mr) mc -= mc % Traits::mr; + else if (mc==0) return; + m = (m%mc)==0 ? mc + : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1)))); + } + } +} +template +inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) +{ +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { + k = numext::mini(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + m = numext::mini(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + n = numext::mini(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + return true; + } +#else + EIGEN_UNUSED_VARIABLE(k) + EIGEN_UNUSED_VARIABLE(m) + EIGEN_UNUSED_VARIABLE(n) +#endif + return false; +} +template +void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + if (!useSpecificBlockingSizes(k, m, n)) { + evaluateProductBlockingSizesHeuristic(k, m, n, num_threads); + } +} +template +inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + computeProductBlockingSizes(k, m, n, num_threads); +} +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD + #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#else + template struct gebp_madd_selector { + EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& ) + { + c = cj.pmadd(a,b,c); + } + }; + template struct gebp_madd_selector { + EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) + { + t = b; t = cj.pmul(a,t); c = padd(c,t); + } + }; + template + EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) + { + gebp_madd_selector::run(cj,a,b,c,t); + } + #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +#endif +template +class gebp_traits +{ +public: + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, + default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) + mr = Vectorizable ? 3*LhsPacketSize : default_mr, +#else + mr = default_mr, +#endif + LhsProgress = LhsPacketSize, + RhsProgress = 1 + }; + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + typedef ResPacket AccPacket; + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const + { + dest = pset1(*b); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = ploadquad(b); + } + template + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const + { + dest = pload(a); + } + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploadu(a); + } + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + { + conj_helper cj; +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c = cj.pmadd(a,b,c); +#else + tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp); +#endif + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = pmadd(c,alpha,r); + } + template + EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const + { + r = pmadd(c,alpha,r); + } +}; +template +class gebp_traits, RealScalar, _ConjLhs, false> +{ +public: + typedef std::complex LhsScalar; + typedef RealScalar RhsScalar; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + enum { + ConjLhs = _ConjLhs, + ConjRhs = false, + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif + LhsProgress = LhsPacketSize, + RhsProgress = 1 + }; + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + typedef ResPacket AccPacket; + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pset1(*b); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + dest = pset1(*b); + } + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload(a); + } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu(a); + } + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename conditional::type()); + } + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + { +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a.v,b,c.v); +#else + tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); +#endif + } + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const + { + c += a * b; + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(c,alpha,r); + } +protected: + conj_helper cj; +}; +template +struct DoublePacket +{ + Packet first; + Packet second; +}; +template +DoublePacket padd(const DoublePacket &a, const DoublePacket &b) +{ + DoublePacket res; + res.first = padd(a.first, b.first); + res.second = padd(a.second,b.second); + return res; +} +template +const DoublePacket& predux_downto4(const DoublePacket &a) +{ + return a; +} +template struct unpacket_traits > { typedef DoublePacket half; }; +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs > +{ +public: + typedef std::complex Scalar; + typedef std::complex LhsScalar; + typedef std::complex RhsScalar; + typedef std::complex ResScalar; + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable + && packet_traits::Vectorizable, + RealPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + nr = 4, + mr = ResPacketSize, + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + typedef typename packet_traits::type RealPacket; + typedef typename packet_traits::type ScalarPacket; + typedef DoublePacket DoublePacketType; + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + typedef typename conditional::type AccPacket; + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) + { + p.first = pset1(RealScalar(0)); + p.second = pset1(RealScalar(0)); + } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + { + dest = pset1(*b); + } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + { + dest.first = pset1(real(*b)); + dest.second = pset1(imag(*b)); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + { + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + { + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + loadRhs(b+0, b0); + loadRhs(b+1, b1); + loadRhs(b+2, b2); + loadRhs(b+3, b3); + } + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + { + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + { + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload((const typename unpacket_traits::type*)(a)); + } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu((const typename unpacket_traits::type*)(a)); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& ) const + { + c.first = padd(pmul(a,b.first), c.first); + c.second = padd(pmul(a,b.second),c.second); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& ) const + { + c = cj.pmadd(a,b,c); + } + EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } + EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + { + ResPacket tmp; + if((!ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(pconj(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); + } + else if((!ConjLhs)&&(ConjRhs)) + { + tmp = pconj(pcplxflip(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); + } + else if((ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(ResPacket(c.second)); + tmp = padd(pconj(ResPacket(c.first)),tmp); + } + else if((ConjLhs)&&(ConjRhs)) + { + tmp = pcplxflip(ResPacket(c.second)); + tmp = psub(pconj(ResPacket(c.first)),tmp); + } + r = pmadd(tmp,alpha,r); + } +protected: + conj_helper cj; +}; +template +class gebp_traits, false, _ConjRhs > +{ +public: + typedef std::complex Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + enum { + ConjLhs = false, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable + && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize, + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + typedef ResPacket AccPacket; + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pset1(*b); + } + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup(a); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup(a); + } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename conditional::type()); + } + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + { +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a,b.v,c.v); +#else + tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); +#endif + } + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const + { + c += a * b; + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(alpha,c,r); + } +protected: + conj_helper cj; +}; +template +struct gebp_kernel +{ + typedef gebp_traits Traits; + typedef typename Traits::ResScalar ResScalar; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + typedef typename Traits::AccPacket AccPacket; + typedef gebp_traits SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename DataMapper::LinearMapper LinearMapper; + enum { + Vectorizable = Traits::Vectorizable, + LhsProgress = Traits::LhsProgress, + RhsProgress = Traits::RhsProgress, + ResPacketSize = Traits::ResPacketSize + }; + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; +template +EIGEN_DONT_INLINE +void gebp_kernel + ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + Traits traits; + SwappedTraits straits; + if(strideA==-1) strideA = depth; + if(strideB==-1) strideB = depth; + conj_helper cj; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; + const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + enum { pk = 8 }; + const Index peeled_kc = depth & ~(pk-1); + const Index prefetch_res_offset = 32/sizeof(ResScalar); + if(mr>=3*Traits::LhsProgress) + { + const Index l1 = defaultL1CacheSize; + const Index actual_panel_rows = (3*LhsProgress) * std::max(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); + for(Index i1=0; i1(alpha); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + R0 = r1.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(2 * Traits::ResPacketSize); + traits.acc(C1, alphav, R0); + traits.acc(C5, alphav, R1); + traits.acc(C9, alphav, R2); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r2.loadPacket(2 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C10, alphav, R2); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + R0 = r3.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(2 * Traits::ResPacketSize); + traits.acc(C3, alphav, R0); + traits.acc(C7, alphav, R1); + traits.acc(C11, alphav, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); + } + } + for(Index j2=packet_cols4; j2(alpha); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + } + } + } + } + if(mr>=2*Traits::LhsProgress) + { + const Index l1 = defaultL1CacheSize; + Index actual_panel_rows = (2*LhsProgress) * std::max(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) )); + for(Index i1=peeled_mc3; i1(alpha); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(0 * Traits::ResPacketSize); + R3 = r1.loadPacket(1 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C1, alphav, R2); + traits.acc(C5, alphav, R3); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(0 * Traits::ResPacketSize); + R3 = r3.loadPacket(1 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C3, alphav, R2); + traits.acc(C7, alphav, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); + } + } + for(Index j2=packet_cols4; j2(alpha); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + } + } + } + } + if(mr>=1*Traits::LhsProgress) + { + for(Index i=peeled_mc2; i(alpha); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(0 * Traits::ResPacketSize, R1); + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(0 * Traits::ResPacketSize, R1); + } + for(Index j2=packet_cols4; j2(alpha); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + r0.storePacket(0 * Traits::ResPacketSize, R0); + } + } + } + if(peeled_mc1::half SResPacketHalf; + if ((SwappedTraits::LhsProgress % 4) == 0 && + (SwappedTraits::LhsProgress <= 8) && + (SwappedTraits::LhsProgress!=8 || unpacket_traits::size==nr)) + { + SAccPacket C0, C1, C2, C3; + straits.initAcc(C0); + straits.initAcc(C1); + straits.initAcc(C2); + straits.initAcc(C3); + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); + const Index endk = (depth/spk)*spk; + const Index endk4 = (depth/(spk*4))*(spk*4); + Index k=0; + for(; k=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; + SResPacketHalf R = res.template gatherPacket(i, j2); + SResPacketHalf alphav = pset1(alpha); + if(depth-endk>0) + { + SLhsPacketHalf a0; + SRhsPacketHalf b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + SAccPacketHalf c0 = predux_downto4(C0); + straits.madd(a0,b0,c0,b0); + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux_downto4(C0), alphav, R); + } + res.scatterPacket(i, j2, R); + } + else + { + SResPacket R = res.template gatherPacket(i, j2); + SResPacket alphav = pset1(alpha); + straits.acc(C0, alphav, R); + res.scatterPacket(i, j2, R); + } + } + else + { + ResScalar C0(0), C1(0), C2(0), C3(0); + for(Index k=0; k +struct gemm_pack_lhs +{ + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); + conj_if::IsComplex && Conjugate> cj; + Index count = 0; + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 + : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + Index i=0; + if(Pack1>=3*PacketSize) + { + for(; i=2*PacketSize) + { + for(; i=1*PacketSize) + { + for(; i1) + { + for(; i +struct gemm_pack_lhs +{ + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; +template +EIGEN_DONT_INLINE void gemm_pack_lhs + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + Index count = 0; + int pack = Pack1; + Index i = 0; + while(pack>0) + { + Index remaining_rows = rows-i; + Index peeled_mc = i+(remaining_rows/pack)*pack; + for(; i=PacketSize) + { + for(; k kernel; + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } + count += PacketSize*pack; + } + } + for(; k +struct gemm_pack_rhs +{ + typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; + enum { PacketSize = packet_traits::size }; + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; +template +EIGEN_DONT_INLINE void gemm_pack_rhs + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index count = 0; + const Index peeled_k = (depth/PacketSize)*PacketSize; + if(nr>=4) + { + for(Index j2=packet_cols8; j2 kernel; + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1%PacketSize] = dm1.loadPacket(k); + kernel.packet[2%PacketSize] = dm2.loadPacket(k); + kernel.packet[3%PacketSize] = dm3.loadPacket(k); + ptranspose(kernel); + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize])); + count+=4*PacketSize; + } + } + for(; k +struct gemm_pack_rhs +{ + typedef typename packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; + enum { PacketSize = packet_traits::size }; + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; +template +EIGEN_DONT_INLINE void gemm_pack_rhs + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index count = 0; + if(nr>=4) + { + for(Index j2=packet_cols8; j2 +namespace Eigen { +namespace internal { +inline void manage_multi_threading(Action action, int* v) +{ + static EIGEN_UNUSED int m_maxThreads = -1; + if(action==SetAction) + { + eigen_internal_assert(v!=0); + m_maxThreads = *v; + } + else if(action==GetAction) + { + eigen_internal_assert(v!=0); + #ifdef EIGEN_HAS_OPENMP + if(m_maxThreads>0) + *v = m_maxThreads; + else + *v = omp_get_max_threads(); + #else + *v = 1; + #endif + } + else + { + eigen_internal_assert(false); + } +} +} +inline void initParallel() +{ + int nbt; + internal::manage_multi_threading(GetAction, &nbt); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); +} +inline int nbThreads() +{ + int ret; + internal::manage_multi_threading(GetAction, &ret); + return ret; +} +inline void setNbThreads(int v) +{ + internal::manage_multi_threading(SetAction, &v); +} +namespace internal { +template struct GemmParallelInfo +{ + GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} + std::atomic sync; + std::atomic users; + Index lhs_start; + Index lhs_length; +}; +template +void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose) +{ +#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS) + EIGEN_UNUSED_VARIABLE(depth); + EIGEN_UNUSED_VARIABLE(transpose); + func(0,rows, 0,cols); +#else + Index size = transpose ? rows : cols; + Index pb_max_threads = std::max(1,size / Functor::Traits::nr); + double work = static_cast(rows) * static_cast(cols) * + static_cast(depth); + double kMinTaskSize = 50000; + pb_max_threads = std::max(1, std::min(pb_max_threads, work / kMinTaskSize)); + Index threads = std::min(nbThreads(), pb_max_threads); + if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) + return func(0,rows, 0,cols); + Eigen::initParallel(); + func.initParallelSession(threads); + if(transpose) + std::swap(rows,cols); + ei_declare_aligned_stack_constructed_variable(GemmParallelInfo,info,threads,0); + #pragma omp parallel num_threads(threads) + { + Index i = omp_get_thread_num(); + Index actual_threads = omp_get_num_threads(); + Index blockCols = (cols / actual_threads) & ~Index(0x3); + Index blockRows = (rows / actual_threads); + blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; + Index r0 = i*blockRows; + Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows; + Index c0 = i*blockCols; + Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols; + info[i].lhs_start = r0; + info[i].lhs_length = actualBlockRows; + if(transpose) func(c0, actualBlockCols, 0, rows, info); + else func(0, rows, c0, actualBlockCols, info); + } +#endif +} +} +} +#endif +// end #include "src/Core/products/Parallelizer.h" +// #include "src/Core/ProductEvaluators.h" +#ifndef EIGEN_PRODUCTEVALUATORS_H +#define EIGEN_PRODUCTEVALUATORS_H +namespace Eigen { +namespace internal { +template +struct evaluator > + : public product_evaluator > +{ + typedef Product XprType; + typedef product_evaluator Base; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} +}; +template +struct evaluator_assume_aliasing, + const CwiseNullaryOp, Plain1>, + const Product > > +{ + static const bool value = true; +}; +template +struct evaluator, + const CwiseNullaryOp, Plain1>, + const Product > > + : public evaluator > +{ + typedef CwiseBinaryOp, + const CwiseNullaryOp, Plain1>, + const Product > XprType; + typedef evaluator > Base; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) + {} +}; +template +struct evaluator, DiagIndex> > + : public evaluator, DiagIndex> > +{ + typedef Diagonal, DiagIndex> XprType; + typedef evaluator, DiagIndex> > Base; + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + : Base(Diagonal, DiagIndex>( + Product(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), + xpr.index() )) + {} +}; +template< typename Lhs, typename Rhs, + typename LhsShape = typename evaluator_traits::Shape, + typename RhsShape = typename evaluator_traits::Shape, + int ProductType = internal::product_type::value> +struct generic_product_impl; +template +struct evaluator_assume_aliasing > { + static const bool value = true; +}; +template +struct product_evaluator, ProductTag, LhsShape, RhsShape> + : public evaluator::PlainObject> +{ + typedef Product XprType; + typedef typename XprType::PlainObject PlainObject; + typedef evaluator Base; + enum { + Flags = Base::Flags | EvalBeforeNestingBit + }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) + : m_result(xpr.rows(), xpr.cols()) + { + ::new (static_cast(this)) Base(m_result); + generic_product_impl::evalTo(m_result, xpr.lhs(), xpr.rhs()); + } +protected: + PlainObject m_result; +}; +template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar> +struct Assignment, internal::assign_op, Dense2Dense, + typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> +{ + typedef Product SrcXprType; + static EIGEN_STRONG_INLINE + void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) + { + Index dstRows = src.rows(); + Index dstCols = src.cols(); + if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) + dst.resize(dstRows, dstCols); + generic_product_impl::evalTo(dst, src.lhs(), src.rhs()); + } +}; +template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar> +struct Assignment, internal::add_assign_op, Dense2Dense, + typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> +{ + typedef Product SrcXprType; + static EIGEN_STRONG_INLINE + void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) + { + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); + generic_product_impl::addTo(dst, src.lhs(), src.rhs()); + } +}; +template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar> +struct Assignment, internal::sub_assign_op, Dense2Dense, + typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> +{ + typedef Product SrcXprType; + static EIGEN_STRONG_INLINE + void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) + { + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); + generic_product_impl::subTo(dst, src.lhs(), src.rhs()); + } +}; +template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis, typename Plain> +struct Assignment, const CwiseNullaryOp,Plain>, + const Product >, AssignFunc, Dense2Dense> +{ + typedef CwiseBinaryOp, + const CwiseNullaryOp,Plain>, + const Product > SrcXprType; + static EIGEN_STRONG_INLINE + void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func) + { + call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func); + } +}; +template +struct evaluator_assume_aliasing::Scalar>, const OtherXpr, + const Product >, DenseShape > { + static const bool value = true; +}; +template +struct evaluator_assume_aliasing::Scalar>, const OtherXpr, + const Product >, DenseShape > { + static const bool value = true; +}; +template +struct assignment_from_xpr_op_product +{ + template + static EIGEN_STRONG_INLINE + void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& ) + { + call_assignment_no_alias(dst, src.lhs(), Func1()); + call_assignment_no_alias(dst, src.rhs(), Func2()); + } +}; +#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP,BINOP,ASSIGN_OP2) \ + template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, typename SrcScalar, typename OtherScalar,typename ProdScalar> \ + struct Assignment, const OtherXpr, \ + const Product >, internal::ASSIGN_OP, Dense2Dense> \ + : assignment_from_xpr_op_product, internal::ASSIGN_OP, internal::ASSIGN_OP2 > \ + {} +EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op, scalar_sum_op,add_assign_op); +EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_sum_op,add_assign_op); +EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_sum_op,sub_assign_op); +EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op, scalar_difference_op,sub_assign_op); +EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op,scalar_difference_op,sub_assign_op); +EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op,scalar_difference_op,add_assign_op); +template +struct generic_product_impl +{ + template + static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); + } + template + static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); + } + template + static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } +}; +template +void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) +{ + evaluator rhsEval(rhs); + typename nested_eval::type actual_lhs(lhs); + const Index cols = dst.cols(); + for (Index j=0; j +void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +{ + evaluator lhsEval(lhs); + typename nested_eval::type actual_rhs(rhs); + const Index rows = dst.rows(); + for (Index i=0; i +struct generic_product_impl +{ + template struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {}; + typedef typename Product::Scalar Scalar; + struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; + struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; + struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; + struct adds { + Scalar m_scale; + explicit adds(const Scalar& s) : m_scale(s) {} + template void operator()(const Dst& dst, const Src& src) const { + dst.const_cast_derived() += m_scale * src; + } + }; + template + static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); + } + template + static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); + } + template + static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); + } + template + static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + { + internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); + } +}; +template +struct generic_product_impl_base +{ + typedef typename Product::Scalar Scalar; + template + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } + template + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } + template + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } + template + static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); } +}; +template +struct generic_product_impl + : generic_product_impl_base > +{ + typedef typename nested_eval::type LhsNested; + typedef typename nested_eval::type RhsNested; + typedef typename Product::Scalar Scalar; + enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight }; + typedef typename internal::remove_all::type>::type MatrixType; + template + static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + { + LhsNested actual_lhs(lhs); + RhsNested actual_rhs(rhs); + internal::gemv_dense_selector::HasUsableDirectAccess) + >::run(actual_lhs, actual_rhs, dst, alpha); + } +}; +template +struct generic_product_impl +{ + typedef typename Product::Scalar Scalar; + template + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op()); + } + template + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op()); + } + template + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); + } +}; +template +struct generic_product_impl + : generic_product_impl {}; +template +struct etor_product_coeff_impl; +template +struct etor_product_packet_impl; +template +struct product_evaluator, ProductTag, DenseShape, DenseShape> + : evaluator_base > +{ + typedef Product XprType; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit product_evaluator(const XprType& xpr) + : m_lhs(xpr.lhs()), + m_rhs(xpr.rhs()), + m_lhsImpl(m_lhs), + m_rhsImpl(m_rhs), + m_innerDim(xpr.lhs().cols()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits::MulCost); + EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits::AddCost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); +#if 0 + std::cerr << "LhsOuterStrideBytes= " << LhsOuterStrideBytes << "\n"; + std::cerr << "RhsOuterStrideBytes= " << RhsOuterStrideBytes << "\n"; + std::cerr << "LhsAlignment= " << LhsAlignment << "\n"; + std::cerr << "RhsAlignment= " << RhsAlignment << "\n"; + std::cerr << "CanVectorizeLhs= " << CanVectorizeLhs << "\n"; + std::cerr << "CanVectorizeRhs= " << CanVectorizeRhs << "\n"; + std::cerr << "CanVectorizeInner= " << CanVectorizeInner << "\n"; + std::cerr << "EvalToRowMajor= " << EvalToRowMajor << "\n"; + std::cerr << "Alignment= " << Alignment << "\n"; + std::cerr << "Flags= " << Flags << "\n"; +#endif + } + typedef typename internal::nested_eval::type LhsNested; + typedef typename internal::nested_eval::type RhsNested; + typedef typename internal::remove_all::type LhsNestedCleaned; + typedef typename internal::remove_all::type RhsNestedCleaned; + typedef evaluator LhsEtorType; + typedef evaluator RhsEtorType; + enum { + RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime, + ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime, + InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime), + MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime, + MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime + }; + typedef typename find_best_packet::type LhsVecPacketType; + typedef typename find_best_packet::type RhsVecPacketType; + enum { + LhsCoeffReadCost = LhsEtorType::CoeffReadCost, + RhsCoeffReadCost = RhsEtorType::CoeffReadCost, + CoeffReadCost = InnerSize==0 ? NumTraits::ReadCost + : InnerSize == Dynamic ? HugeCost + : InnerSize * (NumTraits::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + + (InnerSize - 1) * NumTraits::AddCost, + Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT, + LhsFlags = LhsEtorType::Flags, + RhsFlags = RhsEtorType::Flags, + LhsRowMajor = LhsFlags & RowMajorBit, + RhsRowMajor = RhsFlags & RowMajorBit, + LhsVecPacketSize = unpacket_traits::size, + RhsVecPacketSize = unpacket_traits::size, + LhsAlignment = EIGEN_PLAIN_ENUM_MIN(LhsEtorType::Alignment,LhsVecPacketSize*int(sizeof(typename LhsNestedCleaned::Scalar))), + RhsAlignment = EIGEN_PLAIN_ENUM_MIN(RhsEtorType::Alignment,RhsVecPacketSize*int(sizeof(typename RhsNestedCleaned::Scalar))), + SameType = is_same::value, + CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime!=1), + CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime!=1), + EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 + : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 + : (bool(RhsRowMajor) && !CanVectorizeLhs), + Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit) + | (EvalToRowMajor ? RowMajorBit : 0) + | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) + | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0), + LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)), + RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)), + Alignment = bool(CanVectorizeLhs) ? (LhsOuterStrideBytes<=0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment) + : bool(CanVectorizeRhs) ? (RhsOuterStrideBytes<=0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment) + : 0, + CanVectorizeInner = SameType + && LhsRowMajor + && (!RhsRowMajor) + && (LhsFlags & RhsFlags & ActualPacketAccessBit) + && (InnerSize % packet_traits::size == 0) + }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const + { + return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum(); + } + EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const + { + const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; + const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0; + return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum(); + } + template + const PacketType packet(Index row, Index col) const + { + PacketType res; + typedef etor_product_packet_impl PacketImpl; + PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res); + return res; + } + template + const PacketType packet(Index index) const + { + const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; + const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0; + return packet(row,col); + } +protected: + typename internal::add_const_on_value_type::type m_lhs; + typename internal::add_const_on_value_type::type m_rhs; + LhsEtorType m_lhsImpl; + RhsEtorType m_rhsImpl; + Index m_innerDim; +}; +template +struct product_evaluator, LazyCoeffBasedProductMode, DenseShape, DenseShape> + : product_evaluator, CoeffBasedProductMode, DenseShape, DenseShape> +{ + typedef Product XprType; + typedef Product BaseProduct; + typedef product_evaluator Base; + enum { + Flags = Base::Flags | EvalBeforeNestingBit + }; + EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + : Base(BaseProduct(xpr.lhs(),xpr.rhs())) + {} +}; +template +struct etor_product_packet_impl +{ + static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) + { + etor_product_packet_impl::run(row, col, lhs, rhs, innerDim, res); + res = pmadd(pset1(lhs.coeff(row, Index(UnrollingIndex-1))), rhs.template packet(Index(UnrollingIndex-1), col), res); + } +}; +template +struct etor_product_packet_impl +{ + static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res) + { + etor_product_packet_impl::run(row, col, lhs, rhs, innerDim, res); + res = pmadd(lhs.template packet(row, Index(UnrollingIndex-1)), pset1(rhs.coeff(Index(UnrollingIndex-1), col)), res); + } +}; +template +struct etor_product_packet_impl +{ + static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index , Packet &res) + { + res = pmul(pset1(lhs.coeff(row, Index(0))),rhs.template packet(Index(0), col)); + } +}; +template +struct etor_product_packet_impl +{ + static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index , Packet &res) + { + res = pmul(lhs.template packet(row, Index(0)), pset1(rhs.coeff(Index(0), col))); + } +}; +template +struct etor_product_packet_impl +{ + static EIGEN_STRONG_INLINE void run(Index , Index , const Lhs& , const Rhs& , Index , Packet &res) + { + res = pset1(typename unpacket_traits::type(0)); + } +}; +template +struct etor_product_packet_impl +{ + static EIGEN_STRONG_INLINE void run(Index , Index , const Lhs& , const Rhs& , Index , Packet &res) + { + res = pset1(typename unpacket_traits::type(0)); + } +}; +template +struct etor_product_packet_impl +{ + static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) + { + res = pset1(typename unpacket_traits::type(0)); + for(Index i = 0; i < innerDim; ++i) + res = pmadd(pset1(lhs.coeff(row, i)), rhs.template packet(i, col), res); + } +}; +template +struct etor_product_packet_impl +{ + static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) + { + res = pset1(typename unpacket_traits::type(0)); + for(Index i = 0; i < innerDim; ++i) + res = pmadd(lhs.template packet(row, i), pset1(rhs.coeff(i, col)), res); + } +}; +template +struct triangular_product_impl; +template +struct generic_product_impl + : generic_product_impl_base > +{ + typedef typename Product::Scalar Scalar; + template + static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + { + triangular_product_impl + ::run(dst, lhs.nestedExpression(), rhs, alpha); + } +}; +template +struct generic_product_impl +: generic_product_impl_base > +{ + typedef typename Product::Scalar Scalar; + template + static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + { + triangular_product_impl::run(dst, lhs, rhs.nestedExpression(), alpha); + } +}; +template +struct selfadjoint_product_impl; +template +struct generic_product_impl + : generic_product_impl_base > +{ + typedef typename Product::Scalar Scalar; + template + static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + { + selfadjoint_product_impl::run(dst, lhs.nestedExpression(), rhs, alpha); + } +}; +template +struct generic_product_impl +: generic_product_impl_base > +{ + typedef typename Product::Scalar Scalar; + template + static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + { + selfadjoint_product_impl::run(dst, lhs, rhs.nestedExpression(), alpha); + } +}; +template +struct diagonal_product_evaluator_base + : evaluator_base +{ + typedef typename ScalarBinaryOpTraits::ReturnType Scalar; +public: + enum { + CoeffReadCost = NumTraits::MulCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost, + MatrixFlags = evaluator::Flags, + DiagFlags = evaluator::Flags, + _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor, + _ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft) + ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)), + _SameTypes = is_same::value, + _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), + _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, + Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), + Alignment = evaluator::Alignment + }; + diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) + : m_diagImpl(diag), m_matImpl(mat) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits::MulCost); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const + { + return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); + } +protected: + template + EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const + { + return internal::pmul(m_matImpl.template packet(row, col), + internal::pset1(m_diagImpl.coeff(id))); + } + template + EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const + { + enum { + InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime, + DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator::Alignment)) + }; + return internal::pmul(m_matImpl.template packet(row, col), + m_diagImpl.template packet(id)); + } + evaluator m_diagImpl; + evaluator m_matImpl; +}; +template +struct product_evaluator, ProductTag, DiagonalShape, DenseShape> + : diagonal_product_evaluator_base, OnTheLeft> +{ + typedef diagonal_product_evaluator_base, OnTheLeft> Base; + using Base::m_diagImpl; + using Base::m_matImpl; + using Base::coeff; + typedef typename Base::Scalar Scalar; + typedef Product XprType; + typedef typename XprType::PlainObject PlainObject; + enum { + StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor + }; + EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + : Base(xpr.rhs(), xpr.lhs().diagonal()) + { + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const + { + return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); + } +#ifndef __CUDACC__ + template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const + { + return this->template packet_impl(row,col, row, + typename internal::conditional::type()); + } + template + EIGEN_STRONG_INLINE PacketType packet(Index idx) const + { + return packet(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx); + } +#endif +}; +template +struct product_evaluator, ProductTag, DenseShape, DiagonalShape> + : diagonal_product_evaluator_base, OnTheRight> +{ + typedef diagonal_product_evaluator_base, OnTheRight> Base; + using Base::m_diagImpl; + using Base::m_matImpl; + using Base::coeff; + typedef typename Base::Scalar Scalar; + typedef Product XprType; + typedef typename XprType::PlainObject PlainObject; + enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor }; + EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) + : Base(xpr.lhs(), xpr.rhs().diagonal()) + { + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const + { + return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); + } +#ifndef __CUDACC__ + template + EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const + { + return this->template packet_impl(row,col, col, + typename internal::conditional::type()); + } + template + EIGEN_STRONG_INLINE PacketType packet(Index idx) const + { + return packet(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx); + } +#endif +}; +template +struct permutation_matrix_product; +template +struct permutation_matrix_product +{ + typedef typename nested_eval::type MatrixType; + typedef typename remove_all::type MatrixTypeCleaned; + template + static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) + { + MatrixType mat(xpr); + const Index n = Side==OnTheLeft ? mat.rows() : mat.cols(); + if(is_same_dense(dst, mat)) + { + Matrix mask(perm.size()); + mask.fill(false); + Index r = 0; + while(r < perm.size()) + { + while(r=perm.size()) + break; + Index k0 = r++; + Index kPrev = k0; + mask.coeffRef(k0) = true; + for(Index k=perm.indices().coeff(k0); k!=k0; k=perm.indices().coeff(k)) + { + Block(dst, k) + .swap(Block + (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev)); + mask.coeffRef(k) = true; + kPrev = k; + } + } + } + else + { + for(Index i = 0; i < n; ++i) + { + Block + (dst, ((Side==OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i) + = + Block + (mat, ((Side==OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i); + } + } + } +}; +template +struct generic_product_impl +{ + template + static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + { + permutation_matrix_product::run(dst, lhs, rhs); + } +}; +template +struct generic_product_impl +{ + template + static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + { + permutation_matrix_product::run(dst, rhs, lhs); + } +}; +template +struct generic_product_impl, Rhs, PermutationShape, MatrixShape, ProductTag> +{ + template + static void evalTo(Dest& dst, const Inverse& lhs, const Rhs& rhs) + { + permutation_matrix_product::run(dst, lhs.nestedExpression(), rhs); + } +}; +template +struct generic_product_impl, MatrixShape, PermutationShape, ProductTag> +{ + template + static void evalTo(Dest& dst, const Lhs& lhs, const Inverse& rhs) + { + permutation_matrix_product::run(dst, rhs.nestedExpression(), lhs); + } +}; +template +struct transposition_matrix_product +{ + typedef typename nested_eval::type MatrixType; + typedef typename remove_all::type MatrixTypeCleaned; + template + static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr) + { + MatrixType mat(xpr); + typedef typename TranspositionType::StorageIndex StorageIndex; + const Index size = tr.size(); + StorageIndex j = 0; + if(!is_same_dense(dst,mat)) + dst = mat; + for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k +struct generic_product_impl +{ + template + static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + { + transposition_matrix_product::run(dst, lhs, rhs); + } +}; +template +struct generic_product_impl +{ + template + static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) + { + transposition_matrix_product::run(dst, rhs, lhs); + } +}; +template +struct generic_product_impl, Rhs, TranspositionsShape, MatrixShape, ProductTag> +{ + template + static void evalTo(Dest& dst, const Transpose& lhs, const Rhs& rhs) + { + transposition_matrix_product::run(dst, lhs.nestedExpression(), rhs); + } +}; +template +struct generic_product_impl, MatrixShape, TranspositionsShape, ProductTag> +{ + template + static void evalTo(Dest& dst, const Lhs& lhs, const Transpose& rhs) + { + transposition_matrix_product::run(dst, rhs.nestedExpression(), lhs); + } +}; +} +} +#endif +// end #include "src/Core/ProductEvaluators.h" +// #include "src/Core/products/GeneralMatrixVector.h" +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H +#define EIGEN_GENERAL_MATRIX_VECTOR_H +namespace Eigen { +namespace internal { +template +struct general_matrix_vector_product +{ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; +enum { + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable + && int(packet_traits::size)==int(packet_traits::size), + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1 +}; +typedef typename packet_traits::type _LhsPacket; +typedef typename packet_traits::type _RhsPacket; +typedef typename packet_traits::type _ResPacket; +typedef typename conditional::type LhsPacket; +typedef typename conditional::type RhsPacket; +typedef typename conditional::type ResPacket; +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + RhsScalar alpha); +}; +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + RhsScalar alpha) +{ + EIGEN_UNUSED_VARIABLE(resIncr); + eigen_internal_assert(resIncr==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ + pstore(&res[j], \ + padd(pload(&res[j]), \ + padd( \ + padd(pcj.pmul(lhs0.template load(j), ptmp0), \ + pcj.pmul(lhs1.template load(j), ptmp1)), \ + padd(pcj.pmul(lhs2.template load(j), ptmp2), \ + pcj.pmul(lhs3.template load(j), ptmp3)) ))) + typedef typename LhsMapper::VectorMapper LhsScalars; + conj_helper cj; + conj_helper pcj; + if(ConjugateRhs) + alpha = numext::conj(alpha); + enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; + const Index columnsAtOnce = 4; + const Index peels = 2; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index ResPacketAlignedMask = ResPacketSize-1; + const Index size = rows; + const Index lhsStride = lhs.stride(); + Index alignedStart = internal::first_default_aligned(res,size); + Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; + const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; + Index alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; + const Index lhsAlignmentOffset = lhs.firstAligned(size); + Index skipColumns = 0; + if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) + { + alignedSize = 0; + alignedStart = 0; + alignmentPattern = NoneAligned; + } + else if(LhsPacketSize > 4) + { + alignmentPattern = NoneAligned; + } + else if (LhsPacketSize>1) + { + while (skipColumns(alpha*rhs(i, 0)), + ptmp1 = pset1(alpha*rhs(i+offset1, 0)), + ptmp2 = pset1(alpha*rhs(i+2, 0)), + ptmp3 = pset1(alpha*rhs(i+offset3, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), + lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); + if (Vectorizable) + { + for (Index j=0; jalignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (Index j = alignedStart; j1) + { + LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; + ResPacket T0, T1; + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); + for (; j(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); + A00 = lhs0.template load(j); + A10 = lhs0.template load(j+LhsPacketSize); + T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); + T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); + T0 = pcj.pmadd(A01, ptmp1, T0); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); + T0 = pcj.pmadd(A02, ptmp2, T0); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); + T0 = pcj.pmadd(A03, ptmp3, T0); + pstore(&res[j],T0); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); + T1 = pcj.pmadd(A11, ptmp1, T1); + T1 = pcj.pmadd(A12, ptmp2, T1); + T1 = pcj.pmadd(A13, ptmp3, T1); + pstore(&res[j+ResPacketSize],T1); + } + } + for (; j(alpha*rhs(k, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, k); + if (Vectorizable) + { + for (Index j=0; j(alignedStart)) + for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); + else + for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); + } + for (Index i=alignedSize; i +struct general_matrix_vector_product +{ +typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; +enum { + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable + && int(packet_traits::size)==int(packet_traits::size), + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1 +}; +typedef typename packet_traits::type _LhsPacket; +typedef typename packet_traits::type _RhsPacket; +typedef typename packet_traits::type _ResPacket; +typedef typename conditional::type LhsPacket; +typedef typename conditional::type RhsPacket; +typedef typename conditional::type ResPacket; +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha); +}; +template +EIGEN_DONT_INLINE void general_matrix_vector_product::run( + Index rows, Index cols, + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + eigen_internal_assert(rhs.stride()==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ + RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ + ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ + ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ + ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } + conj_helper cj; + conj_helper pcj; + typedef typename LhsMapper::VectorMapper LhsScalars; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; + const Index rowsAtOnce = 4; + const Index peels = 2; + const Index RhsPacketAlignedMask = RhsPacketSize-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index depth = cols; + const Index lhsStride = lhs.stride(); + Index alignedStart = rhs.firstAligned(depth); + Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; + const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; + Index alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; + const Index lhsAlignmentOffset = lhs.firstAligned(depth); + const Index rhsAlignmentOffset = rhs.firstAligned(rows); + Index skipRows = 0; + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || + (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || + (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) + { + alignedSize = 0; + alignedStart = 0; + alignmentPattern = NoneAligned; + } + else if(LhsPacketSize > 4) + { + alignmentPattern = NoneAligned; + } + else if (LhsPacketSize>1) + { + while (skipRows(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), + ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); + for (Index j=0; jalignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (Index j = alignedStart; j1) + { + LhsPacket A01, A02, A03, A11, A12, A13; + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); + for (; j(0); + A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); + ptmp1 = pcj.pmadd(A01, b, ptmp1); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); + ptmp2 = pcj.pmadd(A02, b, ptmp2); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); + ptmp3 = pcj.pmadd(A03, b, ptmp3); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); + b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); + ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); + ptmp1 = pcj.pmadd(A11, b, ptmp1); + ptmp2 = pcj.pmadd(A12, b, ptmp2); + ptmp3 = pcj.pmadd(A13, b, ptmp3); + } + } + for (; j(tmp0); + const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); + for (Index j=0; jalignedStart) + { + if (lhs0.template aligned(alignedStart)) + for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); + else + for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); + tmp0 += predux(ptmp0); + } + for (Index j=alignedSize; j class level3_blocking; +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> +struct general_matrix_matrix_product +{ + typedef gebp_traits Traits; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, Index depth, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsStride, + ResScalar* res, Index resStride, + ResScalar alpha, + level3_blocking& blocking, + GemmParallelInfo* info = 0) + { + general_matrix_matrix_product + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); + } +}; +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> +struct general_matrix_matrix_product +{ +typedef gebp_traits Traits; +typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; +static void run(Index rows, Index cols, Index depth, + const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resStride, + ResScalar alpha, + level3_blocking& blocking, + GemmParallelInfo* info = 0) +{ + typedef const_blas_data_mapper LhsMapper; + typedef const_blas_data_mapper RhsMapper; + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + Index kc = blocking.kc(); + Index mc = (std::min)(rows,blocking.mc()); + Index nc = (std::min)(cols,blocking.nc()); + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; +#ifdef EIGEN_HAS_OPENMP + if(info) + { + int tid = omp_get_thread_num(); + int threads = omp_get_num_threads(); + LhsScalar* blockA = blocking.blockA(); + eigen_internal_assert(blockA!=0); + std::size_t sizeB = kc*nc; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0); + for(Index k=0; k0) { + while(info[i].sync!=k) { + } + } + gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); + } + for(Index j=nc; j +struct gemm_functor +{ + gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking) + : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) + {} + void initParallelSession(Index num_threads) const + { + m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads); + m_blocking.allocateA(); + } + void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo* info=0) const + { + if(cols==-1) + cols = m_rhs.cols(); + Gemm::run(rows, cols, m_lhs.cols(), + &m_lhs.coeffRef(row,0), m_lhs.outerStride(), + &m_rhs.coeffRef(0,col), m_rhs.outerStride(), + (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), + m_actualAlpha, m_blocking, info); + } + typedef typename Gemm::Traits Traits; + protected: + const Lhs& m_lhs; + const Rhs& m_rhs; + Dest& m_dest; + Scalar m_actualAlpha; + BlockingType& m_blocking; +}; +template class gemm_blocking_space; +template +class level3_blocking +{ + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + protected: + LhsScalar* m_blockA; + RhsScalar* m_blockB; + Index m_mc; + Index m_nc; + Index m_kc; + public: + level3_blocking() + : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) + {} + inline Index mc() const { return m_mc; } + inline Index nc() const { return m_nc; } + inline Index kc() const { return m_kc; } + inline LhsScalar* blockA() { return m_blockA; } + inline RhsScalar* blockB() { return m_blockB; } +}; +template +class gemm_blocking_space + : public level3_blocking< + typename conditional::type, + typename conditional::type> +{ + enum { + Transpose = StorageOrder==RowMajor, + ActualRows = Transpose ? MaxCols : MaxRows, + ActualCols = Transpose ? MaxRows : MaxCols + }; + typedef typename conditional::type LhsScalar; + typedef typename conditional::type RhsScalar; + typedef gebp_traits Traits; + enum { + SizeA = ActualRows * MaxDepth, + SizeB = ActualCols * MaxDepth + }; +#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES + EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA]; + EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB]; +#else + EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1]; + EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1]; +#endif + public: + gemm_blocking_space(Index , Index , Index , Index , bool ) + { + this->m_mc = ActualRows; + this->m_nc = ActualCols; + this->m_kc = MaxDepth; +#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES + this->m_blockA = m_staticA; + this->m_blockB = m_staticB; +#else + this->m_blockA = reinterpret_cast((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)); + this->m_blockB = reinterpret_cast((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)); +#endif + } + void initParallel(Index, Index, Index, Index) + {} + inline void allocateA() {} + inline void allocateB() {} + inline void allocateAll() {} +}; +template +class gemm_blocking_space + : public level3_blocking< + typename conditional::type, + typename conditional::type> +{ + enum { + Transpose = StorageOrder==RowMajor + }; + typedef typename conditional::type LhsScalar; + typedef typename conditional::type RhsScalar; + typedef gebp_traits Traits; + Index m_sizeA; + Index m_sizeB; + public: + gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + if(l3_blocking) + { + computeProductBlockingSizes(this->m_kc, this->m_mc, this->m_nc, num_threads); + } + else + { + Index n = this->m_nc; + computeProductBlockingSizes(this->m_kc, this->m_mc, n, num_threads); + } + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + void initParallel(Index rows, Index cols, Index depth, Index num_threads) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0); + Index m = this->m_mc; + computeProductBlockingSizes(this->m_kc, m, this->m_nc, num_threads); + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + void allocateA() + { + if(this->m_blockA==0) + this->m_blockA = aligned_new(m_sizeA); + } + void allocateB() + { + if(this->m_blockB==0) + this->m_blockB = aligned_new(m_sizeB); + } + void allocateAll() + { + allocateA(); + allocateB(); + } + ~gemm_blocking_space() + { + aligned_delete(this->m_blockA, m_sizeA); + aligned_delete(this->m_blockB, m_sizeB); + } +}; +} +namespace internal { +template +struct generic_product_impl + : generic_product_impl_base > +{ + typedef typename Product::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all::type ActualLhsTypeCleaned; + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all::type ActualRhsTypeCleaned; + enum { + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + }; + typedef generic_product_impl lazyproduct; + template + static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + lazyproduct::evalTo(dst, lhs, rhs); + else + { + dst.setZero(); + scaleAndAddTo(dst, lhs, rhs, Scalar(1)); + } + } + template + static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + lazyproduct::addTo(dst, lhs, rhs); + else + scaleAndAddTo(dst,lhs, rhs, Scalar(1)); + } + template + static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + lazyproduct::subTo(dst, lhs, rhs); + else + scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); + } + template + static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); + if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) + return; + typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, + Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, + LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), + RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> + (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit); + } +}; +} +} +#endif +// end #include "src/Core/products/GeneralMatrixMatrix.h" +// #include "src/Core/VectorwiseOp.h" +#ifndef EIGEN_PARTIAL_REDUX_H +#define EIGEN_PARTIAL_REDUX_H +namespace Eigen { +template< typename MatrixType, typename MemberOp, int Direction> +class PartialReduxExpr; +namespace internal { +template +struct traits > + : traits +{ + typedef typename MemberOp::result_type Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::XprKind XprKind; + typedef typename MatrixType::Scalar InputScalar; + enum { + RowsAtCompileTime = Direction==Vertical ? 1 : MatrixType::RowsAtCompileTime, + ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime, + MaxRowsAtCompileTime = Direction==Vertical ? 1 : MatrixType::MaxRowsAtCompileTime, + MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime, + Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0, + TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime : MatrixType::ColsAtCompileTime + }; +}; +} +template< typename MatrixType, typename MemberOp, int Direction> +class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr >::type, + internal::no_assignment_operator +{ + public: + typedef typename internal::dense_xpr_base::type Base; + EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr) + EIGEN_DEVICE_FUNC + explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp()) + : m_matrix(mat), m_functor(func) {} + EIGEN_DEVICE_FUNC + Index rows() const { return (Direction==Vertical ? 1 : m_matrix.rows()); } + EIGEN_DEVICE_FUNC + Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); } + EIGEN_DEVICE_FUNC + typename MatrixType::Nested nestedExpression() const { return m_matrix; } + EIGEN_DEVICE_FUNC + const MemberOp& functor() const { return m_functor; } + protected: + typename MatrixType::Nested m_matrix; + const MemberOp m_functor; +}; +#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \ + template \ + struct member_##MEMBER { \ + EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ + typedef ResultType result_type; \ + template struct Cost \ + { enum { value = COST }; }; \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ + ResultType operator()(const XprType& mat) const \ + { return mat.MEMBER(); } \ + } +namespace internal { +EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits >::Cost ); +EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits::AddCost + NumTraits::MulCost); +EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits::MulCost); +template +struct member_lpnorm { + typedef ResultType result_type; + template struct Cost + { enum { value = (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost }; }; + EIGEN_DEVICE_FUNC member_lpnorm() {} + template + EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const + { return mat.template lpNorm

(); } +}; +template +struct member_redux { + typedef typename result_of< + BinaryOp(const Scalar&,const Scalar&) + >::type result_type; + template struct Cost + { enum { value = (Size-1) * functor_traits::Cost }; }; + EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {} + template + EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase& mat) const + { return mat.redux(m_functor); } + const BinaryOp m_functor; +}; +} +template class VectorwiseOp +{ + public: + typedef typename ExpressionType::Scalar Scalar; + typedef typename ExpressionType::RealScalar RealScalar; + typedef Eigen::Index Index; + typedef typename internal::ref_selector::non_const_type ExpressionTypeNested; + typedef typename internal::remove_all::type ExpressionTypeNestedCleaned; + template class Functor, + typename Scalar_=Scalar> struct ReturnType + { + typedef PartialReduxExpr, + Direction + > Type; + }; + template struct ReduxReturnType + { + typedef PartialReduxExpr, + Direction + > Type; + }; + enum { + isVertical = (Direction==Vertical) ? 1 : 0, + isHorizontal = (Direction==Horizontal) ? 1 : 0 + }; + protected: + typedef typename internal::conditional::type SubVector; + EIGEN_DEVICE_FUNC + SubVector subVector(Index i) + { + return SubVector(m_matrix.derived(),i); + } + EIGEN_DEVICE_FUNC + Index subVectors() const + { return isVertical?m_matrix.cols():m_matrix.rows(); } + template struct ExtendedType { + typedef Replicate Type; + }; + template + EIGEN_DEVICE_FUNC + typename ExtendedType::Type + extendedTo(const DenseBase& other) const + { + EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxColsAtCompileTime==1), + YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED) + EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1), + YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED) + return typename ExtendedType::Type + (other.derived(), + isVertical ? 1 : m_matrix.rows(), + isHorizontal ? 1 : m_matrix.cols()); + } + template struct OppositeExtendedType { + typedef Replicate Type; + }; + template + EIGEN_DEVICE_FUNC + typename OppositeExtendedType::Type + extendedToOpposite(const DenseBase& other) const + { + EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxColsAtCompileTime==1), + YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED) + EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxRowsAtCompileTime==1), + YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED) + return typename OppositeExtendedType::Type + (other.derived(), + isHorizontal ? 1 : m_matrix.rows(), + isVertical ? 1 : m_matrix.cols()); + } + public: + EIGEN_DEVICE_FUNC + explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {} + EIGEN_DEVICE_FUNC + inline const ExpressionType& _expression() const { return m_matrix; } + template + EIGEN_DEVICE_FUNC + const typename ReduxReturnType::Type + redux(const BinaryOp& func = BinaryOp()) const + { return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); } + typedef typename ReturnType::Type MinCoeffReturnType; + typedef typename ReturnType::Type MaxCoeffReturnType; + typedef typename ReturnType::Type SquaredNormReturnType; + typedef typename ReturnType::Type NormReturnType; + typedef typename ReturnType::Type BlueNormReturnType; + typedef typename ReturnType::Type StableNormReturnType; + typedef typename ReturnType::Type HypotNormReturnType; + typedef typename ReturnType::Type SumReturnType; + typedef typename ReturnType::Type MeanReturnType; + typedef typename ReturnType::Type AllReturnType; + typedef typename ReturnType::Type AnyReturnType; + typedef PartialReduxExpr, Direction> CountReturnType; + typedef typename ReturnType::Type ProdReturnType; + typedef Reverse ConstReverseReturnType; + typedef Reverse ReverseReturnType; + template struct LpNormReturnType { + typedef PartialReduxExpr,Direction> Type; + }; + EIGEN_DEVICE_FUNC + const MinCoeffReturnType minCoeff() const + { return MinCoeffReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const MaxCoeffReturnType maxCoeff() const + { return MaxCoeffReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const SquaredNormReturnType squaredNorm() const + { return SquaredNormReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const NormReturnType norm() const + { return NormReturnType(_expression()); } + template + EIGEN_DEVICE_FUNC + const typename LpNormReturnType

::Type lpNorm() const + { return typename LpNormReturnType

::Type(_expression()); } + EIGEN_DEVICE_FUNC + const BlueNormReturnType blueNorm() const + { return BlueNormReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const StableNormReturnType stableNorm() const + { return StableNormReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const HypotNormReturnType hypotNorm() const + { return HypotNormReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const SumReturnType sum() const + { return SumReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const MeanReturnType mean() const + { return MeanReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const AllReturnType all() const + { return AllReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const AnyReturnType any() const + { return AnyReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const CountReturnType count() const + { return CountReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const ProdReturnType prod() const + { return ProdReturnType(_expression()); } + EIGEN_DEVICE_FUNC + const ConstReverseReturnType reverse() const + { return ConstReverseReturnType( _expression() ); } + EIGEN_DEVICE_FUNC + ReverseReturnType reverse() + { return ReverseReturnType( _expression() ); } + typedef Replicate ReplicateReturnType; + EIGEN_DEVICE_FUNC + const ReplicateReturnType replicate(Index factor) const; + template const Replicate + EIGEN_DEVICE_FUNC + replicate(Index factor = Factor) const + { + return Replicate + (_expression(),isVertical?factor:1,isHorizontal?factor:1); + } + template + EIGEN_DEVICE_FUNC + ExpressionType& operator=(const DenseBase& other) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + return const_cast(m_matrix = extendedTo(other.derived())); + } + template + EIGEN_DEVICE_FUNC + ExpressionType& operator+=(const DenseBase& other) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + return const_cast(m_matrix += extendedTo(other.derived())); + } + template + EIGEN_DEVICE_FUNC + ExpressionType& operator-=(const DenseBase& other) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + return const_cast(m_matrix -= extendedTo(other.derived())); + } + template + EIGEN_DEVICE_FUNC + ExpressionType& operator*=(const DenseBase& other) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + m_matrix *= extendedTo(other.derived()); + return const_cast(m_matrix); + } + template + EIGEN_DEVICE_FUNC + ExpressionType& operator/=(const DenseBase& other) + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + m_matrix /= extendedTo(other.derived()); + return const_cast(m_matrix); + } + template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + CwiseBinaryOp, + const ExpressionTypeNestedCleaned, + const typename ExtendedType::Type> + operator+(const DenseBase& other) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + return m_matrix + extendedTo(other.derived()); + } + template + EIGEN_DEVICE_FUNC + CwiseBinaryOp, + const ExpressionTypeNestedCleaned, + const typename ExtendedType::Type> + operator-(const DenseBase& other) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + return m_matrix - extendedTo(other.derived()); + } + template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + CwiseBinaryOp, + const ExpressionTypeNestedCleaned, + const typename ExtendedType::Type> + EIGEN_DEVICE_FUNC + operator*(const DenseBase& other) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + return m_matrix * extendedTo(other.derived()); + } + template + EIGEN_DEVICE_FUNC + CwiseBinaryOp, + const ExpressionTypeNestedCleaned, + const typename ExtendedType::Type> + operator/(const DenseBase& other) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) + EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) + EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) + return m_matrix / extendedTo(other.derived()); + } + EIGEN_DEVICE_FUNC + CwiseBinaryOp, + const ExpressionTypeNestedCleaned, + const typename OppositeExtendedType::Type>::Type> + normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); } + EIGEN_DEVICE_FUNC void normalize() { + m_matrix = this->normalized(); + } + EIGEN_DEVICE_FUNC inline void reverseInPlace(); + typedef Homogeneous HomogeneousReturnType; + EIGEN_DEVICE_FUNC + HomogeneousReturnType homogeneous() const; + typedef typename ExpressionType::PlainObject CrossReturnType; + template + EIGEN_DEVICE_FUNC + const CrossReturnType cross(const MatrixBase& other) const; + enum { + HNormalized_Size = Direction==Vertical ? internal::traits::RowsAtCompileTime + : internal::traits::ColsAtCompileTime, + HNormalized_SizeMinusOne = HNormalized_Size==Dynamic ? Dynamic : HNormalized_Size-1 + }; + typedef Block::RowsAtCompileTime), + Direction==Horizontal ? int(HNormalized_SizeMinusOne) + : int(internal::traits::ColsAtCompileTime)> + HNormalized_Block; + typedef Block::RowsAtCompileTime), + Direction==Horizontal ? 1 : int(internal::traits::ColsAtCompileTime)> + HNormalized_Factors; + typedef CwiseBinaryOp::Scalar>, + const HNormalized_Block, + const Replicate > + HNormalizedReturnType; + EIGEN_DEVICE_FUNC + const HNormalizedReturnType hnormalized() const; + protected: + ExpressionTypeNested m_matrix; +}; +template +inline typename DenseBase::ColwiseReturnType +DenseBase::colwise() +{ + return ColwiseReturnType(derived()); +} +template +inline typename DenseBase::RowwiseReturnType +DenseBase::rowwise() +{ + return RowwiseReturnType(derived()); +} +} +#endif +// end #include "src/Core/VectorwiseOp.h" +// #include "src/Core/Replicate.h" +#ifndef EIGEN_REPLICATE_H +#define EIGEN_REPLICATE_H +namespace Eigen { +namespace internal { +template +struct traits > + : traits +{ + typedef typename MatrixType::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::XprKind XprKind; + typedef typename ref_selector::type MatrixTypeNested; + typedef typename remove_reference::type _MatrixTypeNested; + enum { + RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic + ? Dynamic + : RowFactor * MatrixType::RowsAtCompileTime, + ColsAtCompileTime = ColFactor==Dynamic || int(MatrixType::ColsAtCompileTime)==Dynamic + ? Dynamic + : ColFactor * MatrixType::ColsAtCompileTime, + MaxRowsAtCompileTime = RowsAtCompileTime, + MaxColsAtCompileTime = ColsAtCompileTime, + IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1 + : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0 + : (MatrixType::Flags & RowMajorBit) ? 1 : 0, + Flags = IsRowMajor ? RowMajorBit : 0 + }; +}; +} +template class Replicate + : public internal::dense_xpr_base< Replicate >::type +{ + typedef typename internal::traits::MatrixTypeNested MatrixTypeNested; + typedef typename internal::traits::_MatrixTypeNested _MatrixTypeNested; + public: + typedef typename internal::dense_xpr_base::type Base; + EIGEN_DENSE_PUBLIC_INTERFACE(Replicate) + typedef typename internal::remove_all::type NestedExpression; + template + EIGEN_DEVICE_FUNC + inline explicit Replicate(const OriginalMatrixType& matrix) + : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor) + { + EIGEN_STATIC_ASSERT((internal::is_same::type,OriginalMatrixType>::value), + THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE) + eigen_assert(RowFactor!=Dynamic && ColFactor!=Dynamic); + } + template + EIGEN_DEVICE_FUNC + inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor) + : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor) + { + EIGEN_STATIC_ASSERT((internal::is_same::type,OriginalMatrixType>::value), + THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE) + } + EIGEN_DEVICE_FUNC + inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); } + EIGEN_DEVICE_FUNC + inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); } + EIGEN_DEVICE_FUNC + const _MatrixTypeNested& nestedExpression() const + { + return m_matrix; + } + protected: + MatrixTypeNested m_matrix; + const internal::variable_if_dynamic m_rowFactor; + const internal::variable_if_dynamic m_colFactor; +}; +template +template +const Replicate +DenseBase::replicate() const +{ + return Replicate(derived()); +} +template +const typename VectorwiseOp::ReplicateReturnType +VectorwiseOp::replicate(Index factor) const +{ + return typename VectorwiseOp::ReplicateReturnType + (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1); +} +} +#endif +// end #include "src/Core/Replicate.h" +// #include "src/Core/ArrayWrapper.h" +#ifndef EIGEN_ARRAYWRAPPER_H +#define EIGEN_ARRAYWRAPPER_H +namespace Eigen { +namespace internal { +template +struct traits > + : public traits::type > +{ + typedef ArrayXpr XprKind; + enum { + Flags0 = traits::type >::Flags, + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag + }; +}; +} +template +class ArrayWrapper : public ArrayBase > +{ + public: + typedef ArrayBase Base; + EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper) + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper) + typedef typename internal::remove_all::type NestedExpression; + typedef typename internal::conditional< + internal::is_lvalue::value, + Scalar, + const Scalar + >::type ScalarWithConstIfNotLvalue; + typedef typename internal::ref_selector::non_const_type NestedExpressionType; + using Base::coeffRef; + EIGEN_DEVICE_FUNC + explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {} + EIGEN_DEVICE_FUNC + inline Index rows() const { return m_expression.rows(); } + EIGEN_DEVICE_FUNC + inline Index cols() const { return m_expression.cols(); } + EIGEN_DEVICE_FUNC + inline Index outerStride() const { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC + inline Index innerStride() const { return m_expression.innerStride(); } + EIGEN_DEVICE_FUNC + inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); } + EIGEN_DEVICE_FUNC + inline const Scalar* data() const { return m_expression.data(); } + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index rowId, Index colId) const + { + return m_expression.coeffRef(rowId, colId); + } + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index index) const + { + return m_expression.coeffRef(index); + } + template + EIGEN_DEVICE_FUNC + inline void evalTo(Dest& dst) const { dst = m_expression; } + const typename internal::remove_all::type& + EIGEN_DEVICE_FUNC + nestedExpression() const + { + return m_expression; + } + EIGEN_DEVICE_FUNC + void resize(Index newSize) { m_expression.resize(newSize); } + EIGEN_DEVICE_FUNC + void resize(Index rows, Index cols) { m_expression.resize(rows,cols); } + protected: + NestedExpressionType m_expression; +}; +} +#endif +// end #include "src/Core/ArrayWrapper.h" + diff --git a/contrib/nnc/libs/backend/soft/src/cpp_generator.cpp b/contrib/nnc/libs/backend/soft/src/cpp_generator.cpp index e4f7ac1..e552d31 100644 --- a/contrib/nnc/libs/backend/soft/src/cpp_generator.cpp +++ b/contrib/nnc/libs/backend/soft/src/cpp_generator.cpp @@ -13,6 +13,7 @@ using namespace nncc::contrib::core::IR::model; #include "cpp_operations.h" #include "param_constants.h" +#include "eigen.h" #include "cpp_common_funcs.h" #include "cpp_add_bias.h" #include "cpp_capped_relu.h" @@ -199,9 +200,9 @@ void CPPCodeGenerator::materializeCode(ostream &out, const ModelAnalyzer &ma, co string className = ma.getModelName() + "Model"; out << "#include \"" << _headerFile << "\"\n"; - out << "#include \"Eigen/Core\""; // put operations from tflite + out.write(eigen, sizeof(eigen)); out.write(cpp_common_funcs, sizeof(cpp_common_funcs)); out.write(cpp_add_bias, sizeof(cpp_add_bias)); out.write(cpp_capped_relu, sizeof(cpp_capped_relu)); -- 2.7.4