libstdc++: Annotate most lambdas with always_inline

author Matthias Kretz <m.kretz@gsi.de>

Sat, 14 Jan 2023 16:07:59 +0000 (17:07 +0100)

committer Matthias Kretz <m.kretz@gsi.de>

Thu, 16 Feb 2023 14:58:33 +0000 (15:58 +0100)
author Matthias Kretz <m.kretz@gsi.de>
Sat, 14 Jan 2023 16:07:59 +0000 (17:07 +0100)
committer Matthias Kretz <m.kretz@gsi.de>
Thu, 16 Feb 2023 14:58:33 +0000 (15:58 +0100)
diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h

index 3de966bbf2295b624061ddaa9cfbfa54beedf202..ffe72fa6ccf9e13a5affaa026cae7f4be06a4771 100644 (file)
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -609,28 +609,34 @@ template <size_t _Bytes>
           operator&(_Ip __rhs) const
           {
             return __generate_from_n_evaluations<_Np, _Ip>(
-             [&](auto __i) { return __rhs._M_data[__i] & _M_data[__i]; });
+             [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+               return __rhs._M_data[__i] & _M_data[__i];
+             });
           }
  
           _GLIBCXX_SIMD_INTRINSIC constexpr _Ip
           operator|(_Ip __rhs) const
           {
             return __generate_from_n_evaluations<_Np, _Ip>(
-             [&](auto __i) { return __rhs._M_data[__i] | _M_data[__i]; });
+             [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+               return __rhs._M_data[__i] | _M_data[__i];
+             });
           }
  
           _GLIBCXX_SIMD_INTRINSIC constexpr _Ip
           operator^(_Ip __rhs) const
           {
             return __generate_from_n_evaluations<_Np, _Ip>(
-             [&](auto __i) { return __rhs._M_data[__i] ^ _M_data[__i]; });
+             [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+               return __rhs._M_data[__i] ^ _M_data[__i];
+             });
           }
  
           _GLIBCXX_SIMD_INTRINSIC constexpr _Ip
           operator~() const
           {
             return __generate_from_n_evaluations<_Np, _Ip>(
-             [&](auto __i) { return ~_M_data[__i]; });
+             [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return ~_M_data[__i]; });
           }
         };
         return _Ip{};
@@ -1391,7 +1397,7 @@ template <size_t _Np, bool _Sanitized>
      operator^=(const _BitMask& __b) & noexcept
      {
        __execute_n_times<_S_array_size>(
-       [&](auto __i) { _M_bits[__i] ^= __b._M_bits[__i]; });
+       [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] ^= __b._M_bits[__i]; });
        return *this;
      }
  
@@ -1399,7 +1405,7 @@ template <size_t _Np, bool _Sanitized>
      operator|=(const _BitMask& __b) & noexcept
      {
        __execute_n_times<_S_array_size>(
-       [&](auto __i) { _M_bits[__i] |= __b._M_bits[__i]; });
+       [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] |= __b._M_bits[__i]; });
        return *this;
      }
  
@@ -1407,7 +1413,7 @@ template <size_t _Np, bool _Sanitized>
      operator&=(const _BitMask& __b) & noexcept
      {
        __execute_n_times<_S_array_size>(
-       [&](auto __i) { _M_bits[__i] &= __b._M_bits[__i]; });
+       [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { _M_bits[__i] &= __b._M_bits[__i]; });
        return *this;
      }
  
@@ -1797,8 +1803,9 @@ template <size_t _Np, typename _Tp>
    __vector_broadcast(_Tp __x)
    {
      return __call_with_n_evaluations<_Np>(
-      [](auto... __xx) { return __vector_type_t<_Tp, _Np>{__xx...}; },
-      [&__x](int) { return __x; });
+      [](auto... __xx) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+       return __vector_type_t<_Tp, _Np>{__xx...};
+      }, [&__x](int) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x; });
    }
  
  // }}}
@@ -2205,7 +2212,7 @@ template <int _Offset,
  #endif
         constexpr int _O = _Offset * __return_width;
         return __call_with_subscripts<__return_width, _O>(
-         __x, [](auto... __entries) {
+         __x, [](auto... __entries) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             return reinterpret_cast<_R>(_Up{__entries...});
           });
        }
@@ -2607,7 +2614,7 @@ template <typename _Tp, size_t _Width>
  
      _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper(initializer_list<_Tp> __init)
        : _Base(__generate_from_n_evaluations<_Width, _BuiltinType>(
-       [&](auto __i) { return __init.begin()[__i.value]; })) {}
+       [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __init.begin()[__i.value]; })) {}
  
      _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper() = default;
      _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper(const _SimdWrapper&)
@@ -2632,10 +2639,9 @@ template <typename _Tp, size_t _Width>
        _GLIBCXX_SIMD_INTRINSIC constexpr
        operator _SimdTuple<_Tp, _As...>() const
        {
-       const auto& dd = _M_data; // workaround for GCC7 ICE
-       return __generate_from_n_evaluations<sizeof...(_As),
-                                            _SimdTuple<_Tp, _As...>>([&](
-         auto __i) constexpr { return dd[int(__i)]; });
+       return __generate_from_n_evaluations<sizeof...(_As), _SimdTuple<_Tp, _As...>>(
+                [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+                { return _M_data[int(__i)]; });
        }
  
      _GLIBCXX_SIMD_INTRINSIC constexpr operator const _BuiltinType&() const
@@ -3192,21 +3198,19 @@ template <typename _Tp, int _Np>
    { return __x; }
  
  template <typename _Tp, typename _Ap>
-  _GLIBCXX_SIMD_INTRINSIC auto
+  _GLIBCXX_SIMD_INTRINSIC fixed_size_simd<_Tp, simd_size_v<_Tp, _Ap>>
    to_fixed_size(const simd<_Tp, _Ap>& __x)
    {
-    return simd<_Tp, simd_abi::fixed_size<simd_size_v<_Tp, _Ap>>>([&__x](
-      auto __i) constexpr { return __x[__i]; });
+    using _Rp = fixed_size_simd<_Tp, simd_size_v<_Tp, _Ap>>;
+    return _Rp([&__x](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
    }
  
  template <typename _Tp, typename _Ap>
-  _GLIBCXX_SIMD_INTRINSIC auto
+  _GLIBCXX_SIMD_INTRINSIC fixed_size_simd_mask<_Tp, simd_size_v<_Tp, _Ap>>
    to_fixed_size(const simd_mask<_Tp, _Ap>& __x)
    {
-    constexpr int _Np = simd_mask<_Tp, _Ap>::size();
-    fixed_size_simd_mask<_Tp, _Np> __r;
-    __execute_n_times<_Np>([&](auto __i) constexpr { __r[__i] = __x[__i]; });
-    return __r;
+    return {__private_init,
+           [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }};
    }
  
  // to_native {{{2
@@ -3225,7 +3229,9 @@ template <typename _Tp, size_t _Np>
    enable_if_t<(_Np == native_simd_mask<_Tp>::size()), native_simd_mask<_Tp>>
    to_native(const fixed_size_simd_mask<_Tp, _Np>& __x)
    {
-    return native_simd_mask<_Tp>([&](auto __i) constexpr { return __x[__i]; });
+    return native_simd_mask<_Tp>(
+            __private_init,
+            [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
    }
  
  // to_compatible {{{2
@@ -3242,7 +3248,10 @@ template <typename _Tp, size_t _Np>
    _GLIBCXX_SIMD_INTRINSIC
    enable_if_t<(_Np == simd_mask<_Tp>::size()), simd_mask<_Tp>>
    to_compatible(const simd_mask<_Tp, simd_abi::fixed_size<_Np>>& __x)
-  { return simd_mask<_Tp>([&](auto __i) constexpr { return __x[__i]; }); }
+  {
+    return simd_mask<_Tp>(
+            [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
+  }
  
  // masked assignment [simd_mask.where] {{{1
  
@@ -3400,9 +3409,9 @@ template <typename _M, typename _Tp>
        _Impl::template _S_masked_cassign(                                       \
         __data(_M_k), __data(_M_value),                                        \
         __to_value_type_or_member_type<_Tp>(static_cast<_Up&&>(__x)),          \
-       [](auto __impl, auto __lhs, auto __rhs) constexpr {                    \
-       return __impl.__name(__lhs, __rhs);                                    \
-       });                                                                    \
+       [](auto __impl, auto __lhs, auto __rhs)                                \
+         constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA                         \
+       { return __impl.__name(__lhs, __rhs); });                              \
      }                                                                          \
    static_assert(true)
      _GLIBCXX_SIMD_OP_(+, _S_plus);
@@ -3899,12 +3908,11 @@ template <typename _V, typename _Ap,
        }
      else if (__x._M_is_constprop())
        {
-       return __generate_from_n_evaluations<Parts, array<_V, Parts>>([&](
-         auto __i) constexpr {
-         return _V([&](auto __j) constexpr {
-           return __x[__i * _V::size() + __j];
-         });
-       });
+       return __generate_from_n_evaluations<Parts, array<_V, Parts>>(
+                [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+                            { return __x[__i * _V::size() + __j]; });
+                });
        }
      else if constexpr (
        __is_fixed_size_abi_v<_Ap>
@@ -3917,41 +3925,40 @@ template <typename _V, typename _Ap,
  #ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS
        const __may_alias<_Tp>* const __element_ptr
         = reinterpret_cast<const __may_alias<_Tp>*>(&__data(__x));
-      return __generate_from_n_evaluations<Parts, array<_V, Parts>>([&](
-       auto __i) constexpr {
-       return _V(__element_ptr + __i * _V::size(), vector_aligned);
-      });
+      return __generate_from_n_evaluations<Parts, array<_V, Parts>>(
+              [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+              { return _V(__element_ptr + __i * _V::size(), vector_aligned); });
  #else
        const auto& __xx = __data(__x);
-      return __generate_from_n_evaluations<Parts, array<_V, Parts>>([&](
-       auto __i) constexpr {
-       [[maybe_unused]] constexpr size_t __offset
-         = decltype(__i)::value * _V::size();
-       return _V([&](auto __j) constexpr {
-         constexpr _SizeConstant<__j + __offset> __k;
-         return __xx[__k];
-       });
-      });
+      return __generate_from_n_evaluations<Parts, array<_V, Parts>>(
+              [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                [[maybe_unused]] constexpr size_t __offset
+                  = decltype(__i)::value * _V::size();
+                return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                         constexpr _SizeConstant<__j + __offset> __k;
+                         return __xx[__k];
+                       });
+              });
  #endif
      }
    else if constexpr (is_same_v<typename _V::abi_type, simd_abi::scalar>)
      {
        // normally memcpy should work here as well
-      return __generate_from_n_evaluations<Parts, array<_V, Parts>>([&](
-       auto __i) constexpr { return __x[__i]; });
+      return __generate_from_n_evaluations<Parts, array<_V, Parts>>(
+              [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
      }
    else
      {
-      return __generate_from_n_evaluations<Parts, array<_V, Parts>>([&](
-       auto __i) constexpr {
-       if constexpr (__is_fixed_size_abi_v<typename _V::abi_type>)
-         return _V([&](auto __j) constexpr {
-           return __x[__i * _V::size() + __j];
-         });
-       else
-         return _V(__private_init,
-                   __extract_part<decltype(__i)::value, Parts>(__data(__x)));
-      });
+      return __generate_from_n_evaluations<Parts, array<_V, Parts>>(
+              [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                if constexpr (__is_fixed_size_abi_v<typename _V::abi_type>)
+                  return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                           return __x[__i * _V::size() + __j];
+                         });
+                else
+                  return _V(__private_init,
+                            __extract_part<decltype(__i)::value, Parts>(__data(__x)));
+              });
      }
    }
  
@@ -3975,22 +3982,22 @@ template <typename _V, typename _Ap,
      else if constexpr (_V::size() <= __CHAR_BIT__ * sizeof(_ULLong))
        {
         const bitset __bits = __x.__to_bitset();
-       return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
-         auto __i) constexpr {
-         constexpr size_t __offset = __i * _V::size();
-         return _V(__bitset_init, (__bits >> __offset).to_ullong());
-       });
+       return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+                [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  constexpr size_t __offset = __i * _V::size();
+                  return _V(__bitset_init, (__bits >> __offset).to_ullong());
+                });
        }
      else
        {
-       return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>([&](
-         auto __i) constexpr {
-         constexpr size_t __offset = __i * _V::size();
-         return _V(
-           __private_init, [&](auto __j) constexpr {
-             return __x[__j + __offset];
-           });
-       });
+       return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>(
+                [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  constexpr size_t __offset = __i * _V::size();
+                  return _V(__private_init,
+                            [&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                              return __x[__j + __offset];
+                            });
+                });
        }
    }
  
@@ -4008,12 +4015,14 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
      using _V = __deduced_simd<_Tp, _N0>;
  
      if (__x._M_is_constprop())
-      return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&](
-       auto __i) constexpr {
-       using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
-       constexpr size_t __offset = _SL::_S_before(__i);
-       return _Vi([&](auto __j) constexpr { return __x[__offset + __j]; });
-      });
+      return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>(
+              [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
+                constexpr size_t __offset = _SL::_S_before(__i);
+                return _Vi([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                         return __x[__offset + __j];
+                       });
+              });
      else if constexpr (_Np == _N0)
        {
         static_assert(sizeof...(_Sizes) == 1);
@@ -4080,28 +4089,28 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
  #ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS
      const __may_alias<_Tp>* const __element_ptr
        = reinterpret_cast<const __may_alias<_Tp>*>(&__x);
-    return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&](
-      auto __i) constexpr {
-      using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
-      constexpr size_t __offset = _SL::_S_before(__i);
-      constexpr size_t __base_align = alignof(simd<_Tp, _Ap>);
-      constexpr size_t __a
-       = __base_align - ((__offset * sizeof(_Tp)) % __base_align);
-      constexpr size_t __b = ((__a - 1) & __a) ^ __a;
-      constexpr size_t __alignment = __b == 0 ? __a : __b;
-      return _Vi(__element_ptr + __offset, overaligned<__alignment>);
-    });
+    return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>(
+            [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+              using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
+              constexpr size_t __offset = _SL::_S_before(__i);
+              constexpr size_t __base_align = alignof(simd<_Tp, _Ap>);
+              constexpr size_t __a
+                = __base_align - ((__offset * sizeof(_Tp)) % __base_align);
+              constexpr size_t __b = ((__a - 1) & __a) ^ __a;
+              constexpr size_t __alignment = __b == 0 ? __a : __b;
+              return _Vi(__element_ptr + __offset, overaligned<__alignment>);
+            });
  #else
-    return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>([&](
-      auto __i) constexpr {
-      using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
-      const auto& __xx = __data(__x);
-      using _Offset = decltype(_SL::_S_before(__i));
-      return _Vi([&](auto __j) constexpr {
-       constexpr _SizeConstant<_Offset::value + __j> __k;
-       return __xx[__k];
-      });
-    });
+    return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>(
+            [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+              using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>;
+              const auto& __xx = __data(__x);
+              using _Offset = decltype(_SL::_S_before(__i));
+              return _Vi([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                       constexpr _SizeConstant<_Offset::value + __j> __k;
+                       return __xx[__k];
+                     });
+            });
  #endif
    }
  
@@ -4143,8 +4152,9 @@ template <typename _Tp, typename... _As, typename = __detail::__odr_helper>
        return simd_cast<_Rp>(__xs...);
      else if ((... && __xs._M_is_constprop()))
        return simd<_Tp,
-                 simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>([&](
-       auto __i) constexpr { return __subscript_in_pack<__i>(__xs...); });
+                 simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>(
+              [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+              { return __subscript_in_pack<__i>(__xs...); });
      else
        {
         _Rp __r{};
@@ -4160,9 +4170,10 @@ template <typename _Tp, typename _Abi, size_t _Np>
    _GLIBCXX_SIMD_CONSTEXPR __deduced_simd<_Tp, simd_size_v<_Tp, _Abi> * _Np>
    concat(const array<simd<_Tp, _Abi>, _Np>& __x)
    {
-    return __call_with_subscripts<_Np>(__x, [](const auto&... __xs) {
-      return concat(__xs...);
-    });
+    return __call_with_subscripts<_Np>(
+            __x, [](const auto&... __xs) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+              return concat(__xs...);
+            });
    }
  
  // }}}
@@ -4695,7 +4706,7 @@ template <typename _Tp, typename _Abi>
        simd_mask(_PrivateInit, _Fp&& __gen)
        : _M_data()
        {
-       __execute_n_times<size()>([&](auto __i) constexpr {
+       __execute_n_times<size()>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           _Impl::_S_set(_M_data, __i, __gen(__i));
         });
        }
@@ -4881,7 +4892,9 @@ template <typename _Tp, typename _Abi>
      if (__builtin_is_constant_evaluated() || __k._M_is_constprop())
        {
         const int __r = __call_with_subscripts<simd_size_v<_Tp, _Abi>>(
-         __k, [](auto... __elements) { return ((__elements != 0) + ...); });
+                         __k, [](auto... __elements) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                           return ((__elements != 0) + ...);
+                         });
         if (__builtin_is_constant_evaluated() || __builtin_constant_p(__r))
           return __r;
        }
@@ -4896,8 +4909,11 @@ template <typename _Tp, typename _Abi>
        {
         constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
         const size_t _Idx = __call_with_n_evaluations<_Np>(
-         [](auto... __indexes) { return std::min({__indexes...}); },
-         [&](auto __i) { return __k[__i] ? +__i : _Np; });
+                             [](auto... __indexes) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                               return std::min({__indexes...});
+                             }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                               return __k[__i] ? +__i : _Np;
+                             });
         if (_Idx >= _Np)
           __invoke_ub("find_first_set(empty mask) is UB");
         if (__builtin_constant_p(_Idx))
@@ -4914,8 +4930,11 @@ template <typename _Tp, typename _Abi>
        {
         constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
         const int _Idx = __call_with_n_evaluations<_Np>(
-         [](auto... __indexes) { return std::max({__indexes...}); },
-         [&](auto __i) { return __k[__i] ? int(__i) : -1; });
+                          [](auto... __indexes) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                            return std::max({__indexes...});
+                          }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                            return __k[__i] ? int(__i) : -1;
+                          });
         if (_Idx < 0)
           __invoke_ub("find_first_set(empty mask) is UB");
         if (__builtin_constant_p(_Idx))
diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h

index 8851da69800f3bbdb65f725d0387b866918ae727..792439a81bf20db8032e24dadb18ceb06becc494 100644 (file)
--- a/libstdc++-v3/include/experimental/bits/simd_builtin.h
+++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h
@@ -194,8 +194,11 @@ template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
         using _Up = decltype(__w);
         return __intrin_bitcast<_Tp>(
           __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>(
-           [](auto... __chunks) { return _Up{__chunks...}; },
-           [&](auto __i) { return __w[__shift / __chunksize + __i]; }));
+           [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+             return _Up{__chunks...};
+           }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+             return __w[__shift / __chunksize + __i];
+           }));
        }
    }
  
@@ -225,7 +228,9 @@ template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
         // by _Total");
         if (__x._M_is_constprop())
           return __generate_from_n_evaluations<__return_size, _R>(
-           [&](auto __i) { return __x[__values_to_skip + __i]; });
+           [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+             return __x[__values_to_skip + __i];
+           });
         if constexpr (_Index == 0 && _Total == 1)
           return __x;
         else if constexpr (_Index == 0)
@@ -570,7 +575,9 @@ template <typename _To,
         constexpr auto _Np
           = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts;
         return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
-         [&](auto __i) { return static_cast<_To>(__v[__i + _Offset]); });
+                [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return static_cast<_To>(__v[__i + _Offset]);
+                });
        }
      else
        {
@@ -611,13 +618,14 @@ template <typename _To,
               return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
             };
             [[maybe_unused]] const auto __vi = __to_intrin(__v);
-           auto&& __make_array = [](auto __x0, [[maybe_unused]] auto __x1) {
-             if constexpr (_Np == 1)
-               return _R{__intrin_bitcast<_To>(__x0)};
-             else
-               return _R{__intrin_bitcast<_To>(__x0),
-                         __intrin_bitcast<_To>(__x1)};
-           };
+           auto&& __make_array
+               = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                 if constexpr (_Np == 1)
+                   return _R{__intrin_bitcast<_To>(__x0)};
+                 else
+                   return _R{__intrin_bitcast<_To>(__x0),
+                             __intrin_bitcast<_To>(__x1)};
+               };
  
             if constexpr (_Np == 0)
               return _R{};
@@ -642,7 +650,7 @@ template <typename _To,
                       = __convert_all<__vector_type16_t<int>, _Np>(
                         __adjust(_SizeConstant<_Np * 4>(), __v));
                     return __generate_from_n_evaluations<_Np, _R>(
-                     [&](auto __i) {
+                     [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                         return __vector_convert<_To>(__as_wrapper(__ints[__i]));
                       });
                   }
@@ -687,36 +695,40 @@ template <typename _To,
                   __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])),
                   __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))};
                 if constexpr (sizeof(_ToT) == 4)
-                 return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
-                   return __vector_convert<_To>(
-                     _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
-                 });
+                 return __generate_from_n_evaluations<_Np, _R>(
+                          [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                            return __vector_convert<_To>(
+                                     _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
+                          });
                 else if constexpr (is_integral_v<_ToT>)
-                 return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
-                   const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
-                   const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
-                   return __vector_bitcast<_ToT>(
-                     __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
-                                  : _mm_unpackhi_epi32(__sx32, __signbits));
-                 });
+                 return __generate_from_n_evaluations<_Np, _R>(
+                          [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                            const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
+                            const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
+                            return __vector_bitcast<_ToT>(
+                                     __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
+                                                  : _mm_unpackhi_epi32(__sx32, __signbits));
+                          });
                 else
-                 return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
-                   const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
-                   return __vector_convert<_To>(
-                     __i % 2 == 0 ? __int4
-                                  : _SimdWrapper<int, 4>(
-                                    _mm_unpackhi_epi64(__to_intrin(__int4),
-                                                       __to_intrin(__int4))));
-                 });
+                 return __generate_from_n_evaluations<_Np, _R>(
+                          [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                            const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
+                            return __vector_convert<_To>(
+                                     __i % 2 == 0 ? __int4
+                                                  : _SimdWrapper<int, 4>(
+                                                      _mm_unpackhi_epi64(__to_intrin(__int4),
+                                                                         __to_intrin(__int4))));
+                          });
               }
             else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4)
               {
                 const auto __shorts = __convert_all<__vector_type16_t<
                   conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
                   __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v));
-               return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
-                 return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
-               });
+               return __generate_from_n_evaluations<_Np, _R>(
+                        [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                          return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
+                        });
               }
             else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8
                                && is_signed_v<_FromT> && is_integral_v<_ToT>)
@@ -736,9 +748,10 @@ template <typename _To,
                      __vector_bitcast<int>(
                        _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16),
                                           _mm_srai_epi32(__vv[1], 31)))};
-               return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
-                 return __vector_bitcast<_ToT>(__vvvv[__i]);
-               });
+               return __generate_from_n_evaluations<_Np, _R>(
+                        [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                          return __vector_bitcast<_ToT>(__vvvv[__i]);
+                        });
               }
             else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8)
               {
@@ -747,9 +760,10 @@ template <typename _To,
                     is_signed_v<_FromT> || is_floating_point_v<_ToT>, int,
                     unsigned int>>>(
                     __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v));
-               return __generate_from_n_evaluations<_Np, _R>([&](auto __i) {
-                 return __convert_all<_To>(__ints[__i / 2])[__i % 2];
-               });
+               return __generate_from_n_evaluations<_Np, _R>(
+                        [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                          return __convert_all<_To>(__ints[__i / 2])[__i % 2];
+                        });
               }
             else
               __assert_unreachable<_To>();
@@ -779,14 +793,14 @@ template <typename _To,
                 __extract_part<_Offset, _FromVT::_S_partial_width,
                                _ToVT::_S_full_size>(__v))};
             else
-             return __generate_from_n_evaluations<_Np, _R>([&](
-               auto __i) constexpr {
-               auto __part
-                 = __extract_part<__i * _ToVT::_S_full_size + _Offset,
-                                  _FromVT::_S_partial_width,
-                                  _ToVT::_S_full_size>(__v);
-               return __vector_convert<_To>(__part);
-             });
+             return __generate_from_n_evaluations<_Np, _R>(
+                      [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                        auto __part
+                          = __extract_part<__i * _ToVT::_S_full_size + _Offset,
+                                           _FromVT::_S_partial_width,
+                                           _ToVT::_S_full_size>(__v);
+                        return __vector_convert<_To>(__part);
+                      });
           }
         else if constexpr (_Offset == 0)
           return array<_To, 1>{__vector_convert<_To>(__v)};
@@ -1017,8 +1031,9 @@ template <int _UsedBytes>
         else
           {
             constexpr auto __size = _S_size<_Tp>;
-           _GLIBCXX_SIMD_USE_CONSTEXPR auto __r = __generate_vector<_UV>(
-             [](auto __i) constexpr { return __i < __size ? -1 : 0; });
+           _GLIBCXX_SIMD_USE_CONSTEXPR auto __r
+             = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+                                      { return __i < __size ? -1 : 0; });
             return __r;
           }
        }
@@ -1208,7 +1223,7 @@ template <int _UsedBytes>
             if constexpr (is_integral_v<typename _TVT::value_type>)
               return __x
                      | __generate_vector<_Tp, _S_full_size<_Tp>>(
-                      [](auto __i) -> _Tp {
+                      [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
                          if (__i < _Np)
                            return 0;
                          else
@@ -1348,26 +1363,27 @@ struct _CommonImplBuiltin
         }
        else
         {
-         __execute_n_times<__div_roundup(_Np, 4)>([&](auto __i) {
-           constexpr int __offset = __i * 4;
-           constexpr int __remaining = _Np - __offset;
-           if constexpr (__remaining > 4 && __remaining <= 7)
-             {
-               const _ULLong __bool7
-                 = (__x.template _M_extract<__offset>()._M_to_bits()
-                    * 0x40810204081ULL)
-                   & 0x0101010101010101ULL;
-               _S_store<__remaining>(__bool7, __mem + __offset);
-             }
-           else if constexpr (__remaining >= 4)
-             {
-               int __bits = __x.template _M_extract<__offset>()._M_to_bits();
-               if constexpr (__remaining > 7)
-                 __bits &= 0xf;
-               const int __bool4 = (__bits * 0x204081) & 0x01010101;
-               _S_store<4>(__bool4, __mem + __offset);
-             }
-         });
+         __execute_n_times<__div_roundup(_Np, 4)>(
+           [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+             constexpr int __offset = __i * 4;
+             constexpr int __remaining = _Np - __offset;
+             if constexpr (__remaining > 4 && __remaining <= 7)
+               {
+                 const _ULLong __bool7
+                   = (__x.template _M_extract<__offset>()._M_to_bits()
+                        * 0x40810204081ULL)
+                       & 0x0101010101010101ULL;
+                 _S_store<__remaining>(__bool7, __mem + __offset);
+               }
+             else if constexpr (__remaining >= 4)
+               {
+                 int __bits = __x.template _M_extract<__offset>()._M_to_bits();
+                 if constexpr (__remaining > 7)
+                   __bits &= 0xf;
+                 const int __bool4 = (__bits * 0x204081) & 0x01010101;
+                 _S_store<4>(__bool4, __mem + __offset);
+               }
+           });
         }
      }
  
@@ -1434,13 +1450,13 @@ template <typename _Abi, typename>
        inline static constexpr _SimdMember<_Tp> _S_generator(_Fp&& __gen,
                                                             _TypeTag<_Tp>)
        {
-       return __generate_vector<_Tp, _S_full_size<_Tp>>([&](
-         auto __i) constexpr {
-         if constexpr (__i < _S_size<_Tp>)
-           return __gen(__i);
-         else
-           return 0;
-       });
+       return __generate_vector<_Tp, _S_full_size<_Tp>>(
+                [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  if constexpr (__i < _S_size<_Tp>)
+                    return __gen(__i);
+                  else
+                    return 0;
+                });
        }
  
      // _S_load {{{2
@@ -1455,10 +1471,10 @@ template <typename _Abi, typename>
                                                                       : 16;
         constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
         if constexpr (sizeof(_Up) > 8)
-         return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>([&](
-           auto __i) constexpr {
-           return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
-         });
+         return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
+                  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                    return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
+                  });
         else if constexpr (is_same_v<_Up, _Tp>)
           return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
                                                _Np * sizeof(_Tp)>(__mem);
@@ -1470,13 +1486,12 @@ template <typename _Abi, typename>
             constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
             constexpr size_t __elements_per_load = _Np / __n_loads;
             return __call_with_n_evaluations<__n_loads>(
-             [](auto... __uncvted) {
-               return __convert<_SimdMember<_Tp>>(__uncvted...);
-             },
-             [&](auto __i) {
-               return _CommonImpl::template _S_load<_Up, __elements_per_load>(
-                 __mem + __i * __elements_per_load);
-             });
+                    [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                      return __convert<_SimdMember<_Tp>>(__uncvted...);
+                    }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                      return _CommonImpl::template _S_load<_Up, __elements_per_load>(
+                                                     __mem + __i * __elements_per_load);
+                    });
           }
         else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0
                            && __max_load_size > 16)
@@ -1485,20 +1500,19 @@ template <typename _Abi, typename>
               = __bytes_to_load / (__max_load_size / 2);
             constexpr size_t __elements_per_load = _Np / __n_loads;
             return __call_with_n_evaluations<__n_loads>(
-             [](auto... __uncvted) {
-               return __convert<_SimdMember<_Tp>>(__uncvted...);
-             },
-             [&](auto __i) {
-               return _CommonImpl::template _S_load<_Up, __elements_per_load>(
-                 __mem + __i * __elements_per_load);
-             });
+                    [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                      return __convert<_SimdMember<_Tp>>(__uncvted...);
+                    }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                      return _CommonImpl::template _S_load<_Up, __elements_per_load>(
+                                                     __mem + __i * __elements_per_load);
+                    });
           }
         else // e.g. int[] -> <char, 9>
           return __call_with_subscripts(
-           __mem, make_index_sequence<_Np>(), [](auto... __args) {
-             return __vector_type_t<_Tp, _S_full_size<_Tp>>{
-               static_cast<_Tp>(__args)...};
-           });
+           __mem, make_index_sequence<_Np>(),
+                  [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                    return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
+                  });
        }
  
      // _S_masked_load {{{2
@@ -1507,9 +1521,10 @@ template <typename _Abi, typename>
        _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
                      const _Up* __mem) noexcept
        {
-       _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), [&](auto __i) {
-         __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
-       });
+       _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
+                                 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                                   __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
+                                 });
         return __merge;
        }
  
@@ -1523,7 +1538,7 @@ template <typename _Abi, typename>
         constexpr size_t __max_store_size
           = _SuperImpl::template _S_max_store_size<_Up>;
         if constexpr (sizeof(_Up) > 8)
-         __execute_n_times<_Np>([&](auto __i) constexpr {
+         __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             __mem[__i] = __v[__i];
           });
         else if constexpr (is_same_v<_Up, _Tp>)
@@ -1540,9 +1555,10 @@ template <typename _Abi, typename>
             using _V = __vector_type_t<_Up, __vsize>;
             const array<_V, __stores> __converted
               = __convert_all<_V, __stores>(__v);
-           __execute_n_times<__full_stores>([&](auto __i) constexpr {
-             _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
-           });
+           __execute_n_times<__full_stores>(
+             [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+               _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
+             });
             if constexpr (__full_stores < __stores)
               _CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
                                              * sizeof(_Up)>(
@@ -1557,7 +1573,8 @@ template <typename _Abi, typename>
                             _MaskMember<_Tp> __k)
        {
         _BitOps::_S_bit_iteration(
-         _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr {
+         _MaskImpl::_S_to_bits(__k),
+         [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             __mem[__i] = __v[__i];
           });
        }
@@ -1579,7 +1596,7 @@ template <typename _Abi, typename>
             _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
           {
             // bitwise or no conversion, reinterpret:
-           const _MaskMember<_Up> __kk = [&]() {
+           const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
               if constexpr (__is_bitmask_v<decltype(__k)>)
                 return _MaskMember<_Up>(__k._M_data);
               else
@@ -1618,7 +1635,7 @@ template <typename _Abi, typename>
                 constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
                 const array<_UV, _NAllStores> __converted
                   = __convert_all<_UV, _NAllStores>(__v);
-               __execute_n_times<_NFullStores>([&](auto __i) {
+               __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                   _SuperImpl::_S_masked_store_nocvt(
                     _UW(__converted[__i]), __mem + __i * _UW_size,
                     _UAbi::_MaskImpl::template _S_convert<
@@ -1637,10 +1654,10 @@ template <typename _Abi, typename>
               }
           }
         else
-         _BitOps::_S_bit_iteration(
-           _MaskImpl::_S_to_bits(__k), [&](auto __i) constexpr {
-             __mem[__i] = static_cast<_Up>(__v[__i]);
-           });
+         _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
+                                   [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                                     __mem[__i] = static_cast<_Up>(__v[__i]);
+                                   });
        }
  
      // _S_complement {{{2
@@ -1932,7 +1949,9 @@ template <typename _Abi, typename>
        static _Tp _S_##__name(const _Tp& __x, const _More&... __more)           \
        {                                                                        \
         return __generate_vector<_Tp>(                                         \
-         [&](auto __i) { return __name(__x[__i], __more[__i]...); });         \
+                [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {            \
+                  return __name(__x[__i], __more[__i]...);                    \
+                });                                                           \
        }
  
  #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name)                            \
@@ -1941,23 +1960,25 @@ template <typename _Abi, typename>
                                                  const _More&... __more)       \
        {                                                                        \
         return __generate_vector<_Tp>(                                         \
-         [&](auto __i) { return __name(__x[__i], __more[__i]...); });         \
-      }
-
-#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name)                   \
-    template <typename _Tp, typename... _More>                                 \
-      static auto _S_##__name(const _Tp& __x, const _More&... __more)          \
-      {                                                                        \
-       return __fixed_size_storage_t<_RetTp,                                  \
-                                     _VectorTraits<_Tp>::_S_partial_width>::  \
-         _S_generate([&](auto __meta) constexpr {                             \
-           return __meta._S_generator(                                        \
-             [&](auto __i) {                                                  \
-               return __name(__x[__meta._S_offset + __i],                     \
-                             __more[__meta._S_offset + __i]...);              \
-             },                                                               \
-             static_cast<_RetTp*>(nullptr));                                  \
-         });                                                                  \
+                [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {            \
+                  return __name(__x[__i], __more[__i]...);                    \
+                });                                                           \
+      }
+
+#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name)                          \
+    template <typename _Tp, typename... _More>                                        \
+      static auto _S_##__name(const _Tp& __x, const _More&... __more)                 \
+      {                                                                               \
+       return __fixed_size_storage_t<_RetTp,                                         \
+                                     _VectorTraits<_Tp>::_S_partial_width>::         \
+         _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
+           return __meta._S_generator(                                               \
+             [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {                      \
+               return __name(__x[__meta._S_offset + __i],                            \
+                             __more[__meta._S_offset + __i]...);                     \
+             },                                                                      \
+             static_cast<_RetTp*>(nullptr));                                         \
+         });                                                                         \
        }
  
      _GLIBCXX_SIMD_MATH_FALLBACK(acos)
@@ -2010,7 +2031,7 @@ template <typename _Abi, typename>
        _S_remquo(const _Tp __x, const _Tp __y,
                 __fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
        {
-       return __generate_vector<_Tp>([&](auto __i) {
+       return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           int __tmp;
           auto __r = remquo(__x[__i], __y[__i], &__tmp);
           __z->_M_set(__i, __tmp);
@@ -2423,7 +2444,7 @@ template <typename _Abi, typename>
    #endif // _GLIBCXX_SIMD_X86INTRIN
        else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
         return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
-                                           [](auto... __l) {
+                                           [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                                               return __make_wrapper<int>(__l...);
                                             })};
        else
@@ -2554,13 +2575,13 @@ struct _MaskImplBuiltinMixin
      _S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
      {
        static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
-      return __generate_vector<__vector_type_t<_Up, _ToN>>([&](
-       auto __i) constexpr {
-       if constexpr (__i < _Np)
-         return __x[__i] ? ~_Up() : _Up();
-       else
-         return _Up();
-      });
+      return __generate_vector<__vector_type_t<_Up, _ToN>>(
+              [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                if constexpr (__i < _Np)
+                  return __x[__i] ? ~_Up() : _Up();
+                else
+                  return _Up();
+              });
      }
  
    template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
@@ -2601,13 +2622,13 @@ struct _MaskImplBuiltinMixin
           -1, -1, -1, -1, -1>(__y); else
           */
           {
-           return __generate_vector<__vector_type_t<_Up, _ToN>>([&](
-             auto __i) constexpr {
-             if constexpr (__i < _Np)
-               return _Up(__x[__i.value]);
-             else
-               return _Up();
-           });
+           return __generate_vector<__vector_type_t<_Up, _ToN>>(
+                    [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                      if constexpr (__i < _Np)
+                        return _Up(__x[__i.value]);
+                      else
+                        return _Up();
+                    });
           }
         }
      }
@@ -2625,7 +2646,9 @@ struct _MaskImplBuiltinMixin
         = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1);
        _ULLong __r = 0;
        __execute_n_times<_Np>(
-       [&](auto __i) { __r |= _ULLong(__bools[__i.value]) << __i; });
+       [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+         __r |= _ULLong(__bools[__i.value]) << __i;
+       });
        return __r;
      }
  
@@ -2677,9 +2700,10 @@ template <typename _Abi, typename>
             return __bools > 0;
           }
         else
-         return __generate_vector<_I, _S_size<_Tp>>([&](auto __i) constexpr {
-           return __mem[__i] ? ~_I() : _I();
-         });
+         return __generate_vector<_I, _S_size<_Tp>>(
+                  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                    return __mem[__i] ? ~_I() : _I();
+                  });
        }
  
      // }}}
@@ -2752,7 +2776,7 @@ template <typename _Abi, typename>
         // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
         auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
         _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
-                                 [&](auto __i) {
+                                 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                                     __tmp._M_set(__i, -__mem[__i]);
                                   });
         __merge = __wrapper_bitcast<_Tp>(__tmp);
@@ -2764,7 +2788,7 @@ template <typename _Abi, typename>
        _GLIBCXX_SIMD_INTRINSIC static void _S_store(_SimdWrapper<_Tp, _Np> __v,
                                                    bool* __mem) noexcept
        {
-       __execute_n_times<_Np>([&](auto __i) constexpr {
+       __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           __mem[__i] = __v[__i];
         });
        }
@@ -2775,10 +2799,10 @@ template <typename _Abi, typename>
        _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
                       const _SimdWrapper<_Tp, _Np> __k) noexcept
        {
-       _BitOps::_S_bit_iteration(
-         _SuperImpl::_S_to_bits(__k), [&](auto __i) constexpr {
-           __mem[__i] = __v[__i];
-         });
+       _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
+                                 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                                   __mem[__i] = __v[__i];
+                                 });
        }
  
      // _S_from_bitmask{{{2
@@ -2845,7 +2869,7 @@ template <typename _Abi, typename>
               {
                 __k = __generate_from_n_evaluations<_Np,
                                                     __vector_type_t<_Tp, _Np>>(
-                 [&](auto __j) {
+                 [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                     if (__i == __j)
                       return _Tp(-__x);
                     else
@@ -2890,7 +2914,8 @@ template <typename _Abi, typename>
        {
         return __call_with_subscripts(
           __data(__k), make_index_sequence<_S_size<_Tp>>(),
-         [](const auto... __ent) constexpr { return (... && !(__ent == 0)); });
+         [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+         { return (... && !(__ent == 0)); });
        }
  
      // }}}
@@ -2901,7 +2926,8 @@ template <typename _Abi, typename>
        {
         return __call_with_subscripts(
           __data(__k), make_index_sequence<_S_size<_Tp>>(),
-         [](const auto... __ent) constexpr { return (... || !(__ent == 0)); });
+         [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+         { return (... || !(__ent == 0)); });
        }
  
      // }}}
@@ -2912,7 +2938,8 @@ template <typename _Abi, typename>
        {
         return __call_with_subscripts(
           __data(__k), make_index_sequence<_S_size<_Tp>>(),
-         [](const auto... __ent) constexpr { return (... && (__ent == 0)); });
+         [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+         { return (... && (__ent == 0)); });
        }
  
      // }}}
diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h b/libstdc++-v3/include/experimental/bits/simd_converter.h

index 00b91c099ee70244e7f81298c5aa9e2aa686e1e6..3160e25163209df5a614c295c9d023b03a69ac40 100644 (file)
--- a/libstdc++-v3/include/experimental/bits/simd_converter.h
+++ b/libstdc++-v3/include/experimental/bits/simd_converter.h
@@ -121,7 +121,7 @@ template <typename _From, typename _To, int _Np>
         {
           return __call_with_subscripts(
             __x, make_index_sequence<_Np>(),
-           [](auto... __values) constexpr->_Ret {
+           [](auto... __values) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Ret {
               return __make_simd_tuple<_To, decltype((void) __values,
                                                      simd_abi::scalar())...>(
                 static_cast<_To>(__values)...);
@@ -233,7 +233,9 @@ template <typename _From, typename _To, int _Np>
           static_assert(_Ret::_FirstAbi::template _S_is_partial<_To>);
           return _Ret{__generate_from_n_evaluations<
             _Np, typename _VectorTraits<typename _Ret::_FirstType>::type>(
-           [&](auto __i) { return static_cast<_To>(__x[__i]); })};
+           [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+             return static_cast<_To>(__x[__i]);
+           })};
         }
        else
         {
@@ -241,7 +243,7 @@ template <typename _From, typename _To, int _Np>
           constexpr auto __n
             = __div_roundup(_Ret::_S_first_size, _Arg::_S_first_size);
           return __call_with_n_evaluations<__n>(
-           [&__x](auto... __uncvted) {
+           [&__x](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
               // assuming _Arg Abi tags for all __i are _Arg::_FirstAbi
               _SimdConverter<_From, typename _Arg::_FirstAbi, _To,
                              typename _Ret::_FirstAbi>
@@ -255,8 +257,9 @@ template <typename _From, typename _To, int _Np>
                     _From, simd_abi::fixed_size<_Np - _Ret::_S_first_size>, _To,
                     simd_abi::fixed_size<_Np - _Ret::_S_first_size>>()(
                     __simd_tuple_pop_front<_Ret::_S_first_size>(__x))};
-           },
-           [&__x](auto __i) { return __get_tuple_at<__i>(__x); });
+           }, [&__x](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+             return __get_tuple_at<__i>(__x);
+           });
         }
      }
    };
@@ -322,13 +325,14 @@ template <typename _From, int _Np, typename _To, typename _Ap>
         return __vector_convert<__vector_type_t<_To, _Np>>(__x.first);
        else if constexpr (_Arg::_S_is_homogeneous)
         return __call_with_n_evaluations<_Arg::_S_tuple_size>(
-         [](auto... __members) {
+         [](auto... __members) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             if constexpr ((is_convertible_v<decltype(__members), _To> && ...))
               return __vector_type_t<_To, _Np>{static_cast<_To>(__members)...};
             else
               return __vector_convert<__vector_type_t<_To, _Np>>(__members...);
-         },
-         [&](auto __i) { return __get_tuple_at<__i>(__x); });
+         }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+           return __get_tuple_at<__i>(__x);
+         });
        else if constexpr (__fixed_size_storage_t<_To, _Np>::_S_tuple_size == 1)
         {
           _SimdConverter<_From, simd_abi::fixed_size<_Np>, _To,
@@ -340,7 +344,7 @@ template <typename _From, int _Np, typename _To, typename _Ap>
         {
           const _SimdWrapper<_From, _Np> __xv
             = __generate_from_n_evaluations<_Np, __vector_type_t<_From, _Np>>(
-             [&](auto __i) { return __x[__i]; });
+               [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; });
           return __vector_convert<__vector_type_t<_To, _Np>>(__xv);
         }
      }
diff --git a/libstdc++-v3/include/experimental/bits/simd_detail.h b/libstdc++-v3/include/experimental/bits/simd_detail.h

index 8cabc504863f499264377874dcc6c5ae375f0a0f..a0ad10efe0f6adb0604fd3db367d7867601bc7e4 100644 (file)
--- a/libstdc++-v3/include/experimental/bits/simd_detail.h
+++ b/libstdc++-v3/include/experimental/bits/simd_detail.h
@@ -262,6 +262,7 @@
  #define _GLIBCXX_SIMD_INTRINSIC                                                \
    [[__gnu__::__always_inline__, __gnu__::__artificial__]] inline
  #define _GLIBCXX_SIMD_ALWAYS_INLINE [[__gnu__::__always_inline__]] inline
+#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA __attribute__((__always_inline__))
  #define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0)
  #define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1)
  
@@ -294,6 +295,8 @@
  #ifdef _GLIBCXX_SIMD_NO_ALWAYS_INLINE
  #undef _GLIBCXX_SIMD_ALWAYS_INLINE
  #define _GLIBCXX_SIMD_ALWAYS_INLINE inline
+#undef _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+#define _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
  #undef _GLIBCXX_SIMD_INTRINSIC
  #define _GLIBCXX_SIMD_INTRINSIC inline
  #endif
diff --git a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h

index 9ecc8e521caedcb588841227f2138269927212aa..3ac6eaa3f6bf5edfbf8301b689aa1ed9522ec9a1 100644 (file)
--- a/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
+++ b/libstdc++-v3/include/experimental/bits/simd_fixed_size.h
@@ -434,14 +434,15 @@ template <typename _Tp, typename _Abi0, typename... _Abis>
         if constexpr (is_same_v<_SimdTuple, __remove_cvref_t<_Tup>>)
           return __tup.first;
         else if (__builtin_is_constant_evaluated())
-         return __fixed_size_storage_t<_TupT, _S_first_size>::_S_generate([&](
-           auto __meta) constexpr {
-           return __meta._S_generator(
-             [&](auto __i) constexpr { return __tup[__i]; },
-             static_cast<_TupT*>(nullptr));
+         return __fixed_size_storage_t<_TupT, _S_first_size>::_S_generate(
+                  [&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                    return __meta._S_generator(
+                             [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                               return __tup[__i];
+                             }, static_cast<_TupT*>(nullptr));
           });
         else
-         return [&]() {
+         return [&]() { // not always_inline; allow the compiler to decide
             __fixed_size_storage_t<_TupT, _S_first_size> __r;
             __builtin_memcpy(__r._M_as_charptr(), __tup._M_as_charptr(),
                              sizeof(__r));
@@ -515,12 +516,11 @@ template <typename _Tp, typename _Abi0, typename... _Abis>
                          negation<is_const<remove_reference_t<_More>>>>) )
           {
             // need to write back at least one of __more after calling __fun
-           auto&& __first = [&](auto... __args) constexpr
-           {
+           auto&& __first = [&](auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
               auto __r = __fun(__tuple_element_meta<_Tp, _Abi0, 0>(), first,
                                __args...);
               [[maybe_unused]] auto&& __ignore_me = {(
-               [](auto&& __dst, const auto& __src) {
+               [](auto&& __dst, const auto& __src) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                   if constexpr (is_assignable_v<decltype(__dst),
                                                 decltype(__dst)>)
                     {
@@ -530,8 +530,7 @@ template <typename _Tp, typename _Abi0, typename... _Abis>
                 }(static_cast<_More&&>(__more), __args),
                 0)...};
               return __r;
-           }
-           (_M_extract_argument(__more)...);
+           }(_M_extract_argument(__more)...);
             if constexpr (_S_tuple_size == 1)
               return {__first};
             else
@@ -776,18 +775,18 @@ template <typename _Tp, size_t _Np, typename _V, size_t _NV, typename... _VX>
           sizeof...(_VX) == 0,
           "An array of scalars must be the last argument to __to_simd_tuple");
         return __call_with_subscripts(
-         __from,
-         make_index_sequence<_NV>(), [&](const auto... __args) constexpr {
-           return __simd_tuple_concat(
-             _SimdTuple<_Tp, simd_abi::scalar>{__args}..., _SimdTuple<_Tp>());
-         });
+                __from, make_index_sequence<_NV>(),
+                [&](const auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __simd_tuple_concat(
+                           _SimdTuple<_Tp, simd_abi::scalar>{__args}..., _SimdTuple<_Tp>());
+                });
        }
      else
        return __call_with_subscripts(
-       __from,
-       make_index_sequence<_NV>(), [&](const auto... __args) constexpr {
-         return __to_simd_tuple<_Tp, _Np>(__args..., __fromX...);
-       });
+              __from, make_index_sequence<_NV>(),
+              [&](const auto... __args) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                return __to_simd_tuple<_Tp, _Np>(__args..., __fromX...);
+              });
    }
  
  template <size_t, typename _Tp>
@@ -841,7 +840,7 @@ template <typename _Tp, typename _A0, typename _A1, typename... _Abis,
                        || _A0::template _S_is_partial<_Tp>)
        return {__generate_from_n_evaluations<_R::_S_first_size,
                                             typename _R::_FirstType>(
-               [&](auto __i) { return __x[__i]; }),
+               [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __x[__i]; }),
               __optimize_simd_tuple(
                 __simd_tuple_pop_front<_R::_S_first_size>(__x))};
      else if constexpr (is_same_v<_A0, _A1>
@@ -994,10 +993,11 @@ template <int _Index, int _Total, int _Combine, typename _Tp, typename _A0,
         return __as_vector(simd<_Tp, _RetAbi>(element_ptr, element_aligned));
  #else
         [[maybe_unused]] constexpr size_t __offset = __values_to_skip;
-       return __as_vector(simd<_Tp, _RetAbi>([&](auto __i) constexpr {
-         constexpr _SizeConstant<__i + __offset> __k;
-         return __x[__k];
-       }));
+       return __as_vector(simd<_Tp, _RetAbi>(
+                            [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                              constexpr _SizeConstant<__i + __offset> __k;
+                              return __x[__k];
+                            }));
  #endif
        }
  
@@ -1286,9 +1286,10 @@ template <int _Np, typename>
      template <typename _Tp>
        static constexpr inline _SimdMember<_Tp> _S_broadcast(_Tp __x) noexcept
        {
-       return _SimdMember<_Tp>::_S_generate([&](auto __meta) constexpr {
-         return __meta._S_broadcast(__x);
-       });
+       return _SimdMember<_Tp>::_S_generate(
+                [&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __meta._S_broadcast(__x);
+                });
        }
  
      // _S_generator {{{2
@@ -1296,14 +1297,15 @@ template <int _Np, typename>
        static constexpr inline _SimdMember<_Tp> _S_generator(_Fp&& __gen,
                                                             _TypeTag<_Tp>)
        {
-       return _SimdMember<_Tp>::_S_generate([&__gen](auto __meta) constexpr {
-         return __meta._S_generator(
-           [&](auto __i) constexpr {
-             return __i < _Np ? __gen(_SizeConstant<__meta._S_offset + __i>())
-                              : 0;
-           },
-           _TypeTag<_Tp>());
-       });
+       return _SimdMember<_Tp>::_S_generate(
+                [&__gen](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __meta._S_generator(
+                           [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                             return __i < _Np ? __gen(_SizeConstant<__meta._S_offset + __i>())
+                                              : 0;
+                           },
+                           _TypeTag<_Tp>());
+                });
        }
  
      // _S_load {{{2
@@ -1311,9 +1313,10 @@ template <int _Np, typename>
        static inline _SimdMember<_Tp> _S_load(const _Up* __mem,
                                              _TypeTag<_Tp>) noexcept
        {
-       return _SimdMember<_Tp>::_S_generate([&](auto __meta) {
-         return __meta._S_load(&__mem[__meta._S_offset], _TypeTag<_Tp>());
-       });
+       return _SimdMember<_Tp>::_S_generate(
+                [&](auto __meta) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __meta._S_load(&__mem[__meta._S_offset], _TypeTag<_Tp>());
+                });
        }
  
      // _S_masked_load {{{2
@@ -1323,7 +1326,7 @@ template <int _Np, typename>
                      const _MaskMember __bits, const _Up* __mem) noexcept
        {
         auto __merge = __old;
-       __for_each(__merge, [&](auto __meta, auto& __native) {
+       __for_each(__merge, [&](auto __meta, auto& __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           if (__meta._S_submask(__bits).any())
  #pragma GCC diagnostic push
           // __mem + __mem._S_offset could be UB ([expr.add]/4.3, but it punts
@@ -1344,7 +1347,7 @@ template <int _Np, typename>
        static inline void _S_store(const _SimdMember<_Tp>& __v, _Up* __mem,
                                   _TypeTag<_Tp>) noexcept
        {
-       __for_each(__v, [&](auto __meta, auto __native) {
+       __for_each(__v, [&](auto __meta, auto __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           __meta._S_store(__native, &__mem[__meta._S_offset], _TypeTag<_Tp>());
         });
        }
@@ -1355,7 +1358,7 @@ template <int _Np, typename>
                                          _Up* __mem,
                                          const _MaskMember __bits) noexcept
        {
-       __for_each(__v, [&](auto __meta, auto __native) {
+       __for_each(__v, [&](auto __meta, auto __native) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           if (__meta._S_submask(__bits).any())
  #pragma GCC diagnostic push
           // __mem + __mem._S_offset could be UB ([expr.add]/4.3, but it punts
@@ -1376,7 +1379,7 @@ template <int _Np, typename>
        {
         _MaskMember __bits = 0;
         __for_each(
-         __x, [&__bits](auto __meta, auto __native) constexpr {
+         __x, [&__bits](auto __meta, auto __native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             __bits
               |= __meta._S_mask_to_shifted_ullong(__meta._S_negate(__native));
           });
@@ -1414,7 +1417,7 @@ template <int _Np, typename>
           {
             const auto& __x2 = __call_with_n_evaluations<
               __div_roundup(_Tup::_S_tuple_size, 2)>(
-             [](auto __first_simd, auto... __remaining) {
+             [](auto __first_simd, auto... __remaining) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                 if constexpr (sizeof...(__remaining) == 0)
                   return __first_simd;
                 else
@@ -1428,7 +1431,7 @@ template <int _Np, typename>
                       __make_simd_tuple(__first_simd, __remaining...));
                   }
               },
-             [&](auto __i) {
+             [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                 auto __left = __tup.template _M_simd_at<2 * __i>();
                 if constexpr (2 * __i + 1 == _Tup::_S_tuple_size)
                   return __left;
@@ -1444,7 +1447,9 @@ template <int _Np, typename>
                         _GLIBCXX_SIMD_USE_CONSTEXPR_API
                         typename _LT::mask_type __k(
                           __private_init,
-                         [](auto __j) constexpr { return __j < _RT::size(); });
+                         [](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                           return __j < _RT::size();
+                         });
                         _LT __ext_right = __left;
                         where(__k, __ext_right)
                           = __proposed::resizing_simd_cast<_LT>(__right);
@@ -1464,7 +1469,7 @@ template <int _Np, typename>
              const _SimdTuple<_Tp, _As...>& __b)
        {
         return __a._M_apply_per_chunk(
-         [](auto __impl, auto __aa, auto __bb) constexpr {
+         [](auto __impl, auto __aa, auto __bb) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             return __impl._S_min(__aa, __bb);
           },
           __b);
@@ -1476,7 +1481,7 @@ template <int _Np, typename>
              const _SimdTuple<_Tp, _As...>& __b)
        {
         return __a._M_apply_per_chunk(
-         [](auto __impl, auto __aa, auto __bb) constexpr {
+         [](auto __impl, auto __aa, auto __bb) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             return __impl._S_max(__aa, __bb);
           },
           __b);
@@ -1487,9 +1492,10 @@ template <int _Np, typename>
        static inline constexpr _SimdTuple<_Tp, _As...>
        _S_complement(const _SimdTuple<_Tp, _As...>& __x) noexcept
        {
-       return __x._M_apply_per_chunk([](auto __impl, auto __xx) constexpr {
-         return __impl._S_complement(__xx);
-       });
+       return __x._M_apply_per_chunk(
+                [](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __impl._S_complement(__xx);
+                });
        }
  
      // _S_unary_minus {{{2
@@ -1497,23 +1503,24 @@ template <int _Np, typename>
        static inline constexpr _SimdTuple<_Tp, _As...>
        _S_unary_minus(const _SimdTuple<_Tp, _As...>& __x) noexcept
        {
-       return __x._M_apply_per_chunk([](auto __impl, auto __xx) constexpr {
-         return __impl._S_unary_minus(__xx);
-       });
+       return __x._M_apply_per_chunk(
+                [](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __impl._S_unary_minus(__xx);
+                });
        }
  
      // arithmetic operators {{{2
  
-#define _GLIBCXX_SIMD_FIXED_OP(name_, op_)                                     \
-    template <typename _Tp, typename... _As>                                   \
-      static inline constexpr _SimdTuple<_Tp, _As...> name_(                   \
-       const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y)\
-      {                                                                        \
-       return __x._M_apply_per_chunk(                                         \
-         [](auto __impl, auto __xx, auto __yy) constexpr {                    \
-           return __impl.name_(__xx, __yy);                                   \
-         },                                                                   \
-         __y);                                                                \
+#define _GLIBCXX_SIMD_FIXED_OP(name_, op_)                                                     \
+    template <typename _Tp, typename... _As>                                                   \
+      static inline constexpr _SimdTuple<_Tp, _As...> name_(                                   \
+       const _SimdTuple<_Tp, _As...>& __x, const _SimdTuple<_Tp, _As...>& __y)                \
+      {                                                                                        \
+       return __x._M_apply_per_chunk(                                                         \
+         [](auto __impl, auto __xx, auto __yy) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
+           return __impl.name_(__xx, __yy);                                                   \
+         },                                                                                   \
+         __y);                                                                                \
        }
  
      _GLIBCXX_SIMD_FIXED_OP(_S_plus, +)
@@ -1532,18 +1539,20 @@ template <int _Np, typename>
        static inline constexpr _SimdTuple<_Tp, _As...>
        _S_bit_shift_left(const _SimdTuple<_Tp, _As...>& __x, int __y)
        {
-       return __x._M_apply_per_chunk([__y](auto __impl, auto __xx) constexpr {
-         return __impl._S_bit_shift_left(__xx, __y);
-       });
+       return __x._M_apply_per_chunk(
+                [__y](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __impl._S_bit_shift_left(__xx, __y);
+                });
        }
  
      template <typename _Tp, typename... _As>
        static inline constexpr _SimdTuple<_Tp, _As...>
        _S_bit_shift_right(const _SimdTuple<_Tp, _As...>& __x, int __y)
        {
-       return __x._M_apply_per_chunk([__y](auto __impl, auto __xx) constexpr {
-         return __impl._S_bit_shift_right(__xx, __y);
-       });
+       return __x._M_apply_per_chunk(
+                [__y](auto __impl, auto __xx) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __impl._S_bit_shift_right(__xx, __y);
+                });
        }
  
    // math {{{2
@@ -1557,35 +1566,40 @@ template <int _Np, typename>
           {                                                                    \
             if constexpr (is_same_v<_Tp, _RetTp>)                              \
               return __x._M_apply_per_chunk(                                   \
-               [](auto __impl, auto __xx) constexpr {                         \
-                 using _V = typename decltype(__impl)::simd_type;             \
-                 return __data(__name(_V(__private_init, __xx)));             \
-               });                                                            \
+                      [](auto __impl, auto __xx)                              \
+                        constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA          \
+                      {                                                       \
+                        using _V = typename decltype(__impl)::simd_type;      \
+                        return __data(__name(_V(__private_init, __xx)));      \
+                      });                                                     \
             else                                                               \
               return __optimize_simd_tuple(                                    \
-               __x.template _M_apply_r<_RetTp>([](auto __impl, auto __xx) {   \
-                 return __impl._S_##__name(__xx);                             \
-               }));                                                           \
+                      __x.template _M_apply_r<_RetTp>(                        \
+                        [](auto __impl, auto __xx)                            \
+                          _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA                  \
+                        { return __impl._S_##__name(__xx); }));               \
           }                                                                    \
         else if constexpr (                                                    \
           is_same_v<                                                           \
             _Tp,                                                               \
             _RetTp> && (... && is_same_v<_SimdTuple<_Tp, _As...>, _More>) )    \
           return __x._M_apply_per_chunk(                                       \
-           [](auto __impl, auto __xx, auto... __pack) constexpr {             \
-             using _V = typename decltype(__impl)::simd_type;                 \
-             return __data(__name(_V(__private_init, __xx),                   \
-                                  _V(__private_init, __pack)...));            \
-           },                                                                 \
-           __more...);                                                        \
+                  [](auto __impl, auto __xx, auto... __pack)                  \
+                    constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA              \
+                  {                                                           \
+                    using _V = typename decltype(__impl)::simd_type;          \
+                    return __data(__name(_V(__private_init, __xx),            \
+                                         _V(__private_init, __pack)...));     \
+                  }, __more...);                                              \
         else if constexpr (is_same_v<_Tp, _RetTp>)                             \
           return __x._M_apply_per_chunk(                                       \
-           [](auto __impl, auto __xx, auto... __pack) constexpr {             \
-             using _V = typename decltype(__impl)::simd_type;                 \
-             return __data(__name(_V(__private_init, __xx),                   \
-                                  __autocvt_to_simd(__pack)...));             \
-           },                                                                 \
-           __more...);                                                        \
+                  [](auto __impl, auto __xx, auto... __pack)                  \
+                    constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA              \
+                  {                                                           \
+                    using _V = typename decltype(__impl)::simd_type;          \
+                    return __data(__name(_V(__private_init, __xx),            \
+                                         __autocvt_to_simd(__pack)...));      \
+                  }, __more...);                                              \
         else                                                                   \
           __assert_unreachable<_Tp>();                                         \
        }
@@ -1657,10 +1671,10 @@ template <int _Np, typename>
         __fixed_size_storage_t<int, _SimdTuple<_Tp, _Abis...>::_S_size()>* __z)
        {
         return __x._M_apply_per_chunk(
-         [](auto __impl, const auto __xx, const auto __yy, auto& __zz) {
-           return __impl._S_remquo(__xx, __yy, &__zz);
-         },
-         __y, *__z);
+                [](auto __impl, const auto __xx, const auto __yy, auto& __zz)
+                  _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+                { return __impl._S_remquo(__xx, __yy, &__zz); },
+                __y, *__z);
        }
  
      template <typename _Tp, typename... _As>
@@ -1669,12 +1683,10 @@ template <int _Np, typename>
                __fixed_size_storage_t<int, _Np>& __exp) noexcept
        {
         return __x._M_apply_per_chunk(
-         [](auto __impl, const auto& __a, auto& __b) {
-           return __data(
-             frexp(typename decltype(__impl)::simd_type(__private_init, __a),
-                   __autocvt_to_simd(__b)));
-         },
-         __exp);
+                [](auto __impl, const auto& __a, auto& __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __data(frexp(typename decltype(__impl)::simd_type(__private_init, __a),
+                                      __autocvt_to_simd(__b)));
+                }, __exp);
        }
  
  #define _GLIBCXX_SIMD_TEST_ON_TUPLE_(name_)                                    \
@@ -1700,7 +1712,7 @@ template <int _Np, typename>
        _S_increment(_SimdTuple<_Ts...>& __x)
        {
         __for_each(
-         __x, [](auto __meta, auto& native) constexpr {
+         __x, [](auto __meta, auto& native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             __meta._S_increment(native);
           });
        }
@@ -1710,7 +1722,7 @@ template <int _Np, typename>
        _S_decrement(_SimdTuple<_Ts...>& __x)
        {
         __for_each(
-         __x, [](auto __meta, auto& native) constexpr {
+         __x, [](auto __meta, auto& native) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             __meta._S_decrement(native);
           });
        }
@@ -1722,11 +1734,10 @@ template <int _Np, typename>
        __cmp(const _SimdTuple<_Tp, _As...>& __x,                                \
             const _SimdTuple<_Tp, _As...>& __y)                                \
        {                                                                        \
-       return _M_test(                                                        \
-         [](auto __impl, auto __xx, auto __yy) constexpr {                    \
-           return __impl.__cmp(__xx, __yy);                                   \
-         },                                                                   \
-         __x, __y);                                                           \
+       return _M_test([](auto __impl, auto __xx, auto __yy)                   \
+                        constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA          \
+                      { return __impl.__cmp(__xx, __yy); },                   \
+                      __x, __y);                                              \
        }
  
      _GLIBCXX_SIMD_CMP_OPERATIONS(_S_equal_to)
@@ -1753,12 +1764,13 @@ template <int _Np, typename>
        _S_masked_assign(const _MaskMember __bits, _SimdTuple<_Tp, _As...>& __lhs,
                        const __type_identity_t<_SimdTuple<_Tp, _As...>>& __rhs)
        {
-       __for_each(
-         __lhs, __rhs,
-         [&](auto __meta, auto& __native_lhs, auto __native_rhs) constexpr {
-           __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs,
-                                   __native_rhs);
-         });
+       __for_each(__lhs, __rhs,
+                  [&](auto __meta, auto& __native_lhs, auto __native_rhs)
+                    constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+                  {
+                    __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs,
+                                            __native_rhs);
+                  });
        }
  
      // Optimization for the case where the RHS is a scalar. No need to broadcast
@@ -1769,7 +1781,7 @@ template <int _Np, typename>
                        const __type_identity_t<_Tp> __rhs)
        {
         __for_each(
-         __lhs, [&](auto __meta, auto& __native_lhs) constexpr {
+         __lhs, [&](auto __meta, auto& __native_lhs) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             __meta._S_masked_assign(__meta._S_make_mask(__bits), __native_lhs,
                                     __rhs);
           });
@@ -1782,12 +1794,13 @@ template <int _Np, typename>
                                            const _SimdTuple<_Tp, _As...>& __rhs,
                                            _Op __op)
        {
-       __for_each(
-         __lhs, __rhs,
-         [&](auto __meta, auto& __native_lhs, auto __native_rhs) constexpr {
-           __meta.template _S_masked_cassign(__meta._S_make_mask(__bits),
-                                             __native_lhs, __native_rhs, __op);
-         });
+       __for_each(__lhs, __rhs,
+                  [&](auto __meta, auto& __native_lhs, auto __native_rhs)
+                    constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+                  {
+                    __meta.template _S_masked_cassign(__meta._S_make_mask(__bits),
+                                                      __native_lhs, __native_rhs, __op);
+                  });
        }
  
      // Optimization for the case where the RHS is a scalar. No need to broadcast
@@ -1798,7 +1811,7 @@ template <int _Np, typename>
                                            const _Tp& __rhs, _Op __op)
        {
         __for_each(
-         __lhs, [&](auto __meta, auto& __native_lhs) constexpr {
+         __lhs, [&](auto __meta, auto& __native_lhs) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
             __meta.template _S_masked_cassign(__meta._S_make_mask(__bits),
                                               __native_lhs, __rhs, __op);
           });
@@ -1899,7 +1912,7 @@ template <int _Np, typename>
        // _Np _UShort, _UInt, _ULLong, float, and double can be more efficient.
        _ULLong __r = 0;
        using _Vs = __fixed_size_storage_t<_UChar, _Np>;
-      __for_each(_Vs{}, [&](auto __meta, auto) {
+      __for_each(_Vs{}, [&](auto __meta, auto) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
         __r |= __meta._S_mask_to_shifted_ullong(
           __meta._S_mask_impl._S_load(&__mem[__meta._S_offset],
                                       _SizeConstant<__meta._S_size()>()));
@@ -1912,9 +1925,10 @@ template <int _Np, typename>
                                              _MaskMember __mask,
                                              const bool* __mem) noexcept
      {
-      _BitOps::_S_bit_iteration(__mask.to_ullong(), [&](auto __i) {
-       __merge.set(__i, __mem[__i]);
-      });
+      _BitOps::_S_bit_iteration(__mask.to_ullong(),
+                               [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                                 __merge.set(__i, __mem[__i]);
+                               });
        return __merge;
      }
  
@@ -1932,7 +1946,8 @@ template <int _Np, typename>
      static inline void _S_masked_store(const _MaskMember __v, bool* __mem,
                                        const _MaskMember __k) noexcept
      {
-      _BitOps::_S_bit_iteration(__k, [&](auto __i) { __mem[__i] = __v[__i]; });
+      _BitOps::_S_bit_iteration(
+       __k, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; });
      }
  
      // logical and bitwise operators {{{2
diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h

index 2aff8ff5fa4a5dbf38c04e4068d762acebae2b5a..c20315e4e3047a818874d49211c1c576f562ae99 100644 (file)
--- a/libstdc++-v3/include/experimental/bits/simd_math.h
+++ b/libstdc++-v3/include/experimental/bits/simd_math.h
@@ -788,7 +788,7 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
  
         // __exponent(__x) returns the exponent value (bias removed) as
         // simd<_Up> with integral _Up
-       auto&& __exponent = [](const _V& __v) {
+       auto&& __exponent = [](const _V& __v) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           using namespace std::experimental::__proposed;
           using _IV = rebind_simd_t<
             conditional_t<sizeof(_Tp) == sizeof(_LLong), _LLong, int>, _V>;
@@ -931,7 +931,7 @@ template <typename _R, typename _ToApply, typename _Tp, typename... _Tps>
    {
      return {__private_init,
             __data(__arg0)._M_apply_per_chunk(
-             [&](auto __impl, const auto&... __inner) {
+             [&](auto __impl, const auto&... __inner) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                 using _V = typename decltype(__impl)::simd_type;
                 return __data(__apply(_V(__private_init, __inner)...));
               },
@@ -1092,8 +1092,9 @@ _GLIBCXX_SIMD_CVTING2(hypot)
      if constexpr (__is_fixed_size_abi_v<_Abi> && _V::size() > 1)
        {
         return __fixed_size_apply<simd<_Tp, _Abi>>(
-         [](auto __a, auto __b, auto __c) { return hypot(__a, __b, __c); },
-         __x, __y, __z);
+                [](auto __a, auto __b, auto __c) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return hypot(__a, __b, __c);
+                }, __x, __y, __z);
        }
      else
        {
@@ -1380,9 +1381,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
                  const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m,
                  const simd<_Tp, _Abi>& __x)
    {
-    return simd<_Tp, _Abi>([&](auto __i) {
-      return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]);
-    });
+    return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            return std::assoc_laguerre(__n[__i], __m[__i], __x[__i]);
+          });
    }
  
  template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1391,9 +1392,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
                  const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m,
                  const simd<_Tp, _Abi>& __x)
    {
-    return simd<_Tp, _Abi>([&](auto __i) {
-      return std::assoc_legendre(__n[__i], __m[__i], __x[__i]);
-    });
+    return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            return std::assoc_legendre(__n[__i], __m[__i], __x[__i]);
+          });
    }
  
  _GLIBCXX_SIMD_MATH_CALL2_(beta, _Tp)
@@ -1414,8 +1415,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
    hermite(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
           const simd<_Tp, _Abi>& __x)
    {
-    return simd<_Tp, _Abi>(
-      [&](auto __i) { return std::hermite(__n[__i], __x[__i]); });
+    return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            return std::hermite(__n[__i], __x[__i]);
+          });
    }
  
  template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1423,8 +1425,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
    laguerre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
            const simd<_Tp, _Abi>& __x)
    {
-    return simd<_Tp, _Abi>(
-      [&](auto __i) { return std::laguerre(__n[__i], __x[__i]); });
+    return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            return std::laguerre(__n[__i], __x[__i]);
+          });
    }
  
  template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1432,8 +1435,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
    legendre(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
            const simd<_Tp, _Abi>& __x)
    {
-    return simd<_Tp, _Abi>(
-      [&](auto __i) { return std::legendre(__n[__i], __x[__i]); });
+    return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            return std::legendre(__n[__i], __x[__i]);
+          });
    }
  
  _GLIBCXX_SIMD_MATH_CALL_(riemann_zeta)
@@ -1443,8 +1447,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
    sph_bessel(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
              const simd<_Tp, _Abi>& __x)
    {
-    return simd<_Tp, _Abi>(
-      [&](auto __i) { return std::sph_bessel(__n[__i], __x[__i]); });
+    return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            return std::sph_bessel(__n[__i], __x[__i]);
+          });
    }
  
  template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1453,9 +1458,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
                const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __m,
                const simd<_Tp, _Abi>& theta)
    {
-    return simd<_Tp, _Abi>([&](auto __i) {
-      return std::assoc_legendre(__l[__i], __m[__i], theta[__i]);
-    });
+    return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            return std::assoc_legendre(__l[__i], __m[__i], theta[__i]);
+          });
    }
  
  template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
@@ -1463,8 +1468,9 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper>
    sph_neumann(const fixed_size_simd<unsigned, simd_size_v<_Tp, _Abi>>& __n,
               const simd<_Tp, _Abi>& __x)
    {
-    return simd<_Tp, _Abi>(
-      [&](auto __i) { return std::sph_neumann(__n[__i], __x[__i]); });
+    return simd<_Tp, _Abi>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+            return std::sph_neumann(__n[__i], __x[__i]);
+          });
    }
  // }}}
  
diff --git a/libstdc++-v3/include/experimental/bits/simd_neon.h b/libstdc++-v3/include/experimental/bits/simd_neon.h

index 8429c252196547a3d1a9b37132631477e7e4f61c..7e4cb17b205d882c890e7292370fccdeb607f32b 100644 (file)
--- a/libstdc++-v3/include/experimental/bits/simd_neon.h
+++ b/libstdc++-v3/include/experimental/bits/simd_neon.h
@@ -61,7 +61,7 @@ template <typename _Abi, typename>
        _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
                      const _Up* __mem) noexcept
        {
-       __execute_n_times<_Np>([&](auto __i) {
+       __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           if (__k[__i] != 0)
             __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
         });
@@ -75,7 +75,7 @@ template <typename _Abi, typename>
        _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
                             _MaskMember<_Tp> __k)
        {
-       __execute_n_times<_Np>([&](auto __i) {
+       __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           if (__k[__i] != 0)
             __mem[__i] = __v[__i];
         });
@@ -286,7 +286,7 @@ struct _MaskImplNeonMixin
             {
               constexpr auto __bitsel
                 = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
-                 [&](auto __i) {
+                 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                     return static_cast<_I>(
                       __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
                   });
@@ -306,7 +306,7 @@ struct _MaskImplNeonMixin
             {
               constexpr auto __bitsel
                 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
-                 [&](auto __i) {
+                 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                     return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                   });
               __asint &= __bitsel;
@@ -322,7 +322,7 @@ struct _MaskImplNeonMixin
             {
               constexpr auto __bitsel
                 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
-                 [&](auto __i) {
+                 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                     return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                   });
               __asint &= __bitsel;
@@ -346,7 +346,7 @@ struct _MaskImplNeonMixin
             {
               constexpr auto __bitsel
                 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
-                 [&](auto __i) {
+                 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                     return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                   });
               __asint &= __bitsel;
@@ -361,7 +361,7 @@ struct _MaskImplNeonMixin
             {
               constexpr auto __bitsel
                 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
-                 [&](auto __i) {
+                 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                     return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                   });
               __asint &= __bitsel;
diff --git a/libstdc++-v3/include/experimental/bits/simd_x86.h b/libstdc++-v3/include/experimental/bits/simd_x86.h

index 0f4aa95e1a46f580977fc3d76f210cf75e48f802..60e80d394ba2cd36ae3cf44f08498a1b65d3bd7d 100644 (file)
--- a/libstdc++-v3/include/experimental/bits/simd_x86.h
+++ b/libstdc++-v3/include/experimental/bits/simd_x86.h
@@ -537,16 +537,17 @@ struct _CommonImplX86 : _CommonImplBuiltin
      _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem)
      {
        if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL
-       _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>([=]() constexpr {
-                       if constexpr (_Np <= 16)
-                         return _mm_movm_epi8(__x._M_to_bits());
-                       else if constexpr (_Np <= 32)
-                         return _mm256_movm_epi8(__x._M_to_bits());
-                       else if constexpr (_Np <= 64)
-                         return _mm512_movm_epi8(__x._M_to_bits());
-                       else
-                         __assert_unreachable<_SizeConstant<_Np>>();
-                     }()),
+       _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>(
+                           [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                             if constexpr (_Np <= 16)
+                               return _mm_movm_epi8(__x._M_to_bits());
+                             else if constexpr (_Np <= 32)
+                               return _mm256_movm_epi8(__x._M_to_bits());
+                             else if constexpr (_Np <= 64)
+                               return _mm512_movm_epi8(__x._M_to_bits());
+                             else
+                               __assert_unreachable<_SizeConstant<_Np>>();
+                           }()),
                       __mem);
        else if constexpr (__have_bmi2)
         {
@@ -554,7 +555,7 @@ struct _CommonImplX86 : _CommonImplBuiltin
             _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem);
           else
             __execute_n_times<__div_roundup(_Np, sizeof(size_t))>(
-             [&](auto __i) {
+             [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                 constexpr size_t __offset = __i * sizeof(size_t);
                 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset);
                 if constexpr (__todo == 1)
@@ -575,7 +576,7 @@ struct _CommonImplX86 : _CommonImplBuiltin
               });
         }
        else if constexpr (__have_sse2 && _Np > 7)
-       __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) {
+       __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
           constexpr int __offset = __i * 16;
           constexpr int __todo = std::min(16, int(_Np) - __offset);
           const int __bits = __x.template _M_extract<__offset>()._M_to_bits();
@@ -765,9 +766,10 @@ struct _CommonImplX86 : _CommonImplBuiltin
        static_assert(is_same_v<_Tp, _Tp> && __have_avx512f);
        if (__k._M_is_constprop() && __at0._M_is_constprop()
           && __at1._M_is_constprop())
-       return __generate_from_n_evaluations<_Np,
-                                            __vector_type_t<_Tp, _Np>>([&](
-         auto __i) constexpr { return __k[__i] ? __at1[__i] : __at0[__i]; });
+       return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>(
+                [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                  return __k[__i] ? __at1[__i] : __at0[__i];
+                });
        else if constexpr (sizeof(__at0) == 64
                          || (__have_avx512vl && sizeof(__at0) >= 16))
         return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data);
@@ -994,9 +996,8 @@ template <typename _Abi, typename>
               }
             else
               _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
-                                       [&](auto __i) {
-                                         __merge._M_set(__i, static_cast<_Tp>(
-                                                               __mem[__i]));
+                                       [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                                         __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
                                         });
           }
         /* Very uncertain, that the following improves anything. Needs
@@ -1417,11 +1418,12 @@ template <typename _Abi, typename>
               const auto __yf = __convert_all<_FloatV, __n_floatv>(
                 _Abi::__make_padding_nonzero(__as_vector(__y)));
               return __call_with_n_evaluations<__n_floatv>(
-               [](auto... __quotients) {
+               [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                   return __vector_convert<_R>(__quotients...);
                 },
-               [&__xf,
-                &__yf](auto __i) -> _SimdWrapper<_Float, __n_intermediate> {
+               [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+                 -> _SimdWrapper<_Float, __n_intermediate>
+               {
  #if !defined __clang__ && __GCC_IEC_559 == 0
                   // If -freciprocal-math is active, using the `/` operator is
                   // incorrect because it may be translated to an imprecise
@@ -1980,7 +1982,7 @@ template <typename _Abi, typename>
               {
                 auto __mask = __vector_bitcast<_UChar>(
                   __vector_bitcast<_UShort>(__iy) << 5);
-               auto __maskl = [&]() {
+               auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                   return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8);
                 };
                 auto __xh = __vector_bitcast<short>(__ix);
@@ -2067,19 +2069,20 @@ template <typename _Abi, typename>
           }                                                      //}}}
         else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{
           {
-           [[maybe_unused]] auto __blend_0xaa = [](auto __a, auto __b) {
-             if constexpr (sizeof(__a) == 16)
-               return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
-                                      0xaa);
-             else if constexpr (sizeof(__a) == 32)
-               return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
-                                         0xaa);
-             else if constexpr (sizeof(__a) == 64)
-               return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
-                                              __to_intrin(__b));
-             else
-               __assert_unreachable<decltype(__a)>();
-           };
+           [[maybe_unused]] auto __blend_0xaa
+             = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+               if constexpr (sizeof(__a) == 16)
+                 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b),
+                                        0xaa);
+               else if constexpr (sizeof(__a) == 32)
+                 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b),
+                                           0xaa);
+               else if constexpr (sizeof(__a) == 64)
+                 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a),
+                                                __to_intrin(__b));
+               else
+                 __assert_unreachable<decltype(__a)>();
+             };
             if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16)
               return __intrin_bitcast<_V>(is_signed_v<_Up>
                                             ? _mm_srav_epi16(__ix, __iy)
@@ -2136,9 +2139,10 @@ template <typename _Abi, typename>
               {
                 auto __k = __vector_bitcast<_UShort>(__iy) << 11;
                 auto __x128 = __vector_bitcast<_Up>(__ix);
-               auto __mask = [](__vector_type16_t<_UShort> __kk) {
-                 return __vector_bitcast<short>(__kk) < 0;
-               };
+               auto __mask
+                 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                   return __vector_bitcast<short>(__kk) < 0;
+                 };
                 // do __x128 = 0 where __y[4] is set
                 __x128 = __mask(__k) ? decltype(__x128)() : __x128;
                 // do __x128 =>> 8 where __y[3] is set
@@ -2178,7 +2182,7 @@ template <typename _Abi, typename>
               }
             else
               {
-               auto __shift = [](auto __a, auto __b) {
+               auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                   if constexpr (is_signed_v<_Up>)
                     return _mm_sra_epi32(__a, __b);
                   else
@@ -3492,7 +3496,7 @@ struct _MaskImplX86Mixin
         return _S_to_maskvector<_Up, _ToN>(__k);
        else if (__x._M_is_constprop() || __builtin_is_constant_evaluated())
         return __generate_from_n_evaluations<std::min(_ToN, _Np), _UV>(
-         [&](auto __i) -> _Up { return -__x[__i.value]; });
+         [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; });
        else if constexpr (sizeof(_Up) == 1)
         {
           if constexpr (sizeof(_UI) == 16)
@@ -3737,9 +3741,9 @@ struct _MaskImplX86Mixin
        else if constexpr (__bits_per_element >= _ToN)
         {
           constexpr auto __bitmask
-           = __generate_vector<_V>([](auto __i) constexpr->_UpUInt {
-               return __i < _ToN ? 1ull << __i : 0;
-             });
+           = __generate_vector<_V>([](auto __i)
+                                   constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt
+                                   { return __i < _ToN ? 1ull << __i : 0; });
           const auto __bits
             = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask;
           if constexpr (__bits_per_element > _ToN)
@@ -3750,11 +3754,11 @@ struct _MaskImplX86Mixin
        else
         {
           const _V __tmp
-           = __generate_vector<_V>([&](auto __i) constexpr {
+           = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                 return static_cast<_UpUInt>(
                   __k >> (__bits_per_element * (__i / __bits_per_element)));
               })
-             & __generate_vector<_V>([](auto __i) constexpr {
+             & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                   return static_cast<_UpUInt>(1ull
                                               << (__i % __bits_per_element));
                 }); // mask bit index
@@ -3790,7 +3794,7 @@ struct _MaskImplX86Mixin
               const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x);
               return __generate_from_n_evaluations<std::min(_ToN, _Np),
                                                    __vector_type_t<_Up, _ToN>>(
-               [&](auto __i) -> _Up { return __y[__i.value]; });
+               [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; });
             }
           using _To = __vector_type_t<_Up, _ToN>;
           [[maybe_unused]] constexpr size_t _FromN = _Np;
@@ -4125,8 +4129,11 @@ struct _MaskImplX86Mixin
             {
               const auto __bools = -__x._M_data;
               const _ULLong __k = __call_with_n_evaluations<_Np>(
-               [](auto... __bits) { return (__bits | ...); },
-               [&](auto __i) { return _ULLong(__bools[+__i]) << __i; });
+               [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                 return (__bits | ...);
+               }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+                 return _ULLong(__bools[+__i]) << __i;
+               });
               if (__builtin_is_constant_evaluated()
                   || __builtin_constant_p(__k))
                 return __k;
@@ -4282,13 +4289,14 @@ template <typename _Abi, typename>
         static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
         if constexpr (__have_avx512bw)
           {
-           const auto __to_vec_or_bits = [](auto __bits) -> decltype(auto) {
-             if constexpr (__is_avx512_abi<_Abi>())
-               return __bits;
-             else
-               return _S_to_maskvector<_Tp>(
-                 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
-           };
+           const auto __to_vec_or_bits
+             = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) {
+               if constexpr (__is_avx512_abi<_Abi>())
+                 return __bits;
+               else
+                 return _S_to_maskvector<_Tp>(
+                          _BitMask<_S_size<_Tp>>(__bits)._M_sanitized());
+             };
  
             if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl)
               {
@@ -4475,7 +4483,7 @@ template <typename _Abi, typename>
               }
             else
               {
-               _BitOps::_S_bit_iteration(__mask, [&](auto __i) {
+               _BitOps::_S_bit_iteration(__mask, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                   __merge._M_set(__i, __mem[__i]);
                 });
                 return __merge;
@@ -4554,7 +4562,7 @@ template <typename _Abi, typename>
           {
             if constexpr (__have_avx512bw_vl)
               _CommonImplX86::_S_store<_Np>(
-               __vector_bitcast<char>([](auto __data) {
+               __vector_bitcast<char>([](auto __data) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
                   if constexpr (_Np <= 16)
                     return _mm_maskz_set1_epi8(__data, 1);
                   else if constexpr (_Np <= 32)
author	Matthias Kretz <m.kretz@gsi.de>
	Sat, 14 Jan 2023 16:07:59 +0000 (17:07 +0100)
committer	Matthias Kretz <m.kretz@gsi.de>
	Thu, 16 Feb 2023 14:58:33 +0000 (15:58 +0100)
libstdc++-v3/include/experimental/bits/simd.h		patch \| blob \| history
libstdc++-v3/include/experimental/bits/simd_builtin.h		patch \| blob \| history
libstdc++-v3/include/experimental/bits/simd_converter.h		patch \| blob \| history
libstdc++-v3/include/experimental/bits/simd_detail.h		patch \| blob \| history
libstdc++-v3/include/experimental/bits/simd_fixed_size.h		patch \| blob \| history
libstdc++-v3/include/experimental/bits/simd_math.h		patch \| blob \| history
libstdc++-v3/include/experimental/bits/simd_neon.h		patch \| blob \| history
libstdc++-v3/include/experimental/bits/simd_x86.h		patch \| blob \| history