}
#endif
-/* vec_abss */
+/* vec_abss */
#define __builtin_altivec_abss_v16qi vec_abss
#define __builtin_altivec_abss_v8hi vec_abss
#define __builtin_altivec_abss_v4si vec_abss
static vector bool int __ATTRS_o_ai
vec_mergee(vector bool int __a, vector bool int __b) {
return vec_perm(__a, __b, (vector unsigned char)
- (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+ (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
}
static vector signed int __ATTRS_o_ai
vec_mergee(vector signed int __a, vector signed int __b) {
return vec_perm(__a, __b, (vector unsigned char)
- (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+ (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
}
static vector unsigned int __ATTRS_o_ai
vec_mergee(vector unsigned int __a, vector unsigned int __b) {
return vec_perm(__a, __b, (vector unsigned char)
- (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+ (0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B));
}
///
/// \headerfile <x86intrin.h>
///
-/// \code
+/// \code
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
-/// \endcode
+/// \endcode
///
-/// \code
+/// \code
/// This intrinsic corresponds to the \c EXTRQ instruction.
-/// \endcode
+/// \endcode
///
/// \param x
/// The value from which bits are extracted.
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
-/// Bits [5:0] specify the index of the least significant bit; the other
-/// bits are ignored. If the sum of the index and length is greater than
-/// 64, the result is undefined. If the length and index are both zero,
-/// bits [63:0] of parameter x are extracted. If the length is zero
+/// Bits [5:0] specify the index of the least significant bit; the other
+/// bits are ignored. If the sum of the index and length is greater than
+/// 64, the result is undefined. If the length and index are both zero,
+/// bits [63:0] of parameter x are extracted. If the length is zero
/// but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
/// extracted from the source operand.
///
/// \headerfile <x86intrin.h>
///
-/// \code
+/// \code
/// This intrinsic corresponds to the \c EXTRQ instruction.
-/// \endcode
+/// \endcode
///
/// \param __x
/// The value from which bits are extracted.
/// \param __y
-/// Specifies the index of the least significant bit at [13:8]
-/// and the length at [5:0]; all other bits are ignored.
+/// Specifies the index of the least significant bit at [13:8]
+/// and the length at [5:0]; all other bits are ignored.
/// If bits [5:0] are zero, the length is interpreted as 64.
-/// If the sum of the index and length is greater than 64, the result is
-/// undefined. If the length and index are both zero, bits [63:0] of
-/// parameter __x are extracted. If the length is zero but the index is
-/// non-zero, the result is undefined.
-/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
+/// If the sum of the index and length is greater than 64, the result is
+/// undefined. If the length and index are both zero, bits [63:0] of
+/// parameter __x are extracted. If the length is zero but the index is
+/// non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
/// from the source operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_extract_si64(__m128i __x, __m128i __y)
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
}
-/// \brief Inserts bits of a specified length from the source integer vector
-/// y into the lower 64 bits of the destination integer vector x at the
+/// \brief Inserts bits of a specified length from the source integer vector
+/// y into the lower 64 bits of the destination integer vector x at the
/// index idx and of the length len.
///
/// \headerfile <x86intrin.h>
///
-/// \code
+/// \code
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
/// const int idx);
-/// \endcode
+/// \endcode
///
-/// \code
+/// \code
/// This intrinsic corresponds to the \c INSERTQ instruction.
-/// \endcode
+/// \endcode
///
/// \param x
-/// The destination operand where bits will be inserted. The inserted bits
-/// are defined by the length len and by the index idx specifying the least
+/// The destination operand where bits will be inserted. The inserted bits
+/// are defined by the length len and by the index idx specifying the least
/// significant bit.
/// \param y
-/// The source operand containing the bits to be extracted. The extracted
+/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand y of length len.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
-/// Bits [5:0] specify the index of the least significant bit; the other
-/// bits are ignored. If the sum of the index and length is greater than
-/// 64, the result is undefined. If the length and index are both zero,
-/// bits [63:0] of parameter y are inserted into parameter x. If the
+/// Bits [5:0] specify the index of the least significant bit; the other
+/// bits are ignored. If the sum of the index and length is greater than
+/// 64, the result is undefined. If the length and index are both zero,
+/// bits [63:0] of parameter y are inserted into parameter x. If the
/// length is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits
+/// \returns A 128-bit integer vector containing the original lower 64-bits
/// of destination operand x with the specified bitfields replaced by the
-/// lower bits of source operand y. The upper 64 bits of the return value
+/// lower bits of source operand y. The upper 64 bits of the return value
/// are undefined.
#define _mm_inserti_si64(x, y, len, idx) \
(__v2di)(__m128i)(y), \
(char)(len), (char)(idx)))
-/// \brief Inserts bits of a specified length from the source integer vector
-/// __y into the lower 64 bits of the destination integer vector __x at
+/// \brief Inserts bits of a specified length from the source integer vector
+/// __y into the lower 64 bits of the destination integer vector __x at
/// the index and of the length specified by __y.
///
/// \headerfile <x86intrin.h>
///
-/// \code
+/// \code
/// This intrinsic corresponds to the \c INSERTQ instruction.
-/// \endcode
+/// \endcode
///
/// \param __x
-/// The destination operand where bits will be inserted. The inserted bits
-/// are defined by the length and by the index of the least significant bit
+/// The destination operand where bits will be inserted. The inserted bits
+/// are defined by the length and by the index of the least significant bit
/// specified by operand __y.
/// \param __y
-/// The source operand containing the bits to be extracted. The extracted
+/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand __y with length specified
-/// by bits [69:64]. These are inserted into the destination at the index
+/// by bits [69:64]. These are inserted into the destination at the index
/// specified by bits [77:72]; all other bits are ignored.
/// If bits [69:64] are zero, the length is interpreted as 64.
-/// If the sum of the index and length is greater than 64, the result is
-/// undefined. If the length and index are both zero, bits [63:0] of
+/// If the sum of the index and length is greater than 64, the result is
+/// undefined. If the length and index are both zero, bits [63:0] of
/// parameter __y are inserted into parameter __x. If the length
-/// is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits
+/// is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits
/// of destination operand __x with the specified bitfields replaced by the
-/// lower bits of source operand __y. The upper 64 bits of the return value
+/// lower bits of source operand __y. The upper 64 bits of the return value
/// are undefined.
static __inline__ __m128i __DEFAULT_FN_ATTRS
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
}
-/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
///
-/// \code
+/// \code
/// This intrinsic corresponds to the \c MOVNTSD instruction.
-/// \endcode
+/// \endcode
///
/// \param __p
/// The 64-bit memory location used to store the register value.
///
/// \headerfile <x86intrin.h>
///
-/// \code
+/// \code
/// This intrinsic corresponds to the \c MOVNTSS instruction.
-/// \endcode
+/// \endcode
///
/// \param __p
/// The 32-bit memory location used to store the register value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
- (__v16qi) __O,
+ (__v16qi) __O,
__M);
}
*
*===-----------------------------------------------------------------------===
*/
-
+
#ifndef __IMMINTRIN_H
#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
#endif
return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
}
-/*
+/*
Vector insert.
We use macros rather than inlines because we only want to accept
invocations where the immediate M is a constant expression.
(((M) & 1) ? 4 : 2), \
(((M) & 1) ? 5 : 3) );})
-/*
+/*
Vector extract.
We use macros rather than inlines because we only want to accept
invocations where the immediate M is a constant expression.
struct __loadu_pd {
__m128d __v;
} __attribute__((__packed__, __may_alias__));
-
+
__m256d __v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)__addr_lo)->__v);
return _mm256_insertf128_pd(__v256, ((struct __loadu_pd*)__addr_hi)->__v, 1);
}
typedef char TM_buff_type[16];
-/* This macro can be used to determine whether a transaction was successfully
+/* This macro can be used to determine whether a transaction was successfully
started from the __TM_begin() and __TM_simple_begin() intrinsic functions
below. */
#define _HTM_TBEGIN_STARTED 1
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pi8(__m64 __m1, __m64 __m2)
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_adds_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+ return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pu8(__m64 __m1, __m64 __m2)
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
}
-
+
static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pu16(__m64 __m1, __m64 __m2)
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
}
{
return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
}
-
+
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sub_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
}
-
+
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sub_pi32(__m64 __m1, __m64 __m2)
{
{
return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
}
-
+
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_subs_pu16(__m64 __m1, __m64 __m2)
{
{
return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
}
-
+
static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_slli_pi16(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+ return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_slli_si64(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psllqi(__m, __count);
+ return (__m64)__builtin_ia32_psllqi(__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sra_pi16(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+ return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sra_pi32(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+ return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_srl_pi16(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+ return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_srli_pi16(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+ return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_srl_pi32(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+ return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_srl_si64(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrlq(__m, __count);
+ return (__m64)__builtin_ia32_psrlq(__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_srli_si64(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psrlqi(__m, __count);
+ return (__m64)__builtin_ia32_psrlqi(__m, __count);
}
static __inline__ __m64 __DEFAULT_FN_ATTRS
*
*===-----------------------------------------------------------------------===
*/
-
+
#ifndef __PMMINTRIN_H
#define __PMMINTRIN_H
/* Extract a single-precision float from X at index N into D. */
#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
(D) = __a[N]; }))
-
+
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
an index suitable for _mm_insert_ps. */
#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
-
+
/* Extract a float from X at index N into the first index of the return. */
#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
-
+
/* Insert int into packed integer array at index. */
#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
__a[(N) & 15] = (I); \
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi32(__m128i __V)
{
- return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
+ return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
__builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
#define _mm_cmpestri(A, LA, B, LB, M) \
__builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
-
+
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
#define _mm_cmpistra(A, B, M) \
__builtin_ia32_pcmpistria128((A), (B), (M))
* C99 7.18.1.2 Minimum-width integer types.
* C99 7.18.1.3 Fastest minimum-width integer types.
*
- * The standard requires that exact-width type be defined for 8-, 16-, 32-, and
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and
* 64-bit types if they are implemented. Other exact width types are optional.
* This implementation defines an exact-width types for every integer width
* that is represented in the standard integer types.
*
* The standard also requires minimum-width types be defined for 8-, 16-, 32-,
* and 64-bit widths regardless of whether there are corresponding exact-width
- * types.
+ * types.
*
* To accommodate targets that are missing types that are exactly 8, 16, 32, or
* 64 bits wide, this implementation takes an approach of cascading
* suboptimal.
*
* In violation of the standard, some targets do not implement a type that is
- * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
* To accommodate these targets, a required minimum-width type is only
* defined if there exists an exact-width type of equal or greater width.
*/
#endif /* __int_least8_t */
/* prevent glibc sys/types.h from defining conflicting types */
-#ifndef __int8_t_defined
+#ifndef __int8_t_defined
# define __int8_t_defined
#endif /* __int8_t_defined */
*
* The standard requires that integer constant macros be defined for all the
* minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
- * types are required, the corresponding integer constant macros are defined
+ * types are required, the corresponding integer constant macros are defined
* here. This implementation also defines minimum-width types for every other
- * integer width that the target implements, so corresponding macros are
+ * integer width that the target implements, so corresponding macros are
* defined below, too.
*
* These macros are defined using the same successive-shrinking approach as
#endif /* __int_least8_t */
-/* C99 7.18.2.1 Limits of exact-width integer types.
+/* C99 7.18.2.1 Limits of exact-width integer types.
* C99 7.18.2.2 Limits of minimum-width integer types.
* C99 7.18.2.3 Limits of fastest minimum-width integer types.
*
static long double _Complex
_TG_ATTRS
- __tg_pow(long double _Complex __x, long double _Complex __y)
+ __tg_pow(long double _Complex __x, long double _Complex __y)
{return cpowl(__x, __y);}
#undef pow
*
*===-----------------------------------------------------------------------===
*/
-
+
#ifndef __TMMINTRIN_H
#define __TMMINTRIN_H
*
*===-----------------------------------------------------------------------===
*/
-
+
#ifndef __XMMINTRIN_H
#define __XMMINTRIN_H
-
+
#include <mmintrin.h>
typedef int __v4si __attribute__((__vector_size__(16)));
_mm_cvtpi8_ps(__m64 __a)
{
__m64 __b;
-
+
__b = _mm_setzero_si64();
__b = _mm_cmpgt_pi8(__b, __a);
__b = _mm_unpacklo_pi8(__a, __b);
_mm_cvtpu8_ps(__m64 __a)
{
__m64 __b;
-
+
__b = _mm_setzero_si64();
__b = _mm_unpacklo_pi8(__a, __b);
_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
{
__m128 __c;
-
+
__c = _mm_setzero_ps();
__c = _mm_cvtpi32_ps(__c, __b);
__c = _mm_movelh_ps(__c, __c);
_mm_cvtps_pi16(__m128 __a)
{
__m64 __b, __c;
-
+
__b = _mm_cvtps_pi32(__a);
__a = _mm_movehl_ps(__a, __a);
__c = _mm_cvtps_pi32(__a);
-
+
return _mm_packs_pi32(__b, __c);
}
_mm_cvtps_pi8(__m128 __a)
{
__m64 __b, __c;
-
+
__b = _mm_cvtps_pi16(__a);
__c = _mm_setzero_si64();
-
+
return _mm_packs_pi16(__b, __c);
}