From df08b3493869540bad5d4b040dae814e078b411d Mon Sep 17 00:00:00 2001
From: Warren Ristow <warren.ristow@sony.com>
Date: Tue, 26 Apr 2022 14:33:14 -0700
Subject: [PATCH] [NFC] Cleanup miscellaneous header items

- Explain the use of the _MM_SHUFFLE and _MM_SHUFFLE2 macros
- Update some doxygen parameter descriptions to match the implementations
- Add "see also" doxygen tags to some intrinsics
- Minor clang-format changes

Reviewers: RKSimon

Differential Revision: https://reviews.llvm.org/D124469
---
 clang/lib/Headers/__wmmintrin_pclmul.h | 20 +++---
 clang/lib/Headers/avxintrin.h          | 93 ++++++++++++++++++--------
 clang/lib/Headers/bmiintrin.h          |  4 ++
 clang/lib/Headers/emmintrin.h          | 56 +++++++++++-----
 clang/lib/Headers/smmintrin.h          |  4 +-
 clang/lib/Headers/xmmintrin.h          | 12 +++-
 6 files changed, 127 insertions(+), 62 deletions(-)
diff --git a/clang/lib/Headers/__wmmintrin_pclmul.h b/clang/lib/Headers/__wmmintrin_pclmul.h
index fef4b93dbb43..c9a6d50bdc89 100644
--- a/clang/lib/Headers/__wmmintrin_pclmul.h
+++ b/clang/lib/Headers/__wmmintrin_pclmul.h
@@ -22,23 +22,23 @@
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
 ///
-/// \param __X
+/// \param X
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param __Y
+/// \param Y
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param __I
+/// \param I
 ///    An immediate value specifying which 64-bit values to select from the
-///    operands. Bit 0 is used to select a value from operand \a __X, and bit
-///    4 is used to select a value from operand \a __Y: \n
-///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
-///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
-///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
-///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
+///    operands. Bit 0 is used to select a value from operand \a X, and bit
+///    4 is used to select a value from operand \a Y: \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
+///    Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
 /// \returns The 128-bit integer vector containing the result of the carry-less
 ///    multiplication of the selected 64-bit values.
 #define _mm_clmulepi64_si128(X, Y, I) \
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index df2d1a2690d8..a8f953c260c2 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -1504,7 +1504,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
 ///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
 ///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
-///    11: Bits [127:96] and [255:224] are copied from the selected operand.
+///    11: Bits [127:96] and [255:224] are copied from the selected operand. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) \
   ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
@@ -1953,12 +1956,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// int _mm256_extract_epi32(__m256i X, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit vector of [8 x i32].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [2:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of extended
@@ -1971,12 +1978,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// int _mm256_extract_epi16(__m256i X, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [16 x i16].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [3:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
@@ -1990,12 +2001,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// int _mm256_extract_epi8(__m256i X, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [32 x i8].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [4:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
@@ -2010,12 +2025,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// long long _mm256_extract_epi64(__m256i X, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [4 x i64].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [1:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 64-bit integer containing the extracted 64 bits of extended
@@ -2030,18 +2049,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [8 x i32] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-///    \a __imm with \a __b.
+/// \returns A copy of vector \a X, after replacing its element indexed by
+///    \a N with \a I.
 #define _mm256_insert_epi32(X, I, N) \
   ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
                                         (int)(I), (int)(N)))
@@ -2053,18 +2076,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [16 x i16] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An i16 integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-///    \a __imm with \a __b.
+/// \returns A copy of vector \a X, after replacing its element indexed by
+///    \a N with \a I.
 #define _mm256_insert_epi16(X, I, N) \
   ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
                                          (int)(I), (int)(N)))
@@ -2075,18 +2102,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [32 x i8] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An i8 integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-///    \a __imm with \a __b.
+/// \returns A copy of vector \a X, after replacing its element indexed by
+///    \a N with \a I.
 #define _mm256_insert_epi8(X, I, N) \
   ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
                                          (int)(I), (int)(N)))
@@ -2098,18 +2129,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [4 x i64] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    A 64-bit integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-///     \a __imm with \a __b.
+/// \returns A copy of vector \a X, after replacing its element indexed by
+///     \a N with \a I.
 #define _mm256_insert_epi64(X, I, N) \
   ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
                                         (long long)(I), (int)(N)))
diff --git a/clang/lib/Headers/bmiintrin.h b/clang/lib/Headers/bmiintrin.h
index f583c215f919..0db8ddfa0cbf 100644
--- a/clang/lib/Headers/bmiintrin.h
+++ b/clang/lib/Headers/bmiintrin.h
@@ -47,6 +47,7 @@ __tzcnt_u16(unsigned short __X)
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of trailing zero
 ///    bits in the operand.
+/// \see _mm_tzcnt_32
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
@@ -63,6 +64,7 @@ __tzcnt_u32(unsigned int __X)
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An 32-bit integer containing the number of trailing zero bits in
 ///    the operand.
+/// \see __tzcnt_u32
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
@@ -83,6 +85,7 @@ _mm_tzcnt_32(unsigned int __X)
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of trailing zero
 ///    bits in the operand.
+/// \see _mm_tzcnt_64
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
@@ -99,6 +102,7 @@ __tzcnt_u64(unsigned long long __X)
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An 64-bit integer containing the number of trailing zero bits in
 ///    the operand.
+/// \see __tzcnt_u64
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 2078c7f0c11a..c1e2915b6cb2 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4126,21 +4126,25 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m128i _mm_extract_epi16(__m256i a, const int imm);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
 ///
-/// \param __a
+/// \param a
 ///    A 128-bit integer vector.
-/// \param __imm
-///    An immediate value. Bits [2:0] selects values from \a __a to be assigned
+/// \param imm
+///    An immediate value. Bits [2:0] selects values from \a a to be assigned
 ///    to bits[15:0] of the result. \n
-///    000: assign values from bits [15:0] of \a __a. \n
-///    001: assign values from bits [31:16] of \a __a. \n
-///    010: assign values from bits [47:32] of \a __a. \n
-///    011: assign values from bits [63:48] of \a __a. \n
-///    100: assign values from bits [79:64] of \a __a. \n
-///    101: assign values from bits [95:80] of \a __a. \n
-///    110: assign values from bits [111:96] of \a __a. \n
-///    111: assign values from bits [127:112] of \a __a.
+///    000: assign values from bits [15:0] of \a a. \n
+///    001: assign values from bits [31:16] of \a a. \n
+///    010: assign values from bits [47:32] of \a a. \n
+///    011: assign values from bits [63:48] of \a a. \n
+///    100: assign values from bits [79:64] of \a a. \n
+///    101: assign values from bits [95:80] of \a a. \n
+///    110: assign values from bits [111:96] of \a a. \n
+///    111: assign values from bits [127:112] of \a a.
 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
 #define _mm_extract_epi16(a, imm)                                              \
@@ -4154,18 +4158,22 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m128i _mm_insert_epi16(__m256i a, int b, const int imm);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
 ///
-/// \param __a
+/// \param a
 ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
 ///    result and then one of the eight elements in the result is replaced by
-///    the lower 16 bits of \a __b.
-/// \param __b
+///    the lower 16 bits of \a b.
+/// \param b
 ///    An integer. The lower 16 bits of this parameter are written to the
-///    result beginning at an offset specified by \a __imm.
-/// \param __imm
+///    result beginning at an offset specified by \a imm.
+/// \param imm
 ///    An immediate value specifying the bit offset in the result at which the
-///    lower 16 bits of \a __b are written.
+///    lower 16 bits of \a b are written.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi16(a, b, imm)                                            \
   ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
@@ -4213,7 +4221,10 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
 ///    00: assign values from bits [31:0] of \a a. \n
 ///    01: assign values from bits [63:32] of \a a. \n
 ///    10: assign values from bits [95:64] of \a a. \n
-///    11: assign values from bits [127:96] of \a a.
+///    11: assign values from bits [127:96] of \a a. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shuffle_epi32(a, imm)                                              \
   ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
@@ -4244,6 +4255,9 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
 ///    01: assign values from bits [31:16] of \a a. \n
 ///    10: assign values from bits [47:32] of \a a. \n
 ///    11: assign values from bits [63:48] of \a a. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflelo_epi16(a, imm)                                            \
   ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
@@ -4274,6 +4288,9 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
 ///    01: assign values from bits [95:80] of \a a. \n
 ///    10: assign values from bits [111:96] of \a a. \n
 ///    11: assign values from bits [127:112] of \a a. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflehi_epi16(a, imm)                                            \
   ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
@@ -4617,6 +4634,9 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
 ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
+///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
+///    <c>[b1, b0]</c>.
 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
 #define _mm_shuffle_pd(a, b, i)                                                \
   ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 52b2f6f15bbc..46fb7bcd4e09 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1213,8 +1213,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
-///    extended to 16-bit values.
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
+///    sign-extended to 16-bit values.
 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
   /* This function always performs a signed extension, but __v16qi is a char
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 1612d3d2773d..4aa70d6e55a6 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -2086,7 +2086,7 @@ _mm_storer_ps(float *__p, __m128 __a)
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// void _mm_prefetch(const void * a, const int sel);
+/// void _mm_prefetch(const void *a, const int sel);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
@@ -2360,7 +2360,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    00: assigned from bits [15:0] of \a a. \n
 ///    01: assigned from bits [31:16] of \a a. \n
 ///    10: assigned from bits [47:32] of \a a. \n
-///    11: assigned from bits [63:48] of \a a.
+///    11: assigned from bits [63:48] of \a a. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) \
   ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
@@ -2602,7 +2605,10 @@ void _mm_setcsr(unsigned int __i);
 ///    00: Bits [31:0] copied from the specified operand. \n
 ///    01: Bits [63:32] copied from the specified operand. \n
 ///    10: Bits [95:64] copied from the specified operand. \n
-///    11: Bits [127:96] copied from the specified operand.
+///    11: Bits [127:96] copied from the specified operand. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) \
   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
-- 
2.34.1