From 96d02f550397c46278924dd1081b5b33da270a05 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 29 Apr 2017 17:17:06 +0000
Subject: [PATCH] [X86][AVX] Added support for _mm256_zext* helper intrinsics
 (PR32839)

llvm-svn: 301749
---
 clang/lib/Headers/avx512fintrin.h     | 110 ++++++++++++++++++++++++++++++++++
 clang/lib/Headers/avxintrin.h         |  55 +++++++++++++++++
 clang/test/CodeGen/avx-builtins.c     |  21 +++++++
 clang/test/CodeGen/avx512f-builtins.c |  41 +++++++++++++
 4 files changed, 227 insertions(+)

diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index d8535f7..b556d04 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -528,6 +528,116 @@ _mm512_mask2int(__mmask16 __a)
   return (int)__a;
 }
 
+/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The upper 384 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
+///    contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_zextpd128_pd512(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
+}
+
+/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
+///    256-bit floating-point vector of [4 x double]. The lower 256 bits
+///    contain the value of the source vector. The upper 256 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
+///    contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_zextpd256_pd512(__m256d __a)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The upper 384 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
+///    contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_zextps128_ps512(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
+}
+
+/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
+///    256-bit floating-point vector of [8 x float]. The lower 256 bits contain
+///    the value of the source vector. The upper 256 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
+///    contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_zextps256_ps512(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+/// \brief Constructs a 512-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The upper
+///    384 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The upper 384 bits are set to zero.
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_zextsi128_si512(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
+}
+
+/// \brief Constructs a 512-bit integer vector from a 256-bit integer vector.
+///    The lower 256 bits contain the value of the source vector. The upper
+///    256 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
+///    the parameter. The upper 256 bits are set to zero.
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_zextsi256_si512(__m256i __a)
+{
+  return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
 /* Bitwise operators */
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_epi32(__m512i __a, __m512i __b)
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 5381878..cdb7aa4 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -4523,6 +4523,61 @@ _mm256_castsi128_si256(__m128i __a)
   return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The upper 128 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+///    contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_zextpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The upper 128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+///    contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_zextps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The upper
+///    128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The upper 128 bits are set to zero.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_zextsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
+}
+
 /*
    Vector insert.
    We use macros rather than inlines because we only want to accept
diff --git a/clang/test/CodeGen/avx-builtins.c b/clang/test/CodeGen/avx-builtins.c
index 4832664..90f428e 100644
--- a/clang/test/CodeGen/avx-builtins.c
+++ b/clang/test/CodeGen/avx-builtins.c
@@ -1386,6 +1386,27 @@ void test_mm256_zeroupper() {
   return _mm256_zeroupper();
 }
 
+__m256d test_mm256_zextpd128_pd256(__m128d A) {
+  // CHECK-LABEL: test_mm256_zextpd128_pd256
+  // CHECK: store <2 x double> zeroinitializer
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm256_zextpd128_pd256(A);
+}
+
+__m256 test_mm256_zextps128_ps256(__m128 A) {
+  // CHECK-LABEL: test_mm256_zextps128_ps256
+  // CHECK: store <4 x float> zeroinitializer
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_zextps128_ps256(A);
+}
+
+__m256i test_mm256_zextsi128_si256(__m128i A) {
+  // CHECK-LABEL: test_mm256_zextsi128_si256
+  // CHECK: store <2 x i64> zeroinitializer
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm256_zextsi128_si256(A);
+}
+
 double test_mm256_cvtsd_f64(__m256d __a)
 {
  // CHECK-LABEL: @test_mm256_cvtsd_f64
diff --git a/clang/test/CodeGen/avx512f-builtins.c b/clang/test/CodeGen/avx512f-builtins.c
index 3ae8014..c66d836 100644
--- a/clang/test/CodeGen/avx512f-builtins.c
+++ b/clang/test/CodeGen/avx512f-builtins.c
@@ -8387,3 +8387,44 @@ __m512 test_mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A){
   return _mm512_mask_abs_ps( __W, __U, __A);
 }
 
+__m512d test_mm512_zextpd128_pd512(__m128d A) {
+  // CHECK-LABEL: test_mm512_zextpd128_pd512
+  // CHECK: store <2 x double> zeroinitializer
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  return _mm512_zextpd128_pd512(A);
+}
+
+__m512d test_mm512_zextpd256_pd512(__m256d A) {
+  // CHECK-LABEL: test_mm512_zextpd256_pd512
+  // CHECK: store <4 x double> zeroinitializer
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm512_zextpd256_pd512(A);
+}
+
+__m512 test_mm512_zextps128_ps512(__m128 A) {
+  // CHECK-LABEL: test_mm512_zextps128_ps512
+  // CHECK: store <4 x float> zeroinitializer
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  return _mm512_zextps128_ps512(A);
+}
+
+__m512 test_mm512_zextps256_ps512(__m256 A) {
+  // CHECK-LABEL: test_mm512_zextps256_ps512
+  // CHECK: store <8 x float> zeroinitializer
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_zextps256_ps512(A);
+}
+
+__m512i test_mm512_zextsi128_si512(__m128i A) {
+  // CHECK-LABEL: test_mm512_zextsi128_si512
+  // CHECK: store <2 x i64> zeroinitializer
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  return _mm512_zextsi128_si512(A);
+}
+
+__m512i test_mm512_zextsi256_si512(__m256i A) {
+  // CHECK-LABEL: test_mm512_zextsi256_si512
+  // CHECK: store <4 x i64> zeroinitializer
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm512_zextsi256_si512(A);
+}
-- 
2.7.4