From 7e9747b50bcb1be28d4a3236571e8050835497a6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 29 Feb 2020 18:56:53 +0000
Subject: [PATCH] [X86][F16C] Remove cvtph2ps intrinsics and use generic
 half2float conversion (PR37554)

This removes everything but int_x86_avx512_mask_vcvtph2ps_512 which provides the SAE variant, but even this can use the fpext generic if the rounding control is the default.

Differential Revision: https://reviews.llvm.org/D75162
---
 clang/lib/CodeGen/CGBuiltin.cpp                    |  48 +++
 clang/test/CodeGen/avx512f-builtins-constrained.c  |  17 +-
 clang/test/CodeGen/avx512f-builtins.c              |  14 +-
 clang/test/CodeGen/avx512vl-builtins-constrained.c |  26 +-
 clang/test/CodeGen/avx512vl-builtins.c             |  22 +-
 clang/test/CodeGen/f16c-builtins-constrained.c     |  13 +-
 clang/test/CodeGen/f16c-builtins.c                 |  13 +-
 llvm/include/llvm/IR/IntrinsicsX86.td              |  12 +-
 llvm/lib/IR/AutoUpgrade.cpp                        |  20 ++
 llvm/lib/Target/X86/X86IntrinsicsInfo.h            |   6 -
 .../Transforms/InstCombine/InstCombineCalls.cpp    |  44 ---
 .../CodeGen/X86/avx512-intrinsics-fast-isel.ll     |  56 ++++
 llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll |  82 +++++-
 llvm/test/CodeGen/X86/avx512-intrinsics.ll         |  70 -----
 .../CodeGen/X86/avx512vl-intrinsics-fast-isel.ll   |  92 ++++++
 .../CodeGen/X86/avx512vl-intrinsics-upgrade.ll     |  94 ++++++
 llvm/test/CodeGen/X86/avx512vl-intrinsics.ll       |  95 ------
 llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll | 108 +++----
 llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll   | 148 ++++++++++
 llvm/test/CodeGen/X86/f16c-intrinsics.ll           | 324 +++++----------------
 llvm/test/Transforms/InstCombine/X86/x86-f16c.ll   |  13 +-
 21 files changed, 744 insertions(+), 573 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 47b3abd..ba3b14c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10327,6 +10327,46 @@ Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
   return EmitX86CpuIs(CPUStr);
 }
 
+// Convert F16 halfs to floats.
+static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
+                                       ArrayRef<Value *> Ops,
+                                       llvm::Type *DstTy) {
+  assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) &&
+         "Unknown cvtph2ps intrinsic");
+
+  // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
+  if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) {
+    Intrinsic::ID IID = Intrinsic::x86_avx512_mask_vcvtph2ps_512;
+    Function *F =
+        CGF.CGM.getIntrinsic(IID, {DstTy, Ops[0]->getType(), Ops[1]->getType(),
+                                   Ops[2]->getType(), Ops[3]->getType()});
+    return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
+  }
+
+  unsigned NumDstElts = DstTy->getVectorNumElements();
+  Value *Src = Ops[0];
+
+  // Extract the subvector.
+  if (NumDstElts != Src->getType()->getVectorNumElements()) {
+    assert(NumDstElts == 4 && "Unexpected vector size");
+    uint32_t ShuffleMask[4] = {0, 1, 2, 3};
+    Src = CGF.Builder.CreateShuffleVector(Src, UndefValue::get(Src->getType()),
+                                          ShuffleMask);
+  }
+
+  // Bitcast from vXi16 to vXf16.
+  llvm::Type *HalfTy = llvm::VectorType::get(
+      llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts);
+  Src = CGF.Builder.CreateBitCast(Src, HalfTy);
+
+  // Perform the fp-extension.
+  Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps");
+
+  if (Ops.size() >= 3)
+    Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]);
+  return Res;
+}
+
 // Convert a BF16 to a float.
 static Value *EmitX86CvtBF16ToFloatExpr(CodeGenFunction &CGF,
                                         const CallExpr *E,
@@ -12531,6 +12571,14 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_cmpordsd:
     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
 
+  // f16c half2float intrinsics
+  case X86::BI__builtin_ia32_vcvtph2ps:
+  case X86::BI__builtin_ia32_vcvtph2ps256:
+  case X86::BI__builtin_ia32_vcvtph2ps_mask:
+  case X86::BI__builtin_ia32_vcvtph2ps256_mask:
+  case X86::BI__builtin_ia32_vcvtph2ps512_mask:
+    return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType()));
+
 // AVX512 bf16 intrinsics
   case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
     Ops[2] = getMaskVecValue(*this, Ops[2],
diff --git a/clang/test/CodeGen/avx512f-builtins-constrained.c b/clang/test/CodeGen/avx512f-builtins-constrained.c
index dcddd24..1ccc234 100644
--- a/clang/test/CodeGen/avx512f-builtins-constrained.c
+++ b/clang/test/CodeGen/avx512f-builtins-constrained.c
@@ -171,21 +171,32 @@ __m128 test_mm_maskz_sqrt_ss(__mmask8 __U, __m128 __A, __m128 __B){
 __m512 test_mm512_cvtph_ps (__m256i __A)
 {
   // COMMON-LABEL: test_mm512_cvtph_ps 
-  // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.512
+  // COMMONIR: bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // COMMONIR: bitcast <16 x i16> %{{.*}} to <16 x half>
+  // UNCONSTRAINED: fpext <16 x half> %{{.*}} to <16 x float>
+  // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %{{.*}}, metadata !"fpexcept.strict")
   return _mm512_cvtph_ps (__A);
 }
 
 __m512 test_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
 {
   // COMMON-LABEL: test_mm512_mask_cvtph_ps 
-  // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.512
+  // COMMONIR: bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // COMMONIR: bitcast <16 x i16> %{{.*}} to <16 x half>
+  // UNCONSTRAINED: fpext <16 x half> %{{.*}} to <16 x float>
+  // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %{{.*}}, metadata !"fpexcept.strict")
+  // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvtph_ps (__W,__U,__A);
 }
 
 __m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
 {
   // COMMON-LABEL: test_mm512_maskz_cvtph_ps 
-  // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.512
+  // COMMONIR: bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // COMMONIR: bitcast <16 x i16> %{{.*}} to <16 x half>
+  // UNCONSTRAINED: fpext <16 x half> %{{.*}} to <16 x float>
+  // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half> %{{.*}}, metadata !"fpexcept.strict")
+  // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvtph_ps (__U,__A);
 }
 
diff --git a/clang/test/CodeGen/avx512f-builtins.c b/clang/test/CodeGen/avx512f-builtins.c
index 390ea14f..c193e7d 100644
--- a/clang/test/CodeGen/avx512f-builtins.c
+++ b/clang/test/CodeGen/avx512f-builtins.c
@@ -9463,21 +9463,29 @@ __m256 test_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
 __m512 test_mm512_cvtph_ps (__m256i __A)
 {
   // CHECK-LABEL: @test_mm512_cvtph_ps 
-  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half>
+  // CHECK: fpext <16 x half> %{{.*}} to <16 x float>
   return _mm512_cvtph_ps (__A);
 }
 
 __m512 test_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
 {
   // CHECK-LABEL: @test_mm512_mask_cvtph_ps 
-  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half>
+  // CHECK: fpext <16 x half> %{{.*}} to <16 x float>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvtph_ps (__W,__U,__A);
 }
 
 __m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
 {
   // CHECK-LABEL: @test_mm512_maskz_cvtph_ps 
-  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half>
+  // CHECK: fpext <16 x half> %{{.*}} to <16 x float>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvtph_ps (__U,__A);
 }
 
diff --git a/clang/test/CodeGen/avx512vl-builtins-constrained.c b/clang/test/CodeGen/avx512vl-builtins-constrained.c
index 0e2aa4e..bc59510 100644
--- a/clang/test/CodeGen/avx512vl-builtins-constrained.c
+++ b/clang/test/CodeGen/avx512vl-builtins-constrained.c
@@ -8,25 +8,43 @@
 
 __m128 test_mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) {
   // COMMON-LABEL: @test_mm_mask_cvtph_ps
-  // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.128
+  // COMMONIR: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // COMMONIR: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // COMMONIR: bitcast <4 x i16> %{{.*}} to <4 x half>
+  // UNCONSTRAINED: fpext <4 x half> %{{.*}} to <4 x float>
+  // CONSTRAINED: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict") 
+  // COMMONIR: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_cvtph_ps(__W, __U, __A);
 }
 
 __m128 test_mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) {
   // COMMON-LABEL: @test_mm_maskz_cvtph_ps
-  // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.128
+  // COMMONIR: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // COMMONIR: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // COMMONIR: bitcast <4 x i16> %{{.*}} to <4 x half>
+  // UNCONSTRAINED: fpext <4 x half> %{{.*}} to <4 x float>
+  // CONSTRAINED: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict") 
+  // COMMONIR: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_cvtph_ps(__U, __A);
 }
 
 __m256 test_mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) {
   // COMMON-LABEL: @test_mm256_mask_cvtph_ps
-  // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.256
+  // COMMONIR: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // COMMONIR: bitcast <8 x i16> %{{.*}} to <8 x half>
+  // UNCONSTRAINED: fpext <8 x half> %{{.*}} to <8 x float>
+  // CONSTRAINED: call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %{{.*}}, metadata !"fpexcept.strict") 
+  // COMMONIR: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_cvtph_ps(__W, __U, __A);
 }
 
 __m256 test_mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) {
   // COMMON-LABEL: @test_mm256_maskz_cvtph_ps
-  // COMMONIR: @llvm.x86.avx512.mask.vcvtph2ps.256
+  // COMMONIR: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // COMMONIR: bitcast <8 x i16> %{{.*}} to <8 x half>
+  // UNCONSTRAINED: fpext <8 x half> %{{.*}} to <8 x float>
+  // CONSTRAINED: call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %{{.*}}, metadata !"fpexcept.strict") 
+  // COMMONIR: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_cvtph_ps(__U, __A);
 }
 
diff --git a/clang/test/CodeGen/avx512vl-builtins.c b/clang/test/CodeGen/avx512vl-builtins.c
index 5bed16e..8d8fded 100644
--- a/clang/test/CodeGen/avx512vl-builtins.c
+++ b/clang/test/CodeGen/avx512vl-builtins.c
@@ -9692,25 +9692,39 @@ __m256 test_mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) {
 
 __m128 test_mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_mask_cvtph_ps
-  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.128
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half>
+  // CHECK: fpext <4 x half> %{{.*}} to <4 x float>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_cvtph_ps(__W, __U, __A);
 }
 
 __m128 test_mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm_maskz_cvtph_ps
-  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.128
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half>
+  // CHECK: fpext <4 x half> %{{.*}} to <4 x float>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_cvtph_ps(__U, __A);
 }
 
 __m256 test_mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_mask_cvtph_ps
-  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.256
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK: bitcast <8 x i16> %{{.*}} to <8 x half>
+  // CHECK: fpext <8 x half> %{{.*}} to <8 x float>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_cvtph_ps(__W, __U, __A);
 }
 
 __m256 test_mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: @test_mm256_maskz_cvtph_ps
-  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.256
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK: bitcast <8 x i16> %{{.*}} to <8 x half>
+  // CHECK: fpext <8 x half> %{{.*}} to <8 x float>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_cvtph_ps(__U, __A);
 }
 
diff --git a/clang/test/CodeGen/f16c-builtins-constrained.c b/clang/test/CodeGen/f16c-builtins-constrained.c
index 74cf3d1..ce84155 100644
--- a/clang/test/CodeGen/f16c-builtins-constrained.c
+++ b/clang/test/CodeGen/f16c-builtins-constrained.c
@@ -13,7 +13,9 @@ float test_cvtsh_ss(unsigned short a) {
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
-  // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}})
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half>
+  // CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   return _cvtsh_ss(a);
 }
@@ -34,13 +36,18 @@ unsigned short test_cvtss_sh(float a) {
 
 __m128 test_mm_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm_cvtph_ps
-  // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}})
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half>
+  // CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
   return _mm_cvtph_ps(a);
 }
 
 __m256 test_mm256_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm256_cvtph_ps
-  // CHECK: call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %{{.*}})
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK: bitcast <8 x i16> %{{.*}} to <8 x half>
+  // CHECK: call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half> %{{.*}}, metadata !"fpexcept.strict")
   return _mm256_cvtph_ps(a);
 }
 
diff --git a/clang/test/CodeGen/f16c-builtins.c b/clang/test/CodeGen/f16c-builtins.c
index ce14187..1616cfb 100644
--- a/clang/test/CodeGen/f16c-builtins.c
+++ b/clang/test/CodeGen/f16c-builtins.c
@@ -13,7 +13,9 @@ float test_cvtsh_ss(unsigned short a) {
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
-  // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}})
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half>
+  // CHECK: fpext <4 x half> %{{.*}} to <4 x float>
   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   return _cvtsh_ss(a);
 }
@@ -31,13 +33,18 @@ unsigned short test_cvtss_sh(float a) {
 
 __m128 test_mm_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm_cvtph_ps
-  // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}})
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: bitcast <4 x i16> %{{.*}} to <4 x half>
+  // CHECK: fpext <4 x half> %{{.*}} to <4 x float>
   return _mm_cvtph_ps(a);
 }
 
 __m256 test_mm256_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm256_cvtph_ps
-  // CHECK: call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %{{.*}})
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK: bitcast <8 x i16> %{{.*}} to <8 x half>
+  // CHECK: fpext <8 x half> %{{.*}} to <8 x float>
   return _mm256_cvtph_ps(a);
 }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 94a1ac9..4d353c7 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2546,26 +2546,16 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Half float conversion
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_vcvtph2ps_128 : GCCBuiltin<"__builtin_ia32_vcvtph2ps">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
-  def int_x86_vcvtph2ps_256 : GCCBuiltin<"__builtin_ia32_vcvtph2ps256">,
-              Intrinsic<[llvm_v8f32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<1>]>;
   def int_x86_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256">,
               Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<1>]>;
-  def int_x86_avx512_mask_vcvtph2ps_512 : GCCBuiltin<"__builtin_ia32_vcvtph2ps512_mask">,
+  def int_x86_avx512_mask_vcvtph2ps_512 :
               Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty, llvm_v16f32_ty,
                                            llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<3>]>;
-  def int_x86_avx512_mask_vcvtph2ps_256 : GCCBuiltin<"__builtin_ia32_vcvtph2ps256_mask">,
-              Intrinsic<[llvm_v8f32_ty], [llvm_v8i16_ty, llvm_v8f32_ty,
-                                           llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vcvtph2ps_128 : GCCBuiltin<"__builtin_ia32_vcvtph2ps_mask">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty, llvm_v4f32_ty,
-                                           llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512_mask">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty,
                                            llvm_v16i16_ty, llvm_i16_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index f6e4a6b..28bc5a4 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -204,6 +204,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.cvtqq2pd.") || // Added in 7.0 updated 9.0
       Name.startswith("avx512.mask.cvtuqq2pd.") || // Added in 7.0 updated 9.0
       Name.startswith("avx512.mask.cvtdq2ps.") || // Added in 7.0 updated 9.0
+      Name == "avx512.mask.vcvtph2ps.128" || // Added in 11.0
+      Name == "avx512.mask.vcvtph2ps.256" || // Added in 11.0
       Name == "avx512.mask.cvtqq2ps.256" || // Added in 9.0
       Name == "avx512.mask.cvtqq2ps.512" || // Added in 9.0
       Name == "avx512.mask.cvtuqq2ps.256" || // Added in 9.0
@@ -316,6 +318,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name == "avx.cvtdq2.pd.256" || // Added in 3.9
       Name == "avx.cvtdq2.ps.256" || // Added in 7.0
       Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
+      Name.startswith("vcvtph2ps.") || // Added in 11.0
       Name.startswith("avx.vinsertf128.") || // Added in 3.7
       Name == "avx2.vinserti128" || // Added in 3.7
       Name.startswith("avx512.mask.insert") || // Added in 4.0
@@ -2135,6 +2138,23 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       if (CI->getNumArgOperands() >= 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
+    } else if (IsX86 && (Name.startswith("avx512.mask.vcvtph2ps.") ||
+                         Name.startswith("vcvtph2ps."))) {
+      Type *DstTy = CI->getType();
+      Rep = CI->getArgOperand(0);
+      Type *SrcTy = Rep->getType();
+      unsigned NumDstElts = DstTy->getVectorNumElements();
+      if (NumDstElts != SrcTy->getVectorNumElements()) {
+        assert(NumDstElts == 4 && "Unexpected vector size");
+        uint32_t ShuffleMask[4] = {0, 1, 2, 3};
+        Rep = Builder.CreateShuffleVector(Rep, Rep, ShuffleMask);
+      }
+      Rep = Builder.CreateBitCast(
+          Rep, VectorType::get(Type::getHalfTy(C), NumDstElts));
+      Rep = Builder.CreateFPExt(Rep, DstTy, "cvtph2ps");
+      if (CI->getNumArgOperands() >= 3)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                            CI->getArgOperand(1));
     } else if (IsX86 && (Name.startswith("avx512.mask.loadu."))) {
       Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
                               CI->getArgOperand(1), CI->getArgOperand(2),
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 669b688..6dc4860 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -783,10 +783,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::FSUBS, X86ISD::FSUBS_RND),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK,
                      X86ISD::FSUBS, X86ISD::FSUBS_RND),
-  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::CVTPH2PS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE,
                      X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE),
   X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK,
@@ -1108,8 +1104,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(subborrow_64,      ADX, X86ISD::SBB, X86ISD::SUB),
   X86_INTRINSIC_DATA(tbm_bextri_u32,    BEXTRI, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(tbm_bextri_u64,    BEXTRI, X86ISD::BEXTR, 0),
-  X86_INTRINSIC_DATA(vcvtph2ps_128,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
-  X86_INTRINSIC_DATA(vcvtph2ps_256,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(vcvtps2ph_128,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
   X86_INTRINSIC_DATA(vcvtps2ph_256,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index bbc84b9..1fcd853 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2543,50 +2543,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
 
-  case Intrinsic::x86_vcvtph2ps_128:
-  case Intrinsic::x86_vcvtph2ps_256: {
-    auto Arg = II->getArgOperand(0);
-    auto ArgType = cast<VectorType>(Arg->getType());
-    auto RetType = cast<VectorType>(II->getType());
-    unsigned ArgWidth = ArgType->getNumElements();
-    unsigned RetWidth = RetType->getNumElements();
-    assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
-    assert(ArgType->isIntOrIntVectorTy() &&
-           ArgType->getScalarSizeInBits() == 16 &&
-           "CVTPH2PS input type should be 16-bit integer vector");
-    assert(RetType->getScalarType()->isFloatTy() &&
-           "CVTPH2PS output type should be 32-bit float vector");
-
-    // Constant folding: Convert to generic half to single conversion.
-    if (isa<ConstantAggregateZero>(Arg))
-      return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
-
-    if (isa<ConstantDataVector>(Arg)) {
-      auto VectorHalfAsShorts = Arg;
-      if (RetWidth < ArgWidth) {
-        SmallVector<uint32_t, 8> SubVecMask;
-        for (unsigned i = 0; i != RetWidth; ++i)
-          SubVecMask.push_back((int)i);
-        VectorHalfAsShorts = Builder.CreateShuffleVector(
-            Arg, UndefValue::get(ArgType), SubVecMask);
-      }
-
-      auto VectorHalfType =
-          VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
-      auto VectorHalfs =
-          Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType);
-      auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType);
-      return replaceInstUsesWith(*II, VectorFloats);
-    }
-
-    // We only use the lowest lanes of the argument.
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) {
-      II->setArgOperand(0, V);
-      return II;
-    }
-    break;
-  }
-
   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
   case Intrinsic::x86_sse_cvttss2si:
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index a40273d..1b72205 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1907,6 +1907,62 @@ entry:
   ret <4 x float> %vecins.i
 }
 
+define <16 x float> @test_mm512_cvtph_ps(<4 x i64> %__A) {
+; CHECK-LABEL: test_mm512_cvtph_ps:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+entry:
+  %0 = bitcast <4 x i64> %__A to <16 x i16>
+  %1 = bitcast <16 x i16> %0 to <16 x half>
+  %2 = fpext <16 x half> %1 to <16 x float>
+  ret <16 x float> %2
+}
+
+define <16 x float> @test_mm512_mask_cvtph_ps(<16 x float> %__W, i16 zeroext %__U, <4 x i64> %__A) {
+; X86-LABEL: test_mm512_mask_cvtph_ps:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vcvtph2ps %ymm1, %zmm0 {%k1}
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm512_mask_cvtph_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vcvtph2ps %ymm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__A to <16 x i16>
+  %1 = bitcast <16 x i16> %0 to <16 x half>
+  %2 = bitcast i16 %__U to <16 x i1>
+  %3 = fpext <16 x half> %1 to <16 x float>
+  %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> %__W
+  ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_cvtph_ps(i16 zeroext %__U, <4 x i64> %__A) {
+; X86-LABEL: test_mm512_maskz_cvtph_ps:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm512_maskz_cvtph_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__A to <16 x i16>
+  %1 = bitcast <16 x i16> %0 to <16 x half>
+  %2 = bitcast i16 %__U to <16 x i1>
+  %3 = fpext <16 x half> %1 to <16 x float>
+  %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> zeroinitializer
+  ret <16 x float> %4
+}
+
 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
 ; CHECK-LABEL: test_mm512_cvtps_pd:
 ; CHECK:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index a4ab1b0..888829e 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -4512,6 +4512,76 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x
   ret <8 x double> %res2
 }
 
+define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
+; CHECK-LABEL: test_x86_vcvtph2ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x13,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
+; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x13,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
+; X86-LABEL: test_x86_vcvtph2ps_512_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x13,0xc8]
+; X86-NEXT:    vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_512_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x13,0xc8]
+; X64-NEXT:    vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
+; X86-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x13,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x13,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
+; X86-LABEL: test_x86_vcvtph2ps_512_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x13,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_512_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x13,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
+
 define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_valign_q:
 ; CHECK:       ## %bb.0:
@@ -4633,14 +4703,14 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
 ; X86-NEXT:    ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A]
-; X86-NEXT:    ## fixup A - offset: 6, value: LCPI211_0, kind: FK_Data_4
+; X86-NEXT:    ## fixup A - offset: 6, value: LCPI216_0, kind: FK_Data_4
 ; X86-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
 ; X86-NEXT:    ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A]
-; X86-NEXT:    ## fixup A - offset: 6, value: LCPI211_1, kind: FK_Data_4
+; X86-NEXT:    ## fixup A - offset: 6, value: LCPI216_1, kind: FK_Data_4
 ; X86-NEXT:    vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9]
 ; X86-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
 ; X86-NEXT:    ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A]
-; X86-NEXT:    ## fixup A - offset: 6, value: LCPI211_2, kind: FK_Data_4
+; X86-NEXT:    ## fixup A - offset: 6, value: LCPI216_2, kind: FK_Data_4
 ; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -4649,14 +4719,14 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
 ; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
 ; X64-NEXT:    ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A]
-; X64-NEXT:    ## fixup A - offset: 6, value: LCPI211_0-4, kind: reloc_riprel_4byte
+; X64-NEXT:    ## fixup A - offset: 6, value: LCPI216_0-4, kind: reloc_riprel_4byte
 ; X64-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
 ; X64-NEXT:    ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A]
-; X64-NEXT:    ## fixup A - offset: 6, value: LCPI211_1-4, kind: reloc_riprel_4byte
+; X64-NEXT:    ## fixup A - offset: 6, value: LCPI216_1-4, kind: reloc_riprel_4byte
 ; X64-NEXT:    vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9]
 ; X64-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
 ; X64-NEXT:    ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A]
-; X64-NEXT:    ## fixup A - offset: 6, value: LCPI211_2-4, kind: reloc_riprel_4byte
+; X64-NEXT:    ## fixup A - offset: 6, value: LCPI216_2-4, kind: reloc_riprel_4byte
 ; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index e94f16a..fcc8a15 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -1007,76 +1007,6 @@ define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
 }
 declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
 
-define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
-; CHECK-LABEL: test_x86_vcvtph2ps_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
-; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
-; X64-LABEL: test_x86_vcvtph2ps_512_rrk:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1}
-; X64-NEXT:    vmovaps %zmm1, %zmm0
-; X64-NEXT:    retq
-;
-; X86-LABEL: test_x86_vcvtph2ps_512_rrk:
-; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1}
-; X86-NEXT:    vmovaps %zmm1, %zmm0
-; X86-NEXT:    retl
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
-; X64-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
-; X64-NEXT:    retq
-;
-; X86-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
-; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
-; X86-NEXT:    retl
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
-; X64-LABEL: test_x86_vcvtph2ps_512_rrkz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
-; X64-NEXT:    retq
-;
-; X86-LABEL: test_x86_vcvtph2ps_512_rrkz:
-; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
-; X86-NEXT:    retl
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
-
 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) {
 ; X64-LABEL: test_x86_vcvtps2ph_256:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index 82a19ba..3386e40 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -365,6 +365,98 @@ entry:
   ret <2 x i64> %1
 }
 
+define <4 x float> @test_mm_mask_cvtph_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
+; X86-LABEL: test_mm_mask_cvtph_ps:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vcvtph2ps %xmm1, %xmm0 {%k1}
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_cvtph_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vcvtph2ps %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__A to <8 x i16>
+  %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <4 x i16> %1 to <4 x half>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = fpext <4 x half> %2 to <4 x float>
+  %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> %__W
+  ret <4 x float> %6
+}
+
+define <4 x float> @test_mm_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) {
+; X86-LABEL: test_mm_maskz_cvtph_ps:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vcvtph2ps %xmm0, %xmm0 {%k1} {z}
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_cvtph_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vcvtph2ps %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__A to <8 x i16>
+  %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <4 x i16> %1 to <4 x half>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = fpext <4 x half> %2 to <4 x float>
+  %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> zeroinitializer
+  ret <4 x float> %6
+}
+
+define <8 x float> @test_mm256_mask_cvtph_ps(<8 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
+; X86-LABEL: test_mm256_mask_cvtph_ps:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vcvtph2ps %xmm1, %ymm0 {%k1}
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_cvtph_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vcvtph2ps %xmm1, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__A to <8 x i16>
+  %1 = bitcast <8 x i16> %0 to <8 x half>
+  %2 = bitcast i8 %__U to <8 x i1>
+  %3 = fpext <8 x half> %1 to <8 x float>
+  %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> %__W
+  ret <8 x float> %4
+}
+
+define <8 x float> @test_mm256_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) {
+; X86-LABEL: test_mm256_maskz_cvtph_ps:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vcvtph2ps %xmm0, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_cvtph_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vcvtph2ps %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %__A to <8 x i16>
+  %1 = bitcast <8 x i16> %0 to <8 x half>
+  %2 = bitcast i8 %__U to <8 x i1>
+  %3 = fpext <8 x half> %1 to <8 x float>
+  %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> zeroinitializer
+  ret <8 x float> %4
+}
+
 define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
 ; X86-LABEL: test_mm_mask_cvtps_epi32:
 ; X86:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index d339e21..9ea5000 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -10247,6 +10247,100 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x fl
   ret <8 x float> %res2
 }
 
+define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_vcvtph2ps_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) {
+; X86-LABEL: test_x86_vcvtph2ps_128_rrk:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8]
+; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_rrk:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8]
+; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
+; X86-LABEL: test_x86_vcvtph2ps_128_rrkz:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vcvtph2ps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_rrkz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtph2ps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly
+
+define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_vcvtph2ps_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) {
+; X86-LABEL: test_x86_vcvtph2ps_256_rrk:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8]
+; X86-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_256_rrk:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8]
+; X64-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) {
+; X86-LABEL: test_x86_vcvtph2ps_256_rrkz:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vcvtph2ps %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_256_rrkz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtph2ps %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly
+
 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8)
 
 define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 3c545c8..0512459 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -4214,101 +4214,6 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6
   ret <4 x i64> %res2
 }
 
-define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_vcvtph2ps_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) {
-; X86-LABEL: test_x86_vcvtph2ps_128_rrk:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8]
-; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtph2ps_128_rrk:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8]
-; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask)
-  ret <4 x float> %res
-}
-
-
-define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
-; X86-LABEL: test_x86_vcvtph2ps_128_rrkz:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vcvtph2ps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtph2ps_128_rrkz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vcvtph2ps %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask)
-  ret <4 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly
-
-define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_vcvtph2ps_256:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1)
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) {
-; X86-LABEL: test_x86_vcvtph2ps_256_rrk:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8]
-; X86-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtph2ps_256_rrk:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8]
-; X64-NEXT:    vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask)
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) {
-; X86-LABEL: test_x86_vcvtph2ps_256_rrkz:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vcvtph2ps %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtph2ps_256_rrkz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vcvtph2ps %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask)
-  ret <8 x float> %res
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly
-
 define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) {
 ; X86-LABEL: test_x86_vcvtps2ph_128:
 ; X86:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
index d403fee..e114c20 100644
--- a/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefixes=CHECK,X64
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/f16c-builtins.c
 
 define float @test_cvtsh_ss(i16 %a0) nounwind {
-; X32-LABEL: test_cvtsh_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovd %eax, %xmm0
-; X32-NEXT:    vcvtph2ps %xmm0, %xmm0
-; X32-NEXT:    vmovss %xmm0, (%esp)
-; X32-NEXT:    flds (%esp)
-; X32-NEXT:    popl %eax
-; X32-NEXT:    retl
+; X86-LABEL: test_cvtsh_ss:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vcvtph2ps %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_cvtsh_ss:
 ; X64:       # %bb.0:
@@ -30,21 +30,23 @@ define float @test_cvtsh_ss(i16 %a0) nounwind {
   %ins5 = insertelement <8 x i16> %ins4, i16 0, i32 5
   %ins6 = insertelement <8 x i16> %ins5, i16 0, i32 6
   %ins7 = insertelement <8 x i16> %ins6, i16 0, i32 7
-  %cvt = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %ins7)
+  %shuffle = shufflevector <8 x i16> %ins7, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %bc = bitcast <4 x i16> %shuffle to <4 x half>
+  %cvt = fpext <4 x half> %bc to <4 x float>
   %res = extractelement <4 x float> %cvt, i32 0
   ret float %res
 }
 
 define i16 @test_cvtss_sh(float %a0) nounwind {
-; X32-LABEL: test_cvtss_sh:
-; X32:       # %bb.0:
-; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X32-NEXT:    vcvtps2ph $0, %xmm0, %xmm0
-; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    # kill: def $ax killed $ax killed $eax
-; X32-NEXT:    retl
+; X86-LABEL: test_cvtss_sh:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT:    vcvtps2ph $0, %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_cvtss_sh:
 ; X64:       # %bb.0:
@@ -64,62 +66,44 @@ define i16 @test_cvtss_sh(float %a0) nounwind {
 }
 
 define <4 x float> @test_mm_cvtph_ps(<2 x i64> %a0) nounwind {
-; X32-LABEL: test_mm_cvtph_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    vcvtph2ps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cvtph_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtph2ps %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_cvtph_ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
-  %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %arg0)
+  %shuffle = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %bc = bitcast <4 x i16> %shuffle to <4 x half>
+  %res = fpext <4 x half> %bc to <4 x float>
   ret <4 x float> %res
 }
 
 define <8 x float> @test_mm256_cvtph_ps(<2 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_cvtph_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    vcvtph2ps %xmm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtph_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtph2ps %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_cvtph_ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2ps %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
-  %res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %arg0)
+  %bc = bitcast <8 x i16> %arg0 to <8 x half>
+  %res = fpext <8 x half> %bc to <8 x float>
   ret <8 x float> %res
 }
 
 define <2 x i64> @test_mm_cvtps_ph(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm_cvtps_ph:
-; X32:       # %bb.0:
-; X32-NEXT:    vcvtps2ph $0, %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cvtps_ph:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtps2ph $0, %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_cvtps_ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2ph $0, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %cvt = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
   %res = bitcast <8 x i16> %cvt to <2 x i64>
   ret <2 x i64> %res
 }
 
 define <2 x i64> @test_mm256_cvtps_ph(<8 x float> %a0) nounwind {
-; X32-LABEL: test_mm256_cvtps_ph:
-; X32:       # %bb.0:
-; X32-NEXT:    vcvtps2ph $0, %ymm0, %xmm0
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtps_ph:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtps2ph $0, %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_cvtps_ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2ph $0, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
   %cvt = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
   %res = bitcast <8 x i16> %cvt to <2 x i64>
   ret <2 x i64> %res
diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll
new file mode 100644
index 0000000..0d81038
--- /dev/null
+++ b/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefixes=AVX,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefixes=AVX,X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefixes=AVX512VL,X86-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefixes=AVX512VL,X64-AVX512VL
+
+define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
+; AVX-LABEL: test_x86_vcvtph2ps_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
+
+define <4 x float> @test_x86_vcvtph2ps_128_m(<8 x i16>* nocapture %a) {
+; X86-LABEL: test_x86_vcvtph2ps_128_m:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_m:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X86-AVX512VL:       # %bb.0:
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X64-AVX512VL:       # %bb.0:
+; X64-AVX512VL-NEXT:    vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
+  %load = load <8 x i16>, <8 x i16>* %a
+  %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %load) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+
+define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
+; AVX-LABEL: test_x86_vcvtph2ps_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
+
+define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
+; X86-LABEL: test_x86_vcvtph2ps_256_m:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcvtph2ps (%eax), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_256_m:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2ps (%rdi), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X86-AVX512VL:       # %bb.0:
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtph2ps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X64-AVX512VL:       # %bb.0:
+; X64-AVX512VL-NEXT:    vcvtph2ps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
+  %load = load <8 x i16>, <8 x i16>* %a
+  %res = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %load)
+  ret <8 x float> %res
+}
+
+define <4 x float> @test_x86_vcvtph2ps_128_scalar(i64* %ptr) {
+; X86-LABEL: test_x86_vcvtph2ps_128_scalar:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_scalar:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar:
+; X86-AVX512VL:       # %bb.0:
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar:
+; X64-AVX512VL:       # %bb.0:
+; X64-AVX512VL-NEXT:    vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
+  %load = load i64, i64* %ptr
+  %ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
+  %ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
+  %bc = bitcast <2 x i64> %ins2 to <8 x i16>
+  %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc) #2
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_vcvtph2ps_128_scalar2(i64* %ptr) {
+; X86-LABEL: test_x86_vcvtph2ps_128_scalar2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_scalar2:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar2:
+; X86-AVX512VL:       # %bb.0:
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar2:
+; X64-AVX512VL:       # %bb.0:
+; X64-AVX512VL-NEXT:    vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
+  %load = load i64, i64* %ptr
+  %ins = insertelement <2 x i64> undef, i64 %load, i32 0
+  %bc = bitcast <2 x i64> %ins to <8 x i16>
+  %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc)
+  ret <4 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics.ll b/llvm/test/CodeGen/X86/f16c-intrinsics.ll
index 553cf31..9f27833 100644
--- a/llvm/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/f16c-intrinsics.ll
@@ -1,233 +1,49 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32-AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64-AVX512VL
-
-define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
-; X32-LABEL: test_x86_vcvtph2ps_128:
-; X32:       # %bb.0:
-; X32-NEXT:    vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; X32-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtph2ps_128:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
-; X32-AVX512VL:       # %bb.0:
-; X32-AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
-; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
-
-define <4 x float> @test_x86_vcvtph2ps_128_m(<8 x i16>* nocapture %a) {
-; X32-LABEL: test_x86_vcvtph2ps_128_m:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-NEXT:    vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
-; X32-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtph2ps_128_m:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
-; X32-AVX512VL:       # %bb.0:
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT:    vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
-; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
-  %load = load <8 x i16>, <8 x i16>* %a
-  %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %load) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-
-define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
-; X32-LABEL: test_x86_vcvtph2ps_256:
-; X32:       # %bb.0:
-; X32-NEXT:    vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
-; X32-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtph2ps_256:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
-; X32-AVX512VL:       # %bb.0:
-; X32-AVX512VL-NEXT:    vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
-; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
-
-define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
-; X32-LABEL: test_x86_vcvtph2ps_256_m:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-NEXT:    vcvtph2ps (%eax), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x00]
-; X32-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtph2ps_256_m:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtph2ps (%rdi), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
-; X32-AVX512VL:       # %bb.0:
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT:    vcvtph2ps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x00]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    vcvtph2ps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x07]
-; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
-  %load = load <8 x i16>, <8 x i16>* %a
-  %res = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %load)
-  ret <8 x float> %res
-}
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefixes=AVX,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefixes=AVX,X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefixes=AVX512VL,X86-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefixes=AVX512VL,X64-AVX512VL
 
 define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
-; X32-LABEL: test_x86_vcvtps2ph_128:
-; X32:       # %bb.0:
-; X32-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
-; X32-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtps2ph_128:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
-; X32-AVX512VL:       # %bb.0:
-; X32-AVX512VL-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
-; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
+; AVX-LABEL: test_x86_vcvtps2ph_128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
 
 define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
-; X32-LABEL: test_x86_vcvtps2ph_256:
-; X32:       # %bb.0:
-; X32-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
-; X32-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X32-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtps2ph_256:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
-; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
-; X32-AVX512VL:       # %bb.0:
-; X32-AVX512VL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
-; X32-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
-; X64-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
+; AVX-LABEL: test_x86_vcvtps2ph_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
 
-define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
-; X32-LABEL: test_x86_vcvtps2ph_128_scalar:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-NEXT:    vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
-; X32-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtps2ph_128_scalar:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
-; X32-AVX512VL:       # %bb.0:
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT:    vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
-; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
-  %load = load i64, i64* %ptr
-  %ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
-  %ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
-  %bc = bitcast <2 x i64> %ins2 to <8 x i16>
-  %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc) #2
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
-; X32-LABEL: test_x86_vcvtps2ph_128_scalar2:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-NEXT:    vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
-; X32-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_x86_vcvtps2ph_128_scalar2:
-; X64:       # %bb.0:
-; X64-NEXT:    vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
-; X32-AVX512VL:       # %bb.0:
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT:    vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
-; X64-AVX512VL:       # %bb.0:
-; X64-AVX512VL-NEXT:    vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
-; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
-  %load = load i64, i64* %ptr
-  %ins = insertelement <2 x i64> undef, i64 %load, i32 0
-  %bc = bitcast <2 x i64> %ins to <8 x i16>
-  %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc)
-  ret <4 x float> %res
-}
 
 define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind {
-; X32-LABEL: test_x86_vcvtps2ph_256_m:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-NEXT:    vcvtps2ph $3, %ymm0, (%eax) # encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
-; X32-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X32-NEXT:    retl # encoding: [0xc3]
+; X86-LABEL: test_x86_vcvtps2ph_256_m:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcvtps2ph $3, %ymm0, (%eax) # encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_vcvtps2ph_256_m:
 ; X64:       # %bb.0: # %entry
@@ -235,12 +51,12 @@ define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) n
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
-; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
-; X32-AVX512VL:       # %bb.0: # %entry
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT:    vcvtps2ph $3, %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
-; X32-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
+; X86-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X86-AVX512VL:       # %bb.0: # %entry
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtps2ph $3, %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
 ; X64-AVX512VL:       # %bb.0: # %entry
@@ -254,22 +70,22 @@ entry:
 }
 
 define void @test_x86_vcvtps2ph_128_m(<4 x i16>* nocapture %d, <4 x float> %a) nounwind {
-; X32-LABEL: test_x86_vcvtps2ph_128_m:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
-; X32-NEXT:    retl # encoding: [0xc3]
+; X86-LABEL: test_x86_vcvtps2ph_128_m:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_vcvtps2ph_128_m:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
-; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
-; X32-AVX512VL:       # %bb.0: # %entry
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
+; X86-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X86-AVX512VL:       # %bb.0: # %entry
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
 ; X64-AVX512VL:       # %bb.0: # %entry
@@ -282,60 +98,60 @@ entry:
   ret void
 }
 
-define void @test_x86_vcvtps2ph_128_m2(double* nocapture %hf4x16, <4 x float> %f4x32) #0 {
-; X32-LABEL: test_x86_vcvtps2ph_128_m2:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
-; X32-NEXT:    retl # encoding: [0xc3]
+define void @test_x86_vcvtps2ph_128_m2(double* nocapture %hf4x16, <4 x float> %f4X86) #0 {
+; X86-LABEL: test_x86_vcvtps2ph_128_m2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_vcvtps2ph_128_m2:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
-; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
-; X32-AVX512VL:       # %bb.0: # %entry
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
+; X86-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X86-AVX512VL:       # %bb.0: # %entry
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
 ; X64-AVX512VL:       # %bb.0: # %entry
 ; X64-AVX512VL-NEXT:    vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
 ; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
 entry:
-  %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
+  %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4X86, i32 3)
   %1 = bitcast <8 x i16> %0 to <2 x double>
   %vecext = extractelement <2 x double> %1, i32 0
   store double %vecext, double* %hf4x16, align 8
   ret void
 }
 
-define void @test_x86_vcvtps2ph_128_m3(i64* nocapture %hf4x16, <4 x float> %f4x32) #0 {
-; X32-LABEL: test_x86_vcvtps2ph_128_m3:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
-; X32-NEXT:    retl # encoding: [0xc3]
+define void @test_x86_vcvtps2ph_128_m3(i64* nocapture %hf4x16, <4 x float> %f4X86) #0 {
+; X86-LABEL: test_x86_vcvtps2ph_128_m3:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_vcvtps2ph_128_m3:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
-; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
-; X32-AVX512VL:       # %bb.0: # %entry
-; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X32-AVX512VL-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
-; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
+; X86-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X86-AVX512VL:       # %bb.0: # %entry
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
 ; X64-AVX512VL:       # %bb.0: # %entry
 ; X64-AVX512VL-NEXT:    vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
 ; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
 entry:
-  %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
+  %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4X86, i32 3)
   %1 = bitcast <8 x i16> %0 to <2 x i64>
   %vecext = extractelement <2 x i64> %1, i32 0
   store i64 %vecext, i64* %hf4x16, align 8
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll b/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
index dc0f3e4..19a850c 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
@@ -5,14 +5,16 @@ declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
 
 ;
-; Vector Demanded Bits
+; Vector Demanded Elts
 ;
 
 ; Only bottom 4 elements required.
 define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
 ; CHECK-LABEL: @demand_vcvtph2ps_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> [[A:%.*]])
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <4 x half>
+; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <4 x half> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[CVTPH2PS]]
 ;
   %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
@@ -23,8 +25,9 @@ define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
 define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
 ; CHECK-LABEL: @demand_vcvtph2ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> [[TMP1]])
-; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half>
+; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[CVTPH2PS]]
 ;
   %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
-- 
2.7.4