From a17d627abbf9bf3825770a7d0afc89785954b6f5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 11 May 2018 21:59:34 +0000 Subject: [PATCH] [X86] Remove and autoupgrade a bunch of FMA instrinsics that are no longer used by clang. llvm-svn: 332146 --- llvm/include/llvm/IR/IntrinsicsX86.td | 88 -------- llvm/lib/IR/AutoUpgrade.cpp | 83 ++++++++ llvm/lib/Target/X86/X86IntrinsicsInfo.h | 22 -- .../Transforms/InstCombine/InstCombineCalls.cpp | 6 - .../InstCombine/InstCombineSimplifyDemanded.cpp | 6 - llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll | 20 +- llvm/test/CodeGen/X86/fma-fneg-combine.ll | 16 +- llvm/test/Transforms/InstCombine/X86/x86-fma.ll | 233 --------------------- 8 files changed, 95 insertions(+), 379 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 2cdb49e8..558bec0 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2200,78 +2200,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; - def int_x86_fma_vfmsub_ss : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfmsub_sd : // TODO: remove this intrinsic - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfmsub_ps : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfmsub_pd : // TODO: remove this intrinsic - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfmsub_ps_256 : // TODO: remove this intrinsic - Intrinsic<[llvm_v8f32_ty], - [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfmsub_pd_256 : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmadd_ss : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmadd_sd : // TODO: remove this intrinsic - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmadd_ps : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmadd_pd : // TODO: remove this intrinsic - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmadd_ps_256 : // TODO: remove this intrinsic - Intrinsic<[llvm_v8f32_ty], - [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmadd_pd_256 : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmsub_ss : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmsub_sd : // TODO: remove this intrinsic - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmsub_ps : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmsub_pd : // TODO: remove this intrinsic - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmsub_ps_256 : // TODO: remove this intrinsic - Intrinsic<[llvm_v8f32_ty], - [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfnmsub_pd_256 : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], - [IntrNoMem]>; def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], @@ -2290,22 +2218,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; - def int_x86_fma_vfmsubadd_ps : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfmsubadd_pd : // TODO: remove this intrinsic - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_fma_vfmsubadd_ps_256 : // TODO: remove this intrinsic - Intrinsic<[llvm_v8f32_ty], - [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], - [IntrNoMem]>; - def int_x86_fma_vfmsubadd_pd_256 : // TODO: remove this intrinsic - Intrinsic<[llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], - [IntrNoMem]>; def int_x86_avx512_mask_vfmadd_pd_128 : GCCBuiltin<"__builtin_ia32_vfmaddpd128_mask">, diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index bd1bbd3..a2bc447 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -87,6 +87,10 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { if (Name=="ssse3.pabs.b.128" || // Added in 6.0 Name=="ssse3.pabs.w.128" || // Added in 6.0 Name=="ssse3.pabs.d.128" || // Added in 6.0 + Name.startswith("fma.vfmsub.") || // Added in 7.0 + Name.startswith("fma.vfmsubadd.") || // Added in 7.0 + Name.startswith("fma.vfnmadd.") || // Added in 7.0 + Name.startswith("fma.vfnmsub.") || // Added in 7.0 Name.startswith("avx512.mask.shuf.i") || // Added in 6.0 Name.startswith("avx512.mask.shuf.f") || // Added in 6.0 Name.startswith("avx512.kunpck") || //added in 6.0 @@ -2360,6 +2364,85 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); } + } else if (IsX86 && Name.startswith("fma.vfmsub")) { + // Handle FMSUB and FSUBADD. + unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits(); + unsigned EltWidth = CI->getType()->getScalarSizeInBits(); + Intrinsic::ID IID; + if (Name[10] == '.' && Name[11] == 'p') { + // Packed FMSUB + if (VecWidth == 128 && EltWidth == 32) + IID = Intrinsic::x86_fma_vfmadd_ps; + else if (VecWidth == 128 && EltWidth == 64) + IID = Intrinsic::x86_fma_vfmadd_pd; + else if (VecWidth == 256 && EltWidth == 32) + IID = Intrinsic::x86_fma_vfmadd_ps_256; + else if (VecWidth == 256 && EltWidth == 64) + IID = Intrinsic::x86_fma_vfmadd_pd_256; + else + llvm_unreachable("Unexpected intrinsic"); + } else if (Name[10] == '.' && Name[11] == 's') { + // Scalar FMSUB + if (EltWidth == 32) + IID = Intrinsic::x86_fma_vfmadd_ss; + else if (EltWidth == 64) + IID = Intrinsic::x86_fma_vfmadd_sd; + else + llvm_unreachable("Unexpected intrinsic"); + } else { + // FMSUBADD + if (VecWidth == 128 && EltWidth == 32) + IID = Intrinsic::x86_fma_vfmaddsub_ps; + else if (VecWidth == 128 && EltWidth == 64) + IID = Intrinsic::x86_fma_vfmaddsub_pd; + else if (VecWidth == 256 && EltWidth == 32) + IID = Intrinsic::x86_fma_vfmaddsub_ps_256; + else if (VecWidth == 256 && EltWidth == 64) + IID = Intrinsic::x86_fma_vfmaddsub_pd_256; + else + llvm_unreachable("Unexpected intrinsic"); + } + Value *Arg2 = Builder.CreateFNeg(CI->getArgOperand(2)); + Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1), Arg2 }; + Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), + Ops); + } else if (IsX86 && (Name.startswith("fma.vfnmadd.") || + Name.startswith("fma.vfnmsub."))) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Arg2 = CI->getArgOperand(2); + unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits(); + unsigned EltWidth = CI->getType()->getScalarSizeInBits(); + Intrinsic::ID IID; + if (Name[12] == 'p') { + // Packed FNMADD/FNSUB + Arg0 = Builder.CreateFNeg(Arg0); + if (VecWidth == 128 && EltWidth == 32) + IID = Intrinsic::x86_fma_vfmadd_ps; + else if (VecWidth == 128 && EltWidth == 64) + IID = Intrinsic::x86_fma_vfmadd_pd; + else if (VecWidth == 256 && EltWidth == 32) + IID = Intrinsic::x86_fma_vfmadd_ps_256; + else if (VecWidth == 256 && EltWidth == 64) + IID = Intrinsic::x86_fma_vfmadd_pd_256; + else + llvm_unreachable("Unexpected intrinsic"); + } else { + // Scalar FNMADD/FNMSUB + Arg1 = Builder.CreateFNeg(Arg1); // Arg0 is passthru so invert Arg1. + if (EltWidth == 32) + IID = Intrinsic::x86_fma_vfmadd_ss; + else if (EltWidth == 64) + IID = Intrinsic::x86_fma_vfmadd_sd; + else + llvm_unreachable("Unexpected intrinsic"); + } + // Invert for FNMSUB. + if (Name[8] == 's') + Arg2 = Builder.CreateFNeg(Arg2); + Value *Ops[] = { Arg0, Arg1, Arg2 }; + Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), + Ops); } else if (IsX86 && Name.startswith("avx512.mask.") && upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) { // Rep will be updated by the call in the condition. diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index ad95d4e..2ffa8df 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1536,28 +1536,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_vfmsub_pd, INTR_TYPE_3OP, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_vfmsub_sd, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0), - X86_INTRINSIC_DATA(fma_vfmsub_ss, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0), - X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_vfnmadd_pd, INTR_TYPE_3OP, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_vfnmadd_sd, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0), - X86_INTRINSIC_DATA(fma_vfnmadd_ss, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0), - X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0), - X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0), X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0), X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0), X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 82a26f3..c323bfc 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2402,13 +2402,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx512_mask3_vfnmsub_ss: case Intrinsic::x86_avx512_mask3_vfnmsub_sd: case Intrinsic::x86_fma_vfmadd_ss: - case Intrinsic::x86_fma_vfmsub_ss: - case Intrinsic::x86_fma_vfnmadd_ss: - case Intrinsic::x86_fma_vfnmsub_ss: case Intrinsic::x86_fma_vfmadd_sd: - case Intrinsic::x86_fma_vfmsub_sd: - case Intrinsic::x86_fma_vfnmadd_sd: - case Intrinsic::x86_fma_vfnmsub_sd: case Intrinsic::x86_sse_cmp_ss: case Intrinsic::x86_sse_min_ss: case Intrinsic::x86_sse_max_ss: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index a475a27..7640785 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1387,13 +1387,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::x86_avx512_mask_max_sd_round: case Intrinsic::x86_avx512_mask_min_sd_round: case Intrinsic::x86_fma_vfmadd_ss: - case Intrinsic::x86_fma_vfmsub_ss: - case Intrinsic::x86_fma_vfnmadd_ss: - case Intrinsic::x86_fma_vfnmsub_ss: case Intrinsic::x86_fma_vfmadd_sd: - case Intrinsic::x86_fma_vfmsub_sd: - case Intrinsic::x86_fma_vfnmadd_sd: - case Intrinsic::x86_fma_vfnmsub_sd: case Intrinsic::x86_avx512_mask_vfmadd_ss: case Intrinsic::x86_avx512_mask_vfmadd_sd: case Intrinsic::x86_avx512_maskz_vfmadd_ss: diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll index 212a3ac..a6e3461 100644 --- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll @@ -43,16 +43,16 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; X32-LABEL: test3: ; X32: # %bb.0: # %entry -; X32-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 -; X32-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 -; X32-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X32-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] +; X32-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 +; X32-NEXT: vxorps %xmm3, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test3: ; X64: # %bb.0: # %entry -; X64-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 -; X64-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] +; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 +; X64-NEXT: vxorps %xmm3, %xmm0, %xmm0 ; X64-NEXT: retq entry: %0 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2 @@ -81,16 +81,12 @@ entry: define <8 x float> @test5(<8 x float> %a, <8 x float> %b, <8 x float> %c) { ; X32-LABEL: test5: ; X32: # %bb.0: # %entry -; X32-NEXT: vbroadcastss {{\.LCPI.*}}, %ymm3 -; X32-NEXT: vxorps %ymm3, %ymm2, %ymm2 -; X32-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 +; X32-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; X32-NEXT: retl ; ; X64-LABEL: test5: ; X64: # %bb.0: # %entry -; X64-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 -; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 +; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; X64-NEXT: retq entry: %sub.c = fsub <8 x float> , %c diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine.ll b/llvm/test/CodeGen/X86/fma-fneg-combine.ll index a0e919d..6074406 100644 --- a/llvm/test/CodeGen/X86/fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine.ll @@ -89,18 +89,10 @@ entry: } define <8 x float> @test8(<8 x float> %a, <8 x float> %b, <8 x float> %c) { -; SKX-LABEL: test8: -; SKX: # %bb.0: # %entry -; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm2, %ymm2 -; SKX-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 -; SKX-NEXT: retq -; -; KNL-LABEL: test8: -; KNL: # %bb.0: # %entry -; KNL-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0,-0,-0,-0,-0,-0,-0,-0] -; KNL-NEXT: vxorps %ymm3, %ymm2, %ymm2 -; KNL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 -; KNL-NEXT: retq +; CHECK-LABEL: test8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; CHECK-NEXT: retq entry: %sub.c = fsub <8 x float> , %c %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2 diff --git a/llvm/test/Transforms/InstCombine/X86/x86-fma.ll b/llvm/test/Transforms/InstCombine/X86/x86-fma.ll index 0d27d32..736eca3 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-fma.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-fma.ll @@ -80,236 +80,3 @@ define double @test_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> % ret double %3 } -declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) - -define <4 x float> @test_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfmsub_ss( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) -; CHECK-NEXT: ret <4 x float> [[TMP1]] -; - %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1 - %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2 - %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3 - %res = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6) - ret <4 x float> %res -} - -define float @test_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfmsub_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] -; - %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c) - %5 = extractelement <4 x float> %4, i32 0 - ret float %5 -} - -define float @test_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfmsub_ss_1( -; CHECK-NEXT: ret float 1.000000e+00 -; - %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c) - %5 = extractelement <4 x float> %4, i32 1 - ret float %5 -} - -declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) - -define <2 x double> @test_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfmsub_sd( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) -; CHECK-NEXT: ret <2 x double> [[TMP1]] -; - %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 - %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1 - %res = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2) - ret <2 x double> %res -} - -define double @test_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfmsub_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: ret double [[TMP2]] -; - %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c) - %3 = extractelement <2 x double> %2, i32 0 - ret double %3 -} - -define double @test_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfmsub_sd_1( -; CHECK-NEXT: ret double 1.000000e+00 -; - %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c) - %3 = extractelement <2 x double> %2, i32 1 - ret double %3 -} - -declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) - -define <4 x float> @test_vfnmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfnmadd_ss( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) -; CHECK-NEXT: ret <4 x float> [[TMP1]] -; - %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1 - %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2 - %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3 - %res = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6) - ret <4 x float> %res -} - -define float @test_vfnmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfnmadd_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] -; - %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c) - %5 = extractelement <4 x float> %4, i32 0 - ret float %5 -} - -define float @test_vfnmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfnmadd_ss_1( -; CHECK-NEXT: ret float 1.000000e+00 -; - %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c) - %5 = extractelement <4 x float> %4, i32 1 - ret float %5 -} - -declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) - -define <2 x double> @test_vfnmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfnmadd_sd( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) -; CHECK-NEXT: ret <2 x double> [[TMP1]] -; - %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 - %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1 - %res = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2) - ret <2 x double> %res -} - -define double @test_vfnmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfnmadd_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: ret double [[TMP2]] -; - %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c) - %3 = extractelement <2 x double> %2, i32 0 - ret double %3 -} - -define double @test_vfnmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfnmadd_sd_1( -; CHECK-NEXT: ret double 1.000000e+00 -; - %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c) - %3 = extractelement <2 x double> %2, i32 1 - ret double %3 -} - -declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) - -define <4 x float> @test_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfnmsub_ss( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) -; CHECK-NEXT: ret <4 x float> [[TMP1]] -; - %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1 - %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2 - %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3 - %res = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %3, <4 x float> %6) - ret <4 x float> %res -} - -define float @test_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfnmsub_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] -; - %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c) - %5 = extractelement <4 x float> %4, i32 0 - ret float %5 -} - -define float @test_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; CHECK-LABEL: @test_vfnmsub_ss_1( -; CHECK-NEXT: ret float 1.000000e+00 -; - %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 - %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2 - %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3 - %4 = tail call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %3, <4 x float> %b, <4 x float> %c) - %5 = extractelement <4 x float> %4, i32 1 - ret float %5 -} - -declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) - -define <2 x double> @test_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfnmsub_sd( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) -; CHECK-NEXT: ret <2 x double> [[TMP1]] -; - %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1 - %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1 - %res = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %1, <2 x double> %2) - ret <2 x double> %res -} - -define double @test_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfnmsub_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 -; CHECK-NEXT: ret double [[TMP2]] -; - %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c) - %3 = extractelement <2 x double> %2, i32 0 - ret double %3 -} - -define double @test_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; CHECK-LABEL: @test_vfnmsub_sd_1( -; CHECK-NEXT: ret double 1.000000e+00 -; - %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 - %2 = tail call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %1, <2 x double> %b, <2 x double> %c) - %3 = extractelement <2 x double> %2, i32 1 - ret double %3 -} -- 2.7.4