From 3e6b904f0a5075a3f33683ce38b5a4fd18280e5e Mon Sep 17 00:00:00 2001 From: gpei-dev Date: Tue, 19 Apr 2022 13:44:04 +0800 Subject: [PATCH] Force insert zero-idiom and break false dependency of dest register for several instructions. The related instructions are: VPERMD/Q/PS/PD VRANGEPD/PS/SD/SS VGETMANTSS/SD/SH VGETMANDPS/PD - mem version only VPMULLQ VFMULCSH/PH VFCMULCSH/PH Differential Revision: https://reviews.llvm.org/D116072 --- llvm/lib/Target/X86/X86.td | 31 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 271 ++++++ llvm/lib/Target/X86/X86TargetTransformInfo.h | 5 + llvm/test/CodeGen/X86/getmant-false-deps.ll | 589 ++++++++++++ llvm/test/CodeGen/X86/mulc-false-deps.ll | 872 +++++++++++++++++ llvm/test/CodeGen/X86/perm.avx2-false-deps.ll | 306 ++++++ llvm/test/CodeGen/X86/perm.avx512-false-deps.ll | 1161 +++++++++++++++++++++++ llvm/test/CodeGen/X86/pmullq-false-deps.ll | 363 +++++++ llvm/test/CodeGen/X86/range-false-deps.ll | 984 +++++++++++++++++++ 9 files changed, 4580 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/X86/getmant-false-deps.ll create mode 100644 llvm/test/CodeGen/X86/mulc-false-deps.ll create mode 100644 llvm/test/CodeGen/X86/perm.avx2-false-deps.ll create mode 100644 llvm/test/CodeGen/X86/perm.avx512-false-deps.ll create mode 100644 llvm/test/CodeGen/X86/pmullq-false-deps.ll create mode 100644 llvm/test/CodeGen/X86/range-false-deps.ll diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 60c6625..0634194 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -457,6 +457,27 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt", "HasLZCNTFalseDeps", "true", "LZCNT/TZCNT have a false dependency on dest register">; +def TuningMULCFalseDeps : SubtargetFeature<"false-deps-mulc", + "HasMULCFalseDeps", "true", + "VF[C]MULCPH/SH has a false dependency on dest register">; + +def TuningPERMFalseDeps : SubtargetFeature<"false-deps-perm", + "HasPERMFalseDeps", "true", + "VPERMD/Q/PS/PD has a false dependency on dest register">; + +def TuningRANGEFalseDeps : SubtargetFeature<"false-deps-range", + "HasRANGEFalseDeps", "true", + "VRANGEPD/PS/SD/SS has a false dependency on dest register">; + +def TuningGETMANTFalseDeps : SubtargetFeature<"false-deps-getmant", + "HasGETMANTFalseDeps", "true", + "VGETMANTSS/SD/SH and VGETMANDPS/PD(memory version) has a" + " false dependency on dest register">; + +def TuningMULLQFalseDeps : SubtargetFeature<"false-deps-mullq", + "HasMULLQFalseDeps", "true", + "VPMULLQ has a false dependency on dest register">; + def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking", "HasSBBDepBreaking", "true", "SBB with same register has no source dependency">; @@ -879,7 +900,12 @@ def ProcessorFeatures { FeatureMOVDIRI, FeatureMOVDIR64B, FeatureUINTR]; - list SPRTuning = ICXTuning; + list SPRAdditionalTuning = [TuningMULCFalseDeps, + TuningPERMFalseDeps, + TuningRANGEFalseDeps, + TuningGETMANTFalseDeps, + TuningMULLQFalseDeps]; + list SPRTuning = !listconcat(ICXTuning, SPRAdditionalTuning); list SPRFeatures = !listconcat(ICXFeatures, SPRAdditionalFeatures); @@ -985,7 +1011,8 @@ def ProcessorFeatures { FeatureMOVDIRI, FeatureMOVDIR64B, FeatureWAITPKG]; - list ADLTuning = SKLTuning; + list ADLAdditionalTuning = [TuningPERMFalseDeps]; + list ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); list ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 705301e..2a5f01a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4939,6 +4939,255 @@ static bool hasPartialRegUpdate(unsigned Opcode, case X86::SQRTSDr_Int: case X86::SQRTSDm_Int: return true; + case X86::VFCMULCPHZ128rm: + case X86::VFCMULCPHZ128rmb: + case X86::VFCMULCPHZ128rmbkz: + case X86::VFCMULCPHZ128rmkz: + case X86::VFCMULCPHZ128rr: + case X86::VFCMULCPHZ128rrkz: + case X86::VFCMULCPHZ256rm: + case X86::VFCMULCPHZ256rmb: + case X86::VFCMULCPHZ256rmbkz: + case X86::VFCMULCPHZ256rmkz: + case X86::VFCMULCPHZ256rr: + case X86::VFCMULCPHZ256rrkz: + case X86::VFCMULCPHZrm: + case X86::VFCMULCPHZrmb: + case X86::VFCMULCPHZrmbkz: + case X86::VFCMULCPHZrmkz: + case X86::VFCMULCPHZrr: + case X86::VFCMULCPHZrrb: + case X86::VFCMULCPHZrrbkz: + case X86::VFCMULCPHZrrkz: + case X86::VFMULCPHZ128rm: + case X86::VFMULCPHZ128rmb: + case X86::VFMULCPHZ128rmbkz: + case X86::VFMULCPHZ128rmkz: + case X86::VFMULCPHZ128rr: + case X86::VFMULCPHZ128rrkz: + case X86::VFMULCPHZ256rm: + case X86::VFMULCPHZ256rmb: + case X86::VFMULCPHZ256rmbkz: + case X86::VFMULCPHZ256rmkz: + case X86::VFMULCPHZ256rr: + case X86::VFMULCPHZ256rrkz: + case X86::VFMULCPHZrm: + case X86::VFMULCPHZrmb: + case X86::VFMULCPHZrmbkz: + case X86::VFMULCPHZrmkz: + case X86::VFMULCPHZrr: + case X86::VFMULCPHZrrb: + case X86::VFMULCPHZrrbkz: + case X86::VFMULCPHZrrkz: + case X86::VFCMULCSHZrm: + case X86::VFCMULCSHZrmkz: + case X86::VFCMULCSHZrr: + case X86::VFCMULCSHZrrb: + case X86::VFCMULCSHZrrbkz: + case X86::VFCMULCSHZrrkz: + case X86::VFMULCSHZrm: + case X86::VFMULCSHZrmkz: + case X86::VFMULCSHZrr: + case X86::VFMULCSHZrrb: + case X86::VFMULCSHZrrbkz: + case X86::VFMULCSHZrrkz: + return Subtarget.hasMULCFalseDeps(); + case X86::VPERMDYrm: + case X86::VPERMDYrr: + case X86::VPERMQYmi: + case X86::VPERMQYri: + case X86::VPERMPSYrm: + case X86::VPERMPSYrr: + case X86::VPERMPDYmi: + case X86::VPERMPDYri: + case X86::VPERMDZ256rm: + case X86::VPERMDZ256rmb: + case X86::VPERMDZ256rmbkz: + case X86::VPERMDZ256rmkz: + case X86::VPERMDZ256rr: + case X86::VPERMDZ256rrkz: + case X86::VPERMDZrm: + case X86::VPERMDZrmb: + case X86::VPERMDZrmbkz: + case X86::VPERMDZrmkz: + case X86::VPERMDZrr: + case X86::VPERMDZrrkz: + case X86::VPERMQZ256mbi: + case X86::VPERMQZ256mbikz: + case X86::VPERMQZ256mi: + case X86::VPERMQZ256mikz: + case X86::VPERMQZ256ri: + case X86::VPERMQZ256rikz: + case X86::VPERMQZ256rm: + case X86::VPERMQZ256rmb: + case X86::VPERMQZ256rmbkz: + case X86::VPERMQZ256rmkz: + case X86::VPERMQZ256rr: + case X86::VPERMQZ256rrkz: + case X86::VPERMQZmbi: + case X86::VPERMQZmbikz: + case X86::VPERMQZmi: + case X86::VPERMQZmikz: + case X86::VPERMQZri: + case X86::VPERMQZrikz: + case X86::VPERMQZrm: + case X86::VPERMQZrmb: + case X86::VPERMQZrmbkz: + case X86::VPERMQZrmkz: + case X86::VPERMQZrr: + case X86::VPERMQZrrkz: + case X86::VPERMPSZ256rm: + case X86::VPERMPSZ256rmb: + case X86::VPERMPSZ256rmbkz: + case X86::VPERMPSZ256rmkz: + case X86::VPERMPSZ256rr: + case X86::VPERMPSZ256rrkz: + case X86::VPERMPSZrm: + case X86::VPERMPSZrmb: + case X86::VPERMPSZrmbkz: + case X86::VPERMPSZrmkz: + case X86::VPERMPSZrr: + case X86::VPERMPSZrrkz: + case X86::VPERMPDZ256mbi: + case X86::VPERMPDZ256mbikz: + case X86::VPERMPDZ256mi: + case X86::VPERMPDZ256mikz: + case X86::VPERMPDZ256ri: + case X86::VPERMPDZ256rikz: + case X86::VPERMPDZ256rm: + case X86::VPERMPDZ256rmb: + case X86::VPERMPDZ256rmbkz: + case X86::VPERMPDZ256rmkz: + case X86::VPERMPDZ256rr: + case X86::VPERMPDZ256rrkz: + case X86::VPERMPDZmbi: + case X86::VPERMPDZmbikz: + case X86::VPERMPDZmi: + case X86::VPERMPDZmikz: + case X86::VPERMPDZri: + case X86::VPERMPDZrikz: + case X86::VPERMPDZrm: + case X86::VPERMPDZrmb: + case X86::VPERMPDZrmbkz: + case X86::VPERMPDZrmkz: + case X86::VPERMPDZrr: + case X86::VPERMPDZrrkz: + return Subtarget.hasPERMFalseDeps(); + case X86::VRANGEPDZ128rmbi: + case X86::VRANGEPDZ128rmbikz: + case X86::VRANGEPDZ128rmi: + case X86::VRANGEPDZ128rmikz: + case X86::VRANGEPDZ128rri: + case X86::VRANGEPDZ128rrikz: + case X86::VRANGEPDZ256rmbi: + case X86::VRANGEPDZ256rmbikz: + case X86::VRANGEPDZ256rmi: + case X86::VRANGEPDZ256rmikz: + case X86::VRANGEPDZ256rri: + case X86::VRANGEPDZ256rrikz: + case X86::VRANGEPDZrmbi: + case X86::VRANGEPDZrmbikz: + case X86::VRANGEPDZrmi: + case X86::VRANGEPDZrmikz: + case X86::VRANGEPDZrri: + case X86::VRANGEPDZrrib: + case X86::VRANGEPDZrribkz: + case X86::VRANGEPDZrrikz: + case X86::VRANGEPSZ128rmbi: + case X86::VRANGEPSZ128rmbikz: + case X86::VRANGEPSZ128rmi: + case X86::VRANGEPSZ128rmikz: + case X86::VRANGEPSZ128rri: + case X86::VRANGEPSZ128rrikz: + case X86::VRANGEPSZ256rmbi: + case X86::VRANGEPSZ256rmbikz: + case X86::VRANGEPSZ256rmi: + case X86::VRANGEPSZ256rmikz: + case X86::VRANGEPSZ256rri: + case X86::VRANGEPSZ256rrikz: + case X86::VRANGEPSZrmbi: + case X86::VRANGEPSZrmbikz: + case X86::VRANGEPSZrmi: + case X86::VRANGEPSZrmikz: + case X86::VRANGEPSZrri: + case X86::VRANGEPSZrrib: + case X86::VRANGEPSZrribkz: + case X86::VRANGEPSZrrikz: + case X86::VRANGESDZrmi: + case X86::VRANGESDZrmikz: + case X86::VRANGESDZrri: + case X86::VRANGESDZrrib: + case X86::VRANGESDZrribkz: + case X86::VRANGESDZrrikz: + case X86::VRANGESSZrmi: + case X86::VRANGESSZrmikz: + case X86::VRANGESSZrri: + case X86::VRANGESSZrrib: + case X86::VRANGESSZrribkz: + case X86::VRANGESSZrrikz: + return Subtarget.hasRANGEFalseDeps(); + case X86::VGETMANTSSZrmi: + case X86::VGETMANTSSZrmikz: + case X86::VGETMANTSSZrri: + case X86::VGETMANTSSZrrib: + case X86::VGETMANTSSZrribkz: + case X86::VGETMANTSSZrrikz: + case X86::VGETMANTSDZrmi: + case X86::VGETMANTSDZrmikz: + case X86::VGETMANTSDZrri: + case X86::VGETMANTSDZrrib: + case X86::VGETMANTSDZrribkz: + case X86::VGETMANTSDZrrikz: + case X86::VGETMANTSHZrmi: + case X86::VGETMANTSHZrmikz: + case X86::VGETMANTSHZrri: + case X86::VGETMANTSHZrrib: + case X86::VGETMANTSHZrribkz: + case X86::VGETMANTSHZrrikz: + case X86::VGETMANTPSZ128rmbi: + case X86::VGETMANTPSZ128rmbikz: + case X86::VGETMANTPSZ128rmi: + case X86::VGETMANTPSZ128rmikz: + case X86::VGETMANTPSZ256rmbi: + case X86::VGETMANTPSZ256rmbikz: + case X86::VGETMANTPSZ256rmi: + case X86::VGETMANTPSZ256rmikz: + case X86::VGETMANTPSZrmbi: + case X86::VGETMANTPSZrmbikz: + case X86::VGETMANTPSZrmi: + case X86::VGETMANTPSZrmikz: + case X86::VGETMANTPDZ128rmbi: + case X86::VGETMANTPDZ128rmbikz: + case X86::VGETMANTPDZ128rmi: + case X86::VGETMANTPDZ128rmikz: + case X86::VGETMANTPDZ256rmbi: + case X86::VGETMANTPDZ256rmbikz: + case X86::VGETMANTPDZ256rmi: + case X86::VGETMANTPDZ256rmikz: + case X86::VGETMANTPDZrmbi: + case X86::VGETMANTPDZrmbikz: + case X86::VGETMANTPDZrmi: + case X86::VGETMANTPDZrmikz: + return Subtarget.hasGETMANTFalseDeps(); + case X86::VPMULLQZ128rm: + case X86::VPMULLQZ128rmb: + case X86::VPMULLQZ128rmbkz: + case X86::VPMULLQZ128rmkz: + case X86::VPMULLQZ128rr: + case X86::VPMULLQZ128rrkz: + case X86::VPMULLQZ256rm: + case X86::VPMULLQZ256rmb: + case X86::VPMULLQZ256rmbkz: + case X86::VPMULLQZ256rmkz: + case X86::VPMULLQZ256rr: + case X86::VPMULLQZ256rrkz: + case X86::VPMULLQZrm: + case X86::VPMULLQZrmb: + case X86::VPMULLQZrmbkz: + case X86::VPMULLQZrmkz: + case X86::VPMULLQZrr: + case X86::VPMULLQZrrkz: + return Subtarget.hasMULLQFalseDeps(); // GPR case X86::POPCNT32rm: case X86::POPCNT32rr: @@ -5365,6 +5614,28 @@ void X86InstrInfo::breakPartialRegDependency( .addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::VR128XRegClass.contains(Reg)) { + // Only handle VLX targets. + if (!Subtarget.hasVLX()) + return; + // Since vxorps requires AVX512DQ, vpxord should be the best choice. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::VR256XRegClass.contains(Reg) || + X86::VR512RegClass.contains(Reg)) { + // Only handle VLX targets. + if (!Subtarget.hasVLX()) + return; + // Use vpxord to clear the full ymm/zmm register. + // It wants to read and write the xmm sub-register. + Register XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg) + .addReg(XReg, RegState::Undef) + .addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + MI.addRegisterKilled(Reg, TRI, true); } else if (X86::GR64RegClass.contains(Reg)) { // Using XOR32rr because it has shorter encoding and zeros up the upper bits // as well. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index d262835..4f87478 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -68,6 +68,11 @@ class X86TTIImpl : public BasicTTIImplBase { X86::TuningMacroFusion, X86::TuningPadShortFunctions, X86::TuningPOPCNTFalseDeps, + X86::TuningMULCFalseDeps, + X86::TuningPERMFalseDeps, + X86::TuningRANGEFalseDeps, + X86::TuningGETMANTFalseDeps, + X86::TuningMULLQFalseDeps, X86::TuningSlow3OpsLEA, X86::TuningSlowDivide32, X86::TuningSlowDivide64, diff --git a/llvm/test/CodeGen/X86/getmant-false-deps.ll b/llvm/test/CodeGen/X86/getmant-false-deps.ll new file mode 100644 index 0000000..8880ae0 --- /dev/null +++ b/llvm/test/CodeGen/X86/getmant-false-deps.ll @@ -0,0 +1,589 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-getmant -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-getmant -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE + +define <4 x float> @getmantps_mem_128(<4 x float>* %p0) { +; ENABLE-LABEL: getmantps_mem_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantps $88, (%rdi), %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantps_mem_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantps $88, (%rdi), %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x float>, <4 x float>* %p0, align 64 + %2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %a0, i32 88, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} + +define <4 x float> @getmantps_broadcast_128(float* %p0) { +; ENABLE-LABEL: getmantps_broadcast_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantps $88, (%rdi){1to4}, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantps_broadcast_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantps $88, (%rdi){1to4}, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load float, float* %p0, align 4 + %t0 = insertelement <4 x float> undef, float %v0, i64 0 + %a0 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer + %2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %a0, i32 88, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} + +declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x float>, i8) + +define <8 x float> @getmantps_mem_256(<8 x float>* %p0) { +; ENABLE-LABEL: getmantps_mem_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantps $88, (%rdi), %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantps_mem_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantps $88, (%rdi), %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x float>, <8 x float>* %p0, align 64 + %2 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %a0, i32 88, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} + +define <8 x float> @getmantps_broadcast_256(float* %p0) { +; ENABLE-LABEL: getmantps_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantps $88, (%rdi){1to8}, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantps_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantps $88, (%rdi){1to8}, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load float, float* %p0, align 4 + %t0 = insertelement <8 x float> undef, float %v0, i64 0 + %a0 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer + %2 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %a0, i32 88, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} + +declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x float>, i8) + +define <16 x float> @getmantps_mem_512(<16 x float>* %p0) { +; ENABLE-LABEL: getmantps_mem_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantps $88, (%rdi), %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantps_mem_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantps $88, (%rdi), %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x float>, <16 x float>* %p0, align 64 + %2 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %a0, i32 88, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @getmantps_broadcast_512(float* %p0) { +; ENABLE-LABEL: getmantps_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantps $88, (%rdi){1to16}, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantps_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantps $88, (%rdi){1to16}, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load float, float* %p0, align 4 + %t0 = insertelement <16 x float> undef, float %v0, i64 0 + %a0 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer + %2 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %a0, i32 88, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} + +declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32) + + +define <2 x double> @getmantpd_mem_128(<2 x double>* %p0) { +; ENABLE-LABEL: getmantpd_mem_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantpd $88, (%rdi), %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantpd_mem_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantpd $88, (%rdi), %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <2 x double>, <2 x double>* %p0, align 64 + %2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %a0, i32 88, <2 x double> undef, i8 -1) + ret <2 x double> %2 +} + +define <2 x double> @getmantpd_broadcast_128(double* %p0) { +; ENABLE-LABEL: getmantpd_broadcast_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantpd $88, (%rdi){1to2}, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantpd_broadcast_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantpd $88, (%rdi){1to2}, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load double, double* %p0, align 4 + %t0 = insertelement <2 x double> undef, double %v0, i64 0 + %a0 = shufflevector <2 x double> %t0, <2 x double> undef, <2 x i32> zeroinitializer + %2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %a0, i32 88, <2 x double> undef, i8 -1) + ret <2 x double> %2 +} + +declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8) + +define <4 x double> @getmantpd_mem_256(<4 x double>* %p0) { +; ENABLE-LABEL: getmantpd_mem_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantpd $88, (%rdi), %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantpd_mem_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantpd $88, (%rdi), %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x double>, <4 x double>* %p0, align 64 + %2 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %a0, i32 88, <4 x double> undef, i8 -1) + ret <4 x double> %2 +} + +define <4 x double> @getmantpd_broadcast_256(double* %p0) { +; ENABLE-LABEL: getmantpd_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantpd $88, (%rdi){1to4}, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantpd_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantpd $88, (%rdi){1to4}, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load double, double* %p0, align 4 + %t0 = insertelement <4 x double> undef, double %v0, i64 0 + %a0 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer + %2 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %a0, i32 88, <4 x double> undef, i8 -1) + ret <4 x double> %2 +} + +declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4 x double>, i8) + +define <8 x double> @getmantpd_mem_512(<8 x double>* %p0) { +; ENABLE-LABEL: getmantpd_mem_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantpd $88, (%rdi), %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantpd_mem_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantpd $88, (%rdi), %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x double>, <8 x double>* %p0, align 64 + %2 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %a0, i32 88, <8 x double> undef, i8 -1, i32 4) + ret <8 x double> %2 +} + +define <8 x double> @getmantpd_broadcast_512(double* %p0) { +; ENABLE-LABEL: getmantpd_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantpd $88, (%rdi){1to8}, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantpd_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vgetmantpd $88, (%rdi){1to8}, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load double, double* %p0, align 4 + %t0 = insertelement <8 x double> undef, double %v0, i64 0 + %a0 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer + %2 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %a0, i32 88, <8 x double> undef, i8 -1, i32 4) + ret <8 x double> %2 +} + +declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) + +define <8 x half> @getmantsh(<8 x half> %a0, <8 x half> %a1) { +; ENABLE-LABEL: getmantsh: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vgetmantsh $11, %xmm2, %xmm0, %xmm1 +; ENABLE-NEXT: vaddph %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantsh: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vgetmantsh $11, %xmm2, %xmm0, %xmm1 +; DISABLE-NEXT: vaddph %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 11, <8 x half> undef, i8 -1, i32 4) + %t = fadd <8 x half> %a0, %a1 + %res = fadd <8 x half> %2, %t + ret <8 x half> %res +} + +define <8 x half> @getmantsh_mem(<8 x half> %a0, <8 x half>* %p1) { +; ENABLE-LABEL: getmantsh_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantsh $11, (%rdi), %xmm1, %xmm0 +; ENABLE-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantsh_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; DISABLE-NEXT: vgetmantsh $11, (%rdi), %xmm1, %xmm0 +; DISABLE-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <8 x half>, <8 x half>* %p1, align 64 + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 11, <8 x half> undef, i8 -1, i32 4) + %res = fadd <8 x half> %2, %a0 + ret <8 x half> %res +} + +define <8 x half> @getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) { +; ENABLE-LABEL: getmantsh_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm2 {%k1} {z} +; ENABLE-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: vaddph %xmm0, %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantsh_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm2 {%k1} {z} +; DISABLE-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: vaddph %xmm0, %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 11, <8 x half> zeroinitializer, i8 %2, i32 4) + %t = fadd <8 x half> %a0, %a1 + %res = fadd <8 x half> %3, %t + ret <8 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32) + +define <4 x float> @getmantss(<4 x float> %a0, <4 x float> %a1) { +; ENABLE-LABEL: getmantss: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vgetmantss $11, %xmm2, %xmm0, %xmm1 +; ENABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantss: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vgetmantss $11, %xmm2, %xmm0, %xmm1 +; DISABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %a0, <4 x float> %a1, i32 11, <4 x float> undef, i8 -1, i32 4) + %t = fadd <4 x float> %a0, %a1 + %res = fadd <4 x float> %2, %t + ret <4 x float> %res +} + +define <4 x float> @getmantss_mem(<4 x float> %a0, <4 x float>* %p1) { +; ENABLE-LABEL: getmantss_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantss $11, (%rdi), %xmm1, %xmm0 +; ENABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantss_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; DISABLE-NEXT: vgetmantss $11, (%rdi), %xmm1, %xmm0 +; DISABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x float>, <4 x float>* %p1, align 64 + %2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %a0, <4 x float> %a1, i32 11, <4 x float> undef, i8 -1, i32 4) + %res = fadd <4 x float> %2, %a0 + ret <4 x float> %res +} + +define <4 x float> @getmantss_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { +; ENABLE-LABEL: getmantss_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} {z} +; ENABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantss_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} {z} +; DISABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %a0, <4 x float> %a1, i32 11, <4 x float> zeroinitializer, i8 %2, i32 4) + %t = fadd <4 x float> %a0, %a1 + %res = fadd <4 x float> %3, %t + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32) + +define <2 x double> @getmantsd(<2 x double> %a0, <2 x double> %a1) { +; ENABLE-LABEL: getmantsd: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vgetmantsd $11, %xmm2, %xmm0, %xmm1 +; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantsd: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vgetmantsd $11, %xmm2, %xmm0, %xmm1 +; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %a0, <2 x double> %a1, i32 11, <2 x double> undef, i8 -1, i32 4) + %t = fadd <2 x double> %a0, %a1 + %res = fadd <2 x double> %2, %t + ret <2 x double> %res +} + +define <2 x double> @getmantsd_mem(<2 x double> %a0, <2 x double>* %p1) { +; ENABLE-LABEL: getmantsd_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vgetmantsd $11, (%rdi), %xmm1, %xmm0 +; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantsd_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; DISABLE-NEXT: vgetmantsd $11, (%rdi), %xmm1, %xmm0 +; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <2 x double>, <2 x double>* %p1, align 64 + %2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %a0, <2 x double> %a1, i32 11, <2 x double> undef, i8 -1, i32 4) + %res = fadd <2 x double> %2, %a0 + ret <2 x double> %res +} + +define <2 x double> @getmantsd_maskz(<2 x double> %a0, <2 x double> %a1, i8* %mask) { +; ENABLE-LABEL: getmantsd_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm2 {%k1} {z} +; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: getmantsd_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm2 {%k1} {z} +; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %a0, <2 x double> %a1, i32 11, <2 x double> zeroinitializer, i8 %2, i32 4) + %t = fadd <2 x double> %a0, %a1 + %res = fadd <2 x double> %3, %t + ret <2 x double> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32) diff --git a/llvm/test/CodeGen/X86/mulc-false-deps.ll b/llvm/test/CodeGen/X86/mulc-false-deps.ll new file mode 100644 index 0000000..f4f1563 --- /dev/null +++ b/llvm/test/CodeGen/X86/mulc-false-deps.ll @@ -0,0 +1,872 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mulc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mulc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE + +define <16 x float> @fmulcph(<16 x float> %a0, <16 x float> %a1) { +; ENABLE-LABEL: fmulcph: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 +; ENABLE-NEXT: vmovaps %zmm2, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulcph: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; DISABLE-NEXT: vmovaps %zmm2, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @fmulcph_mem(<16 x float> %a0, <16 x float>* %p1) { +; ENABLE-LABEL: fmulcph_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfmulcph (%rdi), %zmm0, %zmm1 +; ENABLE-NEXT: vmovaps %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulcph_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; DISABLE-NEXT: vfmulcph (%rdi), %zmm0, %zmm1 +; DISABLE-NEXT: vmovaps %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <16 x float>, <16 x float>* %p1, align 64 + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @fmulcph_broadcast(<16 x float> %a0, float* %p1) { +; ENABLE-LABEL: fmulcph_broadcast: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfmulcph (%rdi){1to16}, %zmm0, %zmm1 +; ENABLE-NEXT: vmovaps %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulcph_broadcast: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; DISABLE-NEXT: vfmulcph (%rdi){1to16}, %zmm0, %zmm1 +; DISABLE-NEXT: vmovaps %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <16 x float> undef, float %v1, i64 0 + %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, i16* %mask) { +; ENABLE-LABEL: fmulcph_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovw (%rdi), %k1 +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 {%k1} {z} +; ENABLE-NEXT: vmovaps %zmm2, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulcph_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovw (%rdi), %k1 +; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload +; DISABLE-NEXT: vmovaps %zmm2, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4) + ret <16 x float> %3 +} + +define <16 x float> @fcmulcph(<16 x float> %a0, <16 x float> %a1) { +; ENABLE-LABEL: fcmulcph: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2 +; ENABLE-NEXT: vmovaps %zmm2, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulcph: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; DISABLE-NEXT: vmovaps %zmm2, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @fcmulcph_mem(<16 x float> %a0, <16 x float>* %p1) { +; ENABLE-LABEL: fcmulcph_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfcmulcph (%rdi), %zmm0, %zmm1 +; ENABLE-NEXT: vmovaps %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulcph_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; DISABLE-NEXT: vfcmulcph (%rdi), %zmm0, %zmm1 +; DISABLE-NEXT: vmovaps %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <16 x float>, <16 x float>* %p1, align 64 + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @fcmulcph_broadcast(<16 x float> %a0, float* %p1) { +; ENABLE-LABEL: fcmulcph_broadcast: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfcmulcph (%rdi){1to16}, %zmm0, %zmm1 +; ENABLE-NEXT: vmovaps %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulcph_broadcast: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; DISABLE-NEXT: vfcmulcph (%rdi){1to16}, %zmm0, %zmm1 +; DISABLE-NEXT: vmovaps %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <16 x float> undef, float %v1, i64 0 + %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, i16* %mask) { +; ENABLE-LABEL: fcmulcph_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovw (%rdi), %k1 +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfcmulcph %zmm1, %zmm0, %zmm2 {%k1} {z} +; ENABLE-NEXT: vmovaps %zmm2, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulcph_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovw (%rdi), %k1 +; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload +; DISABLE-NEXT: vmovaps %zmm2, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4) + ret <16 x float> %3 +} + +define <4 x float> @fmulc(<4 x float> %a0, <4 x float> %a1) { +; ENABLE-LABEL: fmulc: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 +; ENABLE-NEXT: vmovaps %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulc: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; DISABLE-NEXT: vmovaps %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} + +define <4 x float> @fmulc_mem(<4 x float> %a0, <4 x float>* %p1) { +; ENABLE-LABEL: fmulc_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfmulcph (%rdi), %xmm0, %xmm1 +; ENABLE-NEXT: vmovaps %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulc_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; DISABLE-NEXT: vfmulcph (%rdi), %xmm0, %xmm1 +; DISABLE-NEXT: vmovaps %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x float>, <4 x float>* %p1, align 64 + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} + +define <4 x float> @fmulc_broadcast(<4 x float> %a0, float* %p1) { +; ENABLE-LABEL: fmulc_broadcast: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfmulcph (%rdi){1to4}, %xmm0, %xmm1 +; ENABLE-NEXT: vmovaps %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulc_broadcast: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; DISABLE-NEXT: vfmulcph (%rdi){1to4}, %xmm0, %xmm1 +; DISABLE-NEXT: vmovaps %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <4 x float> undef, float %v1, i64 0 + %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} + +define <4 x float> @fmulc_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { +; ENABLE-LABEL: fmulc_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 {%k1} {z} +; ENABLE-NEXT: vmovaps %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulc_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload +; DISABLE-NEXT: vmovaps %xmm2, %xmm0 +; DISABLE-NEXT: retq + + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2) + ret <4 x float> %3 +} + +define <4 x float> @fcmulc(<4 x float> %a0, <4 x float> %a1) { +; ENABLE-LABEL: fcmulc: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2 +; ENABLE-NEXT: vmovaps %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulc: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; DISABLE-NEXT: vmovaps %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} + +define <4 x float> @fcmulc_mem(<4 x float> %a0, <4 x float>* %p1) { +; ENABLE-LABEL: fcmulc_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfcmulcph (%rdi), %xmm0, %xmm1 +; ENABLE-NEXT: vmovaps %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulc_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; DISABLE-NEXT: vfcmulcph (%rdi), %xmm0, %xmm1 +; DISABLE-NEXT: vmovaps %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x float>, <4 x float>* %p1, align 64 + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} + +define <4 x float> @fcmulc_broadcast(<4 x float> %a0, float* %p1) { +; ENABLE-LABEL: fcmulc_broadcast: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfcmulcph (%rdi){1to4}, %xmm0, %xmm1 +; ENABLE-NEXT: vmovaps %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulc_broadcast: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; DISABLE-NEXT: vfcmulcph (%rdi){1to4}, %xmm0, %xmm1 +; DISABLE-NEXT: vmovaps %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <4 x float> undef, float %v1, i64 0 + %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1) + ret <4 x float> %2 +} + +define <4 x float> @fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { +; ENABLE-LABEL: fcmulc_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfcmulcph %xmm1, %xmm0, %xmm2 {%k1} {z} +; ENABLE-NEXT: vmovaps %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulc_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload +; DISABLE-NEXT: vmovaps %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2) + ret <4 x float> %3 +} + +define <8 x float> @fmulc_ymm(<8 x float> %a0, <8 x float> %a1) { +; ENABLE-LABEL: fmulc_ymm: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 +; ENABLE-NEXT: vmovaps %ymm2, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulc_ymm: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; DISABLE-NEXT: vmovaps %ymm2, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} + +define <8 x float> @fmulc_ymm_mem(<8 x float> %a0, <8 x float>* %p1) { +; ENABLE-LABEL: fmulc_ymm_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfmulcph (%rdi), %ymm0, %ymm1 +; ENABLE-NEXT: vmovaps %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulc_ymm_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfmulcph (%rdi), %ymm0, %ymm1 +; DISABLE-NEXT: vmovaps %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <8 x float>, <8 x float>* %p1, align 64 + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} + +define <8 x float> @fmulc_ymm_broadcast(<8 x float> %a0, float* %p1) { +; ENABLE-LABEL: fmulc_ymm_broadcast: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfmulcph (%rdi){1to8}, %ymm0, %ymm1 +; ENABLE-NEXT: vmovaps %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulc_ymm_broadcast: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfmulcph (%rdi){1to8}, %ymm0, %ymm1 +; DISABLE-NEXT: vmovaps %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <8 x float> undef, float %v1, i64 0 + %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} + +define <8 x float> @fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i8* %mask) { +; ENABLE-LABEL: fmulc_maskz_ymm: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 {%k1} {z} +; ENABLE-NEXT: vmovaps %ymm2, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulc_maskz_ymm: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload +; DISABLE-NEXT: vmovaps %ymm2, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2) + ret <8 x float> %3 +} + +define <8 x float> @fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) { +; ENABLE-LABEL: fcmulc_ymm: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2 +; ENABLE-NEXT: vmovaps %ymm2, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulc_ymm: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; DISABLE-NEXT: vmovaps %ymm2, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} + +define <8 x float> @fcmulc_ymm_mem(<8 x float> %a0, <8 x float>* %p1) { +; ENABLE-LABEL: fcmulc_ymm_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfcmulcph (%rdi), %ymm0, %ymm1 +; ENABLE-NEXT: vmovaps %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulc_ymm_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfcmulcph (%rdi), %ymm0, %ymm1 +; DISABLE-NEXT: vmovaps %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <8 x float>, <8 x float>* %p1, align 64 + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} + +define <8 x float> @fcmulc_ymm_broadcast(<8 x float> %a0, float* %p1) { +; ENABLE-LABEL: fcmulc_ymm_broadcast: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfcmulcph (%rdi){1to8}, %ymm0, %ymm1 +; ENABLE-NEXT: vmovaps %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulc_ymm_broadcast: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfcmulcph (%rdi){1to8}, %ymm0, %ymm1 +; DISABLE-NEXT: vmovaps %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <8 x float> undef, float %v1, i64 0 + %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1) + ret <8 x float> %2 +} + +define <8 x float> @fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i8* %mask) { +; ENABLE-LABEL: fcmulc_maskz_ymm: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfcmulcph %ymm1, %ymm0, %ymm2 {%k1} {z} +; ENABLE-NEXT: vmovaps %ymm2, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulc_maskz_ymm: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload +; DISABLE-NEXT: vmovaps %ymm2, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2) + ret <8 x float> %3 +} + +define <4 x float> @fmulcsh(<4 x float> %a0, <4 x float> %a1) { +; ENABLE-LABEL: fmulcsh: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 +; ENABLE-NEXT: vmovaps %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulcsh: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; DISABLE-NEXT: vmovaps %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %2 +} + +define <4 x float> @fmulcsh_mem(<4 x float> %a0, <4 x float>* %p1) { +; ENABLE-LABEL: fmulcsh_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfmulcsh (%rdi), %xmm0, %xmm1 +; ENABLE-NEXT: vmovaps %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulcsh_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; DISABLE-NEXT: vfmulcsh (%rdi), %xmm0, %xmm1 +; DISABLE-NEXT: vmovaps %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x float>, <4 x float>* %p1, align 64 + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %2 +} + +define <4 x float> @fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { +; ENABLE-LABEL: fmulcsh_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z} +; ENABLE-NEXT: vmovaps %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fmulcsh_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload +; DISABLE-NEXT: vmovaps %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4) + ret <4 x float> %3 +} + +define <4 x float> @fcmulcsh(<4 x float> %a0, <4 x float> %a1) { +; ENABLE-LABEL: fcmulcsh: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 +; ENABLE-NEXT: vmovaps %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulcsh: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; DISABLE-NEXT: vmovaps %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %2 +} + +define <4 x float> @fcmulcsh_mem(<4 x float> %a0, <4 x float>* %p1) { +; ENABLE-LABEL: fcmulcsh_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vfcmulcsh (%rdi), %xmm0, %xmm1 +; ENABLE-NEXT: vmovaps %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulcsh_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; DISABLE-NEXT: vfcmulcsh (%rdi), %xmm0, %xmm1 +; DISABLE-NEXT: vmovaps %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x float>, <4 x float>* %p1, align 64 + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %2 +} + +define <4 x float> @fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, i8* %mask) { +; ENABLE-LABEL: fcmulcsh_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vfcmulcsh %xmm1, %xmm0, %xmm2 {%k1} {z} +; ENABLE-NEXT: vmovaps %xmm2, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: fcmulcsh_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload +; DISABLE-NEXT: vmovaps %xmm2, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4) + ret <4 x float> %3 +} + +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8) +declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) +declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) + diff --git a/llvm/test/CodeGen/X86/perm.avx2-false-deps.ll b/llvm/test/CodeGen/X86/perm.avx2-false-deps.ll new file mode 100644 index 0000000..33bc951 --- /dev/null +++ b/llvm/test/CodeGen/X86/perm.avx2-false-deps.ll @@ -0,0 +1,306 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=alderlake -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE,ENABLE-ADL +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE,ENABLE-SPR +; RUN: llc -verify-machineinstrs -mcpu=alderlake -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE,DISABLE-ADL +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE,DISABLE-SPR + +define <8 x i32> @permd(<8 x i32> %a0, <8 x i32> %a1) { +; ENABLE-ADL-LABEL: permd: +; ENABLE-ADL: # %bb.0: +; ENABLE-ADL-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-ADL-NEXT: #APP +; ENABLE-ADL-NEXT: nop +; ENABLE-ADL-NEXT: #NO_APP +; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-ADL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-ADL-NEXT: vpermd %ymm2, %ymm1, %ymm0 +; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ENABLE-ADL-NEXT: retq +; +; ENABLE-SPR-LABEL: permd: +; ENABLE-SPR: # %bb.0: +; ENABLE-SPR-NEXT: vmovdqa64 %ymm1, %ymm16 +; ENABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm17 +; ENABLE-SPR-NEXT: #APP +; ENABLE-SPR-NEXT: nop +; ENABLE-SPR-NEXT: #NO_APP +; ENABLE-SPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-SPR-NEXT: vpermd %ymm17, %ymm16, %ymm0 +; ENABLE-SPR-NEXT: vpaddd %ymm16, %ymm17, %ymm1 +; ENABLE-SPR-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ENABLE-SPR-NEXT: retq +; +; DISABLE-ADL-LABEL: permd: +; DISABLE-ADL: # %bb.0: +; DISABLE-ADL-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-ADL-NEXT: #APP +; DISABLE-ADL-NEXT: nop +; DISABLE-ADL-NEXT: #NO_APP +; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-ADL-NEXT: vpermd %ymm2, %ymm1, %ymm0 +; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; DISABLE-ADL-NEXT: retq +; +; DISABLE-SPR-LABEL: permd: +; DISABLE-SPR: # %bb.0: +; DISABLE-SPR-NEXT: vmovdqa64 %ymm1, %ymm16 +; DISABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm17 +; DISABLE-SPR-NEXT: #APP +; DISABLE-SPR-NEXT: nop +; DISABLE-SPR-NEXT: #NO_APP +; DISABLE-SPR-NEXT: vpermd %ymm17, %ymm16, %ymm0 +; DISABLE-SPR-NEXT: vpaddd %ymm16, %ymm17, %ymm1 +; DISABLE-SPR-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; DISABLE-SPR-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) + %3 = add <8 x i32> %a0, %a1 + %res = add <8 x i32> %2, %3 + ret <8 x i32> %res +} + +define <8 x i32> @permd_mem(<8 x i32>* %p0, <8 x i32> %a1) { +; ENABLE-ADL-LABEL: permd_mem: +; ENABLE-ADL: # %bb.0: +; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-ADL-NEXT: #APP +; ENABLE-ADL-NEXT: nop +; ENABLE-ADL-NEXT: #NO_APP +; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-ADL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-ADL-NEXT: vpermd (%rdi), %ymm1, %ymm0 +; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ENABLE-ADL-NEXT: retq +; +; ENABLE-SPR-LABEL: permd_mem: +; ENABLE-SPR: # %bb.0: +; ENABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm16 +; ENABLE-SPR-NEXT: #APP +; ENABLE-SPR-NEXT: nop +; ENABLE-SPR-NEXT: #NO_APP +; ENABLE-SPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-SPR-NEXT: vpermd (%rdi), %ymm16, %ymm0 +; ENABLE-SPR-NEXT: vpaddd %ymm16, %ymm0, %ymm0 +; ENABLE-SPR-NEXT: retq +; +; DISABLE-ADL-LABEL: permd_mem: +; DISABLE-ADL: # %bb.0: +; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-ADL-NEXT: #APP +; DISABLE-ADL-NEXT: nop +; DISABLE-ADL-NEXT: #NO_APP +; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-ADL-NEXT: vpermd (%rdi), %ymm1, %ymm0 +; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; DISABLE-ADL-NEXT: retq +; +; DISABLE-SPR-LABEL: permd_mem: +; DISABLE-SPR: # %bb.0: +; DISABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm16 +; DISABLE-SPR-NEXT: #APP +; DISABLE-SPR-NEXT: nop +; DISABLE-SPR-NEXT: #NO_APP +; DISABLE-SPR-NEXT: vpermd (%rdi), %ymm16, %ymm0 +; DISABLE-SPR-NEXT: vpaddd %ymm16, %ymm0, %ymm0 +; DISABLE-SPR-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %a0 = load <8 x i32>, <8 x i32>* %p0, align 64 + %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) + %res = add <8 x i32> %2, %a1 + ret <8 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly + +define <4 x i64> @permq(<4 x i64> %a0) { +; ENABLE-LABEL: permq: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0] +; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0] +; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %res = add <4 x i64> %2, %a0 + ret <4 x i64> %res +} + +define <4 x i64> @permq_mem(<4 x i64>* %p0) { +; ENABLE-LABEL: permq_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0] +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0] +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %a0 = load <4 x i64>, <4 x i64>* %p0, align 64 + %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + ret <4 x i64> %2 +} + +define <8 x float> @permps(<8 x float> %a0, <8 x i32> %a1) { +; ENABLE-ADL-LABEL: permps: +; ENABLE-ADL: # %bb.0: +; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-ADL-NEXT: #APP +; ENABLE-ADL-NEXT: nop +; ENABLE-ADL-NEXT: #NO_APP +; ENABLE-ADL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-ADL-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; ENABLE-ADL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; ENABLE-ADL-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; ENABLE-ADL-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; ENABLE-ADL-NEXT: retq +; +; ENABLE-SPR-LABEL: permps: +; ENABLE-SPR: # %bb.0: +; ENABLE-SPR-NEXT: vmovaps %ymm0, %ymm16 +; ENABLE-SPR-NEXT: #APP +; ENABLE-SPR-NEXT: nop +; ENABLE-SPR-NEXT: #NO_APP +; ENABLE-SPR-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-SPR-NEXT: vpermps %ymm16, %ymm0, %ymm1 +; ENABLE-SPR-NEXT: vcvtdq2ps %ymm0, %ymm0 +; ENABLE-SPR-NEXT: vaddps %ymm16, %ymm0, %ymm0 +; ENABLE-SPR-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; ENABLE-SPR-NEXT: retq +; +; DISABLE-ADL-LABEL: permps: +; DISABLE-ADL: # %bb.0: +; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-ADL-NEXT: #APP +; DISABLE-ADL-NEXT: nop +; DISABLE-ADL-NEXT: #NO_APP +; DISABLE-ADL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-ADL-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; DISABLE-ADL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; DISABLE-ADL-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; DISABLE-ADL-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; DISABLE-ADL-NEXT: retq +; +; DISABLE-SPR-LABEL: permps: +; DISABLE-SPR: # %bb.0: +; DISABLE-SPR-NEXT: vmovaps %ymm0, %ymm16 +; DISABLE-SPR-NEXT: #APP +; DISABLE-SPR-NEXT: nop +; DISABLE-SPR-NEXT: #NO_APP +; DISABLE-SPR-NEXT: vpermps %ymm16, %ymm0, %ymm1 +; DISABLE-SPR-NEXT: vcvtdq2ps %ymm0, %ymm0 +; DISABLE-SPR-NEXT: vaddps %ymm16, %ymm0, %ymm0 +; DISABLE-SPR-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; DISABLE-SPR-NEXT: retq + %1 = tail call <8 x i32> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1) + %t = sitofp <8 x i32> %1 to <8 x float> + %3 = fadd <8 x float> %t, %a0 + %res = fadd <8 x float> %2, %3 + ret <8 x float> %res +} + +define <8 x float> @permps_mem(<8 x float>* %p0, <8 x i32> %a1) { +; ENABLE-LABEL: permps_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermps (%rdi), %ymm0, %ymm1 +; ENABLE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermps (%rdi), %ymm0, %ymm1 +; DISABLE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %a0 = load <8 x float>, <8 x float>* %p0, align 64 + %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) + %t = sitofp <8 x i32> %a1 to <8 x float> + %res = fadd <8 x float> %2, %t + ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly + +define <4 x double> @permpd(<4 x double> %a0) { +; ENABLE-LABEL: permpd: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0] +; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0] +; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> + %res = fadd <4 x double> %2, %a0 + ret <4 x double> %res +} + +define <4 x double> @permpd_mem(<4 x double>* %p0) { +; ENABLE-LABEL: permpd_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0] +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0] +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %a0 = load <4 x double>, <4 x double>* %p0, align 64 + %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> + ret <4 x double> %2 +} diff --git a/llvm/test/CodeGen/X86/perm.avx512-false-deps.ll b/llvm/test/CodeGen/X86/perm.avx512-false-deps.ll new file mode 100644 index 0000000..5accc99 --- /dev/null +++ b/llvm/test/CodeGen/X86/perm.avx512-false-deps.ll @@ -0,0 +1,1161 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE + +define <4 x i64> @permq_ri_256(<4 x i64> %a0) { +; ENABLE-LABEL: permq_ri_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0] +; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_ri_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0] +; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %res = add <4 x i64> %2, %a0 + ret <4 x i64> %res +} + +define <4 x i64> @permq_rr_256(<4 x i64> %a0, <4 x i64> %idx) { +; ENABLE-LABEL: permq_rr_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermq %ymm0, %ymm2, %ymm1 +; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_rr_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-NEXT: vpermq %ymm0, %ymm2, %ymm1 +; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx) + %t = add <4 x i64> %a0, %idx + %res = add <4 x i64> %t, %2 + ret <4 x i64> %res +} + +define <4 x i64> @permq_rm_256(<4 x i64>* %p0, <4 x i64> %idx) { +; ENABLE-LABEL: permq_rm_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermq (%rdi), %ymm0, %ymm1 +; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_rm_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermq (%rdi), %ymm0, %ymm1 +; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x i64>, <4 x i64>* %p0, align 64 + %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx) + %res = add <4 x i64> %idx, %2 + ret <4 x i64> %res +} + +define <4 x i64> @permq_mi_256(<4 x i64>* %p0) { +; ENABLE-LABEL: permq_mi_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0] +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_mi_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0] +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x i64>, <4 x i64>* %p0, align 64 + %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + ret <4 x i64> %2 +} + +define <4 x i64> @permq_broadcast_256(i64* %p0, <4 x i64> %idx) { +; ENABLE-LABEL: permq_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermq (%rdi){1to4}, %ymm1, %ymm0 +; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vpermq (%rdi){1to4}, %ymm1, %ymm0 +; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load i64, i64* %p0, align 4 + %t0 = insertelement <4 x i64> undef, i64 %v0, i64 0 + %a0 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer + %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx) + %res = add <4 x i64> %2, %idx + ret <4 x i64> %res +} + +define <4 x i64> @permq_maskz_256(<4 x i64> %a0, <4 x i64> %idx, i8* %mask) { +; ENABLE-LABEL: permq_maskz_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vpermq %ymm0, %ymm1, %ymm2 +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 {%k1} +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_maskz_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermq %ymm0, %ymm1, %ymm2 +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 {%k1} +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx, <4 x i64> zeroinitializer, i8 %2) + %t = add <4 x i64> %a0, %idx + %res = add <4 x i64> %3, %t + ret <4 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) +declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <8 x i64> @permq_rr_512(<8 x i64> %a0, <8 x i64> %idx) { +; ENABLE-LABEL: permq_rr_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermq %zmm0, %zmm2, %zmm1 +; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_rr_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; DISABLE-NEXT: vpermq %zmm0, %zmm2, %zmm1 +; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx) + %t = add <8 x i64> %a0, %idx + %res = add <8 x i64> %t, %2 + ret <8 x i64> %res +} + +define <8 x i64> @permq_rm_512(<8 x i64>* %p0, <8 x i64> %idx) { +; ENABLE-LABEL: permq_rm_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermq (%rdi), %zmm0, %zmm1 +; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_rm_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermq (%rdi), %zmm0, %zmm1 +; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x i64>, <8 x i64>* %p0, align 64 + %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx) + %res = add <8 x i64> %idx, %2 + ret <8 x i64> %res +} + +define <8 x i64> @permq_broadcast_512(i64* %p0, <8 x i64> %idx) { +; ENABLE-LABEL: permq_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermq (%rdi){1to8}, %zmm1, %zmm0 +; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vpermq (%rdi){1to8}, %zmm1, %zmm0 +; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load i64, i64* %p0, align 4 + %t0 = insertelement <8 x i64> undef, i64 %v0, i64 0 + %a0 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer + %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx) + %res = add <8 x i64> %2, %idx + ret <8 x i64> %res +} + +define <8 x i64> @permq_maskz_512(<8 x i64> %a0, <8 x i64> %idx, i8* %mask) { +; ENABLE-LABEL: permq_maskz_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vpermq %zmm0, %zmm1, %zmm2 +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 {%k1} +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permq_maskz_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermq %zmm0, %zmm1, %zmm2 +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 {%k1} +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx, <8 x i64> zeroinitializer, i8 %2) + %t = add <8 x i64> %a0, %idx + %res = add <8 x i64> %3, %t + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) +declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i32> @permd_rr_256(<8 x i32> %a0, <8 x i32> %idx) { +; ENABLE-LABEL: permd_rr_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermd %ymm0, %ymm2, %ymm1 +; ENABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permd_rr_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-NEXT: vpermd %ymm0, %ymm2, %ymm1 +; DISABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1) + %t = add <8 x i32> %a0, %idx + %res = add <8 x i32> %t, %2 + ret <8 x i32> %res +} + +define <8 x i32> @permd_rm_256(<8 x i32>* %p0, <8 x i32> %idx) { +; ENABLE-LABEL: permd_rm_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermd (%rdi), %ymm0, %ymm1 +; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permd_rm_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermd (%rdi), %ymm0, %ymm1 +; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x i32>, <8 x i32>* %p0, align 64 + %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1) + %res = add <8 x i32> %idx, %2 + ret <8 x i32> %res +} + +define <8 x i32> @permd_broadcast_256(i32* %p0, <8 x i32> %idx) { +; ENABLE-LABEL: permd_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermd (%rdi){1to8}, %ymm1, %ymm0 +; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permd_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vpermd (%rdi){1to8}, %ymm1, %ymm0 +; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load i32, i32* %p0, align 4 + %t0 = insertelement <8 x i32> undef, i32 %v0, i32 0 + %a0 = shufflevector <8 x i32> %t0, <8 x i32> undef, <8 x i32> zeroinitializer + %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 -1) + %res = add <8 x i32> %2, %idx + ret <8 x i32> %res +} + +define <8 x i32> @permd_maskz_256(<8 x i32> %a0, <8 x i32> %idx, i8* %mask) { +; ENABLE-LABEL: permd_maskz_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vpermd %ymm0, %ymm1, %ymm2 +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 {%k1} +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permd_maskz_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermd %ymm0, %ymm1, %ymm2 +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 {%k1} +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 %2) + %t = add <8 x i32> %a0, %idx + %res = add <8 x i32> %3, %t + ret <8 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <16 x i32> @permd_rr_512(<16 x i32> %a0, <16 x i32> %idx) { +; ENABLE-LABEL: permd_rr_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermd %zmm0, %zmm2, %zmm1 +; ENABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permd_rr_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; DISABLE-NEXT: vpermd %zmm0, %zmm2, %zmm1 +; DISABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1) + %t = add <16 x i32> %a0, %idx + %res = add <16 x i32> %t, %2 + ret <16 x i32> %res +} + +define <16 x i32> @permd_rm_512(<16 x i32>* %p0, <16 x i32> %idx) { +; ENABLE-LABEL: permd_rm_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermd (%rdi), %zmm0, %zmm1 +; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permd_rm_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermd (%rdi), %zmm0, %zmm1 +; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x i32>, <16 x i32>* %p0, align 64 + %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1) + %res = add <16 x i32> %idx, %2 + ret <16 x i32> %res +} + +define <16 x i32> @permd_broadcast_512(i32* %p0, <16 x i32> %idx) { +; ENABLE-LABEL: permd_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermd (%rdi){1to16}, %zmm1, %zmm0 +; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permd_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vpermd (%rdi){1to16}, %zmm1, %zmm0 +; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load i32, i32* %p0, align 4 + %t0 = insertelement <16 x i32> undef, i32 %v0, i32 0 + %a0 = shufflevector <16 x i32> %t0, <16 x i32> undef, <16 x i32> zeroinitializer + %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1) + %res = add <16 x i32> %2, %idx + ret <16 x i32> %res +} + +define <16 x i32> @permd_maskz_512(<16 x i32> %a0, <16 x i32> %idx, i16* %mask) { +; ENABLE-LABEL: permd_maskz_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; ENABLE-NEXT: kmovw (%rdi), %k1 +; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 {%k1} +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permd_maskz_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; DISABLE-NEXT: kmovw (%rdi), %k1 +; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 {%k1} +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> zeroinitializer, i16 %2) + %t = add <16 x i32> %a0, %idx + %res = add <16 x i32> %3, %t + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <4 x double> @permpd_ri_256(<4 x double> %a0) { +; ENABLE-LABEL: permpd_ri_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0] +; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_ri_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0] +; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> + %res = fadd <4 x double> %2, %a0 + ret <4 x double> %res +} + +define <4 x double> @permpd_rr_256(<4 x double> %a0, <4 x i64> %idx) { +; ENABLE-LABEL: permpd_rr_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd %ymm0, %ymm2 +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermpd %ymm2, %ymm0, %ymm1 +; ENABLE-NEXT: vcvtqq2pd %ymm0, %ymm0 +; ENABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_rr_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd %ymm0, %ymm2 +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; DISABLE-NEXT: vpermpd %ymm2, %ymm0, %ymm1 +; DISABLE-NEXT: vcvtqq2pd %ymm0, %ymm0 +; DISABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <4 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %1, <4 x i64> %idx) + %a1 = sitofp <4 x i64> %idx to <4 x double> + %t = fadd <4 x double> %1, %a1 + %res = fadd <4 x double> %2, %t + ret <4 x double> %res +} + +define <4 x double> @permpd_rm_256(<4 x double>* %p0, <4 x i64> %idx) { +; ENABLE-LABEL: permpd_rm_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermpd (%rdi), %ymm1, %ymm0 +; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 +; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_rm_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vpermpd (%rdi), %ymm1, %ymm0 +; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 +; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x double>, <4 x double>* %p0, align 64 + %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx) + %a1 = sitofp <4 x i64> %idx to <4 x double> + %res = fadd <4 x double> %2, %a1 + ret <4 x double> %res +} + +define <4 x double> @permpd_mi_256(<4 x double>* %p0) { +; ENABLE-LABEL: permpd_mi_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0] +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_mi_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0] +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <4 x double>, <4 x double>* %p0, align 64 + %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> + ret <4 x double> %2 +} + +define <4 x double> @permpd_broadcast_256(double* %p0, <4 x i64> %idx) { +; ENABLE-LABEL: permpd_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermpd (%rdi){1to4}, %ymm1, %ymm0 +; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 +; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vpermpd (%rdi){1to4}, %ymm1, %ymm0 +; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 +; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load double, double* %p0, align 4 + %t0 = insertelement <4 x double> undef, double %v0, i64 0 + %a0 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer + %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx) + %a1 = sitofp <4 x i64> %idx to <4 x double> + %res = fadd <4 x double> %2, %a1 + ret <4 x double> %res +} + +define <4 x double> @permpd_maskz_256(<4 x double> %a0, <4 x i64> %idx, i8* %mask) { +; ENABLE-LABEL: permpd_maskz_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z} +; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 +; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_maskz_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z} +; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1 +; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> %idx, <4 x double> zeroinitializer, i8 %2) + %a1 = sitofp <4 x i64> %idx to <4 x double> + %t = fadd <4 x double> %a0, %a1 + %res = fadd <4 x double> %3, %t + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) +declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) + +define <8 x double> @permpd_rr_512(<8 x double> %a0, <8 x i64> %idx) { +; ENABLE-LABEL: permpd_rr_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd %zmm0, %zmm2 +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermpd %zmm2, %zmm0, %zmm1 +; ENABLE-NEXT: vcvtqq2pd %zmm0, %zmm0 +; ENABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; ENABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_rr_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd %zmm0, %zmm2 +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; DISABLE-NEXT: vpermpd %zmm2, %zmm0, %zmm1 +; DISABLE-NEXT: vcvtqq2pd %zmm0, %zmm0 +; DISABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; DISABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <8 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> %idx) + %a1 = sitofp <8 x i64> %idx to <8 x double> + %t = fadd <8 x double> %1, %a1 + %res = fadd <8 x double> %2, %t + ret <8 x double> %res +} + +define <8 x double> @permpd_rm_512(<8 x double>* %p0, <8 x i64> %idx) { +; ENABLE-LABEL: permpd_rm_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermpd (%rdi), %zmm1, %zmm0 +; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 +; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_rm_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vpermpd (%rdi), %zmm1, %zmm0 +; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 +; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x double>, <8 x double>* %p0, align 64 + %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx) + %a1 = sitofp <8 x i64> %idx to <8 x double> + %res = fadd <8 x double> %2, %a1 + ret <8 x double> %res +} + +define <8 x double> @permpd_broadcast_512(double* %p0, <8 x i64> %idx) { +; ENABLE-LABEL: permpd_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermpd (%rdi){1to8}, %zmm1, %zmm0 +; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 +; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vpermpd (%rdi){1to8}, %zmm1, %zmm0 +; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 +; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load double, double* %p0, align 4 + %t0 = insertelement <8 x double> undef, double %v0, i64 0 + %a0 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer + %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx) + %a1 = sitofp <8 x i64> %idx to <8 x double> + %res = fadd <8 x double> %2, %a1 + ret <8 x double> %res +} + +define <8 x double> @permpd_maskz_512(<8 x double> %a0, <8 x i64> %idx, i8* %mask) { +; ENABLE-LABEL: permpd_maskz_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z} +; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 +; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permpd_maskz_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z} +; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1 +; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> %idx, <8 x double> zeroinitializer, i8 %2) + %a1 = sitofp <8 x i64> %idx to <8 x double> + %t = fadd <8 x double> %a0, %a1 + %res = fadd <8 x double> %3, %t + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) +declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) + + +define <8 x float> @permps_rr_256(<8 x float> %a0, <8 x i32> %idx) { +; ENABLE-LABEL: permps_rr_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps %ymm0, %ymm2 +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; ENABLE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; ENABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_rr_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps %ymm0, %ymm2 +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; DISABLE-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; DISABLE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; DISABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <8 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %1, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1) + %a1 = sitofp <8 x i32> %idx to <8 x float> + %t = fadd <8 x float> %1, %a1 + %res = fadd <8 x float> %2, %t + ret <8 x float> %res +} + +define <8 x float> @permps_rm_256(<8 x float>* %p0, <8 x i32> %idx) { +; ENABLE-LABEL: permps_rm_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermps (%rdi), %ymm1, %ymm0 +; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 +; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_rm_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vpermps (%rdi), %ymm1, %ymm0 +; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 +; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <8 x float>, <8 x float>* %p0, align 64 + %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1) + %a1 = sitofp <8 x i32> %idx to <8 x float> + %res = fadd <8 x float> %2, %a1 + ret <8 x float> %res +} + +define <8 x float> @permps_broadcast_256(float* %p0, <8 x i32> %idx) { +; ENABLE-LABEL: permps_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermps (%rdi){1to8}, %ymm1, %ymm0 +; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 +; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vpermps (%rdi){1to8}, %ymm1, %ymm0 +; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 +; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load float, float* %p0, align 4 + %t0 = insertelement <8 x float> undef, float %v0, i32 0 + %a0 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer + %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1) + %a1 = sitofp <8 x i32> %idx to <8 x float> + %res = fadd <8 x float> %2, %a1 + ret <8 x float> %res +} + +define <8 x float> @permps_maskz_256(<8 x float> %a0, <8 x i32> %idx, i8* %mask) { +; ENABLE-LABEL: permps_maskz_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} {z} +; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 +; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_maskz_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} {z} +; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1 +; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 %2) + %a1 = sitofp <8 x i32> %idx to <8 x float> + %t = fadd <8 x float> %a0, %a1 + %res = fadd <8 x float> %3, %t + ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8) + +define <16 x float> @permps_rr_512(<16 x float> %a0, <16 x i32> %idx) { +; ENABLE-LABEL: permps_rr_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps %zmm0, %zmm2 +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpermps %zmm2, %zmm0, %zmm1 +; ENABLE-NEXT: vcvtdq2ps %zmm0, %zmm0 +; ENABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; ENABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_rr_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps %zmm0, %zmm2 +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; DISABLE-NEXT: vpermps %zmm2, %zmm0, %zmm1 +; DISABLE-NEXT: vcvtdq2ps %zmm0, %zmm0 +; DISABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; DISABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <16 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %1, <16 x i32> %idx) + %a1 = sitofp <16 x i32> %idx to <16 x float> + %t = fadd <16 x float> %1, %a1 + %res = fadd <16 x float> %2, %t + ret <16 x float> %res +} + +define <16 x float> @permps_rm_512(<16 x float>* %p0, <16 x i32> %idx) { +; ENABLE-LABEL: permps_rm_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermps (%rdi), %zmm1, %zmm0 +; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 +; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_rm_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vpermps (%rdi), %zmm1, %zmm0 +; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 +; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a0 = load <16 x float>, <16 x float>* %p0, align 64 + %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx) + %a1 = sitofp <16 x i32> %idx to <16 x float> + %res = fadd <16 x float> %2, %a1 + ret <16 x float> %res +} + +define <16 x float> @permps_broadcast_512(float* %p0, <16 x i32> %idx) { +; ENABLE-LABEL: permps_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vpermps (%rdi){1to16}, %zmm1, %zmm0 +; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 +; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vpermps (%rdi){1to16}, %zmm1, %zmm0 +; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 +; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v0 = load float, float* %p0, align 4 + %t0 = insertelement <16 x float> undef, float %v0, i32 0 + %a0 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer + %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx) + %a1 = sitofp <16 x i32> %idx to <16 x float> + %res = fadd <16 x float> %2, %a1 + ret <16 x float> %res +} + +define <16 x float> @permps_maskz_512(<16 x float> %a0, <16 x i32> %idx, i16* %mask) { +; ENABLE-LABEL: permps_maskz_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovw (%rdi), %k1 +; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; ENABLE-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} {z} +; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 +; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: permps_maskz_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovw (%rdi), %k1 +; DISABLE-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} {z} +; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1 +; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx, <16 x float> zeroinitializer, i16 %2) + %a1 = sitofp <16 x i32> %idx to <16 x float> + %t = fadd <16 x float> %a0, %a1 + %res = fadd <16 x float> %3, %t + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) +declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) diff --git a/llvm/test/CodeGen/X86/pmullq-false-deps.ll b/llvm/test/CodeGen/X86/pmullq-false-deps.ll new file mode 100644 index 0000000..7dd4d50 --- /dev/null +++ b/llvm/test/CodeGen/X86/pmullq-false-deps.ll @@ -0,0 +1,363 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE + +define <2 x i64> @pmullq_128(<2 x i64> %a0, <2 x i64> %a1) { +; ENABLE-LABEL: pmullq_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq %xmm2, %xmm0, %xmm1 +; ENABLE-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vpmullq %xmm2, %xmm0, %xmm1 +; DISABLE-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1) + %3 = add <2 x i64> %a0, %a1 + %res = add <2 x i64> %2, %3 + ret <2 x i64> %res +} + +define <2 x i64> @pmullq_mem_128(<2 x i64> %a0, <2 x i64>* %p1) { +; ENABLE-LABEL: pmullq_mem_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq (%rdi), %xmm0, %xmm1 +; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_mem_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpmullq (%rdi), %xmm0, %xmm1 +; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <2 x i64>, <2 x i64>* %p1, align 64 + %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1) + %res = add <2 x i64> %2, %a0 + ret <2 x i64> %res +} + +define <2 x i64> @pmullq_broadcast_128(<2 x i64> %a0, i64* %p1) { +; ENABLE-LABEL: pmullq_broadcast_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1 +; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_broadcast_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1 +; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load i64, i64* %p1, align 4 + %t0 = insertelement <2 x i64> undef, i64 %v1, i64 0 + %a1 = shufflevector <2 x i64> %t0, <2 x i64> undef, <2 x i32> zeroinitializer + %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1) + %res = add <2 x i64> %2, %a0 + ret <2 x i64> %res +} + +define <2 x i64> @pmullq_maskz_128(<2 x i64> %a0, <2 x i64> %a1, i8* %pmask) { +; ENABLE-LABEL: pmullq_maskz_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vpmullq %xmm1, %xmm0, %xmm2 +; ENABLE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_maskz_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vpmullq %xmm1, %xmm0, %xmm2 +; DISABLE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> zeroinitializer, i8 %mask) + %3 = add <2 x i64> %a0, %a1 + %res = add <2 x i64> %2, %3 + ret <2 x i64> %res +} + +declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) + +define <4 x i64> @pmullq_256(<4 x i64> %a0, <4 x i64> %a1) { +; ENABLE-LABEL: pmullq_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq %ymm2, %ymm0, %ymm1 +; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-NEXT: vpmullq %ymm2, %ymm0, %ymm1 +; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1) + %3 = add <4 x i64> %a0, %a1 + %res = add <4 x i64> %2, %3 + ret <4 x i64> %res +} + +define <4 x i64> @pmullq_mem_256(<4 x i64> %a0, <4 x i64>* %p1) { +; ENABLE-LABEL: pmullq_mem_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq (%rdi), %ymm0, %ymm1 +; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_mem_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpmullq (%rdi), %ymm0, %ymm1 +; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x i64>, <4 x i64>* %p1, align 64 + %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1) + %res = add <4 x i64> %2, %a0 + ret <4 x i64> %res +} + +define <4 x i64> @pmullq_broadcast_256(<4 x i64> %a0, i64* %p1) { +; ENABLE-LABEL: pmullq_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1 +; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1 +; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load i64, i64* %p1, align 4 + %t0 = insertelement <4 x i64> undef, i64 %v1, i64 0 + %a1 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer + %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1) + %res = add <4 x i64> %2, %a0 + ret <4 x i64> %res +} + +define <4 x i64> @pmullq_maskz_256(<4 x i64> %a0, <4 x i64> %a1, i8* %pmask) { +; ENABLE-LABEL: pmullq_maskz_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vpmullq %ymm1, %ymm0, %ymm2 +; ENABLE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_maskz_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vpmullq %ymm1, %ymm0, %ymm2 +; DISABLE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> zeroinitializer, i8 %mask) + %3 = add <4 x i64> %a0, %a1 + %res = add <4 x i64> %2, %3 + ret <4 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) + +define <8 x i64> @pmullq_512(<8 x i64> %a0, <8 x i64> %a1) { +; ENABLE-LABEL: pmullq_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq %zmm2, %zmm0, %zmm1 +; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; DISABLE-NEXT: vpmullq %zmm2, %zmm0, %zmm1 +; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1) + %3 = add <8 x i64> %a0, %a1 + %res = add <8 x i64> %2, %3 + ret <8 x i64> %res +} + +define <8 x i64> @pmullq_mem_512(<8 x i64> %a0, <8 x i64>* %p1) { +; ENABLE-LABEL: pmullq_mem_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq (%rdi), %zmm0, %zmm1 +; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_mem_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpmullq (%rdi), %zmm0, %zmm1 +; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <8 x i64>, <8 x i64>* %p1, align 64 + %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1) + %res = add <8 x i64> %2, %a0 + ret <8 x i64> %res +} + +define <8 x i64> @pmullq_broadcast_512(<8 x i64> %a0, i64* %p1) { +; ENABLE-LABEL: pmullq_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 +; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 +; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load i64, i64* %p1, align 4 + %t0 = insertelement <8 x i64> undef, i64 %v1, i64 0 + %a1 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer + %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1) + %res = add <8 x i64> %2, %a0 + ret <8 x i64> %res +} + +define <8 x i64> @pmullq_maskz_512(<8 x i64> %a0, <8 x i64> %a1, i8* %pmask) { +; ENABLE-LABEL: pmullq_maskz_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vpmullq %zmm1, %zmm0, %zmm2 +; ENABLE-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: pmullq_maskz_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vpmullq %zmm1, %zmm0, %zmm2 +; DISABLE-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + %3 = add <8 x i64> %a0, %a1 + %res = add <8 x i64> %2, %3 + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) diff --git a/llvm/test/CodeGen/X86/range-false-deps.ll b/llvm/test/CodeGen/X86/range-false-deps.ll new file mode 100644 index 0000000..e211fb3 --- /dev/null +++ b/llvm/test/CodeGen/X86/range-false-deps.ll @@ -0,0 +1,984 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-range -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE +; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-range -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE + +define <4 x float> @rangeps_128(<4 x float> %a0, <4 x float> %a1) { +; ENABLE-LABEL: rangeps_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangeps $88, %xmm2, %xmm0, %xmm1 +; ENABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vrangeps $88, %xmm2, %xmm0, %xmm1 +; DISABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %a0, <4 x float> %a1, i32 88, <4 x float> undef, i8 -1) + %3 = fadd <4 x float> %a0, %a1 + %res = fadd <4 x float> %2, %3 + ret <4 x float> %res +} + +define <4 x float> @rangeps_mem_128(<4 x float> %a0, <4 x float>* %p1) { +; ENABLE-LABEL: rangeps_mem_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangeps $88, (%rdi), %xmm1, %xmm0 +; ENABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_mem_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; DISABLE-NEXT: vrangeps $88, (%rdi), %xmm1, %xmm0 +; DISABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x float>, <4 x float>* %p1, align 64 + %2 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %a0, <4 x float> %a1, i32 88, <4 x float> undef, i8 -1) + %res = fadd <4 x float> %2, %a0 + ret <4 x float> %res +} + +define <4 x float> @rangeps_broadcast_128(<4 x float> %a0, float* %p1) { +; ENABLE-LABEL: rangeps_broadcast_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangeps $88, (%rdi){1to4}, %xmm1, %xmm0 +; ENABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_broadcast_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; DISABLE-NEXT: vrangeps $88, (%rdi){1to4}, %xmm1, %xmm0 +; DISABLE-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <4 x float> undef, float %v1, i64 0 + %a1 = shufflevector <4 x float> %t0, <4 x float> undef, <4 x i32> zeroinitializer + %2 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %a0, <4 x float> %a1, i32 88, <4 x float> undef, i8 -1) + %res = fadd <4 x float> %2, %a0 + ret <4 x float> %res +} + +define <4 x float> @rangeps_maskz_128(<4 x float> %a0, <4 x float> %a1, i8* %pmask) { +; ENABLE-LABEL: rangeps_maskz_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangeps $88, %xmm2, %xmm0, %xmm1 {%k1} {z} +; ENABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_maskz_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vrangeps $88, %xmm2, %xmm0, %xmm1 {%k1} {z} +; DISABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %a0, <4 x float> %a1, i32 88, <4 x float> undef, i8 %mask) + %3 = fadd <4 x float> %a0, %a1 + %res = fadd <4 x float> %2, %3 + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8) nounwind readnone + +define <8 x float> @rangeps_256(<8 x float> %a0, <8 x float> %a1) { +; ENABLE-LABEL: rangeps_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangeps $88, %ymm2, %ymm0, %ymm1 +; ENABLE-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-NEXT: vrangeps $88, %ymm2, %ymm0, %ymm1 +; DISABLE-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %a0, <8 x float> %a1, i32 88, <8 x float> undef, i8 -1) + %3 = fadd <8 x float> %a0, %a1 + %res = fadd <8 x float> %2, %3 + ret <8 x float> %res +} + +define <8 x float> @rangeps_mem_256(<8 x float> %a0, <8 x float>* %p1) { +; ENABLE-LABEL: rangeps_mem_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangeps $88, (%rdi), %ymm1, %ymm0 +; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_mem_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vrangeps $88, (%rdi), %ymm1, %ymm0 +; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <8 x float>, <8 x float>* %p1, align 64 + %2 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %a0, <8 x float> %a1, i32 88, <8 x float> undef, i8 -1) + %res = fadd <8 x float> %2, %a0 + ret <8 x float> %res +} + +define <8 x float> @rangeps_broadcast_256(<8 x float> %a0, float* %p1) { +; ENABLE-LABEL: rangeps_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangeps $88, (%rdi){1to8}, %ymm1, %ymm0 +; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vrangeps $88, (%rdi){1to8}, %ymm1, %ymm0 +; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <8 x float> undef, float %v1, i64 0 + %a1 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer + %2 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %a0, <8 x float> %a1, i32 88, <8 x float> undef, i8 -1) + %res = fadd <8 x float> %2, %a0 + ret <8 x float> %res +} + +define <8 x float> @rangeps_maskz_256(<8 x float> %a0, <8 x float> %a1, i8* %pmask) { +; ENABLE-LABEL: rangeps_maskz_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangeps $44, %ymm2, %ymm0, %ymm1 {%k1} {z} +; ENABLE-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_maskz_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-NEXT: vrangeps $44, %ymm2, %ymm0, %ymm1 {%k1} {z} +; DISABLE-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %a0, <8 x float> %a1, i32 44, <8 x float> undef, i8 %mask) + %3 = fadd <8 x float> %a0, %a1 + %res = fadd <8 x float> %2, %3 + ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8) nounwind readnone + +define <16 x float> @rangeps_512(<16 x float> %a0, <16 x float> %a1) { +; ENABLE-LABEL: rangeps_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangeps $88, %zmm2, %zmm0, %zmm1 +; ENABLE-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; ENABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; DISABLE-NEXT: vrangeps $88, %zmm2, %zmm0, %zmm1 +; DISABLE-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; DISABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %a0, <16 x float> %a1, i32 88, <16 x float> undef, i16 -1, i32 4) + %3 = fadd <16 x float> %a0, %a1 + %res = fadd <16 x float> %2, %3 + ret <16 x float> %res +} + +define <16 x float> @rangeps_mem_512(<16 x float> %a0, <16 x float>* %p1) { +; ENABLE-LABEL: rangeps_mem_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangeps $88, (%rdi), %zmm1, %zmm0 +; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_mem_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vrangeps $88, (%rdi), %zmm1, %zmm0 +; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <16 x float>, <16 x float>* %p1, align 64 + %2 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %a0, <16 x float> %a1, i32 88, <16 x float> undef, i16 -1, i32 4) + %res = fadd <16 x float> %2, %a0 + ret <16 x float> %res +} + +define <16 x float> @rangeps_broadcast_512(<16 x float> %a0, float* %p1) { +; ENABLE-LABEL: rangeps_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangeps $88, (%rdi){1to16}, %zmm1, %zmm0 +; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vrangeps $88, (%rdi){1to16}, %zmm1, %zmm0 +; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load float, float* %p1, align 4 + %t0 = insertelement <16 x float> undef, float %v1, i64 0 + %a1 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer + %2 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %a0, <16 x float> %a1, i32 88, <16 x float> undef, i16 -1, i32 4) + %res = fadd <16 x float> %2, %a0 + ret <16 x float> %res +} + +define <16 x float> @rangeps_maskz_512(<16 x float> %a0, <16 x float> %a1, i16* %pmask) { +; ENABLE-LABEL: rangeps_maskz_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovw (%rdi), %k1 +; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangeps $88, %zmm2, %zmm0, %zmm1 {%k1} {z} +; ENABLE-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; ENABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangeps_maskz_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovw (%rdi), %k1 +; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; DISABLE-NEXT: vrangeps $88, %zmm2, %zmm0, %zmm1 {%k1} {z} +; DISABLE-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; DISABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i16, i16* %pmask + %2 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %a0, <16 x float> %a1, i32 88, <16 x float> undef, i16 %mask, i32 4) + %3 = fadd <16 x float> %a0, %a1 + %res = fadd <16 x float> %2, %3 + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16, i32) nounwind readnone + + +define <2 x double> @rangepd_128(<2 x double> %a0, <2 x double> %a1) { +; ENABLE-LABEL: rangepd_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangepd $88, %xmm2, %xmm0, %xmm1 +; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vrangepd $88, %xmm2, %xmm0, %xmm1 +; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %a0, <2 x double> %a1, i32 88, <2 x double> undef, i8 -1) + %3 = fadd <2 x double> %a0, %a1 + %res = fadd <2 x double> %2, %3 + ret <2 x double> %res +} + +define <2 x double> @rangepd_mem_128(<2 x double> %a0, <2 x double>* %p1) { +; ENABLE-LABEL: rangepd_mem_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangepd $88, (%rdi), %xmm1, %xmm0 +; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_mem_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; DISABLE-NEXT: vrangepd $88, (%rdi), %xmm1, %xmm0 +; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <2 x double>, <2 x double>* %p1, align 64 + %2 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %a0, <2 x double> %a1, i32 88, <2 x double> undef, i8 -1) + %res = fadd <2 x double> %2, %a0 + ret <2 x double> %res +} + +define <2 x double> @rangepd_broadcast_128(<2 x double> %a0, double* %p1) { +; ENABLE-LABEL: rangepd_broadcast_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangepd $88, (%rdi){1to2}, %xmm1, %xmm0 +; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_broadcast_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; DISABLE-NEXT: vrangepd $88, (%rdi){1to2}, %xmm1, %xmm0 +; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load double, double* %p1, align 4 + %t0 = insertelement <2 x double> undef, double %v1, i64 0 + %a1 = shufflevector <2 x double> %t0, <2 x double> undef, <2 x i32> zeroinitializer + %2 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %a0, <2 x double> %a1, i32 88, <2 x double> undef, i8 -1) + %res = fadd <2 x double> %2, %a0 + ret <2 x double> %res +} + +define <2 x double> @rangepd_maskz_128(<2 x double> %a0, <2 x double> %a1, i8* %pmask) { +; ENABLE-LABEL: rangepd_maskz_128: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangepd $88, %xmm2, %xmm0, %xmm1 {%k1} {z} +; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_maskz_128: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vrangepd $88, %xmm2, %xmm0, %xmm1 {%k1} {z} +; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %a0, <2 x double> %a1, i32 88, <2 x double> undef, i8 %mask) + %3 = fadd <2 x double> %a0, %a1 + %res = fadd <2 x double> %2, %3 + ret <2 x double> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8) nounwind readnone + +define <4 x double> @rangepd_256(<4 x double> %a0, <4 x double> %a1) { +; ENABLE-LABEL: rangepd_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangepd $88, %ymm2, %ymm0, %ymm1 +; ENABLE-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-NEXT: vrangepd $88, %ymm2, %ymm0, %ymm1 +; DISABLE-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %a0, <4 x double> %a1, i32 88, <4 x double> undef, i8 -1) + %3 = fadd <4 x double> %a0, %a1 + %res = fadd <4 x double> %2, %3 + ret <4 x double> %res +} + +define <4 x double> @rangepd_mem_256(<4 x double> %a0, <4 x double>* %p1) { +; ENABLE-LABEL: rangepd_mem_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangepd $88, (%rdi), %ymm1, %ymm0 +; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_mem_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vrangepd $88, (%rdi), %ymm1, %ymm0 +; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x double>, <4 x double>* %p1, align 64 + %2 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %a0, <4 x double> %a1, i32 88, <4 x double> undef, i8 -1) + %res = fadd <4 x double> %2, %a0 + ret <4 x double> %res +} + +define <4 x double> @rangepd_broadcast_256(<4 x double> %a0, double* %p1) { +; ENABLE-LABEL: rangepd_broadcast_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangepd $88, (%rdi){1to4}, %ymm1, %ymm0 +; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_broadcast_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; DISABLE-NEXT: vrangepd $88, (%rdi){1to4}, %ymm1, %ymm0 +; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load double, double* %p1, align 4 + %t0 = insertelement <4 x double> undef, double %v1, i64 0 + %a1 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer + %2 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %a0, <4 x double> %a1, i32 88, <4 x double> undef, i8 -1) + %res = fadd <4 x double> %2, %a0 + ret <4 x double> %res +} + +define <4 x double> @rangepd_maskz_256(<4 x double> %a0, <4 x double> %a1, i8* %pmask) { +; ENABLE-LABEL: rangepd_maskz_256: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangepd $88, %ymm2, %ymm0, %ymm1 {%k1} {z} +; ENABLE-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_maskz_256: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; DISABLE-NEXT: vrangepd $88, %ymm2, %ymm0, %ymm1 {%k1} {z} +; DISABLE-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %a0, <4 x double> %a1, i32 88, <4 x double> undef, i8 %mask) + %3 = fadd <4 x double> %a0, %a1 + %res = fadd <4 x double> %2, %3 + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8) nounwind readnone + +define <8 x double> @rangepd_512(<8 x double> %a0, <8 x double> %a1) { +; ENABLE-LABEL: rangepd_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangepd $88, %zmm2, %zmm0, %zmm1 +; ENABLE-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; ENABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; DISABLE-NEXT: vrangepd $88, %zmm2, %zmm0, %zmm1 +; DISABLE-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; DISABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %a0, <8 x double> %a1, i32 88, <8 x double> undef, i8 -1, i32 4) + %3 = fadd <8 x double> %a0, %a1 + %res = fadd <8 x double> %2, %3 + ret <8 x double> %res +} + +define <8 x double> @rangepd_mem_512(<8 x double> %a0, <8 x double>* %p1) { +; ENABLE-LABEL: rangepd_mem_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangepd $88, (%rdi), %zmm1, %zmm0 +; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_mem_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vrangepd $88, (%rdi), %zmm1, %zmm0 +; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <8 x double>, <8 x double>* %p1, align 64 + %2 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %a0, <8 x double> %a1, i32 88, <8 x double> undef, i8 -1, i32 4) + %res = fadd <8 x double> %2, %a0 + ret <8 x double> %res +} + +define <8 x double> @rangepd_broadcast_512(<8 x double> %a0, double* %p1) { +; ENABLE-LABEL: rangepd_broadcast_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangepd $88, (%rdi){1to8}, %zmm1, %zmm0 +; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_broadcast_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; DISABLE-NEXT: vrangepd $88, (%rdi){1to8}, %zmm1, %zmm0 +; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %v1 = load double, double* %p1, align 4 + %t0 = insertelement <8 x double> undef, double %v1, i64 0 + %a1 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer + %2 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %a0, <8 x double> %a1, i32 88, <8 x double> undef, i8 -1, i32 4) + %res = fadd <8 x double> %2, %a0 + ret <8 x double> %res +} + +define <8 x double> @rangepd_maskz_512(<8 x double> %a0, <8 x double> %a1, i8* %pmask) { +; ENABLE-LABEL: rangepd_maskz_512: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangepd $88, %zmm2, %zmm0, %zmm1 {%k1} {z} +; ENABLE-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; ENABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangepd_maskz_512: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; DISABLE-NEXT: vrangepd $88, %zmm2, %zmm0, %zmm1 {%k1} {z} +; DISABLE-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; DISABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %a0, <8 x double> %a1, i32 88, <8 x double> undef, i8 %mask, i32 4) + %3 = fadd <8 x double> %a0, %a1 + %res = fadd <8 x double> %2, %3 + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8, i32) nounwind readnone + +define <4 x float> @rangess(<4 x float> %a0, <4 x float> %a1) { +; ENABLE-LABEL: rangess: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangess $4, %xmm2, %xmm0, %xmm1 +; ENABLE-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangess: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vrangess $4, %xmm2, %xmm0, %xmm1 +; DISABLE-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4, i32 4) + %3 = fadd <4 x float> %a1, %a0 + %res = fadd <4 x float> %2, %3 + ret <4 x float> %res +} + +define <4 x float> @rangess_mem(<4 x float> %a0, <4 x float>* %p1) { +; ENABLE-LABEL: rangess_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangess $4, (%rdi), %xmm0, %xmm1 +; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangess_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vrangess $4, (%rdi), %xmm0, %xmm1 +; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <4 x float>, <4 x float>* %p1, align 64 + %2 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4, i32 4) + %res = fadd <4 x float> %2, %a0 + ret <4 x float> %res +} + +define <4 x float> @rangess_maskz(<4 x float> %a0, <4 x float> %a1, i8* %pmask) { +; ENABLE-LABEL: rangess_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangess $4, %xmm2, %xmm0, %xmm1 {%k1} {z} +; ENABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangess_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vrangess $4, %xmm2, %xmm0, %xmm1 {%k1} {z} +; DISABLE-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 %mask, i32 4, i32 4) + %3 = fadd <4 x float> %a0, %a1 + %res = fadd <4 x float> %2, %3 + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32) + +define <2 x double> @rangesd(<2 x double> %a0, <2 x double> %a1) { +; ENABLE-LABEL: rangesd: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangesd $4, %xmm2, %xmm0, %xmm1 +; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangesd: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vrangesd $4, %xmm2, %xmm0, %xmm1 +; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> undef, i8 -1, i32 4, i32 4) + %3 = fadd <2 x double> %a0, %a1 + %res = fadd <2 x double> %2, %3 + ret <2 x double> %res +} + +define <2 x double> @rangesd_mem(<2 x double> %a0, <2 x double>* %p1) { +; ENABLE-LABEL: rangesd_mem: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ENABLE-NEXT: vrangesd $4, (%rdi), %xmm1, %xmm0 +; ENABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangesd_mem: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; DISABLE-NEXT: vrangesd $4, (%rdi), %xmm1, %xmm0 +; DISABLE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %a1 = load <2 x double>, <2 x double>* %p1, align 64 + %2 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> undef, i8 -1, i32 4, i32 4) + %res = fadd <2 x double> %2, %a0 + ret <2 x double> %res +} + +define <2 x double> @rangesd_maskz(<2 x double> %a0, <2 x double> %a1, i8* %pmask) { +; ENABLE-LABEL: rangesd_maskz: +; ENABLE: # %bb.0: +; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ENABLE-NEXT: #APP +; ENABLE-NEXT: nop +; ENABLE-NEXT: #NO_APP +; ENABLE-NEXT: kmovb (%rdi), %k1 +; ENABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ENABLE-NEXT: vrangesd $4, %xmm2, %xmm0, %xmm1 {%k1} {z} +; ENABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; ENABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; ENABLE-NEXT: retq +; +; DISABLE-LABEL: rangesd_maskz: +; DISABLE: # %bb.0: +; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; DISABLE-NEXT: #APP +; DISABLE-NEXT: nop +; DISABLE-NEXT: #NO_APP +; DISABLE-NEXT: kmovb (%rdi), %k1 +; DISABLE-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; DISABLE-NEXT: vrangesd $4, %xmm2, %xmm0, %xmm1 {%k1} {z} +; DISABLE-NEXT: vaddpd %xmm2, %xmm0, %xmm0 +; DISABLE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; DISABLE-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %pmask + %2 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> undef, i8 %mask, i32 4, i32 4) + %3 = fadd <2 x double> %a0, %a1 + %res = fadd <2 x double> %2, %3 + ret <2 x double> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32) -- 2.7.4