From 75745d0c3e612b54af6b1cfb62be69356ad155a2 Mon Sep 17 00:00:00 2001 From: "Andrew V. Tischenko" Date: Fri, 14 Apr 2017 07:44:23 +0000 Subject: [PATCH] This patch closes PR#32216: Better testing of schedule model instruction latencies/throughputs. The details are here: https://reviews.llvm.org/D30941 llvm-svn: 300311 --- llvm/include/llvm/CodeGen/AsmPrinter.h | 3 + llvm/include/llvm/CodeGen/TargetSchedule.h | 4 + llvm/include/llvm/MC/MCObjectStreamer.h | 3 +- llvm/include/llvm/MC/MCStreamer.h | 4 +- llvm/include/llvm/MC/MCSubtargetInfo.h | 11 + .../include/llvm/Target/TargetSubtargetInfo.h | 8 + llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 46 +- llvm/lib/CodeGen/TargetSchedule.cpp | 71 +- llvm/lib/CodeGen/TargetSubtargetInfo.cpp | 46 + llvm/lib/MC/MCAsmStreamer.cpp | 34 +- llvm/lib/MC/MCObjectStreamer.cpp | 2 +- llvm/lib/MC/MCStreamer.cpp | 4 +- llvm/lib/Object/RecordStreamer.cpp | 2 +- llvm/lib/Object/RecordStreamer.h | 3 +- .../MCTargetDesc/AArch64ELFStreamer.cpp | 4 +- .../ARM/MCTargetDesc/ARMELFStreamer.cpp | 4 +- .../MCTargetDesc/HexagonMCELFStreamer.cpp | 2 +- .../MCTargetDesc/HexagonMCELFStreamer.h | 3 +- .../Mips/MCTargetDesc/MipsELFStreamer.cpp | 2 +- .../Mips/MCTargetDesc/MipsELFStreamer.h | 3 +- .../Mips/MCTargetDesc/MipsNaClELFStreamer.cpp | 4 +- .../X86/InstPrinter/X86InstComments.cpp | 2 - llvm/lib/Target/X86/X86MCInstLower.cpp | 18 +- llvm/lib/Target/X86/X86Subtarget.h | 3 + llvm/test/CodeGen/X86/recip-fastmath.ll | 412 +++++---- llvm/test/CodeGen/X86/recip-fastmath2.ll | 814 +++++++++--------- 26 files changed, 874 insertions(+), 638 deletions(-) diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index 772043fa3ce3..fb8c8408fc77 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -112,6 +112,9 @@ public: typedef std::pair GOTEquivUsePair; MapVector GlobalGOTEquivs; + /// Enable print [latency:throughput] in output + bool EnablePrintSchedInfo = false; + private: MCSymbol *CurrentFnBegin = nullptr; MCSymbol *CurrentFnEnd = nullptr; diff --git a/llvm/include/llvm/CodeGen/TargetSchedule.h b/llvm/include/llvm/CodeGen/TargetSchedule.h index 0c5a84e0e3b8..1992412120aa 100644 --- a/llvm/include/llvm/CodeGen/TargetSchedule.h +++ b/llvm/include/llvm/CodeGen/TargetSchedule.h @@ -189,6 +189,10 @@ public: /// This is typically one cycle. unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *DepMI) const; + + /// \brief Compute the reciprocal throughput of the given instruction. + Optional computeInstrRThroughput(const MachineInstr *MI) const; + Optional computeInstrRThroughput(unsigned Opcode) const; }; } // end namespace llvm diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h index 11f8dfa24484..7c1189e46ab2 100644 --- a/llvm/include/llvm/MC/MCObjectStreamer.h +++ b/llvm/include/llvm/MC/MCObjectStreamer.h @@ -98,7 +98,8 @@ public: void EmitSLEB128Value(const MCExpr *Value) override; void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override; void ChangeSection(MCSection *Section, const MCExpr *Subsection) override; - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo& STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool = false) override; /// \brief Emit an instruction to a special fragment, because this instruction /// can change its size during relaxation. diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index c0d322e3ed3a..e466b368ed34 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -836,7 +836,9 @@ public: } /// \brief Emit the given \p Instruction into the current section. - virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI); + /// PrintSchedInfo == true then schedul comment should be added to output + virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool PrintSchedInfo = false); /// \brief Set the bundle alignment mode from now on in the section. /// The argument is the power of 2 to which the alignment is set. The diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h index bbdac8fad5f5..d8fb2dc8dcdf 100644 --- a/llvm/include/llvm/MC/MCSubtargetInfo.h +++ b/llvm/include/llvm/MC/MCSubtargetInfo.h @@ -26,6 +26,8 @@ #include namespace llvm { +class MachineInstr; +class MCInst; //===----------------------------------------------------------------------===// /// @@ -167,6 +169,15 @@ public: auto Found = std::lower_bound(ProcDesc.begin(), ProcDesc.end(), CPU); return Found != ProcDesc.end() && StringRef(Found->Key) == CPU; } + + /// Returns string representation of scheduler comment + virtual std::string getSchedInfoStr(const MachineInstr &MI) const { + return std::string(); + } + + virtual std::string getSchedInfoStr(MCInst const &MCI) const { + return std::string(); + } }; } // end namespace llvm diff --git a/llvm/include/llvm/Target/TargetSubtargetInfo.h b/llvm/include/llvm/Target/TargetSubtargetInfo.h index 0b4351596021..83950a9cd027 100644 --- a/llvm/include/llvm/Target/TargetSubtargetInfo.h +++ b/llvm/include/llvm/Target/TargetSubtargetInfo.h @@ -20,6 +20,7 @@ #include "llvm/CodeGen/PBQPRAConstraint.h" #include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/MC/MCInst.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/CodeGen.h" #include @@ -143,6 +144,9 @@ public: /// TargetLowering preference). It does not yet disable the postRA scheduler. virtual bool enableMachineScheduler() const; + /// \brief Support printing of [latency:throughput] comment in output .S file. + virtual bool supportPrintSchedInfo() const { return false; } + /// \brief True if the machine scheduler should disable the TLI preference /// for preRA scheduling with the source level scheduler. virtual bool enableMachineSchedDefaultSched() const { return true; } @@ -227,6 +231,10 @@ public: /// Please use MachineRegisterInfo::subRegLivenessEnabled() instead where /// possible. virtual bool enableSubRegLiveness() const { return false; } + + /// Returns string representation of scheduler comment + std::string getSchedInfoStr(const MachineInstr &MI) const override; + std::string getSchedInfoStr(MCInst const &MCI) const override; }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 834a59a12a6a..6c18d56b8272 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -123,6 +123,10 @@ static const char *const CodeViewLineTablesGroupDescription = STATISTIC(EmittedInsts, "Number of machine instrs printed"); +static cl::opt + PrintSchedule("print-schedule", cl::Hidden, cl::init(false), + cl::desc("Print 'sched: [latency:throughput]' in .s output")); + char AsmPrinter::ID = 0; typedef DenseMap> gcp_map_type; @@ -720,7 +724,8 @@ void AsmPrinter::EmitFunctionEntryLabel() { } /// emitComments - Pretty-print comments for instructions. -static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { +static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS, + AsmPrinter *AP) { const MachineFunction *MF = MI.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); @@ -728,6 +733,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { int FI; const MachineFrameInfo &MFI = MF->getFrameInfo(); + bool Commented = false; // We assume a single instruction only has a spill or reload, not // both. @@ -735,24 +741,39 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) { if (TII->isLoadFromStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Reload\n"; + CommentOS << MMO->getSize() << "-byte Reload"; + Commented = true; } } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Reload\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Reload"; + Commented = true; + } } else if (TII->isStoreToStackSlotPostFE(MI, FI)) { if (MFI.isSpillSlotObjectIndex(FI)) { MMO = *MI.memoperands_begin(); - CommentOS << MMO->getSize() << "-byte Spill\n"; + CommentOS << MMO->getSize() << "-byte Spill"; + Commented = true; } } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) { - if (MFI.isSpillSlotObjectIndex(FI)) - CommentOS << MMO->getSize() << "-byte Folded Spill\n"; + if (MFI.isSpillSlotObjectIndex(FI)) { + CommentOS << MMO->getSize() << "-byte Folded Spill"; + Commented = true; + } } // Check for spill-induced copies - if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) - CommentOS << " Reload Reuse\n"; + if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) { + Commented = true; + CommentOS << " Reload Reuse"; + } + + if (Commented && AP->EnablePrintSchedInfo) + // If any comment was added above and we need sched info comment then + // add this new comment just after the above comment w/o "\n" between them. + CommentOS << " " << MF->getSubtarget().getSchedInfoStr(MI) << "\n"; + else if (Commented) + CommentOS << "\n"; } /// emitImplicitDef - This method emits the specified machine instruction @@ -966,7 +987,7 @@ void AsmPrinter::EmitFunctionBody() { } if (isVerbose()) - emitComments(MI, OutStreamer->GetCommentOS()); + emitComments(MI, OutStreamer->GetCommentOS(), this); switch (MI.getOpcode()) { case TargetOpcode::CFI_INSTRUCTION: @@ -1383,6 +1404,11 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { ORE = &getAnalysis().getORE(); if (isVerbose()) LI = &getAnalysis(); + + const TargetSubtargetInfo &STI = MF.getSubtarget(); + EnablePrintSchedInfo = PrintSchedule.getNumOccurrences() + ? PrintSchedule + : STI.supportPrintSchedInfo(); } namespace { diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp index 04edf0e62857..0df34ce43112 100644 --- a/llvm/lib/CodeGen/TargetSchedule.cpp +++ b/llvm/lib/CodeGen/TargetSchedule.cpp @@ -277,7 +277,11 @@ unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const { if (SCDesc->isValid() && !SCDesc->isVariant()) return computeInstrLatency(*SCDesc); - llvm_unreachable("No MI sched latency"); + if (SCDesc->isValid()) { + assert (!SCDesc->isVariant() && "No MI sched latency: SCDesc->isVariant()"); + return computeInstrLatency(*SCDesc); + } + return 0; } unsigned @@ -331,3 +335,68 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx, } return 0; } + +static Optional +getRTroughputFromItineraries(unsigned schedClass, + const InstrItineraryData *IID){ + double Unknown = std::numeric_limits::infinity(); + double Throughput = Unknown; + + for (const InstrStage *IS = IID->beginStage(schedClass), + *E = IID->endStage(schedClass); + IS != E; ++IS) { + unsigned Cycles = IS->getCycles(); + if (!Cycles) + continue; + Throughput = + std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +static Optional +getRTroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc, + const TargetSubtargetInfo *STI, + const MCSchedModel &SchedModel) { + double Unknown = std::numeric_limits::infinity(); + double Throughput = Unknown; + + for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc), + *WEnd = STI->getWriteProcResEnd(SCDesc); + WPR != WEnd; ++WPR) { + unsigned Cycles = WPR->Cycles; + if (!Cycles) + return Optional(); + + unsigned NumUnits = + SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits; + Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles); + } + // We need reciprocal throughput that's why we return such value. + return 1 / Throughput; +} + +Optional +TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const { + if (hasInstrItineraries()) + return getRTroughputFromItineraries(MI->getDesc().getSchedClass(), + getInstrItineraries()); + if (hasInstrSchedModel()) + return getRTroughputFromInstrSchedModel(resolveSchedClass(MI), STI, + SchedModel); + return Optional(); +} + +Optional +TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const { + unsigned SchedClass = TII->get(Opcode).getSchedClass(); + if (hasInstrItineraries()) + return getRTroughputFromItineraries(SchedClass, getInstrItineraries()); + if (hasInstrSchedModel()) { + const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass); + if (SCDesc->isValid() && !SCDesc->isVariant()) + return getRTroughputFromInstrSchedModel(SCDesc, STI, SchedModel); + } + return Optional(); +} diff --git a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp index c74707d95b9e..0a444e0fff07 100644 --- a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp +++ b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp @@ -11,6 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -52,3 +55,46 @@ bool TargetSubtargetInfo::enablePostRAScheduler() const { bool TargetSubtargetInfo::useAA() const { return false; } + +static std::string createSchedInfoStr(unsigned Latency, + Optional RThroughput) { + static const char *SchedPrefix = " sched: ["; + std::string Comment; + raw_string_ostream CS(Comment); + if (Latency > 0 && RThroughput.hasValue()) + CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue()) + << "]"; + else if (Latency > 0) + CS << SchedPrefix << Latency << ":?]"; + else if (RThroughput.hasValue()) + CS << SchedPrefix << "?:" << RThroughput.getValue() << "]"; + CS.flush(); + return Comment; +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const { + if (MI.isPseudo() || MI.isTerminator()) + return std::string(); + // We don't cache TSchedModel because it depends on TargetInstrInfo + // that could be changed during the compilation + TargetSchedModel TSchedModel; + TSchedModel.init(getSchedModel(), this, getInstrInfo()); + unsigned Latency = TSchedModel.computeInstrLatency(&MI); + Optional RThroughput = TSchedModel.computeInstrRThroughput(&MI); + return createSchedInfoStr(Latency, RThroughput); +} + +/// Returns string representation of scheduler comment +std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const { + // We don't cache TSchedModel because it depends on TargetInstrInfo + // that could be changed during the compilation + TargetSchedModel TSchedModel; + TSchedModel.init(getSchedModel(), this, getInstrInfo()); + if (!TSchedModel.hasInstrSchedModel()) + return std::string(); + unsigned Latency = TSchedModel.computeInstrLatency(MCI.getOpcode()); + Optional RThroughput = + TSchedModel.computeInstrRThroughput(MCI.getOpcode()); + return createSchedInfoStr(Latency, RThroughput); +} diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 92dcf535ec99..9e5553fa8d42 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -103,7 +103,10 @@ public: void AddComment(const Twine &T, bool EOL = true) override; /// AddEncodingComment - Add a comment showing the encoding of an instruction. - void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &); + /// If PrintSchedInfo - is true then the comment sched:[x:y] should + // be added to output if it's being supported by target + void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &, + bool PrintSchedInfo); /// GetCommentOS - Return a raw_ostream that comments can be written to. /// Unlike AddComment, you are required to terminate comments with \n if you @@ -278,7 +281,8 @@ public: void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except) override; void EmitWinEHHandlerData() override; - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool PrintSchedInfo) override; void EmitBundleAlignMode(unsigned AlignPow2) override; void EmitBundleLock(bool AlignToEnd) override; @@ -1504,7 +1508,8 @@ void MCAsmStreamer::EmitWinCFIEndProlog() { } void MCAsmStreamer::AddEncodingComment(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, + bool PrintSchedInfo) { raw_ostream &OS = GetCommentOS(); SmallString<256> Code; SmallVector Fixups; @@ -1577,7 +1582,11 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst, } } } - OS << "]\n"; + OS << "]"; + // If we are not going to add fixup or schedul comments after this point then + // we have to end the current comment line with "\n". + if (Fixups.size() || !PrintSchedInfo) + OS << "\n"; for (unsigned i = 0, e = Fixups.size(); i != e; ++i) { MCFixup &F = Fixups[i]; @@ -1588,16 +1597,19 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst, } void MCAsmStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, + bool PrintSchedInfo) { assert(getCurrentSectionOnly() && "Cannot emit contents before setting section!"); // Show the encoding in a comment if we have a code emitter. if (Emitter) - AddEncodingComment(Inst, STI); + AddEncodingComment(Inst, STI, PrintSchedInfo); // Show the MCInst if enabled. if (ShowInst) { + if (PrintSchedInfo) + GetCommentOS() << "\n"; Inst.dump_pretty(GetCommentOS(), InstPrinter.get(), "\n "); GetCommentOS() << "\n"; } @@ -1607,6 +1619,16 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst, else InstPrinter->printInst(&Inst, OS, "", STI); + if (PrintSchedInfo) { + std::string SI = STI.getSchedInfoStr(Inst); + if (!SI.empty()) + GetCommentOS() << SI; + } + + StringRef Comments = CommentToEmit; + if (Comments.size() && Comments.back() != '\n') + GetCommentOS() << "\n"; + EmitEOL(); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 726326be2ee1..f7f2253256eb 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -238,7 +238,7 @@ bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const { } void MCObjectStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, bool) { MCStreamer::EmitInstruction(Inst, STI); MCSection *Sec = getCurrentSectionOnly(); diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index b9c01c66f31d..c9a6f12b6a58 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -777,8 +777,8 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) { } } -void MCStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { +void MCStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) { // Scan for values. for (unsigned i = Inst.getNumOperands(); i--;) if (Inst.getOperand(i).isExpr()) diff --git a/llvm/lib/Object/RecordStreamer.cpp b/llvm/lib/Object/RecordStreamer.cpp index a5018443b87d..c9c27451f809 100644 --- a/llvm/lib/Object/RecordStreamer.cpp +++ b/llvm/lib/Object/RecordStreamer.cpp @@ -78,7 +78,7 @@ RecordStreamer::const_iterator RecordStreamer::end() { return Symbols.end(); } RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {} void RecordStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, bool) { MCStreamer::EmitInstruction(Inst, STI); } diff --git a/llvm/lib/Object/RecordStreamer.h b/llvm/lib/Object/RecordStreamer.h index c3bd5b09a9bf..a845ecd786a8 100644 --- a/llvm/lib/Object/RecordStreamer.h +++ b/llvm/lib/Object/RecordStreamer.h @@ -34,7 +34,8 @@ public: const_iterator begin(); const_iterator end(); RecordStreamer(MCContext &Context); - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override; void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override; void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override; bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 5903e1e36d45..271263507ae1 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -102,8 +102,8 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override { EmitA64MappingSymbol(); MCELFStreamer::EmitInstruction(Inst, STI); } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 774a0b3771b1..6fa890ba1cd5 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -477,8 +477,8 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst& Inst, - const MCSubtargetInfo &STI) override { + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override { if (IsThumb) EmitThumbMappingSymbol(); else diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index 09819ccedd8f..9e1ff9ca35d7 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -44,7 +44,7 @@ static cl::opt GPSize cl::init(8)); void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, bool) { assert(MCB.getOpcode() == Hexagon::BUNDLE); assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE); assert(HexagonMCInstrInfo::bundleSize(MCB) > 0); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h index 5cb84a48a313..024dff1a2f97 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h @@ -34,7 +34,8 @@ public: MCELFStreamer(Context, TAB, OS, Emitter), MCII (createHexagonMCInstrInfo()) {} - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override; void EmitSymbol(const MCInst &Inst); void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment, diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index 4eeccc3995fd..ae3278322311 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -20,7 +20,7 @@ using namespace llvm; void MipsELFStreamer::EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) { + const MCSubtargetInfo &STI, bool) { MCELFStreamer::EmitInstruction(Inst, STI); MCContext &Context = getContext(); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index 72cde1c90845..f5eda112817e 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -45,7 +45,8 @@ public: /// \p Inst is actually emitted. For example, we can inspect the operands and /// gather sufficient information that allows us to reason about the register /// usage for the translation unit. - void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override; + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool = false) override; /// Overriding this function allows us to record all labels that should be /// marked as microMIPS. Based on this data marking is done in diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp index 8c2617a687b8..9266f0e216d1 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp @@ -139,8 +139,8 @@ private: public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to mask dangerous instructions. - void EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override { // Sandbox indirect jumps. if (isIndirectJump(Inst)) { if (PendingCall) diff --git a/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp index 8594addb5dd4..6e062ec59347 100644 --- a/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -1189,8 +1189,6 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, OS << ']'; --i; // For loop increments element #. } - //MI->print(OS, 0); - OS << "\n"; // We successfully added a comment to this instruction. return true; diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 55b090b67640..550e3543a71e 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -102,7 +102,7 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( } void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { - OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); + OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), EnablePrintSchedInfo); SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get()); } @@ -1529,7 +1529,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector Mask; DecodePSHUFBMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), + !EnablePrintSchedInfo); } break; } @@ -1600,7 +1601,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector Mask; DecodeVPERMILPMask(C, ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), + !EnablePrintSchedInfo); } break; } @@ -1630,7 +1632,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector Mask; DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), + !EnablePrintSchedInfo); } break; } @@ -1646,7 +1649,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector Mask; DecodeVPPERMMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), + !EnablePrintSchedInfo); } break; } @@ -1706,7 +1710,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { CS << "?"; } CS << "]"; - OutStreamer->AddComment(CS.str()); + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); } else if (auto *CV = dyn_cast(C)) { CS << "<"; for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { @@ -1738,7 +1742,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << ">"; - OutStreamer->AddComment(CS.str()); + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); } } break; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index c2c95658482d..d0d88d326949 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -624,6 +624,9 @@ public: /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } + // TODO: Update the regression tests and return true. + bool supportPrintSchedInfo() const override { return false; } + bool enableEarlyIfConversion() const override; /// Return the instruction itineraries based on the subtarget selection. diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll index 5fd553b301aa..16e261bf3c5e 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -2,12 +2,12 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX ; If the target's divss/divps instructions are substantially ; slower than rcpss/rcpps with a Newton-Raphson refinement, @@ -25,11 +25,47 @@ define float @f32_no_estimate(float %x) #0 { ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: f32_no_estimate: -; AVX: # BB#0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-RECIP-LABEL: f32_no_estimate: +; AVX-RECIP: # BB#0: +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-RECIP-NEXT: retq +; +; FMA-RECIP-LABEL: f32_no_estimate: +; FMA-RECIP: # BB#0: +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; FMA-RECIP-NEXT: retq +; +; BTVER2-LABEL: f32_no_estimate: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [19:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: f32_no_estimate: +; SANDY: # BB#0: +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: f32_no_estimate: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; HASWELL-NO-FMA-LABEL: f32_no_estimate: +; HASWELL-NO-FMA: # BB#0: +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; HASWELL-NO-FMA-NEXT: retq +; +; AVX512-LABEL: f32_no_estimate: +; AVX512: # BB#0: +; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00] +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -65,30 +101,30 @@ define float @f32_one_step(float %x) #1 { ; ; BTVER2-LABEL: f32_one_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: f32_one_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: f32_one_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_one_step: ; HASWELL-NO-FMA: # BB#0: @@ -105,7 +141,7 @@ define float @f32_one_step(float %x) #1 { ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -155,42 +191,42 @@ define float @f32_two_step(float %x) #2 { ; ; BTVER2-LABEL: f32_two_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00] +; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00] +; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: f32_two_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00] +; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: f32_two_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 ; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_two_step: ; HASWELL-NO-FMA: # BB#0: @@ -209,13 +245,13 @@ define float @f32_two_step(float %x) #2 { ; AVX512-LABEL: f32_two_step: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovaps %xmm1, %xmm3 +; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 ; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 ; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 ; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -242,21 +278,21 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { ; ; BTVER2-LABEL: v4f32_no_estimate: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v4f32_no_estimate: ; SANDY: # BB#0: -; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v4f32_no_estimate: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50] +; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate: ; HASWELL-NO-FMA: # BB#0: @@ -266,9 +302,9 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { ; ; AVX512-LABEL: v4f32_no_estimate: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 -; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50] +; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00] +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -304,31 +340,31 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; ; BTVER2-LABEL: v4f32_one_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v4f32_one_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v4f32_one_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step: ; HASWELL-NO-FMA: # BB#0: @@ -342,18 +378,18 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; ; KNL-LABEL: v4f32_one_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v4f32_one_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -403,42 +439,42 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; ; BTVER2-LABEL: v4f32_two_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v4f32_two_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v4f32_two_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step: ; HASWELL-NO-FMA: # BB#0: @@ -456,25 +492,25 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; ; KNL-LABEL: v4f32_two_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vmovaps %xmm1, %xmm3 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v4f32_two_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; SKX-NEXT: vmovaps %xmm1, %xmm3 +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -504,21 +540,21 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { ; ; BTVER2-LABEL: v8f32_no_estimate: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_no_estimate: ; SANDY: # BB#0: -; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v8f32_no_estimate: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 -; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00] +; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate: ; HASWELL-NO-FMA: # BB#0: @@ -528,9 +564,9 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { ; ; AVX512-LABEL: v8f32_no_estimate: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 -; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00] +; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00] +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -573,31 +609,31 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; ; BTVER2-LABEL: v8f32_one_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_one_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v8f32_one_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step: ; HASWELL-NO-FMA: # BB#0: @@ -611,18 +647,18 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; ; KNL-LABEL: v8f32_one_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v8f32_one_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -685,42 +721,42 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; ; BTVER2-LABEL: v8f32_two_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_two_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v8f32_two_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step: ; HASWELL-NO-FMA: # BB#0: @@ -738,25 +774,25 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; ; KNL-LABEL: v8f32_two_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vmovaps %ymm1, %ymm3 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v8f32_two_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; SKX-NEXT: vmovaps %ymm1, %ymm3 +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll index 730d2f130388..440a6f0bef13 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX ; It's the extra tests coverage for recip as discussed on D26855. @@ -32,33 +32,33 @@ define float @f32_no_step_2(float %x) #3 { ; ; BTVER2-LABEL: f32_no_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: f32_no_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: f32_no_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_no_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; AVX512-LABEL: f32_no_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast float 1234.0, %x ret float %div } @@ -97,52 +97,52 @@ define float @f32_one_step_2(float %x) #1 { ; ; BTVER2-LABEL: f32_one_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: f32_one_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; AVX512-LABEL: f32_one_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast float 3456.0, %x ret float %div } @@ -184,57 +184,57 @@ define float @f32_one_step_2_divs(float %x) #1 { ; ; BTVER2-LABEL: f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; AVX512-LABEL: f32_one_step_2_divs: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 ; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast float 3456.0, %x %div2 = fdiv fast float %div, %x ret float %div2 @@ -288,72 +288,72 @@ define float @f32_two_step_2(float %x) #2 { ; ; BTVER2-LABEL: f32_two_step_2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00] +; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00] +; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: f32_two_step_2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00] +; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 ; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 ; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 ; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; AVX512-LABEL: f32_two_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovaps %xmm1, %xmm3 +; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 ; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 ; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 ; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast float 6789.0, %x ret float %div } @@ -392,62 +392,62 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; ; BTVER2-LABEL: v4f32_one_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; KNL-LABEL: v4f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v4f32_one_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -489,68 +489,68 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; ; BTVER2-LABEL: v4f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v4f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v4f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; KNL-LABEL: v4f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v4f32_one_step_2_divs: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <4 x float> , %x %div2 = fdiv fast <4 x float> %div, %x ret <4 x float> %div2 @@ -604,84 +604,84 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v4f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 +; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 -; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; KNL-LABEL: v4f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; KNL-NEXT: vmovaps %xmm1, %xmm3 +; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v4f32_two_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 -; SKX-NEXT: vmovaps %xmm1, %xmm3 +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 ; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -728,62 +728,62 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; ; BTVER2-LABEL: v8f32_one_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; KNL-LABEL: v8f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v8f32_one_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -834,68 +834,68 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v8f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] +; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; KNL-LABEL: v8f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] +; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v8f32_one_step_2_divs: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] +; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> , %x %div2 = fdiv fast <8 x float> %div, %x ret <8 x float> %div2 @@ -963,84 +963,84 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] -; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 -; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; KNL-LABEL: v8f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; KNL-NEXT: vmovaps %ymm1, %ymm3 +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] +; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v8f32_two_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 -; SKX-NEXT: vmovaps %ymm1, %ymm3 +; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 ; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -1064,33 +1064,33 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { ; ; BTVER2-LABEL: v8f32_no_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_no_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v8f32_no_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; KNL-LABEL: v8f32_no_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v8f32_no_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -1118,39 +1118,39 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; ; BTVER2-LABEL: v8f32_no_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: retq +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_no_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: retq +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] ; ; HASWELL-LABEL: v8f32_no_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: retq +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] ; ; KNL-LABEL: v8f32_no_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: retq +; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; KNL-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: v8f32_no_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } -- 2.34.1