From df323ba445f7fc4d29def8950e80dec6ba487961 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Thu, 29 Apr 2021 18:51:34 +0200 Subject: [PATCH] Revert "[X86] Support AMX fast register allocation" This reverts commit 3b8ec86fd576b9808dc63da620d9a4f7bbe04372. Revert "[X86] Refine AMX fast register allocation" This reverts commit c3f95e9197643b699b891ca416ce7d72cf89f5fc. This pass breaks using LLVM in a multi-threaded environment by introducing global state. --- llvm/include/llvm/CodeGen/Passes.h | 3 - llvm/include/llvm/CodeGen/TargetPassConfig.h | 4 - llvm/lib/CodeGen/TargetPassConfig.cpp | 4 - llvm/lib/Target/X86/CMakeLists.txt | 2 - llvm/lib/Target/X86/X86.h | 5 - llvm/lib/Target/X86/X86FastTileConfig.cpp | 306 -- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 13 +- llvm/lib/Target/X86/X86LowerAMXType.cpp | 362 +- llvm/lib/Target/X86/X86PreAMXConfig.cpp | 422 -- llvm/lib/Target/X86/X86TargetMachine.cpp | 11 - llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll | 4559 -------------------- .../test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll | 78 - .../CodeGen/X86/AMX/amx-configO2toO0-precfg.ll | 210 - llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll | 513 --- llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir | 465 -- .../X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll | 2 +- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll | 2 +- llvm/test/CodeGen/X86/O0-pipeline.ll | 2 - llvm/tools/opt/opt.cpp | 36 +- .../gn/secondary/llvm/lib/Target/X86/BUILD.gn | 1 - 20 files changed, 40 insertions(+), 6960 deletions(-) delete mode 100644 llvm/lib/Target/X86/X86FastTileConfig.cpp delete mode 100644 llvm/lib/Target/X86/X86PreAMXConfig.cpp delete mode 100644 llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll delete mode 100644 llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll delete mode 100644 llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll delete mode 100644 llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll delete mode 100644 llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index cc911ab..c01f529 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -501,9 +501,6 @@ namespace llvm { /// or split the data to two <128 x i32>. FunctionPass *createX86LowerAMXTypePass(); - /// The pass insert tile config intrinsics for AMX fast register allocation. - FunctionPass *createX86PreAMXConfigPass(); - /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. FunctionPass *createX86LowerAMXIntrinsicsPass(); diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h index 1113803..1511045 100644 --- a/llvm/include/llvm/CodeGen/TargetPassConfig.h +++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -406,10 +406,6 @@ protected: return false; } - /// addPostFastRegAllocRewrite - Add passes to the optimized register - /// allocation pipeline after fast register allocation is complete. - virtual bool addPostFastRegAllocRewrite() { return false; } - /// Add passes to be run immediately after virtual registers are rewritten /// to physical registers. virtual void addPostRewrite() { } diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 22906c5..6e80235 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1316,10 +1316,6 @@ bool TargetPassConfig::addRegAssignAndRewriteFast() { report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc."); addPass(createRegAllocPass(false)); - - // Allow targets to change the register assignments after - // fast register allocation. - addPostFastRegAllocRewrite(); return true; } diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index a2816f6..09ffc2e 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -34,10 +34,8 @@ set(sources X86DiscriminateMemOps.cpp X86LowerTileCopy.cpp X86LowerAMXType.cpp - X86PreAMXConfig.cpp X86LowerAMXIntrinsics.cpp X86TileConfig.cpp - X86FastTileConfig.cpp X86PreTileConfig.cpp X86ExpandPseudo.cpp X86FastISel.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index eba5b6c..0240dc7 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -79,9 +79,6 @@ FunctionPass *createX86WinAllocaExpander(); /// Return a pass that config the tile registers. FunctionPass *createX86TileConfigPass(); -/// Return a pass that config the tile registers after fast reg allocation. -FunctionPass *createX86FastTileConfigPass(); - /// Return a pass that insert pseudo tile config instruction. FunctionPass *createX86PreTileConfigPass(); @@ -175,10 +172,8 @@ void initializeX86PartialReductionPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); -void initializeX86FastTileConfigPass(PassRegistry &); void initializeX86TileConfigPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); -void initializeX86PreAMXConfigPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp deleted file mode 100644 index 6fad91c..0000000 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ /dev/null @@ -1,306 +0,0 @@ -//===-- X86FastTileConfig.cpp - Fast Tile Register Configure---------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file Pass to config the shape of AMX physical registers -/// AMX register need to be configured before use. Before FastRegAllocation pass -/// the ldtilecfg instruction is inserted, however at that time we don't -/// know the shape of each physical tile registers, because the register -/// allocation is not done yet. This pass runs after egister allocation -/// pass. It collects the shape information of each physical tile register -/// and store the shape in the stack slot that is allocated for load config -/// to tile config register. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrBuilder.h" -#include "X86MachineFunctionInfo.h" -#include "X86RegisterInfo.h" -#include "X86Subtarget.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; - -#define DEBUG_TYPE "fasttileconfig" - -namespace { - -class X86FastTileConfig : public MachineFunctionPass { - // context - MachineFunction *MF = nullptr; - const X86Subtarget *ST = nullptr; - const TargetRegisterInfo *TRI = nullptr; - const TargetInstrInfo *TII = nullptr; - MachineRegisterInfo *MRI = nullptr; - - MachineInstr *getTileConfigPoint(); - void tileConfig(); - -public: - X86FastTileConfig() : MachineFunctionPass(ID) {} - - bool fastTileConfig(); - bool isTileLoad(MachineInstr &MI); - bool isTileStore(MachineInstr &MI); - bool isAMXInstr(MachineInstr &MI); - void getTileStoreShape(MachineInstr &MI, - SmallVector &ShapedTiles); - - MachineInstr *getKeyAMXInstr(MachineInstr *MI); - void getTileShapesCfg(MachineInstr *MI, - SmallVector &ShapedTiles); - void getShapeCfgInstrs(MachineInstr *MI, - std::map &RowCfgs, - std::map &ColCfgs); - - /// Return the pass name. - StringRef getPassName() const override { - return "Fast Tile Register Configure"; - } - - void materializeTileCfg(MachineInstr *MI); - - void rewriteTileCfg(SmallVector &ShapedTiles, - std::map &RowCfgs, - std::map &ColCfgs); - - /// Perform register allocation. - bool runOnMachineFunction(MachineFunction &MFunc) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoPHIs); - } - - static char ID; -}; - -} // end anonymous namespace - -char X86FastTileConfig::ID = 0; - -INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE, - "Fast Tile Register Configure", false, false) -INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, - "Fast Tile Register Configure", false, false) - -static bool isTilePhysReg(MachineOperand &Op) { - if (!Op.isReg()) - return false; - - Register Reg = Op.getReg(); - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return true; - return false; -} - -static unsigned getTilePhysRegIdx(MachineOperand *Op) { - assert(isTilePhysReg(*Op) && "Tile Operand is invalid"); - return Op->getReg() - X86::TMM0; -} - -static inline void adjustRowCfg(unsigned TIdx, MachineInstr *MI) { - unsigned Offset = 48 + TIdx; - MI->getOperand(3).ChangeToImmediate(Offset); -} - -static inline void adjustColCfg(unsigned TIdx, MachineInstr *MI) { - unsigned Offset = 16 + TIdx * 2; - MI->getOperand(3).ChangeToImmediate(Offset); -} - -bool X86FastTileConfig::isTileLoad(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILELOADDV; -} -bool X86FastTileConfig::isTileStore(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILESTOREDV; -} -bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) { - // TODO: May need to handle some special nontile amx instrucion. - if (MI.getOpcode() == X86::LDTILECFG || MI.isDebugInstr()) - return false; - - for (MachineOperand &MO : MI.operands()) - if (isTilePhysReg(MO)) - return true; - - return false; -} - -MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) { - auto Cfg = MachineBasicBlock::iterator(MI); - MachineBasicBlock *MBB = MI->getParent(); - MachineInstr *KeyMI = nullptr; - int KeyAMXNum = 0; - - for (auto II = Cfg; II != MBB->end(); II++) { - if (isTileLoad(*II)) { - KeyMI = &*II; - continue; - } - - if (isTileStore(*II)) { - assert(KeyMI && "Key AMX Should be found before!"); - break; - } - - if (isAMXInstr(*II)) { - assert((KeyAMXNum == 0) && "Too many Key AMX instruction!"); - KeyAMXNum++; - KeyMI = &*II; - } - } - assert(KeyMI && "There must be an AMX instruction."); - return KeyMI; -} - -// Orderly get the tiles in key amx instruction, uses before defs. -void X86FastTileConfig::getTileShapesCfg( - MachineInstr *CfgMI, SmallVector &ShapedTiles) { - MachineInstr *KeyMI = getKeyAMXInstr(CfgMI); - - SmallVector DefTiles; - for (MachineOperand &MO : KeyMI->operands()) { - if (!isTilePhysReg(MO)) - continue; - if (MO.isDef()) - DefTiles.push_back(&MO); - else - ShapedTiles.push_back(&MO); - } - ShapedTiles.append(DefTiles); -} - -// We pre-config the shapes at position named with "amx.tmm.N.shape.row* and -// amx.shape.N.col*" at pass "Pre AMX Tile Config". -// The 'N' implies the order of tiles in key amx intrinsic. -void X86FastTileConfig::getShapeCfgInstrs( - MachineInstr *MI, std::map &RowCfgs, - std::map &ColCfgs) { - auto Cfg = MachineBasicBlock::iterator(MI); - MachineBasicBlock *MBB = MI->getParent(); - - for (auto II = Cfg; II != MBB->begin(); II--) { - if (isAMXInstr(*II) || II->isTerminator() || II->isCall()) - break; - if (!II->mayStore() || !II->hasOneMemOperand()) - continue; - const Value *MemPtr = II->memoperands()[0]->getValue(); - if (!MemPtr) - continue; - - StringRef Name = MemPtr->getName(); - if (!Name.startswith("amx.tmm.")) - continue; - - // Get the 'N'th tile shape config in key amx instruction. - auto N = Name.find(".shape"); - StringRef STileIdx = Name.slice(8, N); - unsigned Idx; - STileIdx.getAsInteger(10, Idx); - - // And related them with their store instructions. - if (Name.contains("row")) - RowCfgs[Idx] = &*II; - else if (Name.contains("col")) - ColCfgs[Idx] = &*II; - else - llvm_unreachable("Invalid tile shape info!"); - } - assert((RowCfgs.size() == ColCfgs.size()) && - "The number of tile row and col must be equal!"); -} - -// Here is the data format for the tile config. -// 0 palette = 1 now. -// 1 start_row = 0 now. -// 2-15 reserved, must be zero -// 16-17 tile0.colsb Tile 0 bytes per row. -// 18-19 tile1.colsb Tile 1 bytes per row. -// 20-21 tile2.colsb Tile 2 bytes per row. -// ... (sequence continues) -// 30-31 tile7.colsb Tile 7 bytes per row. -// 32-47 reserved, must be zero -// 48 tile0.rows Tile 0 rows. -// 49 tile1.rows Tile 1 rows. -// 50 tile2.rows Tile 2 rows. -// ... (sequence continues) -// 55 tile7.rows Tile 7 rows. -// 56-63 reserved, must be zero -void X86FastTileConfig::rewriteTileCfg( - SmallVector &ShapedTiles, - std::map &RowCfgs, - std::map &ColCfgs) { - assert((RowCfgs.size() == ShapedTiles.size()) && - "The number of tile shapes not equal with the number of tiles!"); - - // Orderly get the tiles and adjust the shape config. - for (unsigned I = 0, E = ShapedTiles.size(); I < E; I++) { - MachineOperand *MO = ShapedTiles[I]; - unsigned TmmIdx = getTilePhysRegIdx(MO); - if (I == TmmIdx) - continue; - adjustRowCfg(TmmIdx, RowCfgs[I]); - adjustColCfg(TmmIdx, ColCfgs[I]); - } -} - -// We have already preconfig the shapes before fast register allocation at -// X86PreAMXConfig::preWriteTileCfg(). Now, we have done fast register -// allocation, the shapes pre-written before may not rightly corresponding -// to the correct tmm registers, so we need adjust them. -void X86FastTileConfig::materializeTileCfg(MachineInstr *CfgMI) { - SmallVector ShapedTiles; - std::map RowCfgs; - std::map ColCfgs; - - // Orderly keep the tile uses and def in ShapedTiles; - getTileShapesCfg(CfgMI, ShapedTiles); - assert(ShapedTiles.size() && "Not find shapes config!"); - - getShapeCfgInstrs(CfgMI, RowCfgs, ColCfgs); - - rewriteTileCfg(ShapedTiles, RowCfgs, ColCfgs); -} - -bool X86FastTileConfig::fastTileConfig() { - bool Changed = false; - - for (MachineBasicBlock &MBB : *MF) { - SmallVector CFGs; - for (MachineInstr &MI : MBB) - if (MI.getOpcode() == X86::LDTILECFG) - CFGs.push_back(&MI); - for (auto *MI : CFGs) - materializeTileCfg(MI); - if (!CFGs.empty()) - Changed = true; - } - return Changed; -} - -bool X86FastTileConfig::runOnMachineFunction(MachineFunction &MFunc) { - MF = &MFunc; - MRI = &MFunc.getRegInfo(); - ST = &MFunc.getSubtarget(); - TRI = ST->getRegisterInfo(); - TII = MFunc.getSubtarget().getInstrInfo(); - - return fastTileConfig(); -} - -FunctionPass *llvm::createX86FastTileConfigPass() { - return new X86FastTileConfig(); -} diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 248069f..f561c84 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -34,7 +34,6 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -53,10 +52,6 @@ static bool isV256I32Ty(Type *Ty) { } #endif -static cl::opt - X86ScalarizeAMX("enable-x86-scalar-amx", cl::init(false), cl::Hidden, - cl::desc("X86: enable AMX scalarizition.")); - namespace { class X86LowerAMXIntrinsics { Function &Func; @@ -98,7 +93,6 @@ private: lowerTileDP(Instruction *TileDP); bool lowerTileZero(Instruction *TileZero); }; -} // anonymous namespace BasicBlock *X86LowerAMXIntrinsics::createLoop(BasicBlock *Preheader, BasicBlock *Exit, Value *Bound, @@ -630,6 +624,9 @@ bool X86LowerAMXIntrinsics::visit() { return C; } +} // anonymous namespace + +namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: @@ -641,8 +638,6 @@ public: } bool runOnFunction(Function &F) override { - if (!X86ScalarizeAMX) - return false; TargetMachine *TM = &getAnalysis().getTM(); if (!F.hasFnAttribute(Attribute::OptimizeNone) && TM->getOptLevel() != CodeGenOpt::None) @@ -666,6 +661,8 @@ public: } }; +} // anonymous namespace + static const char PassName[] = "Lower AMX intrinsics"; char X86LowerAMXIntrinsicsLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index cbf578a..2150a9d 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -1,4 +1,4 @@ -//===- Target/X86/X86LowerAMXType.cpp - -------------------------*- C++ -*-===// +//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,27 +14,6 @@ /// load/store <256 x i32> instruction to AMX load/store. If the bitcast can /// not be combined with load/store, we transform the bitcast to amx load/store /// and <256 x i32> store/load. -/// -/// If Front End not use O0 but the Mid/Back end use O0, (e.g. "Clang -O2 -S -/// -emit-llvm t.c" + "llc t.ll") we should make sure the amx data is volatile, -/// because that is necessary for AMX fast register allocation. (In Fast -/// registera allocation, register will be allocated before spill/reload, so -/// there is no additional register for amx to identify the step in spill.) -/// The volatileTileData() will handle this case. -/// e.g. -/// ---------------------------------------------------------- -/// | def %td = ... | -/// | ... | -/// | "use %td" | -/// ---------------------------------------------------------- -/// will transfer to --> -/// ---------------------------------------------------------- -/// | def %td = ... | -/// | call void @llvm.x86.tilestored64.internal(mem, %td) | -/// | ... | -/// | %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)| -/// | "use %td2" | -/// ---------------------------------------------------------- // //===----------------------------------------------------------------------===// // @@ -62,13 +41,7 @@ using namespace PatternMatch; #define DEBUG_TYPE "lower-amx-type" -// In AMX intrinsics we let Shape = {Row, Col}, but the -// RealCol = Col / ElementSize. We may use the RealCol -// as a new Row for other new created AMX intrinsics. -static std::map Col2Row; - -static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, - BasicBlock *BB) { +static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) { Function &F = *BB->getParent(); Module *M = BB->getModule(); const DataLayout &DL = M->getDataLayout(); @@ -83,36 +56,7 @@ static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, return AllocaRes; } -static Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity) { - if (Col2Row.count(V)) - return Col2Row[V]; - IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt()); - if (auto *I = dyn_cast(V)) { - BasicBlock::iterator Iter = I->getIterator(); - ++Iter; - Builder.SetInsertPoint(&*Iter); - } - ConstantInt *Gran = Builder.getInt16(Granularity); - Value *RealRow = Builder.CreateUDiv(V, Gran); - Col2Row[V] = RealRow; - return RealRow; -} - -namespace { -class X86LowerAMXType { - Function &Func; - TargetMachine *TM = nullptr; - -public: - X86LowerAMXType(Function &F, TargetMachine *TargetM) : Func(F), TM(TargetM) {} - bool visit(); - void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); - void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); - bool transformBitcast(BitCastInst *Bitcast); - std::pair getShape(IntrinsicInst *II, unsigned OpNo); -}; - -std::pair X86LowerAMXType::getShape(IntrinsicInst *II, unsigned OpNo) { +static std::pair getShape(IntrinsicInst *II, unsigned OpNo) { Value *Row = nullptr, *Col = nullptr; switch (II->getIntrinsicID()) { default: @@ -141,13 +85,6 @@ std::pair X86LowerAMXType::getShape(IntrinsicInst *II, unsigne break; case 5: Row = II->getArgOperand(2); - // FIXME: There is a design bug for AMX shape, which the Col should be - // Col/4 if it will be used as Row, but current Greedy RA can't handle - // this case well, it may failed if we generate a new Shape definition. - // So Let's just do it in O0 first. - // Row = Row / 4 - if (TM->getOptLevel() == CodeGenOpt::None) - Row = getRowFromCol(II, Row, 4); Col = II->getArgOperand(1); break; } @@ -163,7 +100,7 @@ std::pair X86LowerAMXType::getShape(IntrinsicInst *II, unsigne // --> // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, // i8* %addr, i64 %stride64) -void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { +static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { Value *Row = nullptr, *Col = nullptr; Use &U = *(Bitcast->use_begin()); unsigned OpNo = U.getOperandNo(); @@ -188,7 +125,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { // --> // call void @llvm.x86.tilestored64.internal(%row, %col, %addr, // %stride64, %13) -void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) { +static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) { Value *Tile = Bitcast->getOperand(0); auto *II = cast(Tile); @@ -220,14 +157,14 @@ void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) { } // transform bitcast to instructions. -bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { +static bool transformBitcast(BitCastInst *Bitcast) { IRBuilder<> Builder(Bitcast); AllocaInst *AllocaAddr; Value *I8Ptr, *Stride; auto *Src = Bitcast->getOperand(0); auto Prepare = [&]() { - AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent()); + AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent()); I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy()); Stride = Builder.getInt64(64); }; @@ -278,9 +215,17 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { return true; } +namespace { +class X86LowerAMXType { + Function &Func; + +public: + X86LowerAMXType(Function &F) : Func(F) {} + bool visit(); +}; + bool X86LowerAMXType::visit() { SmallVector DeadInsts; - Col2Row.clear(); for (BasicBlock *BB : post_order(&Func)) { for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend(); @@ -377,260 +322,6 @@ bool X86LowerAMXType::visit() { } } // anonymous namespace -static Value *getAllocaPos(BasicBlock *BB) { - Module *M = BB->getModule(); - Function *F = BB->getParent(); - IRBuilder<> Builder(&F->getEntryBlock().front()); - const DataLayout &DL = M->getDataLayout(); - unsigned AllocaAS = DL.getAllocaAddrSpace(); - Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false); - AllocaInst *AllocaRes = - new AllocaInst(V256I32Ty, AllocaAS, "", &F->getEntryBlock().front()); - BasicBlock::iterator Iter = AllocaRes->getIterator(); - ++Iter; - Builder.SetInsertPoint(&*Iter); - Value *I8Ptr = Builder.CreateBitCast(AllocaRes, Builder.getInt8PtrTy()); - return I8Ptr; -} - -static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { - assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); - auto *II = cast(TileDef); - assert(II && "Not tile intrinsic!"); - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); - - BasicBlock *BB = TileDef->getParent(); - BasicBlock::iterator Iter = TileDef->getIterator(); - IRBuilder<> Builder(BB, ++Iter); - Value *Stride = Builder.getInt64(64); - std::array Args = {Row, Col, Ptr, Stride, TileDef}; - - Instruction *TileStore = - Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args); - return TileStore; -} - -static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { - Value *V = U.get(); - assert(V->getType()->isX86_AMXTy() && "Not define tile!"); - - // Get tile shape. - IntrinsicInst *II = nullptr; - if (IsPHI) { - Value *PhiOp = dyn_cast(V)->getIncomingValue(0); - II = cast(PhiOp); - } else { - II = cast(V); - } - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); - - Instruction *UserI = dyn_cast(U.getUser()); - IRBuilder<> Builder(UserI); - Value *Stride = Builder.getInt64(64); - std::array Args = {Row, Col, Ptr, Stride}; - - Value *TileLoad = - Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args); - UserI->replaceUsesOfWith(V, TileLoad); -} - -static bool isIncomingOfPHI(Instruction *I) { - for (Use &U : I->uses()) { - User *V = U.getUser(); - if (isa(V)) - return true; - } - return false; -} - -// Let all AMX tile data become volatile data, shorten the life range -// of each tile register before fast register allocation. -namespace { -class X86VolatileTileData { - Function &F; - -public: - X86VolatileTileData(Function &Func) : F(Func) {} - Value *updatePhiIncomings(BasicBlock *BB, - SmallVector &Imcomings); - void replacePhiDefWithLoad(Instruction *PHI, Value *StorePtr); - bool volatileTileData(); - void volatileTilePHI(PHINode *Inst); - void volatileTileNonPHI(Instruction *I); -}; - -Value *X86VolatileTileData::updatePhiIncomings( - BasicBlock *BB, SmallVector &Imcomings) { - Value *I8Ptr = getAllocaPos(BB); - - for (auto *I : Imcomings) { - User *Store = createTileStore(I, I8Ptr); - - // All its uses (except phi) should load from stored mem. - for (Use &U : I->uses()) { - User *V = U.getUser(); - if (isa(V) || V == Store) - continue; - replaceWithTileLoad(U, I8Ptr); - } - } - return I8Ptr; -} - -void X86VolatileTileData::replacePhiDefWithLoad(Instruction *PHI, - Value *StorePtr) { - for (Use &U : PHI->uses()) - replaceWithTileLoad(U, StorePtr, true); - PHI->eraseFromParent(); -} - -// Smilar with volatileTileNonPHI, this function only handle PHI Nodes -// and their related AMX intrinsics. -// 1) PHI Def should change to tileload. -// 2) PHI Incoming Values should tilestored in just after their def. -// 3) The mem of these tileload and tilestores should be same. -// e.g. -// ------------------------------------------------------ -// bb_dom: -// ... -// br i1 %bool.cond, label %if.else, label %if.then -// -// if.then: -// def %t0 = ... -// ... -// use %t0 -// ... -// br label %if.end -// -// if.else: -// def %t1 = ... -// br label %if.end -// -// if.end: -// %td = phi x86_amx [ %t1, %if.else ], [ %t0, %if.then ] -// ... -// use %td -// ------------------------------------------------------ -// --> -// ------------------------------------------------------ -// bb_entry: -// %mem = alloca <256 x i32>, align 1024 * -// ... -// bb_dom: -// ... -// br i1 %bool.cond, label %if.else, label %if.then -// -// if.then: -// def %t0 = ... -// call void @llvm.x86.tilestored64.internal(mem, %t0) * -// ... -// %t0` = call x86_amx @llvm.x86.tileloadd64.internal(mem)* -// use %t0` * -// ... -// br label %if.end -// -// if.else: -// def %t1 = ... -// call void @llvm.x86.tilestored64.internal(mem, %t1) * -// br label %if.end -// -// if.end: -// ... -// %td = call x86_amx @llvm.x86.tileloadd64.internal(mem) * -// use %td -// ------------------------------------------------------ -void X86VolatileTileData::volatileTilePHI(PHINode *PHI) { - BasicBlock *BB = PHI->getParent(); - SmallVector Imcomings; - - for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) { - Value *Op = PHI->getIncomingValue(I); - Instruction *Inst = dyn_cast(Op); - assert(Inst && "We shouldn't fold AMX instrution!"); - Imcomings.push_back(Inst); - } - - Value *StorePtr = updatePhiIncomings(BB, Imcomings); - replacePhiDefWithLoad(PHI, StorePtr); -} - -// Store the defined tile and load it before use. -// All its users are not PHI. -// e.g. -// ------------------------------------------------------ -// def %td = ... -// ... -// "use %td" -// ------------------------------------------------------ -// --> -// ------------------------------------------------------ -// def %td = ... -// call void @llvm.x86.tilestored64.internal(mem, %td) -// ... -// %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem) -// "use %td2" -// ------------------------------------------------------ -void X86VolatileTileData::volatileTileNonPHI(Instruction *I) { - BasicBlock *BB = I->getParent(); - Value *I8Ptr = getAllocaPos(BB); - User *Store = createTileStore(I, I8Ptr); - - // All its uses should load from stored mem. - for (Use &U : I->uses()) { - User *V = U.getUser(); - assert(!isa(V) && "PHI Nodes should be excluded!"); - if (V != Store) - replaceWithTileLoad(U, I8Ptr); - } -} - -// Volatile Tile Model: -// 1) All the uses of tile data comes from tileload in time. -// 2) All the defs of tile data tilestore into mem immediately. -// For example: -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx -// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) -// call void @llvm.x86.tilestored64.internal(... td) area -// -------------------------------------------------------------------------- -// 3) No terminator, call or other amx instructions in the key amx area. -bool X86VolatileTileData::volatileTileData() { - bool Changed = false; - for (BasicBlock &BB : F) { - SmallVector PHIInsts; - SmallVector AMXDefInsts; - - for (Instruction &I : BB) { - if (!I.getType()->isX86_AMXTy()) - continue; - if (isa(&I)) - PHIInsts.push_back(&I); - else - AMXDefInsts.push_back(&I); - } - - // First we "volatile" the non-phi related amx intrinsics. - for (Instruction *I : AMXDefInsts) { - if (isIncomingOfPHI(I)) - continue; - volatileTileNonPHI(I); - Changed = true; - } - - for (Instruction *I : PHIInsts) { - volatileTilePHI(dyn_cast(I)); - Changed = true; - } - } - return Changed; -} - -} // anonymous namespace - namespace { class X86LowerAMXTypeLegacyPass : public FunctionPass { @@ -643,24 +334,11 @@ public: bool runOnFunction(Function &F) override { TargetMachine *TM = &getAnalysis().getTM(); - - X86LowerAMXType LAT(F, TM); + if (F.hasFnAttribute(Attribute::OptimizeNone) || + TM->getOptLevel() == CodeGenOpt::None) + return false; + X86LowerAMXType LAT(F); bool C = LAT.visit(); - - // Prepare for fast register allocation at O0. - // Todo: May better check the volatile model of AMX code, not just - // by checking Attribute::OptimizeNone and CodeGenOpt::None. - if (TM->getOptLevel() == CodeGenOpt::None) { - // If Front End not use O0 but the Mid/Back end use O0, (e.g. - // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make - // sure the amx data is volatile, that is nessary for AMX fast - // register allocation. - if (!F.hasFnAttribute(Attribute::OptimizeNone)) { - X86VolatileTileData VTD(F); - C = VTD.volatileTileData() || C; - } - } - return C; } diff --git a/llvm/lib/Target/X86/X86PreAMXConfig.cpp b/llvm/lib/Target/X86/X86PreAMXConfig.cpp deleted file mode 100644 index 243f2ed..0000000 --- a/llvm/lib/Target/X86/X86PreAMXConfig.cpp +++ /dev/null @@ -1,422 +0,0 @@ -//===- Target/X86/X86PreAMXConfig.cpp - ------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// Insert tilecfg for each area of key AMX intrinsic. -/// All the key AMX intrinsic's tile operand must come from tileload. And the -/// def tile of key AMX intrinsic must be tilestored. -/// take tdpbssd for example: -/// -------------------------------------------------------------------------- -/// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(...) key -/// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(...) | -/// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(...) amx -/// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(t1, t2, t3) | -/// call void @llvm.x86.tilestored64.internal(... td) area -/// -------------------------------------------------------------------------- -/// This pass will insert tilecfg before every key-amx-area, some like: -/// -------------------------------------------------------------------------- -/// %cfgmem = alloca <16 x i32>, align 4 * allocate mem -/// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init -/// ... -/// ... pre-config shape of %t1 * -/// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * -/// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config -/// ... * -/// ... pre-config shape of %t2 * shapes -/// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * -/// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * -/// ... -/// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * tile config -// -//===----------------------------------------------------------------------===// -// -#include "X86.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsX86.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; -using namespace PatternMatch; - -#define DEBUG_TYPE "pre-amx-config" - -static bool isAMXIntrinsic(IntrinsicInst *II) { - for (Value *Operand : II->operands()) - if (Operand->getType()->isX86_AMXTy()) - return true; - return II->getType()->isX86_AMXTy(); -} - -static bool isTileLoad(IntrinsicInst *II) { - return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal; -} - -static bool isTileStore(IntrinsicInst *II) { - return II->getIntrinsicID() == Intrinsic::x86_tilestored64_internal; -} - -#ifndef NDEBUG -static bool onlyTileDef(IntrinsicInst *II) { - for (Value *Operand : II->operands()) - if (Operand->getType()->isX86_AMXTy()) - return false; - return II->getType()->isX86_AMXTy(); -} - -static bool brokenVolatile(Instruction *I) { - // Todo: it is weak to identify a normal call here. - if ((isa(I) && !isa(I)) || I->isTerminator()) - return true; - return false; -} -#endif - -namespace { -class X86PreAMXConfig { - Function &F; - -public: - X86PreAMXConfig(Function &Func) : F(Func) {} - bool preTileConfig(); - bool addTileConfig(Instruction *ModelStart, SmallVector &Shapes); - bool findConfigShapes( - DenseMap> &PosAndShapes); - bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector &Shapes); - bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos, - SmallVector &Shapes); - BasicBlock::iterator - getShapesAndConfigPosEnd(BasicBlock::iterator Iter, - SmallVector &Shapes); - bool checkVolatileModel(SmallSet &Loads, IntrinsicInst *Store, - IntrinsicInst *KeyAMX); -}; - -// Orderly write the shapes in tilecfg's mem. This maybe not right. -// Because the first shape may not corresponding to the first tmm register, -// so we need to handle at at X86FastTileConfig::materializeTileCfg() -// after register allocation. -// For example: -// -------------------------------------------------------------------------- -// zeroinitialize tilecfg's mem (of ldtilecfg) -// -------------------------------------------------------------------------- -// ... pre-config shape of %t1 * -// %amx.tmm.0.shape.row = getelementptr i8, i8* %mem, i64 48 * -// %amx.tmm.0.shape.col = getelementptr i16, i16* %mem, i64 16 * -// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * -// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config -// ... * -// ... pre-config shape of %t2 * -// %amx.tmm.1.shape.row = getelementptr i8, i8* %mem, i64 49 * -// %amx.tmm.1.shape.col = getelementptr i16, i16* %mem, i64 18 * -// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes -// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * -// ... * -// ... pre-config shape of %t3 * of -// %amx.tmm.2.shape.row = getelementptr i8, i8* %mem, i64 50 * -// %amx.tmm.2.shape.col = getelementptr i16, i16* %mem, i64 20 * -// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 * -// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 * -// ... * tiles -// ... pre-config shape of %td * -// %amx.tmm.3.shape.row = getelementptr i8, i8* %mem, i64 51 * -// %amx.tmm.3.shape.col = getelementptr i16, i16* %mem, i64 22 * -// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 * -// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 * -// -------------------------------------------------------------------------- -// call void @llvm.x86.ldtilecfg(i8* %mem) * tile config -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx -// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) -// call void @llvm.x86.tilestored64.internal(... td) area -// -------------------------------------------------------------------------- -bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos, - SmallVector &Shapes) { - bool Write = false; - LLVMContext &Ctx = Pos->getParent()->getContext(); - Type *I8Ty = Type::getInt8Ty(Ctx); - Type *I16Ty = Type::getInt16Ty(Ctx); - - // TODO: Currently we defaultly set Palette = 1, it may be assigned to - // other value in the future. - Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0); - Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1); - Value *PalettePos = - GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos); - new StoreInst(PaletteValue, PalettePos, Pos); - - for (int I = 0, E = Shapes.size() / 2; I < E; I++) { - Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I); - Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2); - const std::string ShapeName = "amx.tmm." + itostr(I); - Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset, - ShapeName + ".shape.row", Pos); - Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos); - ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0), - ShapeName + ".shape.col", Pos); - Value *Row = Shapes[I * 2]; - Value *Col = Shapes[I * 2 + 1]; - Row = new TruncInst(Row, I8Ty, "", Pos); - new StoreInst(Row, RowPos, Pos); - new StoreInst(Col, ColPos, Pos); - Write = true; - } - return Write; -} - -bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart, - SmallVector &Shapes) { - Module *M = F.getParent(); - IRBuilder<> Builder(ModelStart); - const DataLayout &DL = M->getDataLayout(); - unsigned AddrSpace = DL.getAllocaAddrSpace(); - LLVMContext &Ctx = Builder.getContext(); - Type *V512Ty = VectorType::get(Builder.getInt32Ty(), 16, false); - Align Alignment = DL.getPrefTypeAlign(Type::getInt32Ty(Ctx)); - - AllocaInst *Addr = - new AllocaInst(V512Ty, AddrSpace, "", &F.getEntryBlock().front()); - Addr->setAlignment(Alignment); - Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy()); - - std::array Args = {I8Ptr}; - Instruction *Cfg = - Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg, None, Args); - - Value *Val0 = Constant::getNullValue(V512Ty); - Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg); - assert(Init0 && "Not Zero initilizate the cfg mem!"); - - preWriteTileCfg(I8Ptr, Cfg, Shapes); - - return Init0; -} - -// Todo: We may need to handle "more than one store" case in the future. -bool X86PreAMXConfig::checkVolatileModel(SmallSet &Loads, - IntrinsicInst *Store, - IntrinsicInst *KeyAMX) { - Value *ST = Store->getOperand(4); - - // Only has tileload and tilestore. - if (!KeyAMX) - return (Loads.size() == 1) && Loads.contains(ST); - - // All Loads should be operands of KeyAMX. - // All tile operands of KeyAMX should come from Loads. - for (Value *Op : KeyAMX->operands()) { - if (Op->getType()->isX86_AMXTy()) - if (!Loads.erase(Op)) - return false; - } - - // The def of KeyAMX should be stored into mem. - // Todo: is it key amx can be no def? - return Loads.empty() && (ST == cast(KeyAMX)); -} - -bool X86PreAMXConfig::getKeyAMXShapes(IntrinsicInst *KeyAMX, - SmallVector &Shapes) { - for (unsigned I = 0; I < KeyAMX->getNumOperands(); I++) { - Value *Op = KeyAMX->getOperand(I); - if (!Op->getType()->isX86_AMXTy()) - continue; - IntrinsicInst *TileDef = dyn_cast(Op); - assert((TileDef && isTileLoad(TileDef)) && - "All KeyAMX's tile definiation should comes from TileLoad!"); - Shapes.push_back(TileDef->getOperand(0)); - Shapes.push_back(TileDef->getOperand(1)); - } - if (!isTileStore(KeyAMX)) { - Shapes.push_back(KeyAMX->getOperand(0)); - Shapes.push_back(KeyAMX->getOperand(1)); - } - return Shapes.size() != 0; -} - -// Collect the shapes and skip the area of current key amx intrinsic. -// -// For example: -// ... -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) record (m,k) -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) record (m,k) -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) record (m,k) -// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) -// call void @llvm.x86.tilestored64.internal(m, n,... td) <--PosEnd record (m,k) -// -------------------------------------------------------------------------- -BasicBlock::iterator -X86PreAMXConfig::getShapesAndConfigPosEnd(BasicBlock::iterator Iter, - SmallVector &Shapes) { - IntrinsicInst *KeyAMX = nullptr; - BasicBlock *BB = Iter->getParent(); - BasicBlock::iterator PosEnd = BB->end(); - SmallSet Loads; - - // See TileStore as "Config Position End" and check volatile model. - for (auto I = Iter, E = BB->end(); I != E; ++I) { - assert(!brokenVolatile(&*I) && "Not reach tile store!"); - IntrinsicInst *II = dyn_cast(&*I); - if (!II || !isAMXIntrinsic(II)) - continue; - - if (isTileLoad(II)) { - Loads.insert(II); - } else if (isTileStore(II)) { - if (!checkVolatileModel(Loads, II, KeyAMX)) - report_fatal_error("Not Volatile AMX Model!"); - PosEnd = I; - break; - } else { - assert(!KeyAMX && "Too many key amx intrinsic!"); - KeyAMX = II; - } - } - assert(PosEnd != BB->end() && "Not find TileStore!"); - - // See KeyAMX as TileStore if only TileLoad and TileStore. - if (!KeyAMX) - KeyAMX = dyn_cast(&*PosEnd); - - // Get Shapes in order. - assert(Shapes.empty() && "Shapes should be clean."); - getKeyAMXShapes(KeyAMX, Shapes); - - return PosEnd; -} - -// Record a key amx area's shapes with its position. -// Use the first tileload as its position. -// For example: -// ... -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) <-- pos -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) / -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) shapes: -// %td = call x86_amx @llvm.x86.tdpbssd.internal(...t1, t2, t3) (m,k)(k,n) -// call void @llvm.x86.tilestored64.internal(m, n,... td) (m,n)(m,n) -// -------------------------------------------------------------------------- -bool X86PreAMXConfig::findConfigShapes( - DenseMap> &PosAndShapes) { - bool Find = false; - for (BasicBlock &BB : F) { - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) { - IntrinsicInst *II = dyn_cast(&*I); - if (!II) - continue; - if (!isAMXIntrinsic(II)) - continue; - assert(onlyTileDef(II) && "Not volatile model for AMX at O0!"); - - I = getShapesAndConfigPosEnd(I, PosAndShapes[&*I]); - Find = true; - } - } - return Find; -} - -// Insert ldtilecfg and preconfig the shapes for each area of key AMX intrinsic. -// e.g. (key amx = tdpbssd) -// -------------------------------------------------------------------------- -// %cfgmem = alloca <16 x i32>, align 4 * allocate mem -// store <16 x i32> zeroinitializer, <16 x i32>* %cfgmem * zero init -// ... -// ... pre-config shape of %t1 * -// store volatile i8 %m, i8* %amx.tmm.0.shape.row, align 1 * -// store volatile i16 %k, i16* %amx.tmm.0.shape.col, align 2 * pre-config -// ... * -// ... pre-config shape of %t2 * -// store volatile i8 %k, i8* %amx.tmm.1.shape.row, align 1 * shapes -// store volatile i16 %n, i16* %amx.tmm.1.shape.col, align 2 * -// ... * -// ... pre-config shape of %t3 * of -// store volatile i8 %m, i8* %amx.tmm.2.shape.row, align 1 * -// store volatile i16 %n, i16* %amx.tmm.2.shape.col, align 2 * -// ... * tiles -// ... pre-config shape of %td * -// store volatile i8 %m, i8* %amx.tmm.3.shape.row, align 1 * -// store volatile i16 %n, i16* %amx.tmm.3.shape.col, align 2 * -// -// call void @llvm.x86.ldtilecfg(i8* %cfgmem) * pre-config -// -------------------------------------------------------------------------- -// %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key -// %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...) -// %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx -// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3) -// call void @llvm.x86.tilestored64.internal(... td) area -// -------------------------------------------------------------------------- -bool X86PreAMXConfig::preTileConfig() { - DenseMap> PosAndShapes; - bool NeedCfg = findConfigShapes(PosAndShapes); - if (!NeedCfg) - return false; - for (auto &IPAndShapes : PosAndShapes) - addTileConfig(IPAndShapes.first, IPAndShapes.second); - - return true; -} -} // anonymous namespace - -namespace { - -class X86PreAMXConfigPass : public FunctionPass { -public: - static char ID; - - X86PreAMXConfigPass() : FunctionPass(ID) { - initializeX86PreAMXConfigPassPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - TargetMachine *TM = &getAnalysis().getTM(); - bool C = false; - - // Prepare for fast register allocation at O0. - if (TM->getOptLevel() == CodeGenOpt::None) { - - // We pre-config each key AMX intrinsic at O0. - // In theory, one tile config can cover several AMX intrinsics, but - // it is very diffcult to classify the tile shapes at O0. So here we - // let thing be easy, pre-config every key AMX intrinsic. - X86PreAMXConfig PCFG(F); - C = PCFG.preTileConfig(); - } - - return C; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - } -}; - -} // anonymous namespace - -static const char PassName[] = "Pre AMX Tile Config"; -char X86PreAMXConfigPass::ID = 0; -INITIALIZE_PASS_BEGIN(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false) -INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END(X86PreAMXConfigPass, DEBUG_TYPE, PassName, false, false) - -FunctionPass *llvm::createX86PreAMXConfigPass() { - return new X86PreAMXConfigPass(); -} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 084376d..ff99186 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -64,7 +64,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeX86LowerAMXIntrinsicsLegacyPassPass(PR); initializeX86LowerAMXTypeLegacyPassPass(PR); - initializeX86PreAMXConfigPassPass(PR); initializeGlobalISel(PR); initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); @@ -75,7 +74,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); initializeX86TileConfigPass(PR); - initializeX86FastTileConfigPass(PR); initializeX86LowerTileCopyPass(PR); initializeX86ExpandPseudoPass(PR); initializeX86ExecutionDomainFixPass(PR); @@ -379,7 +377,6 @@ public: bool addPreISel() override; void addMachineSSAOptimization() override; void addPreRegAlloc() override; - bool addPostFastRegAllocRewrite() override; void addPostRegAlloc() override; void addPreEmitPass() override; void addPreEmitPass2() override; @@ -419,9 +416,6 @@ void X86PassConfig::addIRPasses() { addPass(createX86LowerAMXIntrinsicsPass()); addPass(createX86LowerAMXTypePass()); - if (TM->getOptLevel() == CodeGenOpt::None) - addPass(createX86PreAMXConfigPass()); - TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOpt::None) { @@ -589,11 +583,6 @@ void X86PassConfig::addPreEmitPass2() { addPass(createX86LoadValueInjectionRetHardeningPass()); } -bool X86PassConfig::addPostFastRegAllocRewrite() { - addPass(createX86FastTileConfigPass()); - return true; -} - bool X86PassConfig::addPreRewrite() { addPass(createX86TileConfigPass()); return true; diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll deleted file mode 100644 index f7089e9..0000000 --- a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll +++ /dev/null @@ -1,4559 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2 - - -source_filename = "amx_api.c" - -%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }> - -@buf = dso_local global [1024 x i8] zeroinitializer, align 16 -@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 - -; Function Attrs: noinline nounwind optnone uwtable -define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 { -; AVX512-LABEL: test_api: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: .cfi_def_cfa_offset 16 -; AVX512-NEXT: .cfi_offset %rbp, -16 -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: .cfi_def_cfa_register %rbp -; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400 -; AVX512-NEXT: movw %dx, %ax -; AVX512-NEXT: movw %si, %cx -; AVX512-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl $1088, %edx # imm = 0x440 -; AVX512-NEXT: callq memset@PLT -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl $1088, %edx # imm = 0x440 -; AVX512-NEXT: callq memset@PLT -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: movl $1088, %edx # imm = 0x440 -; AVX512-NEXT: callq memset@PLT -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: cmpl $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: je .LBB0_2 -; AVX512-NEXT: # %bb.1: # %if.then -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %si -; AVX512-NEXT: movw 2(%rax), %dx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: addq $64, %rdx -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %di -; AVX512-NEXT: movw 2(%rax), %dx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: addq $64, %rdx -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %si -; AVX512-NEXT: movw 2(%rax), %dx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rdi) -; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: addq $64, %rdx -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: jmp .LBB0_3 -; AVX512-NEXT: .LBB0_2: # %if.else -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %si -; AVX512-NEXT: movw 2(%rax), %dx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: addq $64, %rdx -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %di -; AVX512-NEXT: movw 2(%rax), %dx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: addq $64, %rdx -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw (%rax), %si -; AVX512-NEXT: movw 2(%rax), %dx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rdi) -; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: addq $64, %rdx -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: .LBB0_3: # %if.end -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: movl $1088, %edx # imm = 0x440 -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq memcpy@PLT -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: callq memcpy@PLT -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 64(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 512(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 576(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 640(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 704(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 896(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 960(%rax), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rax), %zmm0 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 -; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: movl $1024, %edx # imm = 0x400 -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq memcpy@PLT -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: callq memcpy@PLT -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: callq memcpy@PLT -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: # kill: def $r8 killed $rax -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 -; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; AVX512-NEXT: movw %r10w, %di -; AVX512-NEXT: shrl $2, %r10d -; AVX512-NEXT: movw %r10w, %r9w -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r8b -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX512-NEXT: # kill: def $r10b killed $r10b killed $r10d -; AVX512-NEXT: movb %r10b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: tileloadd (%r10,%r8), %tmm0 -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: tileloadd (%r10,%r8), %tmm1 -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: tileloadd (%r10,%r8), %tmm2 -; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: addq $64, %rdi -; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq memcpy@PLT -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq memcpy@PLT -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: # kill: def $rdi killed $rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15 -; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16 -; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %r9b -; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%r8) -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0 -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: .cfi_def_cfa %rsp, 8 -; AVX512-NEXT: tilerelease -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; -; AVX2-LABEL: test_api: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX2-NEXT: subq $29696, %rsp # imm = 0x7400 -; AVX2-NEXT: movw %dx, %ax -; AVX2-NEXT: movw %si, %cx -; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl $1088, %edx # imm = 0x440 -; AVX2-NEXT: callq memset@PLT -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl $1088, %edx # imm = 0x440 -; AVX2-NEXT: callq memset@PLT -; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: movl $1088, %edx # imm = 0x440 -; AVX2-NEXT: callq memset@PLT -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: cmpl $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: je .LBB0_2 -; AVX2-NEXT: # %bb.1: # %if.then -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw (%rax), %si -; AVX2-NEXT: movw 2(%rax), %dx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: addq $64, %rdx -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw (%rax), %di -; AVX2-NEXT: movw 2(%rax), %dx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: addq $64, %rdx -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw (%rax), %si -; AVX2-NEXT: movw 2(%rax), %dx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rdi) -; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: addq $64, %rdx -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: jmp .LBB0_3 -; AVX2-NEXT: .LBB0_2: # %if.else -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw (%rax), %si -; AVX2-NEXT: movw 2(%rax), %dx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: addq $64, %rdx -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw (%rax), %di -; AVX2-NEXT: movw 2(%rax), %dx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: addq $64, %rdx -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw (%rax), %si -; AVX2-NEXT: movw 2(%rax), %dx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rdi) -; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: addq $64, %rdx -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: .LBB0_3: # %if.end -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: movl $1088, %edx # imm = 0x440 -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq memcpy@PLT -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: callq memcpy@PLT -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps 64(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 256(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 288(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 352(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 384(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 448(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 480(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 512(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 544(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 576(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 608(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 640(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 672(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 704(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 736(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 768(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 800(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 832(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 864(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 896(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 928(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 960(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 992(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1024(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1056(%rax), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: movl $1024, %edx # imm = 0x400 -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq memcpy@PLT -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: callq memcpy@PLT -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: callq memcpy@PLT -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: # kill: def $r8 killed $rax -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; AVX2-NEXT: movw %r10w, %di -; AVX2-NEXT: shrl $2, %r10d -; AVX2-NEXT: movw %r10w, %r9w -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r8b -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; AVX2-NEXT: # kill: def $r10b killed $r10b killed $r10d -; AVX2-NEXT: movb %r10b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl $64, %r8d -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: tileloadd (%r10,%r8), %tmm0 -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: tileloadd (%r10,%r8), %tmm1 -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: tileloadd (%r10,%r8), %tmm2 -; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: addq $64, %rdi -; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq memcpy@PLT -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq memcpy@PLT -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: # kill: def $rdi killed $rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %r9b -; AVX2-NEXT: movb %r9b, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%r8) -; AVX2-NEXT: movl $64, %r8d -; AVX2-NEXT: tileloadd (%rdi,%r8), %tmm0 -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: .cfi_def_cfa %rsp, 8 -; AVX2-NEXT: tilerelease -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; SSE2-LABEL: test_api: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: .cfi_def_cfa_offset 16 -; SSE2-NEXT: .cfi_offset %rbp, -16 -; SSE2-NEXT: movq %rsp, %rbp -; SSE2-NEXT: .cfi_def_cfa_register %rbp -; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; SSE2-NEXT: subq $30720, %rsp # imm = 0x7800 -; SSE2-NEXT: movw %dx, %ax -; SSE2-NEXT: movw %si, %cx -; SSE2-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl $1088, %edx # imm = 0x440 -; SSE2-NEXT: callq memset@PLT -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl $1088, %edx # imm = 0x440 -; SSE2-NEXT: callq memset@PLT -; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: xorl %esi, %esi -; SSE2-NEXT: movl $1088, %edx # imm = 0x440 -; SSE2-NEXT: callq memset@PLT -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: cmpl $0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: je .LBB0_2 -; SSE2-NEXT: # %bb.1: # %if.then -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw (%rax), %si -; SSE2-NEXT: movw 2(%rax), %dx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: addq $64, %rdx -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw (%rax), %di -; SSE2-NEXT: movw 2(%rax), %dx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: addq $64, %rdx -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw (%rax), %si -; SSE2-NEXT: movw 2(%rax), %dx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rdi) -; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: addq $64, %rdx -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: jmp .LBB0_3 -; SSE2-NEXT: .LBB0_2: # %if.else -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw (%rax), %si -; SSE2-NEXT: movw 2(%rax), %dx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: addq $64, %rdx -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw (%rax), %di -; SSE2-NEXT: movw 2(%rax), %dx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: addq $64, %rdx -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw (%rax), %si -; SSE2-NEXT: movw 2(%rax), %dx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rdi) -; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: addq $64, %rdx -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: .LBB0_3: # %if.end -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movl $1088, %edx # imm = 0x440 -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: callq memcpy@PLT -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: callq memcpy@PLT -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movaps 64(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 80(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 96(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 112(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 128(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 144(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 160(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 176(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 192(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 208(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 224(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 240(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 256(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 272(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 288(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 304(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 320(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 336(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 352(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 368(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 384(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 400(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 416(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 432(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 448(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 464(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 480(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 496(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 512(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 528(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 544(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 560(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 576(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 592(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 608(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 624(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 640(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 656(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 672(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 688(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 704(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 720(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 736(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 752(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 768(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 784(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 800(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 816(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 832(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 848(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 864(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 880(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 896(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 912(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 928(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 944(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 960(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 976(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 992(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 1008(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 1024(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 1040(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 1056(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps 1072(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movl $1024, %edx # imm = 0x400 -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: callq memcpy@PLT -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: callq memcpy@PLT -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: callq memcpy@PLT -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: # kill: def $r8 killed $rax -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; SSE2-NEXT: movw %r10w, %di -; SSE2-NEXT: shrl $2, %r10d -; SSE2-NEXT: movw %r10w, %r9w -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r8b -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp) -; SSE2-NEXT: # kill: def $r10b killed $r10b killed $r10d -; SSE2-NEXT: movb %r10b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: movl $64, %r8d -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: tileloadd (%r10,%r8), %tmm0 -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: tileloadd (%r10,%r8), %tmm1 -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: tileloadd (%r10,%r8), %tmm2 -; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: addq $64, %rdi -; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: callq memcpy@PLT -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: callq memcpy@PLT -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: # kill: def $rdi killed $rax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax -; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %r9b -; SSE2-NEXT: movb %r9b, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%r8) -; SSE2-NEXT: movl $64, %r8d -; SSE2-NEXT: tileloadd (%rdi,%r8), %tmm0 -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: movq %rbp, %rsp -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: .cfi_def_cfa %rsp, 8 -; SSE2-NEXT: tilerelease -; SSE2-NEXT: retq -entry: - %m.addr.i85 = alloca i16, align 2 - %n.addr.i86 = alloca i16, align 2 - %base.addr.i87 = alloca i8*, align 8 - %stride.addr.i88 = alloca i64, align 8 - %tile.addr.i = alloca <256 x i32>, align 64 - %indirect-arg-temp.i5284 = alloca <256 x i32>, align 1024 - %m.addr.i81 = alloca i16, align 2 - %n.addr.i82 = alloca i16, align 2 - %k.addr.i = alloca i16, align 2 - %dst.addr.i83 = alloca <256 x i32>, align 64 - %src1.addr.i = alloca <256 x i32>, align 64 - %src2.addr.i = alloca <256 x i32>, align 64 - %indirect-arg-temp5.i80 = alloca <256 x i32>, align 1024 - %indirect-arg-temp4.i79 = alloca <256 x i32>, align 1024 - %indirect-arg-temp.i78 = alloca <256 x i32>, align 1024 - %m.addr.i74 = alloca i16, align 2 - %n.addr.i75 = alloca i16, align 2 - %base.addr.i76 = alloca i8*, align 8 - %stride.addr.i77 = alloca i64, align 8 - %m.addr.i70 = alloca i16, align 2 - %n.addr.i71 = alloca i16, align 2 - %base.addr.i72 = alloca i8*, align 8 - %stride.addr.i73 = alloca i64, align 8 - %m.addr.i66 = alloca i16, align 2 - %n.addr.i67 = alloca i16, align 2 - %base.addr.i68 = alloca i8*, align 8 - %stride.addr.i69 = alloca i64, align 8 - %m.addr.i62 = alloca i16, align 2 - %n.addr.i63 = alloca i16, align 2 - %base.addr.i64 = alloca i8*, align 8 - %stride.addr.i65 = alloca i64, align 8 - %m.addr.i58 = alloca i16, align 2 - %n.addr.i59 = alloca i16, align 2 - %base.addr.i60 = alloca i8*, align 8 - %stride.addr.i61 = alloca i64, align 8 - %m.addr.i = alloca i16, align 2 - %n.addr.i = alloca i16, align 2 - %base.addr.i56 = alloca i8*, align 8 - %stride.addr.i57 = alloca i64, align 8 - %base.addr.i50 = alloca i8*, align 8 - %stride.addr.i51 = alloca i64, align 8 - %indirect-arg-temp.i52 = alloca <256 x i32>, align 1024 - %c49 = alloca %struct.__tile1024i_str, align 64 - %dst.addr.i44 = alloca %struct.__tile1024i_str*, align 8 - %indirect-arg-temp.i = alloca <256 x i32>, align 1024 - %indirect-arg-temp4.i = alloca <256 x i32>, align 1024 - %indirect-arg-temp5.i = alloca <256 x i32>, align 1024 - %b43 = alloca %struct.__tile1024i_str, align 64 - %a42 = alloca %struct.__tile1024i_str, align 64 - %dst.addr.i35 = alloca %struct.__tile1024i_str*, align 8 - %base.addr.i36 = alloca i8*, align 8 - %stride.addr.i37 = alloca i64, align 8 - %dst.addr.i28 = alloca %struct.__tile1024i_str*, align 8 - %base.addr.i29 = alloca i8*, align 8 - %stride.addr.i30 = alloca i64, align 8 - %dst.addr.i21 = alloca %struct.__tile1024i_str*, align 8 - %base.addr.i22 = alloca i8*, align 8 - %stride.addr.i23 = alloca i64, align 8 - %dst.addr.i14 = alloca %struct.__tile1024i_str*, align 8 - %base.addr.i15 = alloca i8*, align 8 - %stride.addr.i16 = alloca i64, align 8 - %dst.addr.i7 = alloca %struct.__tile1024i_str*, align 8 - %base.addr.i8 = alloca i8*, align 8 - %stride.addr.i9 = alloca i64, align 8 - %dst.addr.i = alloca %struct.__tile1024i_str*, align 8 - %base.addr.i = alloca i8*, align 8 - %stride.addr.i = alloca i64, align 8 - %cond.addr = alloca i32, align 4 - %row.addr = alloca i16, align 2 - %col.addr = alloca i16, align 2 - %a = alloca %struct.__tile1024i_str, align 64 - %b = alloca %struct.__tile1024i_str, align 64 - %c = alloca %struct.__tile1024i_str, align 64 - store i32 %cond, i32* %cond.addr, align 4 - store i16 %row, i16* %row.addr, align 2 - store i16 %col, i16* %col.addr, align 2 - %0 = bitcast %struct.__tile1024i_str* %a to i8* - call void @llvm.memset.p0i8.i64(i8* align 64 %0, i8 0, i64 1088, i1 false) - %row1 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a, i32 0, i32 0 - %1 = load i16, i16* %row.addr, align 2 - store i16 %1, i16* %row1, align 64 - %col2 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a, i32 0, i32 1 - store i16 8, i16* %col2, align 2 - %2 = bitcast %struct.__tile1024i_str* %b to i8* - call void @llvm.memset.p0i8.i64(i8* align 64 %2, i8 0, i64 1088, i1 false) - %row3 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b, i32 0, i32 0 - store i16 8, i16* %row3, align 64 - %col4 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b, i32 0, i32 1 - %3 = load i16, i16* %col.addr, align 2 - store i16 %3, i16* %col4, align 2 - %4 = bitcast %struct.__tile1024i_str* %c to i8* - call void @llvm.memset.p0i8.i64(i8* align 64 %4, i8 0, i64 1088, i1 false) - %row5 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c, i32 0, i32 0 - %5 = load i16, i16* %row.addr, align 2 - store i16 %5, i16* %row5, align 64 - %col6 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c, i32 0, i32 1 - %6 = load i16, i16* %col.addr, align 2 - store i16 %6, i16* %col6, align 2 - %7 = load i32, i32* %cond.addr, align 4 - %tobool = icmp ne i32 %7, 0 - br i1 %tobool, label %if.then, label %if.else - -if.then: ; preds = %entry - store %struct.__tile1024i_str* %a, %struct.__tile1024i_str** %dst.addr.i35, align 8 - store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i36, align 8 - store i64 32, i64* %stride.addr.i37, align 8 - %8 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8 - %row.i38 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %8, i32 0, i32 0 - %9 = load i16, i16* %row.i38, align 64 - %10 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8 - %col.i39 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %10, i32 0, i32 1 - %11 = load i16, i16* %col.i39, align 2 - %12 = load i8*, i8** %base.addr.i36, align 8 - %13 = load i64, i64* %stride.addr.i37, align 8 - store i16 %9, i16* %m.addr.i, align 2 - store i16 %11, i16* %n.addr.i, align 2 - store i8* %12, i8** %base.addr.i56, align 8 - store i64 %13, i64* %stride.addr.i57, align 8 - %14 = load i16, i16* %m.addr.i, align 2 - %15 = load i16, i16* %n.addr.i, align 2 - %16 = load i8*, i8** %base.addr.i56, align 8 - %17 = load i64, i64* %stride.addr.i57, align 8 - %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %14, i16 %15, i8* %16, i64 %17) #2 - %19 = bitcast x86_amx %18 to <256 x i32> - %20 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8 - %tile.i41 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %20, i32 0, i32 3 - store <256 x i32> %19, <256 x i32>* %tile.i41, align 64 - store %struct.__tile1024i_str* %b, %struct.__tile1024i_str** %dst.addr.i28, align 8 - store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i29, align 8 - store i64 32, i64* %stride.addr.i30, align 8 - %21 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8 - %row.i31 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %21, i32 0, i32 0 - %22 = load i16, i16* %row.i31, align 64 - %23 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8 - %col.i32 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %23, i32 0, i32 1 - %24 = load i16, i16* %col.i32, align 2 - %25 = load i8*, i8** %base.addr.i29, align 8 - %26 = load i64, i64* %stride.addr.i30, align 8 - store i16 %22, i16* %m.addr.i58, align 2 - store i16 %24, i16* %n.addr.i59, align 2 - store i8* %25, i8** %base.addr.i60, align 8 - store i64 %26, i64* %stride.addr.i61, align 8 - %27 = load i16, i16* %m.addr.i58, align 2 - %28 = load i16, i16* %n.addr.i59, align 2 - %29 = load i8*, i8** %base.addr.i60, align 8 - %30 = load i64, i64* %stride.addr.i61, align 8 - %31 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %27, i16 %28, i8* %29, i64 %30) #2 - %32 = bitcast x86_amx %31 to <256 x i32> - %33 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8 - %tile.i34 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %33, i32 0, i32 3 - store <256 x i32> %32, <256 x i32>* %tile.i34, align 64 - store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i21, align 8 - store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i22, align 8 - store i64 32, i64* %stride.addr.i23, align 8 - %34 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8 - %row.i24 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %34, i32 0, i32 0 - %35 = load i16, i16* %row.i24, align 64 - %36 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8 - %col.i25 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %36, i32 0, i32 1 - %37 = load i16, i16* %col.i25, align 2 - %38 = load i8*, i8** %base.addr.i22, align 8 - %39 = load i64, i64* %stride.addr.i23, align 8 - store i16 %35, i16* %m.addr.i62, align 2 - store i16 %37, i16* %n.addr.i63, align 2 - store i8* %38, i8** %base.addr.i64, align 8 - store i64 %39, i64* %stride.addr.i65, align 8 - %40 = load i16, i16* %m.addr.i62, align 2 - %41 = load i16, i16* %n.addr.i63, align 2 - %42 = load i8*, i8** %base.addr.i64, align 8 - %43 = load i64, i64* %stride.addr.i65, align 8 - %44 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %40, i16 %41, i8* %42, i64 %43) #2 - %45 = bitcast x86_amx %44 to <256 x i32> - %46 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8 - %tile.i27 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %46, i32 0, i32 3 - store <256 x i32> %45, <256 x i32>* %tile.i27, align 64 - br label %if.end - -if.else: ; preds = %entry - store %struct.__tile1024i_str* %a, %struct.__tile1024i_str** %dst.addr.i14, align 8 - store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i15, align 8 - store i64 32, i64* %stride.addr.i16, align 8 - %47 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8 - %row.i17 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %47, i32 0, i32 0 - %48 = load i16, i16* %row.i17, align 64 - %49 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8 - %col.i18 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %49, i32 0, i32 1 - %50 = load i16, i16* %col.i18, align 2 - %51 = load i8*, i8** %base.addr.i15, align 8 - %52 = load i64, i64* %stride.addr.i16, align 8 - store i16 %48, i16* %m.addr.i66, align 2 - store i16 %50, i16* %n.addr.i67, align 2 - store i8* %51, i8** %base.addr.i68, align 8 - store i64 %52, i64* %stride.addr.i69, align 8 - %53 = load i16, i16* %m.addr.i66, align 2 - %54 = load i16, i16* %n.addr.i67, align 2 - %55 = load i8*, i8** %base.addr.i68, align 8 - %56 = load i64, i64* %stride.addr.i69, align 8 - %57 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %53, i16 %54, i8* %55, i64 %56) #2 - %58 = bitcast x86_amx %57 to <256 x i32> - %59 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8 - %tile.i20 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %59, i32 0, i32 3 - store <256 x i32> %58, <256 x i32>* %tile.i20, align 64 - store %struct.__tile1024i_str* %b, %struct.__tile1024i_str** %dst.addr.i7, align 8 - store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i8, align 8 - store i64 32, i64* %stride.addr.i9, align 8 - %60 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8 - %row.i10 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %60, i32 0, i32 0 - %61 = load i16, i16* %row.i10, align 64 - %62 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8 - %col.i11 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %62, i32 0, i32 1 - %63 = load i16, i16* %col.i11, align 2 - %64 = load i8*, i8** %base.addr.i8, align 8 - %65 = load i64, i64* %stride.addr.i9, align 8 - store i16 %61, i16* %m.addr.i70, align 2 - store i16 %63, i16* %n.addr.i71, align 2 - store i8* %64, i8** %base.addr.i72, align 8 - store i64 %65, i64* %stride.addr.i73, align 8 - %66 = load i16, i16* %m.addr.i70, align 2 - %67 = load i16, i16* %n.addr.i71, align 2 - %68 = load i8*, i8** %base.addr.i72, align 8 - %69 = load i64, i64* %stride.addr.i73, align 8 - %70 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %66, i16 %67, i8* %68, i64 %69) #2 - %71 = bitcast x86_amx %70 to <256 x i32> - %72 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8 - %tile.i13 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %72, i32 0, i32 3 - store <256 x i32> %71, <256 x i32>* %tile.i13, align 64 - store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i, align 8 - store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i, align 8 - store i64 32, i64* %stride.addr.i, align 8 - %73 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8 - %row.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %73, i32 0, i32 0 - %74 = load i16, i16* %row.i, align 64 - %75 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8 - %col.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %75, i32 0, i32 1 - %76 = load i16, i16* %col.i, align 2 - %77 = load i8*, i8** %base.addr.i, align 8 - %78 = load i64, i64* %stride.addr.i, align 8 - store i16 %74, i16* %m.addr.i74, align 2 - store i16 %76, i16* %n.addr.i75, align 2 - store i8* %77, i8** %base.addr.i76, align 8 - store i64 %78, i64* %stride.addr.i77, align 8 - %79 = load i16, i16* %m.addr.i74, align 2 - %80 = load i16, i16* %n.addr.i75, align 2 - %81 = load i8*, i8** %base.addr.i76, align 8 - %82 = load i64, i64* %stride.addr.i77, align 8 - %83 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %79, i16 %80, i8* %81, i64 %82) #2 - %84 = bitcast x86_amx %83 to <256 x i32> - %85 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8 - %tile.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %85, i32 0, i32 3 - store <256 x i32> %84, <256 x i32>* %tile.i, align 64 - br label %if.end - -if.end: ; preds = %if.else, %if.then - %86 = bitcast %struct.__tile1024i_str* %b43 to i8* - %87 = bitcast %struct.__tile1024i_str* %b to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %86, i8* align 1 %87, i64 1088, i1 false) #2 - %88 = bitcast %struct.__tile1024i_str* %a42 to i8* - %89 = bitcast %struct.__tile1024i_str* %a to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %88, i8* align 1 %89, i64 1088, i1 false) #2 - store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i44, align 8 - %row.i45 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 0 - %90 = load i16, i16* %row.i45, align 64 - %col.i46 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b43, i32 0, i32 1 - %91 = load i16, i16* %col.i46, align 2 - %col1.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 1 - %92 = load i16, i16* %col1.i, align 2 - %93 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i44, align 8 - %tile.i47 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %93, i32 0, i32 3 - %94 = load <256 x i32>, <256 x i32>* %tile.i47, align 64 - %tile2.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 3 - %95 = load <256 x i32>, <256 x i32>* %tile2.i, align 64 - %tile3.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b43, i32 0, i32 3 - %96 = load <256 x i32>, <256 x i32>* %tile3.i, align 64 - store <256 x i32> %94, <256 x i32>* %indirect-arg-temp.i, align 1024 - store <256 x i32> %95, <256 x i32>* %indirect-arg-temp4.i, align 1024 - store <256 x i32> %96, <256 x i32>* %indirect-arg-temp5.i, align 1024 - %97 = bitcast <256 x i32>* %indirect-arg-temp5.i80 to i8* - %98 = bitcast <256 x i32>* %indirect-arg-temp5.i to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %97, i8* align 1 %98, i64 1024, i1 false) #2 - %99 = bitcast <256 x i32>* %indirect-arg-temp4.i79 to i8* - %100 = bitcast <256 x i32>* %indirect-arg-temp4.i to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %99, i8* align 1 %100, i64 1024, i1 false) #2 - %101 = bitcast <256 x i32>* %indirect-arg-temp.i78 to i8* - %102 = bitcast <256 x i32>* %indirect-arg-temp.i to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %101, i8* align 1 %102, i64 1024, i1 false) #2 - %dst.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp.i78, align 1024 - %src1.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp4.i79, align 1024 - %src2.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp5.i80, align 1024 - store i16 %90, i16* %m.addr.i81, align 2 - store i16 %91, i16* %n.addr.i82, align 2 - store i16 %92, i16* %k.addr.i, align 2 - store <256 x i32> %dst.i, <256 x i32>* %dst.addr.i83, align 64 - store <256 x i32> %src1.i, <256 x i32>* %src1.addr.i, align 64 - store <256 x i32> %src2.i, <256 x i32>* %src2.addr.i, align 64 - %103 = load i16, i16* %m.addr.i81, align 2 - %104 = load i16, i16* %n.addr.i82, align 2 - %105 = load i16, i16* %k.addr.i, align 2 - %106 = load <256 x i32>, <256 x i32>* %dst.addr.i83, align 64 - %107 = bitcast <256 x i32> %106 to x86_amx - %108 = load <256 x i32>, <256 x i32>* %src1.addr.i, align 64 - %109 = bitcast <256 x i32> %108 to x86_amx - %110 = load <256 x i32>, <256 x i32>* %src2.addr.i, align 64 - %111 = bitcast <256 x i32> %110 to x86_amx - %112 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %103, i16 %104, i16 %105, x86_amx %107, x86_amx %109, x86_amx %111) #2 - %113 = bitcast x86_amx %112 to <256 x i32> - %114 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i44, align 8 - %tile6.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %114, i32 0, i32 3 - store <256 x i32> %113, <256 x i32>* %tile6.i, align 64 - %115 = bitcast %struct.__tile1024i_str* %c49 to i8* - %116 = bitcast %struct.__tile1024i_str* %c to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %115, i8* align 1 %116, i64 1088, i1 false) #2 - store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i50, align 8 - store i64 32, i64* %stride.addr.i51, align 8 - %row.i53 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 0 - %117 = load i16, i16* %row.i53, align 64 - %col.i54 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 1 - %118 = load i16, i16* %col.i54, align 2 - %119 = load i8*, i8** %base.addr.i50, align 8 - %120 = load i64, i64* %stride.addr.i51, align 8 - %tile.i55 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 3 - %121 = load <256 x i32>, <256 x i32>* %tile.i55, align 64 - store <256 x i32> %121, <256 x i32>* %indirect-arg-temp.i52, align 1024 - %122 = bitcast <256 x i32>* %indirect-arg-temp.i5284 to i8* - %123 = bitcast <256 x i32>* %indirect-arg-temp.i52 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %122, i8* align 1 %123, i64 1024, i1 false) #2 - %tile.i89 = load <256 x i32>, <256 x i32>* %indirect-arg-temp.i5284, align 1024 - store i16 %117, i16* %m.addr.i85, align 2 - store i16 %118, i16* %n.addr.i86, align 2 - store i8* %119, i8** %base.addr.i87, align 8 - store i64 %120, i64* %stride.addr.i88, align 8 - store <256 x i32> %tile.i89, <256 x i32>* %tile.addr.i, align 64 - %124 = load i16, i16* %m.addr.i85, align 2 - %125 = load i16, i16* %n.addr.i86, align 2 - %126 = load i8*, i8** %base.addr.i87, align 8 - %127 = load i64, i64* %stride.addr.i88, align 8 - %128 = load <256 x i32>, <256 x i32>* %tile.addr.i, align 64 - %129 = bitcast <256 x i32> %128 to x86_amx - call void @llvm.x86.tilestored64.internal(i16 %124, i16 %125, i8* %126, i64 %127, x86_amx %129) #2 - ret void -} - -; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1 - -; Function Attrs: nounwind -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #2 - -; Function Attrs: nounwind -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2 - -; Function Attrs: nounwind -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #2 - -; Function Attrs: argmemonly nofree nosync nounwind willreturn -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3 - -attributes #0 = { noinline nounwind optnone uwtable } -attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly } -attributes #2 = { nounwind } -attributes #3 = { argmemonly nofree nosync nounwind willreturn } diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll deleted file mode 100644 index 9673b04..0000000 --- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll +++ /dev/null @@ -1,78 +0,0 @@ -; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -lower-amx-type -S | FileCheck %s - -@buf = dso_local global [1024 x i8] zeroinitializer, align 16 -@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 - -; Function Attrs: nounwind uwtable -define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr { - -; CHECK-LABEL: entry: -; CHECK: %{{[0-9]+}} = alloca <256 x i32>, align 1024 -; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024 -; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024 -; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024 -; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0 -; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then -; CHECK: if.then: -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: br label %if.end -; CHECK: if.else: -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: br label %if.end -; CHECK: if.end: -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64) -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: ret void - -entry: - %tobool.not = icmp eq i32 %cond, 0 - br i1 %tobool.not, label %if.else, label %if.then - -if.then: ; preds = %entry - %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - br label %if.end - -if.else: ; preds = %entry - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] - %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] - %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6) - ret void -} - -; Function Attrs: nounwind -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) - -; Function Attrs: nounwind -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) - -; Function Attrs: nounwind -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll deleted file mode 100644 index ef30879..0000000 --- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll +++ /dev/null @@ -1,210 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -pre-amx-config -S | FileCheck %s - -@buf = dso_local global [1024 x i8] zeroinitializer, align 16 -@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 - -; Function Attrs: nounwind uwtable -define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr { -; CHECK-LABEL: entry: -; CHECK: %{{[0-9]+}} = alloca <16 x i32>, align 4 -; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4 -; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4 -; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4 -; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4 -; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4 -; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4 -; CHECK-NEXT: %{{[0-9]+}} = alloca <16 x i32>, align 4 -; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024 -; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024 -; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024 -; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: %{{[0-9]+}} = alloca <256 x i32>, align 1024 -; CHECK-NEXT: %{{[0-9]+}} = bitcast <256 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0 -; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then - -; CHECK: if.then: -; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0 -; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1 -; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16 -; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2 -; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0 -; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1 -; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16 -; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2 -; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0 -; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1 -; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16 -; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2 -; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: br label %if.end - -; CHECK: if.else: -; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0 -; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1 -; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16 -; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 8, i16* %amx.tmm.0.shape.col{{.*}}, align 2 -; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0 -; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1 -; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16 -; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2 -; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0 -; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1 -; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16 -; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2 -; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: br label %if.end -; CHECK: if.end: ; preds = %if.else, %if.then -; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0 -; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1 -; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16 -; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2 -; CHECK-NEXT: %amx.tmm.1.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 49 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 18 -; CHECK-NEXT: %amx.tmm.1.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.1.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 8, i16* %amx.tmm.1.shape.col{{.*}}, align 2 -; CHECK-NEXT: %amx.tmm.2.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 50 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 20 -; CHECK-NEXT: %amx.tmm.2.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 8 to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.2.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 %col, i16* %amx.tmm.2.shape.col{{.*}}, align 2 -; CHECK-NEXT: %amx.tmm.3.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 51 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 22 -; CHECK-NEXT: %amx.tmm.3.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.3.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 %col, i16* %amx.tmm.3.shape.col{{.*}}, align 2 -; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = bitcast <16 x i32>* %{{[0-9]+}} to i8* -; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* %{{[0-9]+}}, align 4 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 0 -; CHECK-NEXT: store i8 1, i8* %{{[0-9]+}}, align 1 -; CHECK-NEXT: %amx.tmm.0.shape.row{{.*}} = getelementptr i8, i8* %{{[0-9]+}}, i64 48 -; CHECK-NEXT: %{{[0-9]+}} = getelementptr i8, i8* %{{[0-9]+}}, i64 16 -; CHECK-NEXT: %amx.tmm.0.shape.col{{.*}} = bitcast i8* %{{[0-9]+}} to i16* -; CHECK-NEXT: %{{[0-9]+}} = trunc i16 %row to i8 -; CHECK-NEXT: store i8 %{{[0-9]+}}, i8* %amx.tmm.0.shape.row{{.*}}, align 1 -; CHECK-NEXT: store i16 %col, i16* %amx.tmm.0.shape.col{{.*}}, align 2 -; CHECK-NEXT: call void @llvm.x86.ldtilecfg(i8* %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %{{[0-9]+}}, i64 64) -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: ret void - -entry: - %0 = alloca <256 x i32>, align 1024 - %1 = bitcast <256 x i32>* %0 to i8* - %2 = alloca <256 x i32>, align 1024 - %3 = bitcast <256 x i32>* %2 to i8* - %4 = alloca <256 x i32>, align 1024 - %5 = bitcast <256 x i32>* %4 to i8* - %6 = alloca <256 x i32>, align 1024 - %7 = bitcast <256 x i32>* %6 to i8* - %tobool.not = icmp eq i32 %cond, 0 - br i1 %tobool.not, label %if.else, label %if.then - -if.then: ; preds = %entry - %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %8) - %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %9) - %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %10) - br label %if.end - -if.else: ; preds = %entry - %11 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %5, i64 64, x86_amx %11) - %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %3, i64 64, x86_amx %12) - %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %1, i64 64, x86_amx %13) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %14 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %5, i64 64) - %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %3, i64 64) - %16 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %1, i64 64) - %17 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %16, x86_amx %14, x86_amx %15) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %7, i64 64, x86_amx %17) - %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %7, i64 64) - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %18) - ret void -} - -; Function Attrs: nounwind -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) - -; Function Attrs: nounwind -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) - -; Function Attrs: nounwind -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll deleted file mode 100644 index 0771d93..0000000 --- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll +++ /dev/null @@ -1,513 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2 - -@buf = dso_local global [1024 x i8] zeroinitializer, align 16 -@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 - -; Function Attrs: nounwind uwtable -define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr { -; AVX512-LABEL: test_api: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: .cfi_def_cfa_offset 16 -; AVX512-NEXT: .cfi_offset %rbp, -16 -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: .cfi_def_cfa_register %rbp -; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX512-NEXT: subq $6144, %rsp # imm = 0x1800 -; AVX512-NEXT: movw %dx, %ax -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: movw %si, %ax -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: cmpl $0, %edi -; AVX512-NEXT: je .LBB0_2 -; AVX512-NEXT: # %bb.1: # %if.then -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl $buf, %r9d -; AVX512-NEXT: movl $32, %r10d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) -; AVX512-NEXT: movl $buf, %esi -; AVX512-NEXT: movl $32, %edi -; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: jmp .LBB0_3 -; AVX512-NEXT: .LBB0_2: # %if.else -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl $buf2, %r9d -; AVX512-NEXT: movl $32, %r10d -; AVX512-NEXT: movw $8, %si -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: movl $64, %r8d -; AVX512-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) -; AVX512-NEXT: movl $buf2, %esi -; AVX512-NEXT: movl $32, %edi -; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: .LBB0_3: # %if.end -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %sil -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: movw $8, %di -; AVX512-NEXT: tileloadd (%r10,%rsi), %tmm1 -; AVX512-NEXT: tileloadd (%r9,%rsi), %tmm2 -; AVX512-NEXT: tileloadd (%r8,%rsi), %tmm0 -; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movb %al, %dil -; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: ldtilecfg (%rsi) -; AVX512-NEXT: movl $64, %esi -; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX512-NEXT: movl $buf, %edx -; AVX512-NEXT: movl $32, %esi -; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: .cfi_def_cfa %rsp, 8 -; AVX512-NEXT: tilerelease -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; -; AVX2-LABEL: test_api: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; AVX2-NEXT: subq $6144, %rsp # imm = 0x1800 -; AVX2-NEXT: movw %dx, %ax -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: movw %si, %ax -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: cmpl $0, %edi -; AVX2-NEXT: je .LBB0_2 -; AVX2-NEXT: # %bb.1: # %if.then -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %sil -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl $buf, %r9d -; AVX2-NEXT: movl $32, %r10d -; AVX2-NEXT: movw $8, %si -; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX2-NEXT: movl $64, %r8d -; AVX2-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rsi) -; AVX2-NEXT: movl $buf, %esi -; AVX2-NEXT: movl $32, %edi -; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: jmp .LBB0_3 -; AVX2-NEXT: .LBB0_2: # %if.else -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %sil -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl $buf2, %r9d -; AVX2-NEXT: movl $32, %r10d -; AVX2-NEXT: movw $8, %si -; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX2-NEXT: movl $64, %r8d -; AVX2-NEXT: tilestored %tmm0, (%r11,%r8) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 -; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rsi) -; AVX2-NEXT: movl $buf2, %esi -; AVX2-NEXT: movl $32, %edi -; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: .LBB0_3: # %if.end -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %sil -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: movw $8, %di -; AVX2-NEXT: tileloadd (%r10,%rsi), %tmm1 -; AVX2-NEXT: tileloadd (%r9,%rsi), %tmm2 -; AVX2-NEXT: tileloadd (%r8,%rsi), %tmm0 -; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movb %al, %dil -; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: ldtilecfg (%rsi) -; AVX2-NEXT: movl $64, %esi -; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; AVX2-NEXT: movl $buf, %edx -; AVX2-NEXT: movl $32, %esi -; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: .cfi_def_cfa %rsp, 8 -; AVX2-NEXT: tilerelease -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; SSE2-LABEL: test_api: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: .cfi_def_cfa_offset 16 -; SSE2-NEXT: .cfi_offset %rbp, -16 -; SSE2-NEXT: movq %rsp, %rbp -; SSE2-NEXT: .cfi_def_cfa_register %rbp -; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; SSE2-NEXT: subq $6144, %rsp # imm = 0x1800 -; SSE2-NEXT: movw %dx, %ax -; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; SSE2-NEXT: movw %si, %ax -; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: cmpl $0, %edi -; SSE2-NEXT: je .LBB0_2 -; SSE2-NEXT: # %bb.1: # %if.then -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %sil -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: movl $buf, %r9d -; SSE2-NEXT: movl $32, %r10d -; SSE2-NEXT: movw $8, %si -; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 -; SSE2-NEXT: movl $64, %r8d -; SSE2-NEXT: tilestored %tmm0, (%r11,%r8) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 -; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rsi) -; SSE2-NEXT: movl $buf, %esi -; SSE2-NEXT: movl $32, %edi -; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: jmp .LBB0_3 -; SSE2-NEXT: .LBB0_2: # %if.else -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %sil -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: movl $buf2, %r9d -; SSE2-NEXT: movl $32, %r10d -; SSE2-NEXT: movw $8, %si -; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 -; SSE2-NEXT: movl $64, %r8d -; SSE2-NEXT: tilestored %tmm0, (%r11,%r8) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 -; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rsi) -; SSE2-NEXT: movl $buf2, %esi -; SSE2-NEXT: movl $32, %edi -; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: .LBB0_3: # %if.end -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload -; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %sil -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: movw $8, %di -; SSE2-NEXT: tileloadd (%r10,%rsi), %tmm1 -; SSE2-NEXT: tileloadd (%r9,%rsi), %tmm2 -; SSE2-NEXT: tileloadd (%r8,%rsi), %tmm0 -; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movb %al, %dil -; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; SSE2-NEXT: ldtilecfg (%rsi) -; SSE2-NEXT: movl $64, %esi -; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 -; SSE2-NEXT: movl $buf, %edx -; SSE2-NEXT: movl $32, %esi -; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) -; SSE2-NEXT: movq %rbp, %rsp -; SSE2-NEXT: popq %rbp -; SSE2-NEXT: .cfi_def_cfa %rsp, 8 -; SSE2-NEXT: tilerelease -; SSE2-NEXT: retq -entry: - %tobool.not = icmp eq i32 %cond, 0 - br i1 %tobool.not, label %if.else, label %if.then - -if.then: ; preds = %entry - %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - br label %if.end - -if.else: ; preds = %entry - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] - %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] - %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6) - ret void -} - -; Function Attrs: nounwind -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) - -; Function Attrs: nounwind -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) - -; Function Attrs: nounwind -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir b/llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir deleted file mode 100644 index da2bcab..0000000 --- a/llvm/test/CodeGen/X86/AMX/amx-fast-tile-config.mir +++ /dev/null @@ -1,465 +0,0 @@ -# RUN: llc -o - -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -run-pass=fasttileconfig %s | FileCheck %s - ---- | - - @buf = dso_local global [1024 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 - - define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr #0 { - entry: - %0 = alloca <16 x i32>, align 4 - %1 = alloca <16 x i32>, align 4 - %2 = alloca <16 x i32>, align 4 - %3 = alloca <16 x i32>, align 4 - %4 = alloca <16 x i32>, align 4 - %5 = alloca <16 x i32>, align 4 - %6 = alloca <16 x i32>, align 4 - %7 = alloca <16 x i32>, align 4 - %8 = alloca <256 x i32>, align 1024 - %9 = bitcast <256 x i32>* %8 to i8* - %10 = alloca <256 x i32>, align 1024 - %11 = bitcast <256 x i32>* %10 to i8* - %12 = alloca <256 x i32>, align 1024 - %13 = bitcast <256 x i32>* %12 to i8* - %14 = alloca <256 x i32>, align 1024 - %15 = bitcast <256 x i32>* %14 to i8* - %tobool.not = icmp eq i32 %cond, 0 - br i1 %tobool.not, label %if.else, label %if.then - - if.then: ; preds = %entry - %16 = bitcast <16 x i32>* %6 to i8* - store <16 x i32> zeroinitializer, <16 x i32>* %6, align 64 - %amx.tmm.0.shape.row1 = getelementptr i8, i8* %16, i64 48 - %17 = getelementptr i8, i8* %16, i64 16 - %amx.tmm.0.shape.col2 = bitcast i8* %17 to i16* - %18 = trunc i16 %row to i8 - store volatile i8 %18, i8* %amx.tmm.0.shape.row1, align 1 - store volatile i16 8, i16* %amx.tmm.0.shape.col2, align 2 - call void @llvm.x86.ldtilecfg(i8* %16) - %19 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %19) - %20 = bitcast <16 x i32>* %2 to i8* - store <16 x i32> zeroinitializer, <16 x i32>* %2, align 64 - %amx.tmm.0.shape.row9 = getelementptr i8, i8* %20, i64 48 - %21 = getelementptr i8, i8* %20, i64 16 - %amx.tmm.0.shape.col10 = bitcast i8* %21 to i16* - %22 = trunc i16 8 to i8 - store volatile i8 %22, i8* %amx.tmm.0.shape.row9, align 1 - store volatile i16 %col, i16* %amx.tmm.0.shape.col10, align 2 - call void @llvm.x86.ldtilecfg(i8* %20) - %23 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %23) - %24 = bitcast <16 x i32>* %3 to i8* - store <16 x i32> zeroinitializer, <16 x i32>* %3, align 64 - %amx.tmm.0.shape.row7 = getelementptr i8, i8* %24, i64 48 - %25 = getelementptr i8, i8* %24, i64 16 - %amx.tmm.0.shape.col8 = bitcast i8* %25 to i16* - %26 = trunc i16 %row to i8 - store volatile i8 %26, i8* %amx.tmm.0.shape.row7, align 1 - store volatile i16 %col, i16* %amx.tmm.0.shape.col8, align 2 - call void @llvm.x86.ldtilecfg(i8* %24) - %27 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %27) - br label %if.end - - if.else: ; preds = %entry - %28 = bitcast <16 x i32>* %1 to i8* - store <16 x i32> zeroinitializer, <16 x i32>* %1, align 64 - %amx.tmm.0.shape.row11 = getelementptr i8, i8* %28, i64 48 - %29 = getelementptr i8, i8* %28, i64 16 - %amx.tmm.0.shape.col12 = bitcast i8* %29 to i16* - %30 = trunc i16 %row to i8 - store volatile i8 %30, i8* %amx.tmm.0.shape.row11, align 1 - store volatile i16 8, i16* %amx.tmm.0.shape.col12, align 2 - call void @llvm.x86.ldtilecfg(i8* %28) - %31 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, i8* %13, i64 64, x86_amx %31) - %32 = bitcast <16 x i32>* %7 to i8* - store <16 x i32> zeroinitializer, <16 x i32>* %7, align 64 - %amx.tmm.0.shape.row = getelementptr i8, i8* %32, i64 48 - %33 = getelementptr i8, i8* %32, i64 16 - %amx.tmm.0.shape.col = bitcast i8* %33 to i16* - %34 = trunc i16 8 to i8 - store volatile i8 %34, i8* %amx.tmm.0.shape.row, align 1 - store volatile i16 %col, i16* %amx.tmm.0.shape.col, align 2 - call void @llvm.x86.ldtilecfg(i8* %32) - %35 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, i8* %11, i64 64, x86_amx %35) - %36 = bitcast <16 x i32>* %0 to i8* - store <16 x i32> zeroinitializer, <16 x i32>* %0, align 64 - %amx.tmm.0.shape.row13 = getelementptr i8, i8* %36, i64 48 - %37 = getelementptr i8, i8* %36, i64 16 - %amx.tmm.0.shape.col14 = bitcast i8* %37 to i16* - %38 = trunc i16 %row to i8 - store volatile i8 %38, i8* %amx.tmm.0.shape.row13, align 1 - store volatile i16 %col, i16* %amx.tmm.0.shape.col14, align 2 - call void @llvm.x86.ldtilecfg(i8* %36) - %39 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %9, i64 64, x86_amx %39) - br label %if.end - - if.end: ; preds = %if.else, %if.then - %40 = bitcast <16 x i32>* %4 to i8* - store <16 x i32> zeroinitializer, <16 x i32>* %4, align 64 - %amx.tmm.0.shape.row5 = getelementptr i8, i8* %40, i64 48 - %41 = getelementptr i8, i8* %40, i64 16 - %amx.tmm.0.shape.col6 = bitcast i8* %41 to i16* - %42 = trunc i16 %row to i8 - store volatile i8 %42, i8* %amx.tmm.0.shape.row5, align 1 - store volatile i16 %col, i16* %amx.tmm.0.shape.col6, align 2 - %amx.tmm.1.shape.row = getelementptr i8, i8* %40, i64 49 - %43 = getelementptr i8, i8* %40, i64 18 - %amx.tmm.1.shape.col = bitcast i8* %43 to i16* - %44 = trunc i16 %row to i8 - store volatile i8 %44, i8* %amx.tmm.1.shape.row, align 1 - store volatile i16 8, i16* %amx.tmm.1.shape.col, align 2 - %amx.tmm.2.shape.row = getelementptr i8, i8* %40, i64 50 - %45 = getelementptr i8, i8* %40, i64 20 - %amx.tmm.2.shape.col = bitcast i8* %45 to i16* - %46 = trunc i16 8 to i8 - store volatile i8 %46, i8* %amx.tmm.2.shape.row, align 1 - store volatile i16 %col, i16* %amx.tmm.2.shape.col, align 2 - %amx.tmm.3.shape.row = getelementptr i8, i8* %40, i64 51 - %47 = getelementptr i8, i8* %40, i64 22 - %amx.tmm.3.shape.col = bitcast i8* %47 to i16* - %48 = trunc i16 %row to i8 - store volatile i8 %48, i8* %amx.tmm.3.shape.row, align 1 - store volatile i16 %col, i16* %amx.tmm.3.shape.col, align 2 - call void @llvm.x86.ldtilecfg(i8* %40) - %49 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* %13, i64 64) - %50 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* %11, i64 64) - %51 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %9, i64 64) - %52 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %51, x86_amx %49, x86_amx %50) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* %15, i64 64, x86_amx %52) - %53 = bitcast <16 x i32>* %5 to i8* - store <16 x i32> zeroinitializer, <16 x i32>* %5, align 64 - %amx.tmm.0.shape.row3 = getelementptr i8, i8* %53, i64 48 - %54 = getelementptr i8, i8* %53, i64 16 - %amx.tmm.0.shape.col4 = bitcast i8* %54 to i16* - %55 = trunc i16 %row to i8 - store volatile i8 %55, i8* %amx.tmm.0.shape.row3, align 1 - store volatile i16 %col, i16* %amx.tmm.0.shape.col4, align 2 - call void @llvm.x86.ldtilecfg(i8* %53) - %56 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* %15, i64 64) - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %56) - ret void - } - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #1 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #1 - - ; Function Attrs: nounwind - declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #1 - - ; Function Attrs: nounwind - declare void @llvm.x86.ldtilecfg(i8*) #2 - - attributes #0 = { "target-features"="+amx-int8,+avx512f" } - attributes #1 = { nounwind "target-features"="+amx-int8,+avx512f" } - attributes #2 = { nounwind } - -... ---- -name: test_api -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: false - stackProtector: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: '' - restorePoint: '' -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 16, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: default, offset: 0, size: 64, alignment: 16, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 2, name: '', type: default, offset: 0, size: 64, alignment: 16, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 3, name: '', type: default, offset: 0, size: 64, alignment: 16, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 16, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 5, name: '', type: default, offset: 0, size: 64, alignment: 16, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 6, name: '', type: default, offset: 0, size: 64, alignment: 16, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 7, name: '', type: default, offset: 0, size: 64, alignment: 16, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 8, name: '', type: default, offset: 0, size: 1024, alignment: 1024, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 9, name: '', type: default, offset: 0, size: 1024, alignment: 1024, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 10, name: '', type: default, offset: 0, size: 1024, alignment: 1024, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 11, name: '', type: default, offset: 0, size: 1024, alignment: 1024, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 12, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 13, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 14, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 15, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 16, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 17, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: {} -body: | - bb.0.entry: - successors: %bb.2(0x40000000), %bb.1(0x40000000) - liveins: $edi, $esi, $edx - - renamable $ax = COPY renamable $dx, implicit killed $edx - MOV16mr %stack.17, 1, $noreg, 0, $noreg, killed $ax :: (store 2 into %stack.17) - renamable $ax = COPY renamable $si, implicit killed $esi - MOV16mr %stack.16, 1, $noreg, 0, $noreg, killed $ax :: (store 2 into %stack.16) - renamable $rax = LEA64r %stack.8, 1, $noreg, 0, $noreg - MOV64mr %stack.15, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.15) - renamable $rax = LEA64r %stack.9, 1, $noreg, 0, $noreg - MOV64mr %stack.14, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.14) - renamable $rax = LEA64r %stack.10, 1, $noreg, 0, $noreg - MOV64mr %stack.13, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.13) - renamable $rax = LEA64r %stack.11, 1, $noreg, 0, $noreg - MOV64mr %stack.12, 1, $noreg, 0, $noreg, killed $rax :: (store 8 into %stack.12) - CMP32ri8 killed renamable $edi, 0, implicit-def $eflags - JCC_1 %bb.2, 4, implicit killed $eflags - - bb.1.if.then: - successors: %bb.3(0x80000000) - ; CHECK-LABEL: bb.1.if.then - ; tmm0 --> row_offset = 48, col_offset = 16 - ; CHECK: MOV8mr %stack.6, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row1) - ; CHECK: MOV16mi %stack.6, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col2) - ; CHECK: LDTILECFG %stack.6, 1, $noreg, 0, $noreg - ; CHECK: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg - ; CHECK: PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - - ; tmm1 --> row_offset = 49, col_offset = 18 - ; CHECK: MOV8mi %stack.2, 1, $noreg, 49, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row9) - ; CHECK: MOV16mr %stack.2, 1, $noreg, 18, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col10) - ; CHECK: LDTILECFG %stack.2, 1, $noreg, 0, $noreg - ; CHECK: renamable $tmm1 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg - ; CHECK: PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm1 - - ; tmm2 --> row_offset = 50, col_offset = 20 - ; CHECK: MOV8mr %stack.3, 1, $noreg, 50, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row7) - ; CHECK: MOV16mr %stack.3, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col8) - ; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg - ; CHECK: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg - ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm2 - - $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16) - $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17) - $rdx = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15) - $rdi = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14) - $r11 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13) - renamable $zmm0 = AVX512_512_SET0 - VMOVDQA64Zmr %stack.6, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.6) - renamable $sil = COPY renamable $al - MOV8mr %stack.6, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row1) - MOV16mi %stack.6, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col2) - LDTILECFG %stack.6, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 - renamable $r9 = MOV32ri64 @buf - renamable $r10 = MOV32ri64 32 - renamable $si = MOV16ri 8 - renamable $tmm0 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg - renamable $r8 = MOV32ri64 64 - PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - VMOVDQA64Zmr %stack.2, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.2) - MOV8mi %stack.2, 1, $noreg, 48, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row9) - MOV16mr %stack.2, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col10) - LDTILECFG %stack.2, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 - renamable $tmm1 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg - PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm1 - renamable $rsi = LEA64r %stack.3, 1, $noreg, 0, $noreg - VMOVDQA64Zmr %stack.3, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.3) - renamable $dil = COPY renamable $al - MOV8mr %stack.3, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row7) - MOV16mr %stack.3, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col8) - LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 - renamable $rsi = MOV32ri64 @buf - renamable $rdi = MOV32ri64 32 - renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg - renamable $rsi = MOV32ri64 64 - PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm2 - JMP_1 %bb.3 - - bb.2.if.else: - successors: %bb.3(0x80000000) - - ; CHECK-LABEL: bb.2.if.else - ; tmm3 --> row_offset = 51, col_offset = 22 - ; CHECK: MOV8mr %stack.1, 1, $noreg, 51, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row11) - ; CHECK: MOV16mi %stack.1, 1, $noreg, 22, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col12) - ; CHECK: LDTILECFG %stack.1, 1, $noreg, 0, $noreg - ; CHECK: renamable $tmm3 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg - ; CHECK: PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm3 - - ; tmm4 --> row_offset = 52, col_offset = 24 - ; CHECK: MOV8mi %stack.7, 1, $noreg, 52, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row) - ; CHECK: MOV16mr %stack.7, 1, $noreg, 24, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col) - ; CHECK: LDTILECFG %stack.7, 1, $noreg, 0, $noreg - ; CHECK: renamable $tmm4 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg - ; CHECK: PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm4 - - ; tmm4 --> row_offset = 53, col_offset = 26 - ; CHECK: MOV8mr %stack.0, 1, $noreg, 53, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row13) - ; CHECK: MOV16mr %stack.0, 1, $noreg, 26, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col14) - ; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg - ; CHECK: renamable $tmm5 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg - ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm5 - - $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16) - $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17) - $rdx = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15) - $rdi = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14) - $r11 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13) - renamable $zmm0 = AVX512_512_SET0 - VMOVDQA64Zmr %stack.1, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.1) - renamable $sil = COPY renamable $al - MOV8mr %stack.1, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row11) - MOV16mi %stack.1, 1, $noreg, 16, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.0.shape.col12) - LDTILECFG %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 - renamable $r9 = MOV32ri64 @buf2 - renamable $r10 = MOV32ri64 32 - renamable $si = MOV16ri 8 - renamable $tmm3 = PTILELOADDV renamable $ax, renamable $si, renamable $r9, 1, renamable $r10, 0, $noreg - renamable $r8 = MOV32ri64 64 - PTILESTOREDV renamable $ax, renamable $si, renamable $r11, 1, renamable $r8, 0, $noreg, killed renamable $tmm3 - VMOVDQA64Zmr %stack.7, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.7) - MOV8mi %stack.7, 1, $noreg, 48, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.0.shape.row) - MOV16mr %stack.7, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col) - LDTILECFG %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 - renamable $tmm4 = PTILELOADDV renamable $si, renamable $cx, killed renamable $r9, 1, killed renamable $r10, 0, $noreg - PTILESTOREDV killed renamable $si, renamable $cx, renamable $rdi, 1, killed renamable $r8, 0, $noreg, killed renamable $tmm4 - renamable $rsi = LEA64r %stack.0, 1, $noreg, 0, $noreg - VMOVDQA64Zmr %stack.0, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.0) - renamable $dil = COPY renamable $al - MOV8mr %stack.0, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row13) - MOV16mr %stack.0, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col14) - LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 - renamable $rsi = MOV32ri64 @buf2 - renamable $rdi = MOV32ri64 32 - renamable $tmm5 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rsi, 1, killed renamable $rdi, 0, $noreg - renamable $rsi = MOV32ri64 64 - PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm5 - - bb.3.if.end: - ; CHECK-LABEL: bb.3.if.end - ; tmm0 --> row_offset = 48, col_offset = 16 - ; tmm1 --> row_offset = 49, col_offset = 18 - ; tmm2 --> row_offset = 50, col_offset = 20 - ; CHECK: MOV8mr %stack.4, 1, $noreg, 48, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row5) - ; CHECK: MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col6) - ; CHECK: MOV8mr %stack.4, 1, $noreg, 49, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.1.shape.row) - ; CHECK: MOV16mi %stack.4, 1, $noreg, 18, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.1.shape.col) - ; CHECK: MOV8mi %stack.4, 1, $noreg, 50, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.2.shape.row) - ; CHECK: MOV16mr %stack.4, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.2.shape.col) - ; CHECK: MOV8mr %stack.4, 1, $noreg, 48, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.3.shape.row) - ; CHECK: MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.3.shape.col) - ; CHECK: LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0 - ; CHECK: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r10, 1, renamable $rsi, 0, $noreg - ; CHECK: renamable $tmm2 = PTILELOADDV renamable $di, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - ; CHECK: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r8, 1, renamable $rsi, 0, $noreg - ; CHECK: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - ; CHECK: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 - - ; tmm6 --> row_offset = 54, col_offset = 28 - ; CHECK: MOV8mr %stack.5, 1, $noreg, 54, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row3) - ; CHECK: MOV16mr %stack.5, 1, $noreg, 28, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col4) - ; CHECK: LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg - ; CHECK: renamable $tmm6 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg - ; CHECK: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm6 - - $ax = MOV16rm %stack.16, 1, $noreg, 0, $noreg :: (load 2 from %stack.16) - $cx = MOV16rm %stack.17, 1, $noreg, 0, $noreg :: (load 2 from %stack.17) - $rdx = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load 8 from %stack.12) - $r8 = MOV64rm %stack.15, 1, $noreg, 0, $noreg :: (load 8 from %stack.15) - $r9 = MOV64rm %stack.14, 1, $noreg, 0, $noreg :: (load 8 from %stack.14) - $r10 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load 8 from %stack.13) - renamable $zmm0 = AVX512_512_SET0 - VMOVDQA64Zmr %stack.4, 1, $noreg, 0, $noreg, renamable $zmm0 :: (store 64 into %ir.4) - renamable $sil = COPY renamable $al - MOV8mr %stack.4, 1, $noreg, 48, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.0.shape.row5) - MOV16mr %stack.4, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col6) - MOV8mr %stack.4, 1, $noreg, 49, $noreg, renamable $sil :: (volatile store 1 into %ir.amx.tmm.1.shape.row) - MOV16mi %stack.4, 1, $noreg, 18, $noreg, 8 :: (volatile store 2 into %ir.amx.tmm.1.shape.col) - MOV8mi %stack.4, 1, $noreg, 50, $noreg, 8 :: (volatile store 1 into %ir.amx.tmm.2.shape.row) - MOV16mr %stack.4, 1, $noreg, 20, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.2.shape.col) - MOV8mr %stack.4, 1, $noreg, 51, $noreg, killed renamable $sil :: (volatile store 1 into %ir.amx.tmm.3.shape.row) - MOV16mr %stack.4, 1, $noreg, 22, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.3.shape.col) - LDTILECFG %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 - renamable $rsi = MOV32ri64 64 - renamable $di = MOV16ri 8 - renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r10, 1, renamable $rsi, 0, $noreg - renamable $tmm2 = PTILELOADDV renamable $di, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r8, 1, renamable $rsi, 0, $noreg - renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $rsi = LEA64r %stack.5, 1, $noreg, 0, $noreg - VMOVDQA64Zmr %stack.5, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store 64 into %ir.5) - renamable $dil = COPY renamable $al - MOV8mr %stack.5, 1, $noreg, 48, $noreg, killed renamable $dil :: (volatile store 1 into %ir.amx.tmm.0.shape.row3) - MOV16mr %stack.5, 1, $noreg, 16, $noreg, renamable $cx :: (volatile store 2 into %ir.amx.tmm.0.shape.col4) - LDTILECFG killed renamable $rsi, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 - renamable $rsi = MOV32ri64 64 - renamable $tmm6 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg - renamable $rdx = MOV32ri64 @buf - renamable $rsi = MOV32ri64 32 - PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm6 - RETQ - -... diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll index 2046d84..1145ff7 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s define dso_local void @test_no_bitcast(i32* %A_mem, i32* %B_mem, i32* %C_mem) local_unnamed_addr #0 { ; CHECK-LABEL: @test_no_bitcast( diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll index 708c7ab..9b05356 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics %s -S | FileCheck %s define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, i8 *%ptr, i64 %stride, <256 x i32>* %vptr) { ; CHECK-LABEL: @test_amx_load_non_O0( diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index ef9a19c7..e5b3584 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -20,7 +20,6 @@ ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store -; CHECK-NEXT: Pre AMX Tile Config ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering @@ -46,7 +45,6 @@ ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Fast Register Allocator -; CHECK-NEXT: Fast Tile Register Configure ; CHECK-NEXT: X86 Lower Tile Copy ; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: X86 FP Stackifier diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index 8f93ca8..bb0d6b9 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -509,29 +509,19 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) { "mips-", "lanai-", "hexagon-", "bpf-", "avr-", "thumb2-", "arm-", "si-", "gcn-", "amdgpu-", "aarch64-", "amdgcn-", "polly-"}; std::vector PassNameContain = {"ehprepare"}; - std::vector PassNameExact = {"safe-stack", - "cost-model", - "codegenprepare", - "interleaved-load-combine", - "unreachableblockelim", - "verify-safepoint-ir", - "atomic-expand", - "hardware-loops", - "type-promotion", - "mve-tail-predication", - "interleaved-access", - "global-merge", - "pre-isel-intrinsic-lowering", - "expand-reductions", - "indirectbr-expand", - "generic-to-nvvm", - "expandmemcmp", - "loop-reduce", - "lower-amx-type", - "pre-amx-config", - "lower-amx-intrinsics", - "polyhedral-info", - "replace-with-veclib"}; + std::vector PassNameExact = { + "safe-stack", "cost-model", + "codegenprepare", "interleaved-load-combine", + "unreachableblockelim", "verify-safepoint-ir", + "atomic-expand", + "hardware-loops", "type-promotion", + "mve-tail-predication", "interleaved-access", + "global-merge", "pre-isel-intrinsic-lowering", + "expand-reductions", "indirectbr-expand", + "generic-to-nvvm", "expandmemcmp", + "loop-reduce", "lower-amx-type", + "lower-amx-intrinsics", "polyhedral-info", + "replace-with-veclib"}; for (const auto &P : PassNamePrefix) if (Pass.startswith(P)) return true; diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn index 876d051..6184508 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn @@ -87,7 +87,6 @@ static_library("LLVMX86CodeGen") { "X86EvexToVex.cpp", "X86ExpandPseudo.cpp", "X86FastISel.cpp", - "X86FastTileConfig.cpp", "X86FixupBWInsts.cpp", "X86FixupLEAs.cpp", "X86FixupSetCC.cpp", -- 2.7.4