SDValue &Index, SDValue &Disp,
SDValue &Segment);
- // Convience method where P is also root.
+ // Convenience method where P is also root.
bool tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
}
+ // Try to fold a vector load. This makes sure the load isn't non-temporal.
+ bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
bool matchBEXTRFromAnd(SDNode *Node);
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
+
+ MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+ const SDLoc &dl, MVT VT, SDNode *Node);
+ MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+ const SDLoc &dl, MVT VT, SDNode *Node,
+ SDValue &InFlag);
};
}
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
+bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ useNonTemporalLoad(cast<LoadSDNode>(N)) ||
+ !IsProfitableToFold(N, P, Root) ||
+ !IsLegalToFold(N, P, Root, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
return true;
}
+// Emit a PCMISTR(I/M) instruction.
+MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
+ bool MayFoldLoad, const SDLoc &dl,
+ MVT VT, SDNode *Node) {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ SDValue Imm = Node->getOperand(2);
+ const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+ Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+ // If there is a load, it will be behind a bitcast. We don't need to check
+ // alignment on this load.
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
+ tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4)) {
+ SDValue Load = N1.getOperand(0);
+ SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ Load.getOperand(0) };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
+ return CNode;
+ }
+
+ SDValue Ops[] = { N0, N1, Imm };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
+ MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+ return CNode;
+}
+
+// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
+// to emit a second instruction after this one. This is needed since we have two
+// copyToReg nodes glued before this and we need to continue that glue through.
+MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
+ bool MayFoldLoad, const SDLoc &dl,
+ MVT VT, SDNode *Node,
+ SDValue &InFlag) {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N2 = Node->getOperand(2);
+ SDValue Imm = Node->getOperand(4);
+ const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+ Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+ // If there is a load, it will be behind a bitcast. We don't need to check
+ // alignment on this load.
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
+ tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4)) {
+ SDValue Load = N2.getOperand(0);
+ SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ Load.getOperand(0), InFlag };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 3);
+ // Update the chain.
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
+ return CNode;
+ }
+
+ SDValue Ops[] = { N0, N2, Imm, InFlag };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
+ MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 2);
+ return CNode;
+}
+
/// If the high bits of an 'and' operand are known zero, try setting the
/// high bits of an 'and' constant operand to produce a smaller encoding by
/// creating a small, sign-extended negative immediate rather than a large
}
break;
}
+ case X86ISD::PCMPISTR: {
+ if (!Subtarget->hasSSE42())
+ break;
+
+ bool NeedIndex = !SDValue(Node, 0).use_empty();
+ bool NeedMask = !SDValue(Node, 1).use_empty();
+ // We can't fold a load if we are going to make two instructions.
+ bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+ MachineSDNode *CNode;
+ if (NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
+ CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+ }
+ if (NeedIndex || !NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
+ CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ }
+
+ // Connect the flag usage to the last instruction created.
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::PCMPESTR: {
+ if (!Subtarget->hasSSE42())
+ break;
+
+ // Copy the two implicit register inputs.
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
+ Node->getOperand(1),
+ SDValue()).getValue(1);
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
+ Node->getOperand(3), InFlag).getValue(1);
+
+ bool NeedIndex = !SDValue(Node, 0).use_empty();
+ bool NeedMask = !SDValue(Node, 1).use_empty();
+ // We can't fold a load if we are going to make two instructions.
+ bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+ MachineSDNode *CNode;
+ if (NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
+ CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
+ InFlag);
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+ }
+ if (NeedIndex || !NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
+ CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ }
+ // Connect the flag usage to the last instruction created.
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
case ISD::STORE:
if (foldLoadStoreIntoMemOperand(Node))
return;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse42_pcmpistria128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_A;
break;
case Intrinsic::x86_sse42_pcmpestria128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_A;
break;
case Intrinsic::x86_sse42_pcmpistric128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_sse42_pcmpestric128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_B;
break;
case Intrinsic::x86_sse42_pcmpistrio128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_O;
break;
case Intrinsic::x86_sse42_pcmpestrio128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_O;
break;
case Intrinsic::x86_sse42_pcmpistris128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_S;
break;
case Intrinsic::x86_sse42_pcmpestris128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_S;
break;
case Intrinsic::x86_sse42_pcmpistriz128:
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_E;
break;
case Intrinsic::x86_sse42_pcmpestriz128:
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_E;
break;
}
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
- SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
+ SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case Intrinsic::x86_sse42_pcmpestri128: {
unsigned Opcode;
if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
- Opcode = X86ISD::PCMPISTRI;
+ Opcode = X86ISD::PCMPISTR;
else
- Opcode = X86ISD::PCMPESTRI;
+ Opcode = X86ISD::PCMPESTR;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
+ case Intrinsic::x86_sse42_pcmpistrm128:
+ case Intrinsic::x86_sse42_pcmpestrm128: {
+ unsigned Opcode;
+ if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
+ Opcode = X86ISD::PCMPISTR;
+ else
+ Opcode = X86ISD::PCMPESTR;
+
+ SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
+ return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
+ }
+
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
- case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
- case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
+ case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
+ case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
return sinkMBB;
}
-// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
-// or XMM0_V32I8 in AVX all of this code can be replaced with that
-// in the .td file.
-static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII) {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("illegal opcode!");
- case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
- case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
- case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
- case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
- case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
- case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
- case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
- case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
- }
-
- DebugLoc dl = MI.getDebugLoc();
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-
- unsigned NumArgs = MI.getNumOperands();
- for (unsigned i = 1; i < NumArgs; ++i) {
- MachineOperand &Op = MI.getOperand(i);
- if (!(Op.isReg() && Op.isImplicit()))
- MIB.add(Op);
- }
- if (MI.hasOneMemOperand())
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
- .addReg(X86::XMM0);
-
- MI.eraseFromParent();
- return BB;
-}
-
-// FIXME: Custom handling because TableGen doesn't support multiple implicit
-// defs in an instruction pattern
-static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII) {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default: llvm_unreachable("illegal opcode!");
- case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
- case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
- case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
- case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
- case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
- case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
- case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
- case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
- }
-
- DebugLoc dl = MI.getDebugLoc();
- MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-
- unsigned NumArgs = MI.getNumOperands(); // remove the results
- for (unsigned i = 1; i < NumArgs; ++i) {
- MachineOperand &Op = MI.getOperand(i);
- if (!(Op.isReg() && Op.isImplicit()))
- MIB.add(Op);
- }
- if (MI.hasOneMemOperand())
- MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
- .addReg(X86::ECX);
-
- MI.eraseFromParent();
- return BB;
-}
-
static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
const X86Subtarget &Subtarget) {
DebugLoc dl = MI.getDebugLoc();
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
- // String/text processing lowering.
- case X86::PCMPISTRM128REG:
- case X86::VPCMPISTRM128REG:
- case X86::PCMPISTRM128MEM:
- case X86::VPCMPISTRM128MEM:
- case X86::PCMPESTRM128REG:
- case X86::VPCMPESTRM128REG:
- case X86::PCMPESTRM128MEM:
- case X86::VPCMPESTRM128MEM:
- assert(Subtarget.hasSSE42() &&
- "Target must have SSE4.2 or AVX features enabled");
- return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
-
- // String/text processing lowering.
- case X86::PCMPISTRIREG:
- case X86::VPCMPISTRIREG:
- case X86::PCMPISTRIMEM:
- case X86::VPCMPISTRIMEM:
- case X86::PCMPESTRIREG:
- case X86::VPCMPESTRIREG:
- case X86::PCMPESTRIMEM:
- case X86::VPCMPESTRIMEM:
- assert(Subtarget.hasSSE42() &&
- "Target must have SSE4.2 or AVX features enabled");
- return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
-
// Thread synchronization.
case X86::MONITOR:
return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
RDSEED,
// SSE42 string comparisons.
- PCMPISTRI,
- PCMPESTRI,
+ // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+ // will emit one or two instructions based on which results are used. If
+ // flags and index/mask this allows us to use a single instruction since
+ // we won't have to pick and opcode for flags. Instead we can rely on the
+ // DAG to CSE everything and decide at isel.
+ PCMPISTR,
+ PCMPESTR,
// Test if in transactional execution.
XTEST,
def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>;
def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>;
-def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
- SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
- SDTCisVT<4, i8>]>;
-def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
- SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
- SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
- SDTCisVT<6, i8>]>;
-
-def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
-def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
-
def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
[SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
{ X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
{ X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
{ X86::PCMPESTRIrr, X86::PCMPESTRIrm, 0 },
- { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, 0 },
+ { X86::PCMPESTRMrr, X86::PCMPESTRMrm, 0 },
{ X86::PCMPISTRIrr, X86::PCMPISTRIrm, 0 },
- { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, 0 },
+ { X86::PCMPISTRMrr, X86::PCMPISTRMrm, 0 },
{ X86::PHMINPOSUWrr, X86::PHMINPOSUWrm, TB_ALIGN_16 },
{ X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
{ X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
{ X86::VPABSDrr, X86::VPABSDrm, 0 },
{ X86::VPABSWrr, X86::VPABSWrm, 0 },
{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
- { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
+ { X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0 },
{ X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
- { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
- { X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 },
+ { X86::VPCMPISTRMrr, X86::VPCMPISTRMrm, 0 },
+ { X86::VPHMINPOSUWrr, X86::VPHMINPOSUWrm, 0 },
{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
{ X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
// SSE4.2 - String/text Processing Instructions
//===----------------------------------------------------------------------===//
-// Packed Compare Implicit Length Strings, Return Mask
-multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
- def REG : PseudoI<(outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
- imm:$src3))]>;
- def MEM : PseudoI<(outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
- (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
-}
-
-let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
- Requires<[HasAVX]>, VEX_WIG;
- defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", loadv2i64>,
- Requires<[UseSSE42]>;
-}
-
multiclass pcmpistrm_SS42AI<string asm> {
def rr : SS42AI<0x62, MRMSrcReg, (outs),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
- defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
- defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
-}
-
-// Packed Compare Explicit Length Strings, Return Mask
-multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
- def REG : PseudoI<(outs VR128:$dst),
- (ins VR128:$src1, VR128:$src3, u8imm:$src5),
- [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
- VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
- def MEM : PseudoI<(outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
- [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
- (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
-}
-
-let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
- Requires<[HasAVX]>;
- defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", loadv2i64>,
- Requires<[UseSSE42]>;
+ defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+ defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
}
multiclass SS42AI_pcmpestrm<string asm> {
let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
- defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
- defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
-}
-
-// Packed Compare Implicit Length Strings, Return Index
-multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
- def REG : PseudoI<(outs GR32:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- [(set GR32:$dst, EFLAGS,
- (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
- def MEM : PseudoI<(outs GR32:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
- (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
-}
-
-let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
- defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
- Requires<[HasAVX]>, VEX_WIG;
- defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", loadv2i64>,
- Requires<[UseSSE42]>;
+ defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+ defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
}
multiclass SS42AI_pcmpistri<string asm> {
defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
}
-// Packed Compare Explicit Length Strings, Return Index
-multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
- def REG : PseudoI<(outs GR32:$dst),
- (ins VR128:$src1, VR128:$src3, u8imm:$src5),
- [(set GR32:$dst, EFLAGS,
- (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
- def MEM : PseudoI<(outs GR32:$dst),
- (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
- [(set GR32:$dst, EFLAGS,
- (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
- imm:$src5))]>;
-}
-
-let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in {
- defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
- Requires<[HasAVX]>;
- defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", loadv2i64>,
- Requires<[UseSSE42]>;
-}
-
multiclass SS42AI_pcmpestri<string asm> {
def rr : SS42AI<0x61, MRMSrcReg, (outs),
(ins VR128:$src1, VR128:$src3, u8imm:$src5),
declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8)
define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
; X32-LABEL: pcmpestri_reg_eq_i8:
%result_ext = zext i16 %result to i32
ret i32 %result_ext
}
+
+define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpestr_index_flag:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: pushl %edi
+; X32-NEXT: pushl %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %ecx, (%edi)
+; X32-NEXT: movl %ebx, (%esi)
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %edi
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpestr_index_flag:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: xorl %r10d, %r10d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT: setb %r10b
+; X64-NEXT: movl %ecx, (%r9)
+; X64-NEXT: movl %r10d, (%r8)
+; X64-NEXT: retq
+entry:
+ %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ store i32 %index, i32* %iptr
+ store i32 %flag, i32* %fptr
+ ret void
+}
+
+define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpestr_mask_flag:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: pushl %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0
+; X32-NEXT: setb %bl
+; X32-NEXT: movdqa %xmm0, (%esi)
+; X32-NEXT: movl %ebx, (%ecx)
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpestr_mask_flag:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: xorl %r9d, %r9d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
+; X64-NEXT: setb %r9b
+; X64-NEXT: movdqa %xmm0, (%r8)
+; X64-NEXT: movl %r9d, (%rcx)
+; X64-NEXT: retq
+entry:
+ %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ store <16 x i8> %mask, <16 x i8>* %mptr
+ store i32 %flag, i32* %fptr
+ ret void
+}
+
+define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind {
+; X32-LABEL: pcmpestr_mask_index:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %edi
+; X32-NEXT: pushl %esi
+; X32-NEXT: movdqa %xmm0, %xmm2
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: pcmpestri $24, %xmm1, %xmm2
+; X32-NEXT: movdqa %xmm0, (%edi)
+; X32-NEXT: movl %ecx, (%esi)
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %edi
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpestr_mask_index:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
+; X64-NEXT: pcmpestri $24, %xmm1, %xmm2
+; X64-NEXT: movdqa %xmm0, (%r9)
+; X64-NEXT: movl %ecx, (%r8)
+; X64-NEXT: retq
+entry:
+ %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ store <16 x i8> %mask, <16 x i8>* %mptr
+ store i32 %index, i32* %iptr
+ ret void
+}
+
+define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpestr_mask_index_flag:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: pushl %ebx
+; X32-NEXT: pushl %edi
+; X32-NEXT: pushl %esi
+; X32-NEXT: movdqa %xmm0, %xmm2
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: pcmpestrm $24, %xmm1, %xmm0
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $24, %xmm1, %xmm2
+; X32-NEXT: setb %bl
+; X32-NEXT: movdqa %xmm0, (%ebp)
+; X32-NEXT: movl %ecx, (%edi)
+; X32-NEXT: movl %ebx, (%esi)
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %edi
+; X32-NEXT: popl %ebx
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpestr_mask_index_flag:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq %rcx, %r9
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestrm $24, %xmm1, %xmm0
+; X64-NEXT: xorl %esi, %esi
+; X64-NEXT: pcmpestri $24, %xmm1, %xmm2
+; X64-NEXT: setb %sil
+; X64-NEXT: movdqa %xmm0, (%r10)
+; X64-NEXT: movl %ecx, (%r9)
+; X64-NEXT: movl %esi, (%r8)
+; X64-NEXT: retq
+entry:
+ %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ store <16 x i8> %mask, <16 x i8>* %mptr
+ store i32 %index, i32* %iptr
+ store i32 %flag, i32* %fptr
+ ret void
+}
+
+define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpistr_index_flag:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: setb %dl
+; X32-NEXT: movl %ecx, (%esi)
+; X32-NEXT: movl %edx, (%eax)
+; X32-NEXT: popl %esi
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpistr_index_flag:
+; X64: # %bb.0: # %entry
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT: setb %al
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: movl %eax, (%rsi)
+; X64-NEXT: retq
+entry:
+ %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ store i32 %index, i32* %iptr
+ store i32 %flag, i32* %fptr
+ ret void
+}
+
+define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpistr_mask_flag:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: setb %cl
+; X32-NEXT: movdqa %xmm0, (%edx)
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpistr_mask_flag:
+; X64: # %bb.0: # %entry
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
+; X64-NEXT: setb %al
+; X64-NEXT: movdqa %xmm0, (%rdi)
+; X64-NEXT: movl %eax, (%rsi)
+; X64-NEXT: retq
+entry:
+ %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ store <16 x i8> %mask, <16 x i8>* %mptr
+ store i32 %flag, i32* %fptr
+ ret void
+}
+
+define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind {
+; X32-LABEL: pcmpistr_mask_index:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movdqa %xmm0, (%edx)
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpistr_mask_index:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
+; X64-NEXT: movdqa %xmm0, (%rdi)
+; X64-NEXT: movl %ecx, (%rsi)
+; X64-NEXT: retq
+entry:
+ %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ store <16 x i8> %mask, <16 x i8>* %mptr
+ store i32 %index, i32* %iptr
+ ret void
+}
+
+define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpistr_mask_index_flag:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: pushl %esi
+; X32-NEXT: movdqa %xmm0, %xmm2
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: pcmpistrm $24, %xmm1, %xmm0
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpistri $24, %xmm1, %xmm2
+; X32-NEXT: setb %bl
+; X32-NEXT: movdqa %xmm0, (%esi)
+; X32-NEXT: movl %ecx, (%edx)
+; X32-NEXT: movl %ebx, (%eax)
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpistr_mask_index_flag:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pcmpistrm $24, %xmm1, %xmm0
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $24, %xmm1, %xmm2
+; X64-NEXT: setb %al
+; X64-NEXT: movdqa %xmm0, (%rdi)
+; X64-NEXT: movl %ecx, (%rsi)
+; X64-NEXT: movl %eax, (%rdx)
+; X64-NEXT: retq
+entry:
+ %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ store <16 x i8> %mask, <16 x i8>* %mptr
+ store i32 %index, i32* %iptr
+ store i32 %flag, i32* %fptr
+ ret void
+}
+
+; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri.
+define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind {
+; X32-LABEL: pcmpistr_mask_index_flag_load:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: pushl %esi
+; X32-NEXT: movdqa %xmm0, %xmm1
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movdqu (%ecx), %xmm2
+; X32-NEXT: pcmpistrm $24, %xmm2, %xmm0
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpistri $24, %xmm2, %xmm1
+; X32-NEXT: setb %bl
+; X32-NEXT: movdqa %xmm0, (%esi)
+; X32-NEXT: movl %ecx, (%edx)
+; X32-NEXT: movl %ebx, (%eax)
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpistr_mask_index_flag_load:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: movdqu (%rdi), %xmm2
+; X64-NEXT: pcmpistrm $24, %xmm2, %xmm0
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: pcmpistri $24, %xmm2, %xmm1
+; X64-NEXT: setb %dil
+; X64-NEXT: movdqa %xmm0, (%rsi)
+; X64-NEXT: movl %ecx, (%rdx)
+; X64-NEXT: movl %edi, (%rax)
+; X64-NEXT: retq
+entry:
+ %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1
+ %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+ store <16 x i8> %mask, <16 x i8>* %mptr
+ store i32 %index, i32* %iptr
+ store i32 %flag, i32* %fptr
+ ret void
+}
+
+; Make sure we don't fold nontemporal loads.
+define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_nontemporal:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movntdqa (%ecx), %xmm1
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: pcmpestri_nontemporal:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movntdqa (%rsi), %xmm1
+; X64-NEXT: xorl %esi, %esi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT: setb %sil
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
+entry:
+ %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0
+ %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+ ret i32 %flag
+}
+
+!0 = !{ i32 1 }