// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
// hoisting the move immediate would make it worthwhile with a less optimal
// BEXTR?
- if (!Subtarget->hasTBM() &&
- !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
+ bool PreferBEXTR =
+ Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
+ if (!PreferBEXTR && !Subtarget->hasBMI2())
return nullptr;
// Must have a shift right.
if (Shift + MaskSize > NVT.getSizeInBits())
return nullptr;
- SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
- unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
- unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
+ // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
+ // does not fit into 32 bits. Load folding is not a sufficient reason.
+ if (!PreferBEXTR && MaskSize <= 32)
+ return nullptr;
- // BMI requires the immediate to placed in a register.
- if (!Subtarget->hasTBM()) {
- ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
- MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+ SDValue Control;
+ unsigned ROpc, MOpc;
+
+ if (!PreferBEXTR) {
+ assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
+ // If we can't make use of BEXTR then we can't fuse shift+mask stages.
+ // Let's perform the mask first, and apply shift later. Note that we need to
+ // widen the mask to account for the fact that we'll apply shift afterwards!
+ Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
+ ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
+ MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
- New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
+ Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+ } else {
+ // The 'control' of BEXTR has the pattern of:
+ // [15...8 bit][ 7...0 bit] location
+ // [ bit count][ shift] name
+ // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
+ Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+ if (Subtarget->hasTBM()) {
+ ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+ MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ } else {
+ assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
+ // BMI requires the immediate to placed in a register.
+ ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+ MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+ unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+ Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+ }
}
MachineSDNode *NewNode;
SDValue Input = N0->getOperand(0);
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
- SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+ SDValue Ops[] = {
+ Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
// Record the mem-refs
CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
} else {
- NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
+ NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
+ }
+
+ if (!PreferBEXTR) {
+ // We still need to apply the shift.
+ SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
+ unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
+ NewNode =
+ CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
}
return NewNode;
;
; BMI2-SLOW-LABEL: bextr64d:
; BMI2-SLOW: # %bb.0: # %entry
-; BMI2-SLOW-NEXT: shrq $2, %rdi
-; BMI2-SLOW-NEXT: movb $33, %al
+; BMI2-SLOW-NEXT: movl $35, %eax
; BMI2-SLOW-NEXT: bzhiq %rax, %rdi, %rax
+; BMI2-SLOW-NEXT: shrq $2, %rax
; BMI2-SLOW-NEXT: retq
;
; BEXTR-FAST-LABEL: bextr64d:
;
; BMI2-SLOW-LABEL: bextr64d_load:
; BMI2-SLOW: # %bb.0: # %entry
-; BMI2-SLOW-NEXT: movq (%rdi), %rax
+; BMI2-SLOW-NEXT: movl $35, %eax
+; BMI2-SLOW-NEXT: bzhiq %rax, (%rdi), %rax
; BMI2-SLOW-NEXT: shrq $2, %rax
-; BMI2-SLOW-NEXT: movb $33, %cl
-; BMI2-SLOW-NEXT: bzhiq %rcx, %rax, %rax
; BMI2-SLOW-NEXT: retq
;
; BEXTR-FAST-LABEL: bextr64d_load: