bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
void PreprocessISelDAG() override;
+ void PostprocessISelDAG() override;
// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"
}
+void X86DAGToDAGISel::PostprocessISelDAG() {
+ // Skip peepholes at -O0.
+ if (TM.getOptLevel() == CodeGenOpt::None)
+ return;
+
+ // Attempt to remove vectors moves that were inserted to zero upper bits.
+
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ // Skip dead nodes and any non-machine opcodes.
+ if (N->use_empty() || !N->isMachineOpcode())
+ continue;
+
+ if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG)
+ continue;
+
+ unsigned SubRegIdx = N->getConstantOperandVal(2);
+ if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
+ continue;
+
+ SDValue Move = N->getOperand(1);
+ if (!Move.isMachineOpcode())
+ continue;
+
+ // Make sure its one of the move opcodes we recognize.
+ switch (Move.getMachineOpcode()) {
+ default:
+ continue;
+ case X86::VMOVAPDrr: case X86::VMOVUPDrr:
+ case X86::VMOVAPSrr: case X86::VMOVUPSrr:
+ case X86::VMOVDQArr: case X86::VMOVDQUrr:
+ case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
+ case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
+ case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
+ case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
+ case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
+ case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
+ case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
+ case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
+ case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
+ case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
+ case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
+ break;
+ }
+
+ SDValue In = Move.getOperand(0);
+ if (!In.isMachineOpcode() ||
+ In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
+ continue;
+
+ // Producing instruction is another vector instruction. We can drop the
+ // move.
+ CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
+
+ // If the move is now dead, delete it.
+ if (Move.getNode()->use_empty())
+ CurDAG->RemoveDeadNode(Move.getNode());
+ }
+}
+
+
/// Emit any code that needs to be executed only in the main function.
void X86DAGToDAGISel::emitSpecialCodeForMain() {
if (Subtarget->isTargetCygMing()) {
v16i32, loadv4i64, sub_ymm>;
}
-// List of opcodes that guaranteed to zero the upper elements of vector regs.
-// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA
-// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make
-// this difficult. So starting with a couple opcodes used by reduction loops
-// where we explicitly insert zeros.
-class veczeroupper<ValueType vt, RegisterClass RC> :
- PatLeaf<(vt RC:$src), [{
- return N->getOpcode() == X86ISD::VPMADDWD ||
- N->getOpcode() == X86ISD::PSADBW;
- }]>;
-
-def zeroupperv2f64 : veczeroupper<v2f64, VR128>;
-def zeroupperv4f32 : veczeroupper<v4f32, VR128>;
-def zeroupperv2i64 : veczeroupper<v2i64, VR128>;
-def zeroupperv4i32 : veczeroupper<v4i32, VR128>;
-def zeroupperv8i16 : veczeroupper<v8i16, VR128>;
-def zeroupperv16i8 : veczeroupper<v16i8, VR128>;
-
-def zeroupperv4f64 : veczeroupper<v4f64, VR256>;
-def zeroupperv8f32 : veczeroupper<v8f32, VR256>;
-def zeroupperv4i64 : veczeroupper<v4i64, VR256>;
-def zeroupperv8i32 : veczeroupper<v8i32, VR256>;
-def zeroupperv16i16 : veczeroupper<v16i16, VR256>;
-def zeroupperv32i8 : veczeroupper<v32i8, VR256>;
-
-
-// If we can guarantee the upper elements have already been zeroed we can elide
-// an explicit zeroing.
-multiclass subvector_zero_ellision<RegisterClass RC, ValueType DstTy,
- ValueType SrcTy, ValueType ZeroTy,
- SubRegIndex SubIdx, PatLeaf Zeroupper> {
- def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
- Zeroupper:$src, (iPTR 0))),
- (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>;
-}
-
-// 128->256
-defm: subvector_zero_ellision<VR128, v4f64, v2f64, v8i32, sub_xmm, zeroupperv2f64>;
-defm: subvector_zero_ellision<VR128, v8f32, v4f32, v8i32, sub_xmm, zeroupperv4f32>;
-defm: subvector_zero_ellision<VR128, v4i64, v2i64, v8i32, sub_xmm, zeroupperv2i64>;
-defm: subvector_zero_ellision<VR128, v8i32, v4i32, v8i32, sub_xmm, zeroupperv4i32>;
-defm: subvector_zero_ellision<VR128, v16i16, v8i16, v8i32, sub_xmm, zeroupperv8i16>;
-defm: subvector_zero_ellision<VR128, v32i8, v16i8, v8i32, sub_xmm, zeroupperv16i8>;
-
-// 128->512
-defm: subvector_zero_ellision<VR128, v8f64, v2f64, v16i32, sub_xmm, zeroupperv2f64>;
-defm: subvector_zero_ellision<VR128, v16f32, v4f32, v16i32, sub_xmm, zeroupperv4f32>;
-defm: subvector_zero_ellision<VR128, v8i64, v2i64, v16i32, sub_xmm, zeroupperv2i64>;
-defm: subvector_zero_ellision<VR128, v16i32, v4i32, v16i32, sub_xmm, zeroupperv4i32>;
-defm: subvector_zero_ellision<VR128, v32i16, v8i16, v16i32, sub_xmm, zeroupperv8i16>;
-defm: subvector_zero_ellision<VR128, v64i8, v16i8, v16i32, sub_xmm, zeroupperv16i8>;
-
-// 256->512
-defm: subvector_zero_ellision<VR256, v8f64, v4f64, v16i32, sub_ymm, zeroupperv4f64>;
-defm: subvector_zero_ellision<VR256, v16f32, v8f32, v16i32, sub_ymm, zeroupperv8f32>;
-defm: subvector_zero_ellision<VR256, v8i64, v4i64, v16i32, sub_ymm, zeroupperv4i64>;
-defm: subvector_zero_ellision<VR256, v16i32, v8i32, v16i32, sub_ymm, zeroupperv8i32>;
-defm: subvector_zero_ellision<VR256, v32i16, v16i16, v16i32, sub_ymm, zeroupperv16i16>;
-defm: subvector_zero_ellision<VR256, v64i8, v32i8, v16i32, sub_ymm, zeroupperv32i8>;
-
-
class maskzeroupper<ValueType vt, RegisterClass RC> :
PatLeaf<(vt RC:$src), [{
return isMaskZeroExtended(N);