// converts.
}
- // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x)
- bool isLE = DAG.getDataLayout().isLittleEndian();
- unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1;
- if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
- ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) {
+ if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST) {
+ // The vector index of the LSBs of the source depend on the endian-ness.
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
+
+ // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
+ unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1;
SDValue BCSrc = InVec.getOperand(0);
- if (BCSrc.getValueType().isScalarInteger())
+ if (InVec.hasOneUse() && ConstEltNo->getZExtValue() == BCTruncElt &&
+ VT.isInteger() && BCSrc.getValueType().isScalarInteger())
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
}
ret i8 %ext
}
+; TODO: This should have folded to avoid vector ops, but the transform
+; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU
+; codegen better.
+
define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {
; X86-LABEL: extractelt_bitcast_extra_use:
; X86: # %bb.0: