static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1,
unsigned Opcode,
const TargetTransformInfo &TTI,
- Instruction *&ConvertToShuffle) {
+ Instruction *&ConvertToShuffle,
+ unsigned PreferredExtractIndex) {
assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
isa<ConstantInt>(Ext1->getOperand(1)) &&
"Expected constant extract indexes");
NewCost +=
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- // The more expensive extract will be replaced by a shuffle. If the extracts
- // have the same cost, replace the extract with the higher index.
+ // The more expensive extract will be replaced by a shuffle. If the costs
+ // are equal and there is a preferred extract index, shuffle the opposite
+ // operand. Otherwise, replace the extract with the higher index.
if (Extract0Cost > Extract1Cost)
ConvertToShuffle = Ext0;
else if (Extract1Cost > Extract0Cost)
ConvertToShuffle = Ext1;
+ else if (PreferredExtractIndex == Ext0Index)
+ ConvertToShuffle = Ext1;
+ else if (PreferredExtractIndex == Ext1Index)
+ ConvertToShuffle = Ext0;
else
ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1;
}
V0->getType() != V1->getType())
return false;
+ // If the scalar value 'I' is going to be re-inserted into a vector, then try
+ // to create an extract to that same element. The extract/insert can be
+ // reduced to a "select shuffle".
+ // TODO: If we add a larger pattern match that starts from an insert, this
+ // probably becomes unnecessary.
+ uint64_t InsertIndex = std::numeric_limits<uint64_t>::max();
+ if (I.hasOneUse())
+ match(I.user_back(), m_InsertElement(m_Value(), m_Value(),
+ m_ConstantInt(InsertIndex)));
+
Instruction *ConvertToShuffle;
- if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle))
+ if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle,
+ InsertIndex))
return false;
if (ConvertToShuffle) {
define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @ins_bo_ext_ext(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i64 3
; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[TMP3]], i32 3
; CHECK-NEXT: ret <4 x float> [[V3]]
;
ret <4 x float> %v3
}
+; TODO: This is conservatively left to extract from the lower index value,
+; but it is likely that extracting from index 3 is the better option.
+
define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @ins_bo_ext_ext_uses(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
; CHECK-LABEL: @PR34724(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[A]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[B]], [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[B]], [[TMP3]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[B]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP8]], i64 3
; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 1
; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP7]], i32 2
; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP9]], i32 3
; SSE-NEXT: ret <4 x i1> [[R]]
;
; AVX-LABEL: @ins_fcmp_ext_ext(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
-; AVX-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[TMP1]], [[A]]
-; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
+; AVX-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[A]], [[TMP1]]
+; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
; AVX-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[TMP3]], i32 2
; AVX-NEXT: ret <4 x i1> [[R]]
;
define <4 x i1> @ins_icmp_ext_ext(<4 x i32> %a, <4 x i1> %b) {
; CHECK-LABEL: @ins_icmp_ext_ext(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[A]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[TMP1]], [[A]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[TMP3]], i32 3
; CHECK-NEXT: ret <4 x i1> [[R]]
;