D47985 saw the old SK_Alternate 'alternating' shuffle mask replaced with the SK_Select mask which accepts either input operand for each lane, equivalent to a vector select with a constant condition operand.
This patch updates SLPVectorizer to make full use of this SK_Select shuffle pattern by removing the 'isOdd()' limitation.
The AArch64 regression will be fixed by D48172.
Differential Revision: https://reviews.llvm.org/D48174
llvm-svn: 335130
}
}
-static bool isOdd(unsigned Value) {
- return Value & 1;
-}
-
static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
unsigned CheckedOpcode) {
return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
unsigned AltOpcode = getAltOpcode(Opcode);
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
- if (InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode))
+ if (!sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode))
return InstructionsState(VL[0], 0, false);
}
}
// Create shuffle to take alternate operations from the vector.
// Also, gather up odd and even scalar ops to propagate IR flags to
// each vector operation.
- ValueList OddScalars, EvenScalars;
+ ValueList OpScalars, AltScalars;
unsigned e = E->Scalars.size();
SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
- if (isOdd(i)) {
+ auto *OpInst = cast<Instruction>(E->Scalars[i]);
+ unsigned InstOpcode = OpInst->getOpcode();
+ assert(sameOpcodeOrAlt(S.Opcode, AltOpcode, InstOpcode) &&
+ "Unexpected main/alternate opcode");
+ if (InstOpcode == AltOpcode) {
Mask[i] = Builder.getInt32(e + i);
- OddScalars.push_back(E->Scalars[i]);
+ AltScalars.push_back(E->Scalars[i]);
} else {
Mask[i] = Builder.getInt32(i);
- EvenScalars.push_back(E->Scalars[i]);
+ OpScalars.push_back(E->Scalars[i]);
}
}
Value *ShuffleMask = ConstantVector::get(Mask);
- propagateIRFlags(V0, EvenScalars);
- propagateIRFlags(V1, OddScalars);
+ propagateIRFlags(V0, OpScalars);
+ propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
if (Instruction *I = dyn_cast<Instruction>(V))
; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]]
; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]]
; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]]
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP0_0]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP0_2]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1_2]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_1]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP0_1]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_3]], i32 2
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP1_3]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], <i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP14]])
-; CHECK-NEXT: ret i32 [[TMP15]]
+; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]]
+; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]]
+; CHECK-NEXT: [[TMP3_0:%.*]] = lshr i32 [[TMP2_0]], 15
+; CHECK-NEXT: [[TMP3_1:%.*]] = lshr i32 [[TMP2_1]], 15
+; CHECK-NEXT: [[TMP3_2:%.*]] = lshr i32 [[TMP2_2]], 15
+; CHECK-NEXT: [[TMP3_3:%.*]] = lshr i32 [[TMP2_3]], 15
+; CHECK-NEXT: [[TMP4_0:%.*]] = and i32 [[TMP3_0]], 65537
+; CHECK-NEXT: [[TMP4_1:%.*]] = and i32 [[TMP3_1]], 65537
+; CHECK-NEXT: [[TMP4_2:%.*]] = and i32 [[TMP3_2]], 65537
+; CHECK-NEXT: [[TMP4_3:%.*]] = and i32 [[TMP3_3]], 65537
+; CHECK-NEXT: [[TMP5_0:%.*]] = mul nuw i32 [[TMP4_0]], 65535
+; CHECK-NEXT: [[TMP5_1:%.*]] = mul nuw i32 [[TMP4_1]], 65535
+; CHECK-NEXT: [[TMP5_2:%.*]] = mul nuw i32 [[TMP4_2]], 65535
+; CHECK-NEXT: [[TMP5_3:%.*]] = mul nuw i32 [[TMP4_3]], 65535
+; CHECK-NEXT: [[TMP6_0:%.*]] = add i32 [[TMP5_0]], [[TMP2_0]]
+; CHECK-NEXT: [[TMP6_1:%.*]] = add i32 [[TMP5_1]], [[TMP2_1]]
+; CHECK-NEXT: [[TMP6_2:%.*]] = add i32 [[TMP5_2]], [[TMP2_2]]
+; CHECK-NEXT: [[TMP6_3:%.*]] = add i32 [[TMP5_3]], [[TMP2_3]]
+; CHECK-NEXT: [[TMP7_0:%.*]] = xor i32 [[TMP6_0]], [[TMP5_0]]
+; CHECK-NEXT: [[TMP7_1:%.*]] = xor i32 [[TMP6_1]], [[TMP5_1]]
+; CHECK-NEXT: [[TMP7_2:%.*]] = xor i32 [[TMP6_2]], [[TMP5_2]]
+; CHECK-NEXT: [[TMP7_3:%.*]] = xor i32 [[TMP6_3]], [[TMP5_3]]
+; CHECK-NEXT: [[REDUCE_0:%.*]] = add i32 [[TMP7_1]], [[TMP7_0]]
+; CHECK-NEXT: [[REDUCE_1:%.*]] = add i32 [[REDUCE_0]], [[TMP7_2]]
+; CHECK-NEXT: [[REDUCE_2:%.*]] = add i32 [[REDUCE_1]], [[TMP7_3]]
+; CHECK-NEXT: ret i32 [[REDUCE_2]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
}
; Function Attrs: nounwind uwtable
-define void @No_faddfsub() #0 {
-; CHECK-LABEL: @No_faddfsub(
+define void @faddfsub_select() #0 {
+; CHECK-LABEL: @faddfsub_select(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-NEXT: store float [[ADD]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
-; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[TMP2]], [[TMP3]]
-; CHECK-NEXT: store float [[ADD1]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
-; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[TMP4]], [[TMP5]]
-; CHECK-NEXT: store float [[ADD2]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
-; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
-; CHECK-NEXT: [[SUB:%.*]] = fsub float [[TMP6]], [[TMP7]]
-; CHECK-NEXT: store float [[SUB]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
; CHECK-NEXT: ret void
;
entry: