bool isConstant = true;
bool AllLanesExtractElt = true;
unsigned NumConstantLanes = 0;
+ unsigned NumDifferentLanes = 0;
+ unsigned NumUndefLanes = 0;
SDValue Value;
SDValue ConstantValue;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
AllLanesExtractElt = false;
- if (V.isUndef())
+ if (V.isUndef()) {
+ ++NumUndefLanes;
continue;
+ }
if (i > 0)
isOnlyLowElement = false;
if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
if (!Value.getNode())
Value = V;
- else if (V != Value)
+ else if (V != Value) {
usesOnlyOneValue = false;
+ ++NumDifferentLanes;
+ }
}
if (!Value.getNode()) {
}
}
+ // If we need to insert a small number of different non-constant elements and
+ // the vector width is sufficiently large, prefer using DUP with the common
+ // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
+ // skip the constant lane handling below.
+ bool PreferDUPAndInsert =
+ !isConstant && NumDifferentLanes >= 1 &&
+ NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
+ NumDifferentLanes >= NumConstantLanes;
+
// If there was only one constant value used and for more than one lane,
// start by splatting that value, then replace the non-constant lanes. This
// is better than the default, which will perform a separate initialization
// for each lane.
- if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+ if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
// Firstly, try to materialize the splat constant.
SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
Val = ConstantBuildVector(Vec, DAG);
return shuffle;
}
+ if (PreferDUPAndInsert) {
+ // First, build a constant vector with the common element.
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned I = 0; I < NumElts; ++I)
+ Ops.push_back(Value);
+ SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
+ // Next, insert the elements that do not match the common value.
+ for (unsigned I = 0; I < NumElts; ++I)
+ if (Op.getOperand(I) != Value)
+ NewVector =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
+ Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
+
+ return NewVector;
+ }
+
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
// know the default expansion would otherwise fall back on something even
// worse. For a vector with one or two non-undef values, that's
ret void
}
-; TODO: This should jsut be a dup + clearing lane 4.
-
define <16 x i8> @test_insert_v16i8_insert_1(i8 %a) {
; CHECK-LABEL: test_insert_v16i8_insert_1:
; CHECK: // %bb.0:
define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) {
; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: mov.b v0[0], w0
-; CHECK-NEXT: mov.b v0[1], w0
-; CHECK-NEXT: mov.b v0[2], w0
-; CHECK-NEXT: mov.b v0[3], w0
-; CHECK-NEXT: mov.b v0[4], w0
-; CHECK-NEXT: mov.b v0[6], w0
-; CHECK-NEXT: mov.b v0[7], w0
-; CHECK-NEXT: mov.b v0[10], w0
-; CHECK-NEXT: mov.b v0[11], w0
-; CHECK-NEXT: mov.b v0[12], w0
-; CHECK-NEXT: mov.b v0[13], w0
-; CHECK-NEXT: mov.b v0[14], w0
-; CHECK-NEXT: mov.b v0[15], w0
+; CHECK-NEXT: dup.16b v0, w0
+; CHECK-NEXT: mov.b v0[5], wzr
+; CHECK-NEXT: mov.b v0[9], wzr
; CHECK-NEXT: ret
%v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef> , i8 %a, i32 0
%v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, i8 %b) {
; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_valeus:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: mov.b v0[0], w0
-; CHECK-NEXT: mov.b v0[1], w0
+; CHECK-NEXT: dup.16b v0, w0
; CHECK-NEXT: mov.b v0[2], w1
-; CHECK-NEXT: mov.b v0[3], w0
-; CHECK-NEXT: mov.b v0[4], w0
-; CHECK-NEXT: mov.b v0[6], w0
+; CHECK-NEXT: mov.b v0[5], wzr
; CHECK-NEXT: mov.b v0[7], w1
-; CHECK-NEXT: mov.b v0[10], w0
-; CHECK-NEXT: mov.b v0[11], w0
+; CHECK-NEXT: mov.b v0[9], wzr
; CHECK-NEXT: mov.b v0[12], w1
-; CHECK-NEXT: mov.b v0[13], w0
-; CHECK-NEXT: mov.b v0[14], w0
; CHECK-NEXT: mov.b v0[15], w1
; CHECK-NEXT: ret
%v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef> , i8 %a, i32 0
define <8 x half> @test_insert_v8f16_insert_1(half %a) {
; CHECK-LABEL: test_insert_v8f16_insert_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
-; CHECK-NEXT: mov.h v1[0], v0[0]
-; CHECK-NEXT: mov.h v1[1], v0[0]
-; CHECK-NEXT: mov.h v1[2], v0[0]
-; CHECK-NEXT: mov.h v1[3], v0[0]
-; CHECK-NEXT: mov.h v1[4], v0[0]
-; CHECK-NEXT: mov.h v1[5], v0[0]
-; CHECK-NEXT: mov.h v1[6], v0[0]
-; CHECK-NEXT: mov.16b v0, v1
+; CHECK-NEXT: adrp x8, .LCPI6_0
+; CHECK-NEXT: dup.8h v0, v0[0]
+; CHECK-NEXT: add x8, x8, :lo12:.LCPI6_0
+; CHECK-NEXT: ld1.h { v0 }[7], [x8]
; CHECK-NEXT: ret
%v.0 = insertelement <8 x half> <half undef, half undef, half undef, half undef, half undef, half undef, half undef, half 0.0>, half %a, i32 0
%v.1 = insertelement <8 x half> %v.0, half %a, i32 1
define <8 x i16> @test_insert_v8i16_insert_2(i16 %a) {
; CHECK-LABEL: test_insert_v8i16_insert_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: mov.h v0[0], w0
-; CHECK-NEXT: mov.h v0[1], w0
-; CHECK-NEXT: mov.h v0[2], w0
-; CHECK-NEXT: mov.h v0[4], w0
-; CHECK-NEXT: mov.h v0[5], w0
-; CHECK-NEXT: mov.h v0[6], w0
+; CHECK-NEXT: dup.8h v0, w0
+; CHECK-NEXT: mov.h v0[3], wzr
+; CHECK-NEXT: mov.h v0[7], wzr
; CHECK-NEXT: ret
%v.0 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 0>, i16 %a, i32 0
%v.1 = insertelement <8 x i16> %v.0, i16 %a, i32 1
define <8 x i16> @test_insert_v8i16_insert_3(i16 %a) {
; CHECK-LABEL: test_insert_v8i16_insert_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi.2d v0, #0000000000000000
-; CHECK-NEXT: mov.h v0[0], w0
-; CHECK-NEXT: mov.h v0[2], w0
-; CHECK-NEXT: mov.h v0[4], w0
-; CHECK-NEXT: mov.h v0[5], w0
-; CHECK-NEXT: mov.h v0[6], w0
+; CHECK-NEXT: dup.8h v0, w0
+; CHECK-NEXT: mov.h v0[1], wzr
+; CHECK-NEXT: mov.h v0[3], wzr
+; CHECK-NEXT: mov.h v0[7], wzr
; CHECK-NEXT: ret
%v.0 = insertelement <8 x i16> <i16 undef, i16 0, i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 0>, i16 %a, i32 0
%v.2 = insertelement <8 x i16> %v.0, i16 %a, i32 2
define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) {
; CHECK-LABEL: test_insert_3_f32_undef_zero_vector:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: mov.s v1[0], v0[0]
-; CHECK-NEXT: mov.s v1[1], v0[0]
-; CHECK-NEXT: mov.s v1[2], v0[0]
-; CHECK-NEXT: mov.16b v0, v1
+; CHECK-NEXT: fmov s1, wzr
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: mov.s v0[3], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %a, i32 0
%v.1 = insertelement <4 x float> %v.0, float %a, i32 1