// TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
ISD::NodeType Opc;
- SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
+ SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}, true);
if (!Rdx)
return SDValue();
"Reduction doesn't end in an extract from index 0");
EVT VT = ExtElt->getValueType(0);
- EVT VecVT = ExtElt->getOperand(0).getValueType();
+ EVT VecVT = Rdx.getValueType();
if (VecVT.getScalarType() != VT)
return SDValue();
// vXi8 reduction - sum lo/hi halves then use PSADBW.
if (VT == MVT::i8) {
while (Rdx.getValueSizeInBits() > 128) {
- EVT RdxVT = Rdx.getValueType();
- unsigned HalfSize = RdxVT.getSizeInBits() / 2;
- unsigned HalfElts = RdxVT.getVectorNumElements() / 2;
+ unsigned HalfSize = VecVT.getSizeInBits() / 2;
+ unsigned HalfElts = VecVT.getVectorNumElements() / 2;
SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
+ VecVT = Rdx.getValueType();
}
- assert(Rdx.getValueType() == MVT::v16i8 && "v16i8 reduction expected");
+ assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
SDValue Hi = DAG.getVectorShuffle(
MVT::v16i8, DL, Rdx, Rdx,
unsigned NumElts = VecVT.getVectorNumElements();
SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
- VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
- Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
+ Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
+ VecVT = Rdx.getValueType();
}
if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
return SDValue();
// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
- assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
for (unsigned i = 0; i != ReductionSteps; ++i)
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
;
; AVX-FAST-LABEL: partial_reduction_add_v8i32:
; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vmovd %xmm0, %eax
; AVX-FAST-NEXT: vzeroupper
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
;
-; AVX1-FAST-LABEL: partial_reduction_add_v16i32:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-FAST-LABEL: partial_reduction_add_v16i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovd %xmm0, %eax
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-FAST-LABEL: partial_reduction_add_v16i32:
-; AVX512-FAST: # %bb.0:
-; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-FAST-NEXT: vmovd %xmm0, %eax
-; AVX512-FAST-NEXT: vzeroupper
-; AVX512-FAST-NEXT: retq
+; AVX-FAST-LABEL: partial_reduction_add_v16i32:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vmovd %xmm0, %eax
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%x0213 = add <16 x i32> %x, %x23
%x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
;
; AVX-FAST-LABEL: hadd32_8:
; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vmovd %xmm0, %eax
; AVX-FAST-NEXT: vzeroupper
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
;
-; AVX1-FAST-LABEL: hadd32_16:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-FAST-LABEL: hadd32_16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovd %xmm0, %eax
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-FAST-LABEL: hadd32_16:
-; AVX512-FAST: # %bb.0:
-; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-FAST-NEXT: vmovd %xmm0, %eax
-; AVX512-FAST-NEXT: vzeroupper
-; AVX512-FAST-NEXT: retq
+; AVX-FAST-LABEL: hadd32_16:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vmovd %xmm0, %eax
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%x227 = add <16 x i32> %x225, %x226
%x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
;
; AVX-LABEL: hadd32_8_optsize:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: vzeroupper
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: retq
;
-; AVX1-SLOW-LABEL: hadd32_16_optsize:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: vzeroupper
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: hadd32_16_optsize:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-SLOW-LABEL: hadd32_16_optsize:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: hadd32_16_optsize:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovd %xmm0, %eax
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-SLOW-LABEL: hadd32_16_optsize:
-; AVX512-SLOW: # %bb.0:
-; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX512-SLOW-NEXT: vzeroupper
-; AVX512-SLOW-NEXT: retq
-;
-; AVX512-FAST-LABEL: hadd32_16_optsize:
-; AVX512-FAST: # %bb.0:
-; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-FAST-NEXT: vmovd %xmm0, %eax
-; AVX512-FAST-NEXT: vzeroupper
-; AVX512-FAST-NEXT: retq
+; AVX-LABEL: hadd32_16_optsize:
+; AVX: # %bb.0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%x227 = add <16 x i32> %x225, %x226
%x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>