return DAG.getBitcast(VT, V);
}
- // To obtain pop count for each i16 element, shuffle the byte pop count to get
- // even and odd elements into distinct vectors, add them and zero-extend each
- // i8 elemento into i16, i.e.:
- //
- // B -> pop count per i8
- // W -> pop count per i16
- //
- // Y = shuffle B, undef <0, 2, ...>
- // Z = shuffle B, undef <1, 3, ...>
- // W = zext <... x i8> to <... x i16> (Y + Z)
- //
- // Use a byte shuffle mask that matches PSHUFB.
- //
+ // The only element type left is i16.
assert(EltVT == MVT::i16 && "Unknown how to handle type");
- SDValue Undef = DAG.getUNDEF(ByteVecVT);
- SmallVector<int, 32> MaskA, MaskB;
-
- // We can't use PSHUFB across lanes, so do the shuffle and sum inside each
- // 128-bit lane, and then collapse the result.
- int NumLanes = VecSize / 128;
- assert(VecSize % 128 == 0 && "Must have 16-byte multiple vectors!");
- for (int i = 0; i < NumLanes; ++i) {
- for (int j = 0; j < 8; ++j) {
- MaskA.push_back(i * 16 + j * 2);
- MaskB.push_back(i * 16 + (j * 2) + 1);
- }
- MaskA.append((size_t)8, -1);
- MaskB.append((size_t)8, -1);
- }
-
- SDValue ShuffA = DAG.getVectorShuffle(ByteVecVT, DL, V, Undef, MaskA);
- SDValue ShuffB = DAG.getVectorShuffle(ByteVecVT, DL, V, Undef, MaskB);
- V = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB);
-
- SmallVector<int, 4> Mask;
- for (int i = 0; i < NumLanes; ++i)
- Mask.push_back(2 * i);
- Mask.append((size_t)NumLanes, -1);
-
- int NumI64Elts = VecSize / 64;
- MVT VecI64VT = MVT::getVectorVT(MVT::i64, NumI64Elts);
-
- V = DAG.getBitcast(VecI64VT, V);
- V = DAG.getVectorShuffle(VecI64VT, DL, V, DAG.getUNDEF(VecI64VT), Mask);
- V = DAG.getBitcast(ByteVecVT, V);
-
- // Zero extend i8s into i16 elts
- SmallVector<int, 16> ZExtInRegMask;
- for (int i = 0; i < NumElts; ++i) {
- ZExtInRegMask.push_back(i);
- ZExtInRegMask.push_back(2 * NumElts);
- }
- return DAG.getBitcast(
- VT, DAG.getVectorShuffle(ByteVecVT, DL, V,
- getZeroVector(ByteVecVT, Subtarget, DAG, DL),
- ZExtInRegMask));
+ // To obtain pop count for each i16 element starting from the pop count for
+ // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+ // right by 8. It is important to shift as i16s as i8 vector shift isn't
+ // directly supported.
+ SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
+ SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+ V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+ DAG.getBitcast(ByteVecVT, V));
+ return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
}
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
// At this point, V contains the byte-wise population count, and we are
// merely doing a horizontal sum if necessary to get the wider element
// counts.
- //
- // FIXME: There is a different lowering strategy above for the horizontal sum
- // of byte-wise population counts. This one and that one should be merged,
- // using the fastest of the two for each size.
- MVT ByteVT = MVT::getVectorVT(MVT::i8, VecSize / 8);
- MVT ShiftVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
- V = DAG.getBitcast(ByteVT, V);
- assert(Len <= 64 && "We don't support element sizes of more than 64 bits!");
- assert(isPowerOf2_32(Len) && "Only power of two element sizes supported!");
- for (int i = Len; i > 8; i /= 2) {
- SDValue Shl = DAG.getBitcast(
- ByteVT, GetShift(ISD::SHL, DAG.getBitcast(ShiftVT, V), i / 2));
- V = DAG.getNode(ISD::ADD, DL, ByteVT, V, Shl);
- }
-
- // The high byte now contains the sum of the element bytes. Shift it right
- // (if needed) to make it the low byte.
- V = DAG.getBitcast(VT, V);
- if (Len > 8)
- V = GetShift(ISD::SRL, V, Len - 8);
+ if (EltVT == MVT::i8)
+ return V;
- return V;
+ return LowerHorizontalByteSum(
+ DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
+ DAG);
}
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
; SSE2-NEXT: psrlq $4, %xmm1
; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: psadbw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllq $32, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $16, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllq $8, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm0
-; SSE2-NEXT: psrlq $56, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64:
; SSE3-NEXT: psrlq $4, %xmm1
; SSE3-NEXT: paddq %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT: pxor %xmm0, %xmm0
+; SSE3-NEXT: psadbw %xmm0, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllq $32, %xmm0
-; SSE3-NEXT: paddb %xmm1, %xmm0
-; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psllq $16, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllq $8, %xmm0
-; SSE3-NEXT: paddb %xmm1, %xmm0
-; SSE3-NEXT: psrlq $56, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64:
; SSE2-NEXT: psrld $4, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psllq $16, %xmm2
-; SSE2-NEXT: paddb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psllq $8, %xmm0
-; SSE2-NEXT: paddb %xmm2, %xmm0
-; SSE2-NEXT: psrld $24, %xmm0
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: psadbw %xmm0, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: psadbw %xmm0, %xmm1
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32:
; SSE3-NEXT: psrld $4, %xmm1
; SSE3-NEXT: paddd %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psllq $16, %xmm2
-; SSE3-NEXT: paddb %xmm1, %xmm2
-; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psllq $8, %xmm0
-; SSE3-NEXT: paddb %xmm2, %xmm0
-; SSE3-NEXT: psrld $24, %xmm0
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE3-NEXT: psadbw %xmm0, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: psadbw %xmm0, %xmm1
+; SSE3-NEXT: packuswb %xmm2, %xmm1
+; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32:
; SSE2-NEXT: paddw %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllq $8, %xmm0
+; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: retq
; SSE3-NEXT: paddw %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllq $8, %xmm0
+; SSE3-NEXT: psllw $8, %xmm0
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv8i16:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufb %xmm2, %xmm4
; SSSE3-NEXT: psrlw $4, %xmm0
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufb %xmm0, %xmm1
-; SSSE3-NEXT: paddb %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: paddb %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: paddb %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: psllw $8, %xmm0
+; SSSE3-NEXT: paddb %xmm3, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16:
; SSE41-NEXT: pshufb %xmm0, %xmm3
; SSE41-NEXT: paddb %xmm4, %xmm3
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: paddb %xmm0, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT: psllw $8, %xmm0
+; SSE41-NEXT: paddb %xmm3, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16:
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX-NEXT: retq
%out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
ret <8 x i16> %out
; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4
+; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm1
-; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: retq
%out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
ret <16 x i16> %out