setOperationAction(ISD::FNEARBYINT, VT, Legal);
}
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
+
+ // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
+
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
if (Subtarget.hasVLX()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
}
-static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
+// For sign extend this needs to handle all vector sizes and SSE4.1 and
+// non-SSE4.1 targets. For zero extend this should only handle inputs of
+// MVT::v64i8 when BWI is not supported, but AVX512 is.
+static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
MVT VT = Op->getSimpleValueType(0);
MVT InVT = In.getSimpleValueType();
if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
return SDValue();
if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
- !(VT.is256BitVector() && Subtarget.hasInt256()))
+ !(VT.is256BitVector() && Subtarget.hasInt256()) &&
+ !(VT.is512BitVector() && Subtarget.hasAVX512()))
return SDValue();
SDLoc dl(Op);
// For 256-bit vectors, we only need the lower (128-bit) half of the input.
- if (VT.is256BitVector())
- In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
- MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
- In, DAG.getIntPtrConstant(0, dl));
+ // For 512-bit vectors, we need 128-bits or 256-bits.
+ if (VT.getSizeInBits() > 128) {
+ // Input needs to be at least the same number of elements as output, and
+ // at least 128-bits.
+ int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
+ In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
+ }
+
+ assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
+ InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
// SSE41 targets can use the pmovsx* instructions directly.
+ unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+ X86ISD::VSEXT : X86ISD::VZEXT;
if (Subtarget.hasSSE41())
- return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+ return DAG.getNode(ExtOpc, dl, VT, In);
+
+ // We should only get here for sign extend.
+ assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
+ "Unexpected opcode!");
// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
SDValue Curr = In;
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
- return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+ return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
// ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
// Also use this if we don't have SSE41 to allow the legalizer do its job.
if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
- (VT.is256BitVector() && Subtarget.hasInt256())) {
+ (VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.hasAVX512())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
return Opcode == ISD::SIGN_EXTEND
? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
; X32-LABEL: test_llvm_x86_avx512_pmovsxbq:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT: vpsllq $56, %zmm0, %zmm0
-; X32-NEXT: vpsraq $56, %zmm0, %zmm0
+; X32-NEXT: vpmovsxbq (%eax), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovsxbq:
; X64: ## BB#0:
-; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT: vpsllq $56, %zmm0, %zmm0
-; X64-NEXT: vpsraq $56, %zmm0, %zmm0
+; X64-NEXT: vpmovsxbq (%rdi), %zmm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; X32-LABEL: test_llvm_x86_avx512_pmovzxbq:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; X32-NEXT: vpand %ymm2, %ymm1, %ymm1
-; X32-NEXT: vpand %ymm2, %ymm0, %ymm0
-; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vmovdqu (%eax), %xmm0
+; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovzxbq:
; X64: ## BB#0:
-; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; X64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; X64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-NEXT: vmovdqu (%rdi), %xmm0
+; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX512F-LABEL: zext_16i8_to_8i64:
; AVX512F: # BB#0: # %entry
; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: zext_16i8_to_8i64:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>