[AVX-512] Add support for creating SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_IN...

author Craig Topper <craig.topper@gmail.com>

Tue, 25 Oct 2016 04:00:29 +0000 (04:00 +0000)

committer Craig Topper <craig.topper@gmail.com>

Tue, 25 Oct 2016 04:00:29 +0000 (04:00 +0000)
author Craig Topper <craig.topper@gmail.com>
Tue, 25 Oct 2016 04:00:29 +0000 (04:00 +0000)
committer Craig Topper <craig.topper@gmail.com>
Tue, 25 Oct 2016 04:00:29 +0000 (04:00 +0000)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 3ffebf8..4423b01 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1309,6 +1309,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
        setOperationAction(ISD::FNEARBYINT, VT, Legal);
      }
  
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
+
+    // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
      setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
@@ -1509,6 +1516,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
      setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
  
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
+
      setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
      if (Subtarget.hasVLX()) {
        setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
@@ -16368,9 +16377,13 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
    return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
  }
  
-static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
-                                             const X86Subtarget &Subtarget,
-                                             SelectionDAG &DAG) {
+// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
+// For sign extend this needs to handle all vector sizes and SSE4.1 and
+// non-SSE4.1 targets. For zero extend this should only handle inputs of
+// MVT::v64i8 when BWI is not supported, but AVX512 is.
+static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
    SDValue In = Op->getOperand(0);
    MVT VT = Op->getSimpleValueType(0);
    MVT InVT = In.getSimpleValueType();
@@ -16385,20 +16398,33 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
    if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
      return SDValue();
    if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
-      !(VT.is256BitVector() && Subtarget.hasInt256()))
+      !(VT.is256BitVector() && Subtarget.hasInt256()) &&
+      !(VT.is512BitVector() && Subtarget.hasAVX512()))
      return SDValue();
  
    SDLoc dl(Op);
  
    // For 256-bit vectors, we only need the lower (128-bit) half of the input.
-  if (VT.is256BitVector())
-    In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
-                     MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
-                     In, DAG.getIntPtrConstant(0, dl));
+  // For 512-bit vectors, we need 128-bits or 256-bits.
+  if (VT.getSizeInBits() > 128) {
+    // Input needs to be at least the same number of elements as output, and
+    // at least 128-bits.
+    int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
+    In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
+  }
+
+  assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
+          InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
  
    // SSE41 targets can use the pmovsx* instructions directly.
+  unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+                      X86ISD::VSEXT : X86ISD::VZEXT;
    if (Subtarget.hasSSE41())
-    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+    return DAG.getNode(ExtOpc, dl, VT, In);
+
+  // We should only get here for sign extend.
+  assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
+         "Unexpected opcode!");
  
    // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
    SDValue Curr = In;
@@ -22075,8 +22101,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
    case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
    case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
    case ISD::SIGN_EXTEND_VECTOR_INREG:
-    return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+    return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
    case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
    case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
    case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
@@ -31120,7 +31147,8 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
    // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
    // Also use this if we don't have SSE41 to allow the legalizer do its job.
    if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
-      (VT.is256BitVector() && Subtarget.hasInt256())) {
+      (VT.is256BitVector() && Subtarget.hasInt256()) ||
+      (VT.is512BitVector() && Subtarget.hasAVX512())) {
      SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
      return Opcode == ISD::SIGN_EXTEND
                 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
diff --git a/llvm/test/CodeGen/X86/avx512-pmovxrm.ll b/llvm/test/CodeGen/X86/avx512-pmovxrm.ll

index 91f0afe..7c3965e 100644 (file)
--- a/llvm/test/CodeGen/X86/avx512-pmovxrm.ll
+++ b/llvm/test/CodeGen/X86/avx512-pmovxrm.ll
@@ -38,16 +38,12 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxbq(<16 x i8>* %a) {
  ; X32-LABEL: test_llvm_x86_avx512_pmovsxbq:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT:    vpsllq $56, %zmm0, %zmm0
-; X32-NEXT:    vpsraq $56, %zmm0, %zmm0
+; X32-NEXT:    vpmovsxbq (%eax), %zmm0
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: test_llvm_x86_avx512_pmovsxbq:
  ; X64:       ## BB#0:
-; X64-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT:    vpsllq $56, %zmm0, %zmm0
-; X64-NEXT:    vpsraq $56, %zmm0, %zmm0
+; X64-NEXT:    vpmovsxbq (%rdi), %zmm0
  ; X64-NEXT:    retq
    %1 = load <16 x i8>, <16 x i8>* %a, align 1
    %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -139,22 +135,14 @@ define <8 x i64> @test_llvm_x86_avx512_pmovzxbq(<16 x i8>* %a) {
  ; X32-LABEL: test_llvm_x86_avx512_pmovzxbq:
  ; X32:       ## BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; X32-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT:    vmovdqu (%eax), %xmm0
+; X32-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: test_llvm_x86_avx512_pmovzxbq:
  ; X64:       ## BB#0:
-; X64-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; X64-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; X64-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
  ; X64-NEXT:    retq
    %1 = load <16 x i8>, <16 x i8>* %a, align 1
    %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll

index f332f48..de946c0 100644 (file)
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -599,9 +599,7 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
  ;
  ; AVX512-LABEL: sext_16i8_to_8i64:
  ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT:    vpsllq $56, %zmm0, %zmm0
-; AVX512-NEXT:    vpsraq $56, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovsxbq %xmm0, %zmm0
  ; AVX512-NEXT:    retq
  ;
  ; X32-SSE41-LABEL: sext_16i8_to_8i64:
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll

index 72dda57..e679cc9 100644 (file)
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -461,17 +461,12 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
  ; AVX512F-LABEL: zext_16i8_to_8i64:
  ; AVX512F:       # BB#0: # %entry
  ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
  ; AVX512F-NEXT:    retq
  ;
  ; AVX512BW-LABEL: zext_16i8_to_8i64:
  ; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
  ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
  ; AVX512BW-NEXT:    retq
  entry:
    %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
author	Craig Topper <craig.topper@gmail.com>
	Tue, 25 Oct 2016 04:00:29 +0000 (04:00 +0000)
committer	Craig Topper <craig.topper@gmail.com>
	Tue, 25 Oct 2016 04:00:29 +0000 (04:00 +0000)
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/X86/avx512-pmovxrm.ll		patch \| blob \| history
llvm/test/CodeGen/X86/vector-sext.ll		patch \| blob \| history
llvm/test/CodeGen/X86/vector-zext.ll		patch \| blob \| history