[X86] Don't use RCP14 and RSQRT14 for reciprocal estimations or for legacy SSE rcp...

author Craig Topper <craig.topper@intel.com>

Sat, 4 Nov 2017 18:26:41 +0000 (18:26 +0000)

committer Craig Topper <craig.topper@intel.com>

Sat, 4 Nov 2017 18:26:41 +0000 (18:26 +0000)
author Craig Topper <craig.topper@intel.com>
Sat, 4 Nov 2017 18:26:41 +0000 (18:26 +0000)
committer Craig Topper <craig.topper@intel.com>
Sat, 4 Nov 2017 18:26:41 +0000 (18:26 +0000)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 3883415..3a923ce 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24808,9 +24808,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FMAXC:              return "X86ISD::FMAXC";
    case X86ISD::FMINC:              return "X86ISD::FMINC";
    case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
-  case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
    case X86ISD::FRCP:               return "X86ISD::FRCP";
-  case X86ISD::FRCPS:              return "X86ISD::FRCPS";
    case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
    case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
    case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
@@ -24988,9 +24986,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::SELECT:             return "X86ISD::SELECT";
    case X86ISD::SELECTS:            return "X86ISD::SELECTS";
    case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
+  case X86ISD::RCP14:              return "X86ISD::RCP14";
+  case X86ISD::RCP14S:             return "X86ISD::RCP14S";
    case X86ISD::RCP28:              return "X86ISD::RCP28";
    case X86ISD::RCP28S:             return "X86ISD::RCP28S";
    case X86ISD::EXP2:               return "X86ISD::EXP2";
+  case X86ISD::RSQRT14:            return "X86ISD::RSQRT14";
+  case X86ISD::RSQRT14S:           return "X86ISD::RSQRT14S";
    case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
    case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
    case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h

index 17cb976..6aa806d 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -254,7 +254,9 @@ namespace llvm {
        /// Note that these typically require refinement
        /// in order to obtain suitable precision.
        FRSQRT, FRCP,
-      FRSQRTS, FRCPS,
+
+      // AVX-512 reciprocal approximations with a little more precision.
+      RSQRT14, RSQRT14S, RCP14, RCP14S,
  
        // Thread Local Storage.
        TLSADDR,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td

index 17b5e10..4b2b8c9 100644 (file)
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -7362,13 +7362,13 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
  }
  }
  
-defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, f32x_info>,
                    EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, f64x_info>,
                    VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, f32x_info>,
                    EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, f64x_info>,
                    VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
  
  /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
@@ -7414,8 +7414,8 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
    }
  }
  
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14>;
  
  /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
  multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td

index c4f34bd..22e9afc 100644 (file)
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -56,8 +56,6 @@ def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
  def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp>;
  def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
  def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
-def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS",  SDTFPBinOp>;
-def X86frcp14s : SDNode<"X86ISD::FRCPS",    SDTFPBinOp>;
  def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
  def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
  def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
@@ -498,10 +496,14 @@ def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
  def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTIFma, [SDNPCommutative]>;
  def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTIFma, [SDNPCommutative]>;
  
+def X86rsqrt14   : SDNode<"X86ISD::RSQRT14",  SDTFPUnaryOp>;
+def X86rcp14     : SDNode<"X86ISD::RCP14",    SDTFPUnaryOp>;
  def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  SDTFPUnaryOpRound>;
  def X86rcp28     : SDNode<"X86ISD::RCP28",    SDTFPUnaryOpRound>;
  def X86exp2      : SDNode<"X86ISD::EXP2",     SDTFPUnaryOpRound>;
  
+def X86rsqrt14s  : SDNode<"X86ISD::RSQRT14S",   SDTFPBinOp>;
+def X86rcp14s    : SDNode<"X86ISD::RCP14S",     SDTFPBinOp>;
  def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28S",   SDTFPBinOpRound>;
  def X86rcp28s    : SDNode<"X86ISD::RCP28S",     SDTFPBinOpRound>;
  def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td

index d4676b5..ec57241 100644 (file)
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -3252,9 +3252,9 @@ defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
  // Reciprocal approximations. Note that these typically require refinement
  // in order to obtain suitable precision.
  defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>;
  defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
-             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>;
  
  // There is no f64 version of the reciprocal approximation instructions.
  
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h

index 6f5e41b..448a0ef 100644 (file)
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1428,26 +1428,26 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
    X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
    X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
    X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
    X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
    X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
    X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
    X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
    X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
    X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
    X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll

index b0cf4e3..e508e34 100644 (file)
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -581,15 +581,10 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
  
  
  define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rcp_ps_256:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rcp_ps_256:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vrcp14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
-; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rcp_ps_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
    ret <8 x float> %res
  }
@@ -619,15 +614,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
  
  
  define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vrsqrt14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
-; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
    ret <8 x float> %res
  }
diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll

index 44d13db..858a27b 100644 (file)
--- a/llvm/test/CodeGen/X86/avx-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx-schedule.ll
@@ -3982,8 +3982,8 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
  ;
  ; SKX-LABEL: test_rcpps:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT:    vrcp14ps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps (%rdi), %ymm1 # sched: [11:1.00]
  ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
  ; SKX-NEXT:    retq # sched: [7:1.00]
  ;
@@ -4174,8 +4174,8 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
  ;
  ; SKX-LABEL: test_rsqrtps:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrsqrt14ps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT:    vrsqrt14ps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [11:1.00]
  ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
  ; SKX-NEXT:    retq # sched: [7:1.00]
  ;
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll

index 0e9d149..56856d2 100644 (file)
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -416,7 +416,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
  ;
  ; SKX-LABEL: v4f32_one_step:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
  ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
  ; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
  ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -533,7 +533,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
  ;
  ; SKX-LABEL: v4f32_two_step:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
  ; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
  ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
  ; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -691,7 +691,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
  ;
  ; SKX-LABEL: v8f32_one_step:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
  ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
  ; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
  ; SKX-NEXT:    retq # sched: [7:1.00]
@@ -821,7 +821,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
  ;
  ; SKX-LABEL: v8f32_two_step:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
  ; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
  ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
  ; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll

index a263e9d..338f23f 100644 (file)
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -478,7 +478,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
  ;
  ; SKX-LABEL: v4f32_one_step2:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
  ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
  ; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
  ; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
@@ -580,7 +580,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
  ;
  ; SKX-LABEL: v4f32_one_step_2_divs:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
  ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
  ; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
  ; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
@@ -708,7 +708,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
  ;
  ; SKX-LABEL: v4f32_two_step2:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
  ; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
  ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
  ; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -814,7 +814,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
  ;
  ; SKX-LABEL: v8f32_one_step2:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
  ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
  ; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
  ; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
@@ -925,7 +925,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
  ;
  ; SKX-LABEL: v8f32_one_step_2_divs:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
  ; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
  ; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
  ; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
@@ -1067,7 +1067,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
  ;
  ; SKX-LABEL: v8f32_two_step2:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
  ; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
  ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
  ; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
@@ -1124,7 +1124,7 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
  ;
  ; SKX-LABEL: v8f32_no_step:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
  ; SKX-NEXT:    retq # sched: [7:1.00]
    %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
    ret <8 x float> %div
@@ -1183,7 +1183,7 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
  ;
  ; SKX-LABEL: v8f32_no_step2:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
  ; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
  ; SKX-NEXT:    retq # sched: [7:1.00]
    %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll

index 5ba9f9a..de02eba 100644 (file)
--- a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -401,15 +401,10 @@ define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
  ; SSE-NEXT:    rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0]
  ; SSE-NEXT:    retl ## encoding: [0xc3]
  ;
-; AVX2-LABEL: test_x86_sse_rcp_ps:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rcp_ps:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rcp_ps:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
    %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
    ret <4 x float> %res
  }
@@ -438,15 +433,10 @@ define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
  ; SSE-NEXT:    rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0]
  ; SSE-NEXT:    retl ## encoding: [0xc3]
  ;
-; AVX2-LABEL: test_x86_sse_rsqrt_ps:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rsqrt_ps:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rsqrt_ps:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
    %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
    ret <4 x float> %res
  }
diff --git a/llvm/test/CodeGen/X86/sse-schedule.ll b/llvm/test/CodeGen/X86/sse-schedule.ll

index b5c2bff..d3c9951 100644 (file)
--- a/llvm/test/CodeGen/X86/sse-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse-schedule.ll
@@ -2547,8 +2547,8 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
  ;
  ; SKX-LABEL: test_rcpps:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT:    vrcp14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT:    vrcpps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT:    vrcpps (%rdi), %xmm1 # sched: [10:1.00]
  ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
  ; SKX-NEXT:    retq # sched: [7:1.00]
  ;
@@ -2719,8 +2719,8 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
  ;
  ; SKX-LABEL: test_rsqrtps:
  ; SKX:       # BB#0:
-; SKX-NEXT:    vrsqrt14ps %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT:    vrsqrt14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
  ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
  ; SKX-NEXT:    retq # sched: [7:1.00]
  ;
author	Craig Topper <craig.topper@intel.com>
	Sat, 4 Nov 2017 18:26:41 +0000 (18:26 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Sat, 4 Nov 2017 18:26:41 +0000 (18:26 +0000)
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
llvm/lib/Target/X86/X86InstrAVX512.td		patch \| blob \| history
llvm/lib/Target/X86/X86InstrFragmentsSIMD.td		patch \| blob \| history
llvm/lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
llvm/lib/Target/X86/X86IntrinsicsInfo.h		patch \| blob \| history
llvm/test/CodeGen/X86/avx-intrinsics-x86.ll		patch \| blob \| history
llvm/test/CodeGen/X86/avx-schedule.ll		patch \| blob \| history
llvm/test/CodeGen/X86/recip-fastmath.ll		patch \| blob \| history
llvm/test/CodeGen/X86/recip-fastmath2.ll		patch \| blob \| history
llvm/test/CodeGen/X86/sse-intrinsics-x86.ll		patch \| blob \| history
llvm/test/CodeGen/X86/sse-schedule.ll		patch \| blob \| history