Added more insertps optimizations

author Filipe Cabecinhas <me@filcab.net>

Mon, 19 May 2014 19:45:57 +0000 (19:45 +0000)

committer Filipe Cabecinhas <me@filcab.net>

Mon, 19 May 2014 19:45:57 +0000 (19:45 +0000)
author Filipe Cabecinhas <me@filcab.net>
Mon, 19 May 2014 19:45:57 +0000 (19:45 +0000)
committer Filipe Cabecinhas <me@filcab.net>
Mon, 19 May 2014 19:45:57 +0000 (19:45 +0000)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index e1db618..cfdaf0f 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7412,6 +7412,23 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                                getShuffleSHUFImmediate(SVOp), DAG);
  }
  
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+                                         SelectionDAG &DAG) {
+  SDLoc dl(Load);
+  MVT VT = Load->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue Addr = Load->getOperand(1);
+  SDValue NewAddr = DAG.getNode(
+      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+  SDValue NewLoad =
+      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                  DAG.getMachineFunction().getMachineMemOperand(
+                      Load->getMemOperand(), 0, EVT.getStoreSize()));
+  return NewLoad;
+}
+
  // It is only safe to call this function if isINSERTPSMask is true for
  // this shufflevector mask.
  static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
@@ -7423,7 +7440,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
    // If we're transferring an i32 from memory to a specific element in a
    // register, we output a generic DAG that will match the PINSRD
    // instruction.
-  // TODO: Optimize for AVX cases too (VINSERTPS)
    MVT VT = SVOp->getSimpleValueType(0);
    MVT EVT = VT.getVectorElementType();
    SDValue V1 = SVOp->getOperand(0);
@@ -7456,17 +7472,10 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
      // Trivial case, when From comes from a load and is only used by the
      // shuffle. Make it use insertps from the vector that we need from that
      // load.
-    SDValue Addr = From.getOperand(1);
-    SDValue NewAddr =
-        DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
-                    DAG.getConstant(DestIndex * EVT.getStoreSize(),
-                                    Addr.getSimpleValueType()));
-
-    LoadSDNode *Load = cast<LoadSDNode>(From);
      SDValue NewLoad =
-        DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
-                    DAG.getMachineFunction().getMachineMemOperand(
-                        Load->getMemOperand(), 0, EVT.getStoreSize()));
+        NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+    if (!NewLoad.getNode())
+      return SDValue();
  
      if (EVT == MVT::f32) {
        // Create this as a scalar to vector to match the instruction pattern.
@@ -20281,6 +20290,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  MVT VT = N->getOperand(1)->getSimpleValueType(0);
+  assert(VT == MVT::v4f32 ||
+         VT == MVT::v4i32 && "X86insertps is only defined for v4x32");
+
+  SDValue Ld = N->getOperand(1);
+  if (MayFoldLoad(Ld)) {
+    // Extract the countS bits from the immediate so we can get the proper
+    // address when narrowing the vector load to a specific element.
+    // When the second source op is a memory address, interps doesn't use
+    // countS and just gets an f32 from that address.
+    unsigned DestIndex =
+        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+  } else
+    return SDValue();
+
+  // Create this as a scalar to vector to match the instruction pattern.
+  SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+  // countS bits are ignored when loading from memory on insertps, which
+  // means we don't need to explicitly set them to 0.
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+                     LoadScalarToVector, N->getOperand(2));
+}
+
  // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
  // as "sbb reg,reg", since it can be extended without zext and produces
  // an all-ones bit which is more useful than 0/1 in some cases.
@@ -20584,6 +20620,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
    case ISD::INTRINSIC_WO_CHAIN:
      return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+  case X86ISD::INSERTPS:
+    return PerformINSERTPSCombine(N, DAG, Subtarget);
    }
  
    return SDValue();
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td

index ac28d07..1eb0485 100644 (file)
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6550,6 +6550,29 @@ let ExeDomain = SSEPackedSingle in {
      defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
  }
  
+let Predicates = [UseSSE41] in {
+  // If we're inserting an element from a load or a null pshuf of a load,
+  // fold the load into the insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+                   imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+  // If we're inserting an element from a vbroadcast of a load, fold the
+  // load into the X86insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
  //===----------------------------------------------------------------------===//
  // SSE4.1 - Round Instructions
  //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll

index 208e384..6069c14 100644 (file)
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
  
  define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
  ; CHECK-LABEL: @blendvb_fallback_v4i32
@@ -23,3 +24,113 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x
    %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
    ret <8 x float> %ret
  }
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: vbroadcastss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll

index e85d8f7..96c5be4 100644 (file)
--- a/llvm/test/CodeGen/X86/fold-load-vec.ll
+++ b/llvm/test/CodeGen/X86/fold-load-vec.ll
@@ -5,7 +5,7 @@
  ; loads from m32.
  define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
  ; CHECK: sample_test
-; CHECK: movaps
+; CHECK-NOT: movaps
  ; CHECK: insertps
  entry:
    %source.addr = alloca <4 x float>*, align 8
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll

index 3652d8c..a3c6201 100644 (file)
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -584,3 +584,111 @@ define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
    %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
    ret <8 x i16> %ret
  }
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: movss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: addps
+; CHECK: addps
+; CHECK: addps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
author	Filipe Cabecinhas <me@filcab.net>
	Mon, 19 May 2014 19:45:57 +0000 (19:45 +0000)
committer	Filipe Cabecinhas <me@filcab.net>
	Mon, 19 May 2014 19:45:57 +0000 (19:45 +0000)
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
llvm/test/CodeGen/X86/avx.ll		patch \| blob \| history
llvm/test/CodeGen/X86/fold-load-vec.ll		patch \| blob \| history
llvm/test/CodeGen/X86/sse41.ll		patch \| blob \| history