[SDAG] Make the DAGCombine worklist not grow endlessly due to duplicate

author Chandler Carruth <chandlerc@gmail.com>

Wed, 23 Jul 2014 07:08:53 +0000 (07:08 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Wed, 23 Jul 2014 07:08:53 +0000 (07:08 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Wed, 23 Jul 2014 07:08:53 +0000 (07:08 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Wed, 23 Jul 2014 07:08:53 +0000 (07:08 +0000)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index c9a5919aaa87f31548bf72c2455579688fdafa13..35d6256a24662c7d23fc0716d89089af7c7c64c4 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18,6 +18,7 @@
  
  #include "llvm/CodeGen/SelectionDAG.h"
  #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/Analysis/AliasAnalysis.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -87,25 +88,21 @@ namespace {
      bool LegalTypes;
      bool ForCodeSize;
  
-    // Worklist of all of the nodes that need to be simplified.
-    //
-    // This has the semantics that when adding to the worklist,
-    // the item added must be next to be processed. It should
-    // also only appear once. The naive approach to this takes
-    // linear time.
-    //
-    // To reduce the insert/remove time to logarithmic, we use
-    // a set and a vector to maintain our worklist.
-    //
-    // The set contains the items on the worklist, but does not
-    // maintain the order they should be visited.
-    //
-    // The vector maintains the order nodes should be visited, but may
-    // contain duplicate or removed nodes. When choosing a node to
-    // visit, we pop off the order stack until we find an item that is
-    // also in the contents set. All operations are O(log N).
-    SmallPtrSet<SDNode*, 64> WorklistContents;
-    SmallVector<SDNode*, 64> WorklistOrder;
+    /// \brief Worklist of all of the nodes that need to be simplified.
+    ///
+    /// This must behave as a stack -- new nodes to process are pushed onto the
+    /// back and when processing we pop off of the back.
+    ///
+    /// The worklist will not contain duplicates but may contain null entries
+    /// due to nodes being deleted from the underlying DAG.
+    SmallVector<SDNode *, 64> Worklist;
+
+    /// \brief Mapping from an SDNode to its position on the worklist.
+    ///
+    /// This is used to find and remove nodes from the worklist (by nulling
+    /// them) when they are deleted from the underlying DAG. It relies on
+    /// stable indices of nodes within the worklist.
+    DenseMap<SDNode *, unsigned> WorklistMap;
  
      // AA - Used for DAG load/store alias analysis.
      AliasAnalysis &AA;
@@ -132,16 +129,24 @@ namespace {
        if (N->getOpcode() == ISD::HANDLENODE)
          return;
  
-      WorklistContents.insert(N);
-      WorklistOrder.push_back(N);
+      if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
+        Worklist.push_back(N);
      }
  
      /// removeFromWorklist - remove all instances of N from the worklist.
      ///
      void removeFromWorklist(SDNode *N) {
-      WorklistContents.erase(N);
+      auto It = WorklistMap.find(N);
+      if (It == WorklistMap.end())
+        return; // Not in the worklist.
+
+      // Null out the entry rather than erasing it to avoid a linear operation.
+      Worklist[It->second] = nullptr;
+      WorklistMap.erase(It);
      }
  
+    bool recursivelyDeleteUnusedNodes(SDNode *N);
+
      SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                        bool AddTo = true);
  
@@ -1072,6 +1077,35 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
    return false;
  }
  
+/// \brief Recursively delete a node which has no uses and any operands for
+/// which it is the only use.
+///
+/// Note that this both deletes the nodes and removes them from the worklist.
+/// It also adds any nodes who have had a user deleted to the worklist as they
+/// may now have only one use and subject to other combines.
+bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
+  if (!N->use_empty())
+    return false;
+
+  SmallSetVector<SDNode *, 16> Nodes;
+  Nodes.insert(N);
+  do {
+    N = Nodes.pop_back_val();
+    if (!N)
+      continue;
+
+    if (N->use_empty()) {
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        Nodes.insert(N->getOperand(i).getNode());
+
+      removeFromWorklist(N);
+      DAG.DeleteNode(N);
+    } else {
+      AddToWorklist(N);
+    }
+  } while (!Nodes.empty());
+  return true;
+}
  
  //===----------------------------------------------------------------------===//
  //  Main DAG Combiner implementation
@@ -1099,27 +1133,25 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
  
    // while the worklist isn't empty, find a node and
    // try and combine it.
-  while (!WorklistContents.empty()) {
+  while (!WorklistMap.empty()) {
      SDNode *N;
-    // The WorklistOrder holds the SDNodes in order, but it may contain
-    // duplicates.
-    // In order to avoid a linear scan, we use a set (O(log N)) to hold what the
-    // worklist *should* contain, and check the node we want to visit is should
-    // actually be visited.
+    // The Worklist holds the SDNodes in order, but it may contain null entries.
      do {
-      N = WorklistOrder.pop_back_val();
-    } while (!WorklistContents.erase(N));
+      N = Worklist.pop_back_val();
+    } while (!N);
+
+    bool GoodWorklistEntry = WorklistMap.erase(N);
+    (void)GoodWorklistEntry;
+    assert(GoodWorklistEntry &&
+           "Found a worklist entry without a corresponding map entry!");
  
      // If N has no uses, it is dead.  Make sure to revisit all N's operands once
      // N is deleted from the DAG, since they too may now be dead or may have a
      // reduced number of uses, allowing other xforms.
-    if (N->use_empty()) {
-      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-        AddToWorklist(N->getOperand(i).getNode());
-
-      DAG.DeleteNode(N);
+    if (recursivelyDeleteUnusedNodes(N))
        continue;
-    }
+
+    WorklistRemover DeadNodes(*this);
  
      SDValue RV = combine(N);
  
@@ -1147,7 +1179,6 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
  
      // Transfer debug value.
      DAG.TransferDbgValues(SDValue(N, 0), RV);
-    WorklistRemover DeadNodes(*this);
      if (N->getNumValues() == RV.getNode()->getNumValues())
        DAG.ReplaceAllUsesWith(N, RV.getNode());
      else {
@@ -1161,23 +1192,11 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
      AddToWorklist(RV.getNode());
      AddUsersToWorklist(RV.getNode());
  
-    // Add any uses of the old node to the worklist in case this node is the
-    // last one that uses them.  They may become dead after this node is
-    // deleted.
-    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-      AddToWorklist(N->getOperand(i).getNode());
-
      // Finally, if the node is now dead, remove it from the graph.  The node
      // may not be dead if the replacement process recursively simplified to
-    // something else needing this node.
-    if (N->use_empty()) {
-      // Nodes can be reintroduced into the worklist.  Make sure we do not
-      // process a node that has been replaced.
-      removeFromWorklist(N);
-
-      // Finally, since the node is now dead, remove it from the graph.
-      DAG.DeleteNode(N);
-    }
+    // something else needing this node. This will also take care of adding any
+    // operands which have lost a user to the worklist.
+    recursivelyDeleteUnusedNodes(N);
    }
  
    // If the root changed (e.g. it was a dead load, update the root).
diff --git a/llvm/test/CodeGen/ARM/fold-stack-adjust.ll b/llvm/test/CodeGen/ARM/fold-stack-adjust.ll

index eb0120f7c1bbce73f6edf1ec4ed2cc5b7a5e04d7..1d10464017cee398a1e7f82312c41350b30659c9 100644 (file)
--- a/llvm/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/llvm/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -167,9 +167,9 @@ end:
  define void @test_varsize(...) minsize {
  ; CHECK-T1-LABEL: test_varsize:
  ; CHECK-T1: sub        sp, #16
-; CHECK-T1: push       {r2, r3, r4, r5, r7, lr}
+; CHECK-T1: push       {r5, r6, r7, lr}
  ; ...
-; CHECK-T1: pop        {r2, r3, r4, r5, r7}
+; CHECK-T1: pop        {r2, r3, r7}
  ; CHECK-T1: pop        {r3}
  ; CHECK-T1: add        sp, #16
  ; CHECK-T1: bx r3
diff --git a/llvm/test/CodeGen/ARM/sxt_rot.ll b/llvm/test/CodeGen/ARM/sxt_rot.ll

index 5ddea2ec13dc0494c41d0b90c9b33b4c02c33526..41626910c3b151721f7a64f2d55ae52fb7caa330 100644 (file)
--- a/llvm/test/CodeGen/ARM/sxt_rot.ll
+++ b/llvm/test/CodeGen/ARM/sxt_rot.ll
@@ -9,7 +9,8 @@ define i32 @test0(i8 %A) {
  
  define signext i8 @test1(i32 %A) {
  ; CHECK: test1
-; CHECK: sxtb r0, r0, ror #8
+; CHECK: lsr r0, r0, #8
+; CHECK: sxtb r0, r0
    %B = lshr i32 %A, 8
    %C = shl i32 %A, 24
    %D = or i32 %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/complex-return.ll b/llvm/test/CodeGen/PowerPC/complex-return.ll

index 8a6adaee555602e0057b1d896355d231c03a4f01..9d25e619d2e578d48489d05a8a9ab5e0e9912012 100644 (file)
--- a/llvm/test/CodeGen/PowerPC/complex-return.ll
+++ b/llvm/test/CodeGen/PowerPC/complex-return.ll
@@ -24,10 +24,10 @@ entry:
  }
  
  ; CHECK-LABEL: foo:
-; CHECK: lfd 3
-; CHECK: lfd 4
  ; CHECK: lfd 1
  ; CHECK: lfd 2
+; CHECK: lfd 3
+; CHECK: lfd 4
  
  define { float, float } @oof() nounwind {
  entry:
diff --git a/llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll b/llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll

index da637cd2548b1a5d65e6bfd825e86cd55b7ce739..c510e36cb41375e8ffd9aba4753baf87e87ea36b 100644 (file)
--- a/llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll
+++ b/llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll
@@ -35,7 +35,7 @@ if.then9.i39:                                     ; preds = %if.end7.i37
    br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
  
  ; CHECK: .LBB0_7:
-; CHECK:       beq 1, .LBB0_10
+; CHECK:       bne 1, .LBB0_10
  ; CHECK:       beq 0, .LBB0_10
  ; CHECK: .LBB0_9:
  
diff --git a/llvm/test/CodeGen/R600/r600-export-fix.ll b/llvm/test/CodeGen/R600/r600-export-fix.ll

index 73bc0635ab2193ccc47db86cc493c009a4c2c0e0..7d728563207850b2f65eb55ed919af97c8ea491c 100644 (file)
--- a/llvm/test/CodeGen/R600/r600-export-fix.ll
+++ b/llvm/test/CodeGen/R600/r600-export-fix.ll
@@ -3,9 +3,9 @@
  ;CHECK:        EXPORT T{{[0-9]}}.XYZW
  ;CHECK:        EXPORT T{{[0-9]}}.0000
  ;CHECK: EXPORT T{{[0-9]}}.0000
-;CHECK: EXPORT T{{[0-9]}}.0XZW
+;CHECK: EXPORT T{{[0-9]}}.0XYZ
  ;CHECK: EXPORT T{{[0-9]}}.XYZW
-;CHECK: EXPORT T{{[0-9]}}.YX00
+;CHECK: EXPORT T{{[0-9]}}.YZ00
  ;CHECK: EXPORT T{{[0-9]}}.0000
  ;CHECK: EXPORT T{{[0-9]}}.0000
  
diff --git a/llvm/test/CodeGen/R600/swizzle-export.ll b/llvm/test/CodeGen/R600/swizzle-export.ll

index 16c3f191935ce156465b68b70d7d1129822e5dfb..0a68f76c7656aaa3efa19e21ae7f3b3c8a0a9798 100644 (file)
--- a/llvm/test/CodeGen/R600/swizzle-export.ll
+++ b/llvm/test/CodeGen/R600/swizzle-export.ll
@@ -94,7 +94,7 @@ main_body:
  
  ; EG-CHECK: @main2
  ; EG-CHECK: T{{[0-9]+}}.XY__
-; EG-CHECK: T{{[0-9]+}}.YXZ0
+; EG-CHECK: T{{[0-9]+}}.ZXY0
  
  define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
  main_body:
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll

index cef3490e2a38df41ecacfc932f914ce0567756f3..5e0977efd20a003856137c622cd605d3d93b6987 100644 (file)
--- a/llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
@@ -10,7 +10,8 @@ define i32 @test0(i8 %A) {
  
  define signext i8 @test1(i32 %A)  {
  ; CHECK: test1
-; CHECK: sxtb.w r0, r0, ror #8
+; CHECK: lsrs r0, r0, #8
+; CHECK: sxtb r0, r0
         %B = lshr i32 %A, 8
         %C = shl i32 %A, 24
         %D = or i32 %B, %C
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll

index bcd4a0fa38ffbef031b79bf3e0da68f88badb0fa..06e78d5b0b8f2384c0ddb04592545af433c4faba 100644 (file)
--- a/llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -25,7 +25,7 @@ define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
  
  define zeroext i32 @test3(i32 %A.u)  {
  ; A8: test3
-; A8: uxth.w r0, r0, ror #8
+; A8: ubfx  r0, r0, #8, #16
      %B.u = lshr i32 %A.u, 8
      %C.u = shl i32 %A.u, 24
      %D.u = or i32 %B.u, %C.u
diff --git a/llvm/test/CodeGen/X86/avx512-zext-load-crash.ll b/llvm/test/CodeGen/X86/avx512-zext-load-crash.ll

deleted file mode 100644 (file)

index 07ded13..0000000
--- a/llvm/test/CodeGen/X86/avx512-zext-load-crash.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-
-define <8 x i16> @test_zext_load() {
-  ; CHECK: vmovq
-entry:
-  %0 = load <2 x i16> ** undef, align 8
-  %1 = getelementptr inbounds <2 x i16>* %0, i64 1
-  %2 = load <2 x i16>* %0, align 1
-  %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %4 = load <2 x i16>* %1, align 1
-  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <8 x i16> %6
-}
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll

index 2681c109ef5db45ad965ff141fa0422927cc9782..cc40bcf2946e33479b96d0bcc3ed2d68de3a1760 100644 (file)
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -237,44 +237,6 @@ exit:
    ret i32 %base
  }
  
-define void @test_loop_rotate_reversed_blocks() {
-; This test case (greatly reduced from an Olden bencmark) ensures that the loop
-; rotate implementation doesn't assume that loops are laid out in a particular
-; order. The first loop will get split into two basic blocks, with the loop
-; header coming after the loop latch.
-;
-; CHECK: test_loop_rotate_reversed_blocks
-; CHECK: %entry
-; Look for a jump into the middle of the loop, and no branches mid-way.
-; CHECK: jmp
-; CHECK: %loop1
-; CHECK-NOT: j{{\w*}} .LBB{{.*}}
-; CHECK: %loop1
-; CHECK: je
-
-entry:
-  %cond1 = load volatile i1* undef
-  br i1 %cond1, label %loop2.preheader, label %loop1
-
-loop1:
-  call i32 @f()
-  %cond2 = load volatile i1* undef
-  br i1 %cond2, label %loop2.preheader, label %loop1
-
-loop2.preheader:
-  call i32 @f()
-  %cond3 = load volatile i1* undef
-  br i1 %cond3, label %exit, label %loop2
-
-loop2:
-  call i32 @f()
-  %cond4 = load volatile i1* undef
-  br i1 %cond4, label %exit, label %loop2
-
-exit:
-  ret void
-}
-
  define i32 @test_loop_align(i32 %i, i32* %a) {
  ; Check that we provide basic loop body alignment with the block placement
  ; pass.
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll

index 21225e340826bca17f1104f61513b03cd13860e1..fd07a3f551001c761981bd9d1889fd279ae2bedb 100644 (file)
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -31,6 +31,7 @@ entry:
  ; CHECK-LABEL: test3:
  ; CHECK: movzbl  8(%esp), %eax
  ; CHECK-NEXT: imull    $171, %eax
+; CHECK-NEXT: andl $65024, %eax
  ; CHECK-NEXT: shrl     $9, %eax
  ; CHECK-NEXT: ret
  }
@@ -56,9 +57,10 @@ entry:
    %div = sdiv i16 %x, 10
    ret i16 %div
  ; CHECK-LABEL: test6:
-; CHECK: imull $26215, %eax, %ecx
-; CHECK: sarl $18, %ecx
-; CHECK: shrl $15, %eax
+; CHECK: imull $26215, %eax
+; CHECK: movl %eax, %ecx
+; CHECK: shrl $31, %ecx
+; CHECK: sarl $18, %eax
  }
  
  define i32 @test7(i32 %x) nounwind {
diff --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-0.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-0.ll

deleted file mode 100644 (file)

index 1d315ff..0000000
--- a/llvm/test/CodeGen/X86/fold-pcmpeqd-0.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s
-; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s
-
-; i386 test has been disabled when scheduler 2-addr hack is disabled.
-
-; This testcase shouldn't need to spill the -1 value,
-; so it should just use pcmpeqd to materialize an all-ones vector.
-; For i386, cp load of -1 are folded.
-
-; With -regalloc=greedy, the live range is split before spilling, so the first
-; pcmpeq doesn't get folded as a constant pool load.
-
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-
-; X86-64: pcmpeqd
-; X86-64-NOT: pcmpeqd
-
-       %struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }>
-       %struct._cl_image_format_t = type <{ i32, i32, i32 }>
-       %struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>
-
-define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {
-entry:
-       %tmp3.i = load i32* null                ; <i32> [#uses=1]
-       %cmp = icmp sgt i32 %tmp3.i, 200                ; <i1> [#uses=1]
-       br i1 %cmp, label %forcond, label %ifthen
-
-ifthen:                ; preds = %entry
-       ret void
-
-forcond:               ; preds = %entry
-       %tmp3.i536 = load i32* null             ; <i32> [#uses=1]
-       %cmp12 = icmp slt i32 0, %tmp3.i536             ; <i1> [#uses=1]
-       br i1 %cmp12, label %forbody, label %afterfor
-
-forbody:               ; preds = %forcond
-       %bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float>             ; <<4 x float>> [#uses=1]
-       %mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer            ; <<4 x float>> [#uses=1]
-       %mul257 = fmul <4 x float> %mul233, zeroinitializer             ; <<4 x float>> [#uses=1]
-       %mul275 = fmul <4 x float> %mul257, zeroinitializer             ; <<4 x float>> [#uses=1]
-       %tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind               ; <<4 x float>> [#uses=1]
-       %bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32>             ; <<4 x i32>> [#uses=0]
-       %bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float>             ; <<4 x float>> [#uses=1]
-       %tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind          ; <<4 x i32>> [#uses=1]
-       %tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind            ; <<4 x float>> [#uses=1]
-       %sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70          ; <<4 x float>> [#uses=2]
-       %mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78             ; <<4 x float>> [#uses=1]
-       %add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 >          ; <<4 x float>> [#uses=1]
-       %mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78         ; <<4 x float>> [#uses=1]
-       %add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >          ; <<4 x float>> [#uses=1]
-       %bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32>          ; <<4 x i32>> [#uses=1]
-       %andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer         ; <<4 x i32>> [#uses=1]
-       %bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float>               ; <<4 x float>> [#uses=1]
-       %mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer         ; <<4 x float>> [#uses=1]
-       %bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32>          ; <<4 x i32>> [#uses=1]
-       %andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer                ; <<4 x i32>> [#uses=1]
-       %xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >         ; <<4 x i32>> [#uses=1]
-       %orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102               ; <<4 x i32>> [#uses=1]
-       %bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float>               ; <<4 x float>> [#uses=1]
-       %cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind                ; <<4 x float>> [#uses=1]
-       %tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind           ; <<4 x float>> [#uses=1]
-       %sub140.i = fsub <4 x float> zeroinitializer, %tmp80            ; <<4 x float>> [#uses=1]
-       %bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32>                ; <<4 x i32>> [#uses=1]
-       %andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 >              ; <<4 x i32>> [#uses=0]
-       %mul171.i = fmul <4 x float> zeroinitializer, %sub140.i         ; <<4 x float>> [#uses=1]
-       %add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >              ; <<4 x float>> [#uses=1]
-       %bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32>              ; <<4 x i32>> [#uses=1]
-       %andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer             ; <<4 x i32>> [#uses=1]
-       %bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float>           ; <<4 x float>> [#uses=1]
-       %mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer             ; <<4 x float>> [#uses=1]
-       %bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32>                ; <<4 x i32>> [#uses=0]
-       %bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32>              ; <<4 x i32>> [#uses=1]
-       %andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer             ; <<4 x i32>> [#uses=1]
-       %bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32>               ; <<4 x i32>> [#uses=1]
-       %xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 >              ; <<4 x i32>> [#uses=1]
-       %orps203.i = or <4 x i32> %andnps192.i, %xorps.i                ; <<4 x i32>> [#uses=1]
-       %bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float>             ; <<4 x float>> [#uses=1]
-       %mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer            ; <<4 x float>> [#uses=1]
-       %mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer            ; <<4 x float>> [#uses=2]
-       %mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer               ; <<4 x float>> [#uses=1]
-       %tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind               ; <<4 x float>> [#uses=1]
-       %bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32>                ; <<4 x i32>> [#uses=1]
-       %andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer             ; <<4 x i32>> [#uses=1]
-       %orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer           ; <<4 x i32>> [#uses=1]
-       %bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>             ; <<4 x float>> [#uses=1]
-       %tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind               ; <<4 x float>> [#uses=1]
-       %bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>          ; <<4 x i32>> [#uses=1]
-       %bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32>         ; <<4 x i32>> [#uses=2]
-       %andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4             ; <<4 x i32>> [#uses=1]
-       %bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32>         ; <<4 x i32>> [#uses=1]
-       %not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 >                ; <<4 x i32>> [#uses=1]
-       %andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7               ; <<4 x i32>> [#uses=1]
-       %orps.i9 = or <4 x i32> %andnps.i8, %andps.i5           ; <<4 x i32>> [#uses=1]
-       %bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float>              ; <<4 x float>> [#uses=1]
-       %bitcast.i = bitcast <4 x float> %mul313 to <4 x i32>           ; <<4 x i32>> [#uses=1]
-       %andps.i = and <4 x i32> %bitcast.i, zeroinitializer            ; <<4 x i32>> [#uses=1]
-       %orps.i = or <4 x i32> zeroinitializer, %andps.i                ; <<4 x i32>> [#uses=1]
-       %bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float>         ; <<4 x float>> [#uses=1]
-       call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
-       unreachable
-
-afterfor:              ; preds = %forcond
-       ret void
-}
-
-declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
-
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/narrow-shl-load.ll b/llvm/test/CodeGen/X86/narrow-shl-load.ll

index 30387925b34d76a01d99bd8c29ca338671ba92c6..5175bfc2bcb1d6a0a9e56d1d0749316c4563ddf6 100644 (file)
--- a/llvm/test/CodeGen/X86/narrow-shl-load.ll
+++ b/llvm/test/CodeGen/X86/narrow-shl-load.ll
@@ -30,40 +30,6 @@ while.end:                                        ; preds = %while.cond
    ret void
  }
  
-
-; DAGCombiner shouldn't fold the sdiv (ashr) away.
-; rdar://8636812
-; CHECK-LABEL: test2:
-; CHECK:   sarl
-
-define i32 @test2() nounwind {
-entry:
-  %i = alloca i32, align 4
-  %j = alloca i8, align 1
-  store i32 127, i32* %i, align 4
-  store i8 0, i8* %j, align 1
-  %tmp3 = load i32* %i, align 4
-  %mul = mul nsw i32 %tmp3, 2
-  %conv4 = trunc i32 %mul to i8
-  %conv5 = sext i8 %conv4 to i32
-  %div6 = sdiv i32 %conv5, 2
-  %conv7 = trunc i32 %div6 to i8
-  %conv9 = sext i8 %conv7 to i32
-  %cmp = icmp eq i32 %conv9, -1
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  ret i32 0
-
-if.end:                                           ; preds = %entry
-  call void @abort() noreturn
-  unreachable
-}
-
-declare void @abort() noreturn
-
-declare void @exit(i32) noreturn
-
  ; DAG Combiner can't fold this into a load of the 1'th byte.
  ; PR8757
  define i32 @test3(i32 *%P) nounwind ssp {
diff --git a/llvm/test/CodeGen/X86/store-narrow.ll b/llvm/test/CodeGen/X86/store-narrow.ll

index 7557f255658dc4f1e91a67f6f2e056f2d554fecc..51f6fb0dbbe08480e6eab660f3aed90dd344432a 100644 (file)
--- a/llvm/test/CodeGen/X86/store-narrow.ll
+++ b/llvm/test/CodeGen/X86/store-narrow.ll
@@ -34,8 +34,8 @@ entry:
  ; X64: movb    %sil, 1(%rdi)
  
  ; X32-LABEL: test2:
-; X32: movb    8(%esp), %[[REG:[abcd]l]]
-; X32: movb    %[[REG]], 1(%{{.*}})
+; X32: movzbl  8(%esp), %e[[REG:[abcd]]]x
+; X32: movb    %[[REG]]l, 1(%{{.*}})
  }
  
  define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
diff --git a/llvm/test/CodeGen/X86/vec_extract-sse4.ll b/llvm/test/CodeGen/X86/vec_extract-sse4.ll

index 747c8a8e8d02e8d46854411ea55847cbf110e9d3..51f7a9898fa053da4a730459b0c22dabf9a5f5b8 100644 (file)
--- a/llvm/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-sse4.ll
@@ -4,8 +4,8 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind {
  ; CHECK-LABEL: @t1
  ; CHECK:         movl 4(%esp), %[[R0:e[abcd]x]]
  ; CHECK-NEXT:    movl 8(%esp), %[[R1:e[abcd]x]]
-; CHECK-NEXT:    movl 12(%[[R1]]), %[[R2:e[abcd]x]]
-; CHECK-NEXT:    movl %[[R2]], (%[[R0]])
+; CHECK-NEXT:    movss 12(%[[R1]]), %[[R2:xmm.*]]
+; CHECK-NEXT:    movss %[[R2]], (%[[R0]])
  ; CHECK-NEXT:    retl
  
         %X = load <4 x float>* %P1
author	Chandler Carruth <chandlerc@gmail.com>
	Wed, 23 Jul 2014 07:08:53 +0000 (07:08 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Wed, 23 Jul 2014 07:08:53 +0000 (07:08 +0000)
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
llvm/test/CodeGen/ARM/fold-stack-adjust.ll		patch \| blob \| history
llvm/test/CodeGen/ARM/sxt_rot.ll		patch \| blob \| history
llvm/test/CodeGen/PowerPC/complex-return.ll		patch \| blob \| history
llvm/test/CodeGen/PowerPC/subsumes-pred-regs.ll		patch \| blob \| history
llvm/test/CodeGen/R600/r600-export-fix.ll		patch \| blob \| history
llvm/test/CodeGen/R600/swizzle-export.ll		patch \| blob \| history
llvm/test/CodeGen/Thumb2/thumb2-sxt_rot.ll		patch \| blob \| history
llvm/test/CodeGen/Thumb2/thumb2-uxt_rot.ll		patch \| blob \| history
llvm/test/CodeGen/X86/avx512-zext-load-crash.ll	[deleted file]	patch \| blob \| history
llvm/test/CodeGen/X86/block-placement.ll		patch \| blob \| history
llvm/test/CodeGen/X86/divide-by-constant.ll		patch \| blob \| history
llvm/test/CodeGen/X86/fold-pcmpeqd-0.ll	[deleted file]	patch \| blob \| history
llvm/test/CodeGen/X86/narrow-shl-load.ll		patch \| blob \| history
llvm/test/CodeGen/X86/store-narrow.ll		patch \| blob \| history
llvm/test/CodeGen/X86/vec_extract-sse4.ll		patch \| blob \| history