From 6438ea45e053378a3c461a879805174eaa864bdb Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 22 May 2020 12:13:18 -0400
Subject: [PATCH] [VectorCombine] position pass after SLP in the optimization
 pipeline rather than before

There are 2 known problem patterns shown in the test diffs here:
vector horizontal ops (an x86 specialization) and vector reductions.

SLP has greater ability to match and fold those than vector-combine,
so let SLP have first chance at that.

This is a quick fix while we continue to improve vector-combine and
possibly canonicalize to reduction intrinsics.

In the longer term, we should improve matching of these patterns
because if they were created in the "bad" forms shown here, then we
would miss optimizing them.

I'm not sure what is happening with alias analysis on the addsub test.
The old pass manager now shows an extra line for that, and we see an
improvement that comes from SLP vectorizing a store. I don't know
what's missing with the new pass manager to make that happen.
Strangely, I can't reproduce the behavior if I compile from C++ with
clang and invoke the new PM with "-fexperimental-new-pass-manager".

Differential Revision: https://reviews.llvm.org/D80236
---
 llvm/lib/Passes/PassBuilder.cpp                    |  7 ++--
 llvm/lib/Transforms/IPO/PassManagerBuilder.cpp     |  6 ++-
 llvm/test/CodeGen/AMDGPU/opt-pipeline.ll           | 14 ++++---
 llvm/test/Other/new-pm-defaults.ll                 |  4 +-
 llvm/test/Other/new-pm-thinlto-defaults.ll         |  4 +-
 .../Other/new-pm-thinlto-postlink-pgo-defaults.ll  |  4 +-
 .../new-pm-thinlto-postlink-samplepgo-defaults.ll  |  4 +-
 llvm/test/Other/opt-O2-pipeline.ll                 |  5 ++-
 llvm/test/Other/opt-O3-pipeline.ll                 |  5 ++-
 llvm/test/Other/opt-Os-pipeline.ll                 |  5 ++-
 llvm/test/Other/opt-pipeline-vector-passes.ll      | 12 +++---
 llvm/test/Transforms/PhaseOrdering/X86/addsub.ll   | 44 +++++++++++++---------
 .../Transforms/PhaseOrdering/X86/horiz-math.ll     | 33 +++++-----------
 .../PhaseOrdering/X86/vector-reductions.ll         | 16 +++-----
 14 files changed, 79 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 9b4cc1c..d4d437a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -986,10 +986,6 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   OptimizePM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
-  // Enhance/cleanup vector code.
-  OptimizePM.addPass(VectorCombinePass());
-  OptimizePM.addPass(EarlyCSEPass());
-
   // Eliminate loads by forwarding stores from the previous iteration to loads
   // of the current iteration.
   OptimizePM.addPass(LoopLoadEliminationPass());
@@ -1016,6 +1012,9 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   if (PTO.SLPVectorization)
     OptimizePM.addPass(SLPVectorizerPass());
 
+  // Enhance/cleanup vector code.
+  OptimizePM.addPass(VectorCombinePass());
+  OptimizePM.addPass(EarlyCSEPass());
   OptimizePM.addPass(InstCombinePass());
 
   // Unroll small loops to hide loop backedge latency and saturate any parallel
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index a0fa1fa..6a2cc02 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -741,8 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createLoopDistributePass());
 
   MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
-  MPM.add(createVectorCombinePass());
-  MPM.add(createEarlyCSEPass());
 
   // Eliminate loads by forwarding stores from the previous iteration to loads
   // of the current iteration.
@@ -783,6 +781,10 @@ void PassManagerBuilder::populateModulePassManager(
     }
   }
 
+  // Enhance/cleanup vector code.
+  MPM.add(createVectorCombinePass());
+  MPM.add(createEarlyCSEPass());
+
   addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createInstructionCombiningPass());
 
diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
index 87f3435..8b1b404 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -230,8 +230,6 @@
 ; GCN-O1-NEXT:       Optimization Remark Emitter
 ; GCN-O1-NEXT:       Inject TLI Mappings
 ; GCN-O1-NEXT:       Loop Vectorization
-; GCN-O1-NEXT:       Optimize scalar/vector ops
-; GCN-O1-NEXT:       Early CSE
 ; GCN-O1-NEXT:       Canonicalize natural loops
 ; GCN-O1-NEXT:       Scalar Evolution Analysis
 ; GCN-O1-NEXT:       Function Alias Analysis Results
@@ -247,6 +245,8 @@
 ; GCN-O1-NEXT:       Combine redundant instructions
 ; GCN-O1-NEXT:       Simplify the CFG
 ; GCN-O1-NEXT:       Dominator Tree Construction
+; GCN-O1-NEXT:       Optimize scalar/vector ops
+; GCN-O1-NEXT:       Early CSE
 ; GCN-O1-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:       Function Alias Analysis Results
 ; GCN-O1-NEXT:       Natural Loop Information
@@ -571,8 +571,6 @@
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Inject TLI Mappings
 ; GCN-O2-NEXT:       Loop Vectorization
-; GCN-O2-NEXT:       Optimize scalar/vector ops
-; GCN-O2-NEXT:       Early CSE
 ; GCN-O2-NEXT:       Canonicalize natural loops
 ; GCN-O2-NEXT:       Scalar Evolution Analysis
 ; GCN-O2-NEXT:       Function Alias Analysis Results
@@ -598,6 +596,9 @@
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Inject TLI Mappings
 ; GCN-O2-NEXT:       SLP Vectorizer
+; GCN-O2-NEXT:       Optimize scalar/vector ops
+; GCN-O2-NEXT:       Early CSE
+; GCN-O2-NEXT:       Function Alias Analysis Results
 ; GCN-O2-NEXT:       Optimization Remark Emitter
 ; GCN-O2-NEXT:       Combine redundant instructions
 ; GCN-O2-NEXT:       Canonicalize natural loops
@@ -924,8 +925,6 @@
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Inject TLI Mappings
 ; GCN-O3-NEXT:       Loop Vectorization
-; GCN-O3-NEXT:       Optimize scalar/vector ops
-; GCN-O3-NEXT:       Early CSE
 ; GCN-O3-NEXT:       Canonicalize natural loops
 ; GCN-O3-NEXT:       Scalar Evolution Analysis
 ; GCN-O3-NEXT:       Function Alias Analysis Results
@@ -951,6 +950,9 @@
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Inject TLI Mappings
 ; GCN-O3-NEXT:       SLP Vectorizer
+; GCN-O3-NEXT:       Optimize scalar/vector ops
+; GCN-O3-NEXT:       Early CSE
+; GCN-O3-NEXT:       Function Alias Analysis Results
 ; GCN-O3-NEXT:       Optimization Remark Emitter
 ; GCN-O3-NEXT:       Combine redundant instructions
 ; GCN-O3-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index ae3e1af..1c4ef40 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -253,8 +253,6 @@
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
-; CHECK-O-NEXT: Running pass: VectorCombinePass
-; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@@ -262,6 +260,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
index 6bbad89..6d278c9 100644
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -223,8 +223,6 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
-; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
-; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
@@ -232,6 +230,8 @@
 ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
+; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index c84a74d..44a4a3a 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -191,8 +191,6 @@
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
-; CHECK-O-NEXT: Running pass: VectorCombinePass
-; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@@ -200,6 +198,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index d4895ba..4f3ddca 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -202,8 +202,6 @@
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
-; CHECK-O-NEXT: Running pass: VectorCombinePass
-; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@@ -211,6 +209,8 @@
 ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
 ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
+; CHECK-O-NEXT: Running pass: VectorCombinePass
+; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
index 6a331bb..3edc2a6 100644
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -227,8 +227,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -254,6 +252,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
index 2710e54..ce44c2b 100644
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -232,8 +232,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -259,6 +257,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
index 1015286..fdc1906 100644
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -213,8 +213,6 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       Loop Vectorization
-; CHECK-NEXT:       Optimize scalar/vector ops
-; CHECK-NEXT:       Early CSE
 ; CHECK-NEXT:       Canonicalize natural loops
 ; CHECK-NEXT:       Scalar Evolution Analysis
 ; CHECK-NEXT:       Function Alias Analysis Results
@@ -240,6 +238,9 @@
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Inject TLI Mappings
 ; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Optimize scalar/vector ops
+; CHECK-NEXT:       Early CSE
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Optimization Remark Emitter
 ; CHECK-NEXT:       Combine redundant instructions
 ; CHECK-NEXT:       Canonicalize natural loops
diff --git a/llvm/test/Other/opt-pipeline-vector-passes.ll b/llvm/test/Other/opt-pipeline-vector-passes.ll
index 1a1ad56..ae462095 100644
--- a/llvm/test/Other/opt-pipeline-vector-passes.ll
+++ b/llvm/test/Other/opt-pipeline-vector-passes.ll
@@ -12,15 +12,15 @@
 
 ; OLDPM_O1-LABEL:  Pass Arguments:
 ; OLDPM_O1:        Loop Vectorization
-; OLDPM_O1:        Optimize scalar/vector ops
 ; OLDPM_O1-NOT:    SLP Vectorizer
+; OLDPM_O1:        Optimize scalar/vector ops
 
 ; Everything runs at -O2.
 
 ; OLDPM_O2-LABEL:  Pass Arguments:
 ; OLDPM_O2:        Loop Vectorization
-; OLDPM_O2:        Optimize scalar/vector ops
 ; OLDPM_O2:        SLP Vectorizer
+; OLDPM_O2:        Optimize scalar/vector ops
 
 ; The loop vectorizer still runs at both -O1/-O2 even with the
 ; debug flag, but it only works on loops explicitly annotated
@@ -28,24 +28,24 @@
 
 ; OLDPM_O1_FORCE_OFF-LABEL:  Pass Arguments:
 ; OLDPM_O1_FORCE_OFF:        Loop Vectorization
-; OLDPM_O1_FORCE_OFF:        Optimize scalar/vector ops
 ; OLDPM_O1_FORCE_OFF-NOT:    SLP Vectorizer
+; OLDPM_O1_FORCE_OFF:        Optimize scalar/vector ops
 
 ; OLDPM_O2_FORCE_OFF-LABEL:  Pass Arguments:
 ; OLDPM_O2_FORCE_OFF:        Loop Vectorization
-; OLDPM_O2_FORCE_OFF:        Optimize scalar/vector ops
 ; OLDPM_O2_FORCE_OFF:        SLP Vectorizer
+; OLDPM_O2_FORCE_OFF:        Optimize scalar/vector ops
 
 ; There should be no difference with the new pass manager.
 ; This is tested more thoroughly in other test files.
 
 ; NEWPM_O1-LABEL:  Running pass: LoopVectorizePass
-; NEWPM_O1:        Running pass: VectorCombinePass
 ; NEWPM_O1-NOT:    Running pass: SLPVectorizerPass
+; NEWPM_O1:        Running pass: VectorCombinePass
 
 ; NEWPM_O2-LABEL:  Running pass: LoopVectorizePass
-; NEWPM_O2:        Running pass: VectorCombinePass
 ; NEWPM_O2:        Running pass: SLPVectorizerPass
+; NEWPM_O2:        Running pass: VectorCombinePass
 
 define void @f() {
   ret void
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index 57acbee..0e50efa 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S                   | FileCheck %s
-; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+; RUN: opt < %s -O3 -S                   | FileCheck %s --check-prefixes=CHECK,OLDPM
+; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s --check-prefixes=CHECK,NEWPM
 
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -69,22 +69,30 @@ define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float>
 }
 
 define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
-; CHECK-LABEL: @add_aggregate_store(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
-; CHECK-NEXT:    store float [[TMP2]], float* [[R0]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
-; CHECK-NEXT:    store float [[TMP3]], float* [[R1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
-; CHECK-NEXT:    [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
-; CHECK-NEXT:    store float [[TMP5]], float* [[R2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
-; CHECK-NEXT:    [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
-; CHECK-NEXT:    store float [[TMP6]], float* [[R3]], align 4
-; CHECK-NEXT:    ret void
+; OLDPM-LABEL: @add_aggregate_store(
+; OLDPM-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
+; OLDPM-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; OLDPM-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; OLDPM-NEXT:    [[TMP4:%.*]] = bitcast %struct.Vector4* [[R:%.*]] to <4 x float>*
+; OLDPM-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; OLDPM-NEXT:    ret void
+;
+; NEWPM-LABEL: @add_aggregate_store(
+; NEWPM-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
+; NEWPM-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; NEWPM-NEXT:    [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
+; NEWPM-NEXT:    store float [[TMP2]], float* [[R0]], align 4
+; NEWPM-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NEWPM-NEXT:    [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
+; NEWPM-NEXT:    store float [[TMP3]], float* [[R1]], align 4
+; NEWPM-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
+; NEWPM-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NEWPM-NEXT:    [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
+; NEWPM-NEXT:    store float [[TMP5]], float* [[R2]], align 4
+; NEWPM-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NEWPM-NEXT:    [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
+; NEWPM-NEXT:    store float [[TMP6]], float* [[R3]], align 4
+; NEWPM-NEXT:    ret void
 ;
   %a00 = extractelement <2 x float> %a0, i32 0
   %b00 = extractelement <2 x float> %b0, i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
index bd86e64..749ed96 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll
@@ -9,18 +9,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: @hadd_reverse_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
-; CHECK-NEXT:    [[VECINIT6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], [[B]]
-; CHECK-NEXT:    [[VECINIT10:%.*]] = shufflevector <4 x float> [[VECINIT6]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[B]]
-; CHECK-NEXT:    [[VECINIT14:%.*]] = shufflevector <4 x float> [[VECINIT10]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    ret <4 x float> [[VECINIT14]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -45,18 +37,11 @@ define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 
 define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: @reverse_hadd_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> <i32 undef, i32 undef, i32 6, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], [[B]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x i32> <i32 undef, i32 4, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP10]], <4 x i32> <i32 6, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %vecext = extractelement <4 x float> %a, i32 0
   %vecext1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 66ec815..304b38b 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -5,19 +5,15 @@
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; FIXME: This should only need 2 'or' instructions.
-
 define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @ext_ext_or_reduction_v4i32(
 ; CHECK-NEXT:    [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
-; CHECK-NEXT:    ret i32 [[TMP7]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i32> [[Z]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %z = and <4 x i32> %x, %y
   %z0 = extractelement <4 x i32> %z, i32 0
-- 
2.7.4