From b6e376ddfa1715c0ea0e975fc7fb033da2814661 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Fri, 15 Mar 2019 21:17:53 +0000
Subject: [PATCH] [X86] Promote i8 CMOV's (PR40965)

Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP

Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.

There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?

# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>

// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
  using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;

  int __sign = 1;
  _Up __m = __a;
  _Up __M = __b;
  if (__a > __b) {
    __sign = -1;
    __m = __b;
    __M = __a;
  }
  return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
}  // namespace std

template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
                                       std::numeric_limits<T>::max());
  std::vector<T> v;
  v.reserve(count);
  std::generate_n(std::back_inserter(v), count,
                  [&dis, &gen]() { return dis(gen); });
  assert(v.size() == count);
  return v;
}

struct RandRand {
  template <typename T>
  static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
    return std::make_pair(getVectorOfRandomNumbers<T>(count),
                          getVectorOfRandomNumbers<T>(count));
  }
};
struct ZeroRand {
  template <typename T>
  static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
    return std::make_pair(std::vector<T>(count, T(0)),
                          getVectorOfRandomNumbers<T>(count));
  }
};

template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
  const size_t Length = state.range(0);

  const std::pair<std::vector<T>, std::vector<T>> Data =
      Gen::template Gen<T>(Length);
  const std::vector<T>& a = Data.first;
  const std::vector<T>& b = Data.second;
  assert(a.size() == Length && b.size() == a.size());

  benchmark::ClobberMemory();
  benchmark::DoNotOptimize(a);
  benchmark::DoNotOptimize(a.data());
  benchmark::DoNotOptimize(b);
  benchmark::DoNotOptimize(b.data());

  for (auto _ : state) {
    for (size_t i = 0; i < Length; i++) {
      const auto calculated = std::midpoint(a[i], b[i]);
      benchmark::DoNotOptimize(calculated);
    }
  }
  state.SetComplexityN(Length);
  state.counters["midpoints"] =
      benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
  state.counters["midpoints/sec"] =
      benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
  const size_t BytesRead = 2 * sizeof(T) * Length;
  state.counters["bytes_read/iteration"] =
      benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
                         benchmark::Counter::OneK::kIs1024);
  state.counters["bytes_read/sec"] = benchmark::Counter(
      BytesRead, benchmark::Counter::kIsIterationInvariantRate,
      benchmark::Counter::OneK::kIs1024);
}

template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
  const size_t L2SizeBytes = 2 * 1024 * 1024;
  // What is the largest range we can check to always fit within given L2 cache?
  const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
                        /*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
  b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}

// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
    ->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
    ->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
    ->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
    ->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
    ->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
    ->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
    ->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
    ->Apply(CustomArguments<uint8_t>);

// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
    ->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
    ->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
    ->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
    ->Apply(CustomArguments<uint8_t>);
```

```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
  L1 Data 16K (x8)
  L1 Instruction 64K (x4)
  L2 Unified 2048K (x4)
  L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark                                          Time             CPU   Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072      300398 ns       300404 ns         2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO          2.29 N          2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS              2 %             2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072     300433 ns       300433 ns         2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO         2.29 N          2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS             2 %             2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536       169857 ns       169858 ns         4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO          2.59 N          2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS              3 %             3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536      169770 ns       169771 ns         4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO         2.59 N          2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS             3 %             3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144      591169 ns       591179 ns         1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO          2.25 N          2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS              1 %             1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144     591264 ns       591274 ns         1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO         2.25 N          2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS             1 %             1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288      2983669 ns      2983689 ns          235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO           5.69 N          5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS               0 %             0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288     2668398 ns      2668419 ns          262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO          5.09 N          5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS              0 %             0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072     300887 ns       300887 ns         2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO         2.29 N          2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS             2 %             2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536      169634 ns       169634 ns         4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO         2.59 N          2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS             3 %             3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144     592252 ns       592255 ns         1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO         2.26 N          2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS             1 %             1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288      987295 ns       987309 ns          711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO          1.88 N          1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS              1 %             1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
  L1 Data 16K (x8)
  L1 Instruction 64K (x4)
  L2 Unified 2048K (x4)
  L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark                                          Time             CPU   Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072      300878 ns       300880 ns         2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO          2.29 N          2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS              2 %             2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072     300231 ns       300226 ns         2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO         2.29 N          2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS             2 %             2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536       170819 ns       170777 ns         4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO          2.60 N          2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS              3 %             3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536      171705 ns       171708 ns         4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO         2.62 N          2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS             3 %             3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144      592510 ns       592516 ns         1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO          2.26 N          2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS              1 %             1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144     614823 ns       614823 ns         1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO         2.33 N          2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS             4 %             4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288      1073181 ns      1073201 ns          650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO           2.05 N          2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS               1 %             1 %
BM_StdMidpoint<uint8_t, RandRand>/524288     1071010 ns      1071020 ns          653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO          2.05 N          2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS              1 %             1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072     300413 ns       300416 ns         2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO         2.29 N          2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS             2 %             2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536      169667 ns       169669 ns         4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO         2.59 N          2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS             3 %             3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144     591396 ns       591404 ns         1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO         2.26 N          2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS             1 %             1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288     1069421 ns      1069413 ns          655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO          2.04 N          2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS              0 %             0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark                                                   Time             CPU      Time Old      Time New       CPU Old       CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072                 +0.0016         +0.0016        300398        300878        300404        300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072                -0.0007         -0.0007        300433        300231        300433        300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536                  +0.0057         +0.0054        169857        170819        169858        170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536                 +0.0114         +0.0114        169770        171705        169771        171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144                 +0.0023         +0.0023        591169        592510        591179        592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144                +0.0398         +0.0398        591264        614823        591274        614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288                  -0.6403         -0.6403       2983669       1073181       2983689       1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288                 -0.5986         -0.5986       2668398       1071010       2668419       1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072                -0.0016         -0.0016        300887        300413        300887        300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536                 +0.0002         +0.0002        169634        169667        169634        169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144                -0.0014         -0.0014        592252        591396        592255        591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288                 +0.0832         +0.0832        987295       1069421        987309       1069413
```

What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
  That is because there we are computing mid point between zero and some random number,
  thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.

# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
  which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
  `cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted,  then the PGO
  will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
  https://godbolt.org/z/P5ufig

# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td:  let MispredictPenalty = 16;
X86SchedHaswell.td:  let MispredictPenalty = 16;
X86SchedSandyBridge.td:  let MispredictPenalty = 16;
X86SchedSkylakeClient.td:  let MispredictPenalty = 14;
X86SchedSkylakeServer.td:  let MispredictPenalty = 14;
X86ScheduleBdVer2.td:  let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td:  let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td:  let MispredictPenalty = 10;
X86ScheduleZnver1.td:  let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.

In short, i'd say this is an improvement, at least on this microbenchmark.

Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].

Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic

Reviewed By: craig.topper, andreadb

Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists

Tags: #llvm, #libc

Differential Revision: https://reviews.llvm.org/D59035

llvm-svn: 356300
---
 llvm/lib/Target/X86/X86ISelLowering.cpp            |  11 +-
 .../CodeGen/X86/8bit_cmov_of_trunc_promotion.ll    | 310 +++++++++++++--------
 llvm/test/CodeGen/X86/cmov-promotion.ll            |  57 ++--
 llvm/test/CodeGen/X86/cmov.ll                      |  27 +-
 llvm/test/CodeGen/X86/cmovcmov.ll                  |  23 +-
 llvm/test/CodeGen/X86/copy-eflags.ll               |  37 +--
 .../CodeGen/X86/fast-isel-select-pseudo-cmov.ll    |  40 +--
 llvm/test/CodeGen/X86/fshl.ll                      |  12 +-
 llvm/test/CodeGen/X86/fshr.ll                      |  12 +-
 llvm/test/CodeGen/X86/i386-shrink-wrapping.ll      |  38 +--
 llvm/test/CodeGen/X86/midpoint-int.ll              |  85 ++----
 llvm/test/CodeGen/X86/pr5145.ll                    |  64 ++---
 llvm/test/CodeGen/X86/sadd_sat.ll                  |  32 +--
 llvm/test/CodeGen/X86/sadd_sat_vec.ll              |  22 +-
 llvm/test/CodeGen/X86/sat-add.ll                   |  67 ++---
 llvm/test/CodeGen/X86/select.ll                    |  33 +--
 llvm/test/CodeGen/X86/select_const.ll              |   9 +-
 llvm/test/CodeGen/X86/ssub_sat.ll                  |  32 +--
 llvm/test/CodeGen/X86/ssub_sat_vec.ll              |  22 +-
 llvm/test/CodeGen/X86/uadd_sat.ll                  |  24 +-
 llvm/test/CodeGen/X86/uadd_sat_vec.ll              |  16 +-
 llvm/test/CodeGen/X86/usub_sat.ll                  |  19 +-
 llvm/test/CodeGen/X86/usub_sat_vec.ll              |  14 +-
 llvm/test/CodeGen/X86/x86-shrink-wrapping.ll       |  20 +-
 24 files changed, 483 insertions(+), 543 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9be77c9..56f0c09 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20530,8 +20530,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  // Promote i16 cmovs if it won't prevent folding a load.
-  if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
+  // Or finally, promote i8 cmovs if we have CMOV,
+  //                 or i16 cmovs if it won't prevent folding a load.
+  // FIXME: we should not limit promotion of i8 case to only when the CMOV is
+  //        legal, but EmitLoweredSelect() can not deal with these extensions
+  //        being inserted between two CMOV's. (in i16 case too TBN)
+  //        https://bugs.llvm.org/show_bug.cgi?id=40974
+  if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
+      (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
+       !MayFoldLoad(Op2))) {
     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
     SDValue Ops[] = { Op2, Op1, CC, Cond };
diff --git a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
index ff4dcfa..c8235e8 100644
--- a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
+++ b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
@@ -80,44 +80,65 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 ; Values don't come from regs, but there is only one truncation.
 
 define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nounwind {
-; I386-LABEL: neg_only_one_truncation:
-; I386:       # %bb.0:
-; I386-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    addl %ecx, %eax
-; I386-NEXT:    addb {{[0-9]+}}(%esp), %cl
-; I386-NEXT:    cmpb %cl, %al
-; I386-NEXT:    jge .LBB1_2
-; I386-NEXT:  # %bb.1:
-; I386-NEXT:    movl %ecx, %eax
-; I386-NEXT:  .LBB1_2:
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; I386-NOCMOV-LABEL: neg_only_one_truncation:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    addl %ecx, %eax
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; I386-NOCMOV-NEXT:    cmpb %cl, %al
+; I386-NOCMOV-NEXT:    jge .LBB1_2
+; I386-NOCMOV-NEXT:  # %bb.1:
+; I386-NOCMOV-NEXT:    movl %ecx, %eax
+; I386-NOCMOV-NEXT:  .LBB1_2:
+; I386-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: neg_only_one_truncation:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addl %eax, %ecx
+; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
+; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    movzbl %al, %eax
+; I386-CMOV-NEXT:    cmovgel %ecx, %eax
+; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT:    retl
 ;
-; I686-LABEL: neg_only_one_truncation:
-; I686:       # %bb.0:
-; I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NEXT:    addl %ecx, %eax
-; I686-NEXT:    addb {{[0-9]+}}(%esp), %cl
-; I686-NEXT:    cmpb %cl, %al
-; I686-NEXT:    jge .LBB1_2
-; I686-NEXT:  # %bb.1:
-; I686-NEXT:    movl %ecx, %eax
-; I686-NEXT:  .LBB1_2:
-; I686-NEXT:    # kill: def $al killed $al killed $eax
-; I686-NEXT:    retl
+; I686-NOCMOV-LABEL: neg_only_one_truncation:
+; I686-NOCMOV:       # %bb.0:
+; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I686-NOCMOV-NEXT:    addl %ecx, %eax
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; I686-NOCMOV-NEXT:    cmpb %cl, %al
+; I686-NOCMOV-NEXT:    jge .LBB1_2
+; I686-NOCMOV-NEXT:  # %bb.1:
+; I686-NOCMOV-NEXT:    movl %ecx, %eax
+; I686-NOCMOV-NEXT:  .LBB1_2:
+; I686-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I686-NOCMOV-NEXT:    retl
+;
+; I686-CMOV-LABEL: neg_only_one_truncation:
+; I686-CMOV:       # %bb.0:
+; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addl %eax, %ecx
+; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
+; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    movzbl %al, %eax
+; I686-CMOV-NEXT:    cmovgel %ecx, %eax
+; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: neg_only_one_truncation:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    movl %edi, %eax
-; X86_64-NEXT:    addl %edx, %eax
+; X86_64-NEXT:    addl %edx, %edi
 ; X86_64-NEXT:    addb %sil, %dl
-; X86_64-NEXT:    cmpb %dl, %al
-; X86_64-NEXT:    jge .LBB1_2
-; X86_64-NEXT:  # %bb.1:
-; X86_64-NEXT:    movl %edx, %eax
-; X86_64-NEXT:  .LBB1_2:
+; X86_64-NEXT:    cmpb %dl, %dil
+; X86_64-NEXT:    movzbl %dl, %eax
+; X86_64-NEXT:    cmovgel %edi, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
   %a1_wide = add i32 %a1_wide_orig, %inc
@@ -132,44 +153,63 @@ define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nou
 ; Values don't come from regs, but truncation from different types.
 
 define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nounwind {
-; I386-LABEL: neg_type_mismatch:
-; I386:       # %bb.0:
-; I386-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    addl %ecx, %eax
-; I386-NEXT:    addw {{[0-9]+}}(%esp), %cx
-; I386-NEXT:    cmpb %cl, %al
-; I386-NEXT:    jge .LBB2_2
-; I386-NEXT:  # %bb.1:
-; I386-NEXT:    movl %ecx, %eax
-; I386-NEXT:  .LBB2_2:
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; I386-NOCMOV-LABEL: neg_type_mismatch:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    addl %ecx, %eax
+; I386-NOCMOV-NEXT:    addw {{[0-9]+}}(%esp), %cx
+; I386-NOCMOV-NEXT:    cmpb %cl, %al
+; I386-NOCMOV-NEXT:    jge .LBB2_2
+; I386-NOCMOV-NEXT:  # %bb.1:
+; I386-NOCMOV-NEXT:    movl %ecx, %eax
+; I386-NOCMOV-NEXT:  .LBB2_2:
+; I386-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: neg_type_mismatch:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addl %eax, %ecx
+; I386-CMOV-NEXT:    addw {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    cmovgel %ecx, %eax
+; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT:    retl
 ;
-; I686-LABEL: neg_type_mismatch:
-; I686:       # %bb.0:
-; I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NEXT:    addl %ecx, %eax
-; I686-NEXT:    addw {{[0-9]+}}(%esp), %cx
-; I686-NEXT:    cmpb %cl, %al
-; I686-NEXT:    jge .LBB2_2
-; I686-NEXT:  # %bb.1:
-; I686-NEXT:    movl %ecx, %eax
-; I686-NEXT:  .LBB2_2:
-; I686-NEXT:    # kill: def $al killed $al killed $eax
-; I686-NEXT:    retl
+; I686-NOCMOV-LABEL: neg_type_mismatch:
+; I686-NOCMOV:       # %bb.0:
+; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I686-NOCMOV-NEXT:    addl %ecx, %eax
+; I686-NOCMOV-NEXT:    addw {{[0-9]+}}(%esp), %cx
+; I686-NOCMOV-NEXT:    cmpb %cl, %al
+; I686-NOCMOV-NEXT:    jge .LBB2_2
+; I686-NOCMOV-NEXT:  # %bb.1:
+; I686-NOCMOV-NEXT:    movl %ecx, %eax
+; I686-NOCMOV-NEXT:  .LBB2_2:
+; I686-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I686-NOCMOV-NEXT:    retl
+;
+; I686-CMOV-LABEL: neg_type_mismatch:
+; I686-CMOV:       # %bb.0:
+; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addl %eax, %ecx
+; I686-CMOV-NEXT:    addw {{[0-9]+}}(%esp), %ax
+; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    cmovgel %ecx, %eax
+; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: neg_type_mismatch:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    movl %edi, %eax
-; X86_64-NEXT:    addl %edx, %eax
-; X86_64-NEXT:    addl %edx, %esi
-; X86_64-NEXT:    cmpb %sil, %al
-; X86_64-NEXT:    jge .LBB2_2
-; X86_64-NEXT:  # %bb.1:
 ; X86_64-NEXT:    movl %esi, %eax
-; X86_64-NEXT:  .LBB2_2:
+; X86_64-NEXT:    addl %edx, %edi
+; X86_64-NEXT:    addl %edx, %eax
+; X86_64-NEXT:    cmpb %al, %dil
+; X86_64-NEXT:    cmovgel %edi, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
   %a1_wide = add i32 %a1_wide_orig, %inc
@@ -185,39 +225,56 @@ define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nou
 ; One value come from regs
 
 define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounwind {
-; I386-LABEL: negative_CopyFromReg:
-; I386:       # %bb.0:
-; I386-NEXT:    movb {{[0-9]+}}(%esp), %al
-; I386-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; I386-NEXT:    cmpb %cl, %al
-; I386-NEXT:    jge .LBB3_2
-; I386-NEXT:  # %bb.1:
-; I386-NEXT:    movl %ecx, %eax
-; I386-NEXT:  .LBB3_2:
-; I386-NEXT:    retl
+; I386-NOCMOV-LABEL: negative_CopyFromReg:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    cmpb %cl, %al
+; I386-NOCMOV-NEXT:    jge .LBB3_2
+; I386-NOCMOV-NEXT:  # %bb.1:
+; I386-NOCMOV-NEXT:    movl %ecx, %eax
+; I386-NOCMOV-NEXT:  .LBB3_2:
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: negative_CopyFromReg:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    cmovgel %ecx, %eax
+; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT:    retl
 ;
-; I686-LABEL: negative_CopyFromReg:
-; I686:       # %bb.0:
-; I686-NEXT:    movb {{[0-9]+}}(%esp), %al
-; I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; I686-NEXT:    cmpb %cl, %al
-; I686-NEXT:    jge .LBB3_2
-; I686-NEXT:  # %bb.1:
-; I686-NEXT:    movl %ecx, %eax
-; I686-NEXT:  .LBB3_2:
-; I686-NEXT:    retl
+; I686-NOCMOV-LABEL: negative_CopyFromReg:
+; I686-NOCMOV:       # %bb.0:
+; I686-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %al
+; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    cmpb %cl, %al
+; I686-NOCMOV-NEXT:    jge .LBB3_2
+; I686-NOCMOV-NEXT:  # %bb.1:
+; I686-NOCMOV-NEXT:    movl %ecx, %eax
+; I686-NOCMOV-NEXT:  .LBB3_2:
+; I686-NOCMOV-NEXT:    retl
+;
+; I686-CMOV-LABEL: negative_CopyFromReg:
+; I686-CMOV:       # %bb.0:
+; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    cmovgel %ecx, %eax
+; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: negative_CopyFromReg:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    movl %edi, %eax
-; X86_64-NEXT:    addl %edx, %esi
-; X86_64-NEXT:    cmpb %sil, %al
-; X86_64-NEXT:    jge .LBB3_2
-; X86_64-NEXT:  # %bb.1:
 ; X86_64-NEXT:    movl %esi, %eax
-; X86_64-NEXT:  .LBB3_2:
+; X86_64-NEXT:    addl %edx, %eax
+; X86_64-NEXT:    cmpb %al, %dil
+; X86_64-NEXT:    cmovgel %edi, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
   %a2_wide = add i32 %a2_wide_orig, %inc
@@ -231,36 +288,51 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; Both values come from regs
 
 define i8 @negative_CopyFromRegs(i32 %a1_wide, i32 %a2_wide) nounwind {
-; I386-LABEL: negative_CopyFromRegs:
-; I386:       # %bb.0:
-; I386-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; I386-NEXT:    movb {{[0-9]+}}(%esp), %al
-; I386-NEXT:    cmpb %cl, %al
-; I386-NEXT:    jge .LBB4_2
-; I386-NEXT:  # %bb.1:
-; I386-NEXT:    movl %ecx, %eax
-; I386-NEXT:  .LBB4_2:
-; I386-NEXT:    retl
+; I386-NOCMOV-LABEL: negative_CopyFromRegs:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; I386-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %al
+; I386-NOCMOV-NEXT:    cmpb %cl, %al
+; I386-NOCMOV-NEXT:    jge .LBB4_2
+; I386-NOCMOV-NEXT:  # %bb.1:
+; I386-NOCMOV-NEXT:    movl %ecx, %eax
+; I386-NOCMOV-NEXT:  .LBB4_2:
+; I386-NOCMOV-NEXT:    retl
 ;
-; I686-LABEL: negative_CopyFromRegs:
-; I686:       # %bb.0:
-; I686-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; I686-NEXT:    movb {{[0-9]+}}(%esp), %al
-; I686-NEXT:    cmpb %cl, %al
-; I686-NEXT:    jge .LBB4_2
-; I686-NEXT:  # %bb.1:
-; I686-NEXT:    movl %ecx, %eax
-; I686-NEXT:  .LBB4_2:
-; I686-NEXT:    retl
+; I386-CMOV-LABEL: negative_CopyFromRegs:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    cmovgel %ecx, %eax
+; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT:    retl
+;
+; I686-NOCMOV-LABEL: negative_CopyFromRegs:
+; I686-NOCMOV:       # %bb.0:
+; I686-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; I686-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %al
+; I686-NOCMOV-NEXT:    cmpb %cl, %al
+; I686-NOCMOV-NEXT:    jge .LBB4_2
+; I686-NOCMOV-NEXT:  # %bb.1:
+; I686-NOCMOV-NEXT:    movl %ecx, %eax
+; I686-NOCMOV-NEXT:  .LBB4_2:
+; I686-NOCMOV-NEXT:    retl
+;
+; I686-CMOV-LABEL: negative_CopyFromRegs:
+; I686-CMOV:       # %bb.0:
+; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    cmovgel %ecx, %eax
+; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: negative_CopyFromRegs:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    movl %edi, %eax
-; X86_64-NEXT:    cmpb %sil, %al
-; X86_64-NEXT:    jge .LBB4_2
-; X86_64-NEXT:  # %bb.1:
 ; X86_64-NEXT:    movl %esi, %eax
-; X86_64-NEXT:  .LBB4_2:
+; X86_64-NEXT:    cmpb %al, %dil
+; X86_64-NEXT:    cmovgel %edi, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
   %a1 = trunc i32 %a1_wide to i8
diff --git a/llvm/test/CodeGen/X86/cmov-promotion.ll b/llvm/test/CodeGen/X86/cmov-promotion.ll
index 59ddc77..4a98b6c 100644
--- a/llvm/test/CodeGen/X86/cmov-promotion.ll
+++ b/llvm/test/CodeGen/X86/cmov-promotion.ll
@@ -6,12 +6,9 @@ define i16 @cmov_zpromotion_8_to_16(i1 %c) {
 ; CMOV-LABEL: cmov_zpromotion_8_to_16:
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    testb $1, %dil
-; CMOV-NEXT:    movb $117, %al
-; CMOV-NEXT:    jne .LBB0_2
-; CMOV-NEXT:  # %bb.1:
-; CMOV-NEXT:    movb $-19, %al
-; CMOV-NEXT:  .LBB0_2:
-; CMOV-NEXT:    movzbl %al, %eax
+; CMOV-NEXT:    movl $117, %ecx
+; CMOV-NEXT:    movl $237, %eax
+; CMOV-NEXT:    cmovnel %ecx, %eax
 ; CMOV-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CMOV-NEXT:    retq
 ;
@@ -35,12 +32,9 @@ define i32 @cmov_zpromotion_8_to_32(i1 %c) {
 ; CMOV-LABEL: cmov_zpromotion_8_to_32:
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    testb $1, %dil
-; CMOV-NEXT:    movb $126, %al
-; CMOV-NEXT:    jne .LBB1_2
-; CMOV-NEXT:  # %bb.1:
-; CMOV-NEXT:    movb $-1, %al
-; CMOV-NEXT:  .LBB1_2:
-; CMOV-NEXT:    movzbl %al, %eax
+; CMOV-NEXT:    movl $126, %ecx
+; CMOV-NEXT:    movl $255, %eax
+; CMOV-NEXT:    cmovnel %ecx, %eax
 ; CMOV-NEXT:    retq
 ;
 ; NO_CMOV-LABEL: cmov_zpromotion_8_to_32:
@@ -62,12 +56,9 @@ define i64 @cmov_zpromotion_8_to_64(i1 %c) {
 ; CMOV-LABEL: cmov_zpromotion_8_to_64:
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    testb $1, %dil
-; CMOV-NEXT:    movb $126, %al
-; CMOV-NEXT:    jne .LBB2_2
-; CMOV-NEXT:  # %bb.1:
-; CMOV-NEXT:    movb $-1, %al
-; CMOV-NEXT:  .LBB2_2:
-; CMOV-NEXT:    movzbl %al, %eax
+; CMOV-NEXT:    movl $126, %ecx
+; CMOV-NEXT:    movl $255, %eax
+; CMOV-NEXT:    cmovnel %ecx, %eax
 ; CMOV-NEXT:    retq
 ;
 ; NO_CMOV-LABEL: cmov_zpromotion_8_to_64:
@@ -161,12 +152,10 @@ define i16 @cmov_spromotion_8_to_16(i1 %c) {
 ; CMOV-LABEL: cmov_spromotion_8_to_16:
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    testb $1, %dil
-; CMOV-NEXT:    movb $117, %al
-; CMOV-NEXT:    jne .LBB6_2
-; CMOV-NEXT:  # %bb.1:
-; CMOV-NEXT:    movb $-19, %al
-; CMOV-NEXT:  .LBB6_2:
-; CMOV-NEXT:    movsbl %al, %eax
+; CMOV-NEXT:    movl $117, %eax
+; CMOV-NEXT:    movl $237, %ecx
+; CMOV-NEXT:    cmovnel %eax, %ecx
+; CMOV-NEXT:    movsbl %cl, %eax
 ; CMOV-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CMOV-NEXT:    retq
 ;
@@ -190,12 +179,10 @@ define i32 @cmov_spromotion_8_to_32(i1 %c) {
 ; CMOV-LABEL: cmov_spromotion_8_to_32:
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    testb $1, %dil
-; CMOV-NEXT:    movb $126, %al
-; CMOV-NEXT:    jne .LBB7_2
-; CMOV-NEXT:  # %bb.1:
-; CMOV-NEXT:    movb $-1, %al
-; CMOV-NEXT:  .LBB7_2:
-; CMOV-NEXT:    movsbl %al, %eax
+; CMOV-NEXT:    movl $126, %eax
+; CMOV-NEXT:    movl $255, %ecx
+; CMOV-NEXT:    cmovnel %eax, %ecx
+; CMOV-NEXT:    movsbl %cl, %eax
 ; CMOV-NEXT:    retq
 ;
 ; NO_CMOV-LABEL: cmov_spromotion_8_to_32:
@@ -217,12 +204,10 @@ define i64 @cmov_spromotion_8_to_64(i1 %c) {
 ; CMOV-LABEL: cmov_spromotion_8_to_64:
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    testb $1, %dil
-; CMOV-NEXT:    movb $126, %al
-; CMOV-NEXT:    jne .LBB8_2
-; CMOV-NEXT:  # %bb.1:
-; CMOV-NEXT:    movb $-1, %al
-; CMOV-NEXT:  .LBB8_2:
-; CMOV-NEXT:    movsbq %al, %rax
+; CMOV-NEXT:    movl $126, %eax
+; CMOV-NEXT:    movl $255, %ecx
+; CMOV-NEXT:    cmovnel %eax, %ecx
+; CMOV-NEXT:    movsbq %cl, %rax
 ; CMOV-NEXT:    retq
 ;
 ; NO_CMOV-LABEL: cmov_spromotion_8_to_64:
diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll
index a504538..612df79 100644
--- a/llvm/test/CodeGen/X86/cmov.ll
+++ b/llvm/test/CodeGen/X86/cmov.ll
@@ -91,23 +91,21 @@ define i1 @test4() nounwind {
 ; CHECK-NEXT:    movb {{.*}}(%rip), %cl
 ; CHECK-NEXT:  .LBB3_2: # %func_4.exit.i
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    testb %dl, %dl
 ; CHECK-NEXT:    setne %bl
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    je .LBB3_4
-; CHECK-NEXT:  # %bb.3: # %func_4.exit.i
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:  .LBB3_4: # %func_4.exit.i
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    cmovnel %esi, %ecx
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB3_7
-; CHECK-NEXT:  # %bb.5: # %func_4.exit.i
+; CHECK-NEXT:    je .LBB3_5
+; CHECK-NEXT:  # %bb.3: # %func_4.exit.i
 ; CHECK-NEXT:    testb %bl, %bl
-; CHECK-NEXT:    jne .LBB3_7
-; CHECK-NEXT:  # %bb.6: # %bb.i.i
+; CHECK-NEXT:    jne .LBB3_5
+; CHECK-NEXT:  # %bb.4: # %bb.i.i
 ; CHECK-NEXT:    movb {{.*}}(%rip), %cl
 ; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:  .LBB3_7: # %func_1.exit
+; CHECK-NEXT:  .LBB3_5: # %func_1.exit
 ; CHECK-NEXT:    movb %cl, {{.*}}(%rip)
 ; CHECK-NEXT:    movzbl %cl, %esi
 ; CHECK-NEXT:    movl $_2E_str, %edi
@@ -193,14 +191,9 @@ entry:
 define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    jne .LBB6_1
-; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB6_1:
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovel %edx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %d = select i1 %c, i8 %a, i8 %b
diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll
index 454a798..0360c0a 100644
--- a/llvm/test/CodeGen/X86/cmovcmov.ll
+++ b/llvm/test/CodeGen/X86/cmovcmov.ll
@@ -325,26 +325,11 @@ define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) {
 ; CMOV-LABEL: no_cascade_opt:
 ; CMOV:       # %bb.0: # %entry
 ; CMOV-NEXT:    cmpl %edx, %esi
-; CMOV-NEXT:    movb $20, %al
-; CMOV-NEXT:    movb $20, %dl
-; CMOV-NEXT:    jge .LBB7_1
-; CMOV-NEXT:  # %bb.2: # %entry
-; CMOV-NEXT:    jle .LBB7_3
-; CMOV-NEXT:  .LBB7_4: # %entry
-; CMOV-NEXT:    testl %edi, %edi
-; CMOV-NEXT:    jne .LBB7_5
-; CMOV-NEXT:  .LBB7_6: # %entry
-; CMOV-NEXT:    movb %al, {{.*}}(%rip)
-; CMOV-NEXT:    retq
-; CMOV-NEXT:  .LBB7_1: # %entry
-; CMOV-NEXT:    movl %ecx, %edx
-; CMOV-NEXT:    jg .LBB7_4
-; CMOV-NEXT:  .LBB7_3: # %entry
-; CMOV-NEXT:    movl %edx, %eax
+; CMOV-NEXT:    movl $20, %eax
+; CMOV-NEXT:    cmovll %eax, %ecx
+; CMOV-NEXT:    cmovlel %ecx, %eax
 ; CMOV-NEXT:    testl %edi, %edi
-; CMOV-NEXT:    je .LBB7_6
-; CMOV-NEXT:  .LBB7_5: # %entry
-; CMOV-NEXT:    movl %edx, %eax
+; CMOV-NEXT:    cmovnel %ecx, %eax
 ; CMOV-NEXT:    movb %al, {{.*}}(%rip)
 ; CMOV-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll
index a742158..ce4aed4 100644
--- a/llvm/test/CodeGen/X86/copy-eflags.ll
+++ b/llvm/test/CodeGen/X86/copy-eflags.ll
@@ -247,35 +247,26 @@ define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %
 ;
 ; X64-LABEL: PR37100:
 ; X64:       # %bb.0: # %bb
-; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; X64-NEXT:    jmp .LBB3_1
+; X64-NEXT:    movzbl %cl, %r11d
 ; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_5: # %bb1
-; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X64-NEXT:    movl %r10d, %eax
-; X64-NEXT:    cltd
-; X64-NEXT:    idivl %esi
 ; X64-NEXT:  .LBB3_1: # %bb1
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movsbq %dil, %rax
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpq %rax, %r11
-; X64-NEXT:    setl %sil
-; X64-NEXT:    negl %esi
-; X64-NEXT:    cmpq %rax, %r11
-; X64-NEXT:    jl .LBB3_3
-; X64-NEXT:  # %bb.2: # %bb1
-; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X64-NEXT:    movl %ecx, %edi
-; X64-NEXT:  .LBB3_3: # %bb1
-; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    cmpq %rax, %rsi
+; X64-NEXT:    setl %cl
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    cmpq %rax, %rsi
+; X64-NEXT:    movzbl %al, %edi
+; X64-NEXT:    cmovgel %r11d, %edi
 ; X64-NEXT:    movb %dil, (%r8)
-; X64-NEXT:    jl .LBB3_5
-; X64-NEXT:  # %bb.4: # %bb1
-; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X64-NEXT:    movl (%r9), %esi
-; X64-NEXT:    jmp .LBB3_5
+; X64-NEXT:    cmovgel (%r9), %ecx
+; X64-NEXT:    movl %r10d, %eax
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %ecx
+; X64-NEXT:    jmp .LBB3_1
 bb:
   br label %bb1
 
diff --git a/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll b/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
index 58b378a..a3171ed 100644
--- a/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10                                              | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1                  | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10                             -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10                                              | FileCheck %s --check-prefix=CHECK --check-prefixes=ISEL,SSE,SSE-ISEL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1                  | FileCheck %s --check-prefix=CHECK --check-prefixes=FASTISEL,SSE,SSE-FASTISEL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10                             -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefixes=ISEL,AVX,AVX-ISEL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefixes=FASTISEL,AVX,AVX-FASTISEL
 
 
 define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) {
@@ -278,18 +278,26 @@ define float @select_icmp_sle_f32(i64 %a, i64 %b, float %c, float %d) {
 }
 
 define i8 @select_icmp_sle_i8(i64 %a, i64 %b, i8 %c, i8 %d) {
-; CHECK-LABEL: select_icmp_sle_i8:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    jle LBB12_1
-; CHECK-NEXT:  ## %bb.2:
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  LBB12_1:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; ISEL-LABEL: select_icmp_sle_i8:
+; ISEL:       ## %bb.0:
+; ISEL-NEXT:    movl %edx, %eax
+; ISEL-NEXT:    cmpq %rsi, %rdi
+; ISEL-NEXT:    cmovgl %ecx, %eax
+; ISEL-NEXT:    ## kill: def $al killed $al killed $eax
+; ISEL-NEXT:    retq
+;
+; FASTISEL-LABEL: select_icmp_sle_i8:
+; FASTISEL:       ## %bb.0:
+; FASTISEL-NEXT:    cmpq %rsi, %rdi
+; FASTISEL-NEXT:    jle LBB12_1
+; FASTISEL-NEXT:  ## %bb.2:
+; FASTISEL-NEXT:    movl %ecx, %eax
+; FASTISEL-NEXT:    ## kill: def $al killed $al killed $eax
+; FASTISEL-NEXT:    retq
+; FASTISEL-NEXT:  LBB12_1:
+; FASTISEL-NEXT:    movl %edx, %eax
+; FASTISEL-NEXT:    ## kill: def $al killed $al killed $eax
+; FASTISEL-NEXT:    retq
   %1 = icmp sle i64 %a, %b
   %2 = select i1 %1, i8 %c, i8 %d
   ret i8 %2
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 0e1bcb2..c7ca98d 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -36,19 +36,17 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
 ;
 ; X64-LABEL: var_shift_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andb $7, %dl
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shlb %cl, %dil
+; X64-NEXT:    shlb %cl, %al
 ; X64-NEXT:    movb $8, %cl
 ; X64-NEXT:    subb %dl, %cl
 ; X64-NEXT:    shrb %cl, %sil
+; X64-NEXT:    orb %al, %sil
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    testb %dl, %dl
-; X64-NEXT:    je .LBB0_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    orb %sil, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:  .LBB0_2:
+; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp = tail call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %z)
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 09d63b6..a655c58 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -36,19 +36,17 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
 ;
 ; X64-LABEL: var_shift_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andb $7, %dl
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrb %cl, %sil
+; X64-NEXT:    shrb %cl, %al
 ; X64-NEXT:    movb $8, %cl
 ; X64-NEXT:    subb %dl, %cl
 ; X64-NEXT:    shlb %cl, %dil
+; X64-NEXT:    orb %al, %dil
+; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    testb %dl, %dl
-; X64-NEXT:    je .LBB0_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    orb %sil, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:  .LBB0_2:
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp = tail call i8 @llvm.fshr.i8(i8 %x, i8 %y, i8 %z)
diff --git a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
index 495ead2..4c233c4 100644
--- a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -19,8 +19,7 @@ target triple = "i386-apple-macosx10.5"
 ; CHECK-LABEL: eflagsLiveInPrologue:
 ;
 ; DISABLE: pushl
-; DISABLE-NEXT: pushl
-; DISABLE-NEXT: subl $20, %esp
+; DISABLE-NEXT: subl $8, %esp
 ;
 ; CHECK: movl L_a$non_lazy_ptr, [[A:%[a-z]+]]
 ; CHECK-NEXT: cmpl $0, ([[A]])
@@ -40,35 +39,26 @@ target triple = "i386-apple-macosx10.5"
 ; The for.end block is split to accomadate the different selects.
 ; We are interested in the one with the call, so skip until the branch.
 ; CHECK: [[FOREND_LABEL]]:
-; CHECK-NEXT: xorl
+
+; ENABLE: pushl
+; ENABLE-NEXT: subl $8, %esp
+
+; CHECK: xorl [[CMOVE_VAL:%edx]], [[CMOVE_VAL]]
 ; CHECK-NEXT: cmpb $0, _d
-; CHECK-NEXT: movl $0, %edx
-; CHECK-NEXT: jne [[CALL_LABEL:LBB[0-9_]+]]
-;
-; CHECK: movb $6, %dl
-;
-; CHECK: [[CALL_LABEL]]
-;
-; ENABLE-NEXT: pushl
-; ENABLE-NEXT: pushl
-; We must not use sub here otherwise we will clobber the eflags.
-; ENABLE-NEXT: leal -20(%esp), %esp
-;
-; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]]
-; CHECK-NEXT: movb %dl, ([[E]])
-; CHECK-NEXT: movzbl %dl, [[CONV:%[a-z]+]]
-; CHECK-NEXT: movl $6, [[CONV:%[a-z]+]]
+; CHECK-NEXT: movl $6, [[IMM_VAL:%ecx]]
 ; The eflags is used in the next instruction.
 ; If that instruction disappear, we are not exercising the bug
 ; anymore.
-; CHECK-NEXT: cmovnel {{%[a-z]+}}, [[CONV]]
-;
-; Skip all the crust of vaarg lowering.
+; CHECK-NEXT: cmovnel [[CMOVE_VAL]], [[IMM_VAL]]
+
+; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]]
+; CHECK-NEXT: movb %cl, ([[E]])
+; CHECK-NEXT: leal 1(%ecx), %esi
+
 ; CHECK: calll _varfunc
 ; Set the return value to 0.
 ; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: addl $20, %esp
-; CHECK-NEXT: popl
+; CHECK-NEXT: addl $8, %esp
 ; CHECK-NEXT: popl
 ; CHECK-NEXT: retl
 define i32 @eflagsLiveInPrologue() #0 {
diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index 7d2eae7..c032020 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -1019,22 +1019,17 @@ define i16 @scalar_i16_signed_mem_mem(i16* %a1_addr, i16* %a2_addr) nounwind {
 define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind {
 ; X64-LABEL: scalar_i8_signed_reg_reg:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpb %sil, %dil
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
 ; X64-NEXT:    setle %cl
-; X64-NEXT:    movl %esi, %edx
-; X64-NEXT:    jg .LBB15_2
-; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:  .LBB15_2:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    jge .LBB15_4
-; X64-NEXT:  # %bb.3:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:  .LBB15_4:
-; X64-NEXT:    subb %dl, %al
+; X64-NEXT:    cmovgl %esi, %edx
+; X64-NEXT:    cmovgel %edi, %eax
 ; X64-NEXT:    addb %cl, %cl
 ; X64-NEXT:    decb %cl
+; X64-NEXT:    subb %dl, %al
 ; X64-NEXT:    shrb %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    mulb %cl
 ; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    retq
@@ -1079,17 +1074,12 @@ define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind {
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    cmpb %al, %dil
 ; X64-NEXT:    setbe %cl
-; X64-NEXT:    ja .LBB16_1
-; X64-NEXT:  # %bb.2:
 ; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    jmp .LBB16_3
-; X64-NEXT:  .LBB16_1:
-; X64-NEXT:    movl %eax, %edx
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:  .LBB16_3:
-; X64-NEXT:    subb %dl, %al
+; X64-NEXT:    cmoval %esi, %edx
+; X64-NEXT:    cmoval %edi, %eax
 ; X64-NEXT:    addb %cl, %cl
 ; X64-NEXT:    decb %cl
+; X64-NEXT:    subb %dl, %al
 ; X64-NEXT:    shrb %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    mulb %cl
@@ -1133,23 +1123,18 @@ define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind {
 define i8 @scalar_i8_signed_mem_reg(i8* %a1_addr, i8 %a2) nounwind {
 ; X64-LABEL: scalar_i8_signed_mem_reg:
 ; X64:       # %bb.0:
-; X64-NEXT:    movb (%rdi), %cl
+; X64-NEXT:    movzbl (%rdi), %ecx
 ; X64-NEXT:    cmpb %sil, %cl
 ; X64-NEXT:    setle %dl
-; X64-NEXT:    movl %esi, %edi
-; X64-NEXT:    jg .LBB17_2
-; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    movl %ecx, %edi
-; X64-NEXT:  .LBB17_2:
+; X64-NEXT:    cmovgl %esi, %edi
 ; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    jge .LBB17_4
-; X64-NEXT:  # %bb.3:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:  .LBB17_4:
-; X64-NEXT:    subb %dil, %al
+; X64-NEXT:    cmovll %esi, %eax
 ; X64-NEXT:    addb %dl, %dl
 ; X64-NEXT:    decb %dl
+; X64-NEXT:    subb %dil, %al
 ; X64-NEXT:    shrb %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    mulb %dl
 ; X64-NEXT:    addb %cl, %al
 ; X64-NEXT:    retq
@@ -1193,23 +1178,17 @@ define i8 @scalar_i8_signed_mem_reg(i8* %a1_addr, i8 %a2) nounwind {
 define i8 @scalar_i8_signed_reg_mem(i8 %a1, i8* %a2_addr) nounwind {
 ; X64-LABEL: scalar_i8_signed_reg_mem:
 ; X64:       # %bb.0:
-; X64-NEXT:    movb (%rsi), %dl
-; X64-NEXT:    cmpb %dl, %dil
+; X64-NEXT:    movzbl (%rsi), %eax
+; X64-NEXT:    cmpb %al, %dil
 ; X64-NEXT:    setle %cl
-; X64-NEXT:    movl %edx, %esi
-; X64-NEXT:    jg .LBB18_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    movl %edi, %esi
-; X64-NEXT:  .LBB18_2:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    jge .LBB18_4
-; X64-NEXT:  # %bb.3:
-; X64-NEXT:    movl %edx, %eax
-; X64-NEXT:  .LBB18_4:
-; X64-NEXT:    subb %sil, %al
+; X64-NEXT:    movl %edi, %edx
+; X64-NEXT:    cmovgl %eax, %edx
+; X64-NEXT:    cmovgel %edi, %eax
 ; X64-NEXT:    addb %cl, %cl
 ; X64-NEXT:    decb %cl
+; X64-NEXT:    subb %dl, %al
 ; X64-NEXT:    shrb %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    mulb %cl
 ; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    retq
@@ -1253,26 +1232,20 @@ define i8 @scalar_i8_signed_reg_mem(i8 %a1, i8* %a2_addr) nounwind {
 define i8 @scalar_i8_signed_mem_mem(i8* %a1_addr, i8* %a2_addr) nounwind {
 ; X64-LABEL: scalar_i8_signed_mem_mem:
 ; X64:       # %bb.0:
-; X64-NEXT:    movb (%rdi), %dil
-; X64-NEXT:    movb (%rsi), %cl
-; X64-NEXT:    cmpb %cl, %dil
+; X64-NEXT:    movzbl (%rdi), %ecx
+; X64-NEXT:    movzbl (%rsi), %eax
+; X64-NEXT:    cmpb %al, %cl
 ; X64-NEXT:    setle %dl
 ; X64-NEXT:    movl %ecx, %esi
-; X64-NEXT:    jg .LBB19_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    movl %edi, %esi
-; X64-NEXT:  .LBB19_2:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    jge .LBB19_4
-; X64-NEXT:  # %bb.3:
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:  .LBB19_4:
-; X64-NEXT:    subb %sil, %al
+; X64-NEXT:    cmovgl %eax, %esi
+; X64-NEXT:    cmovgel %ecx, %eax
 ; X64-NEXT:    addb %dl, %dl
 ; X64-NEXT:    decb %dl
+; X64-NEXT:    subb %sil, %al
 ; X64-NEXT:    shrb %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    mulb %dl
-; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    addb %cl, %al
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: scalar_i8_signed_mem_mem:
diff --git a/llvm/test/CodeGen/X86/pr5145.ll b/llvm/test/CodeGen/X86/pr5145.ll
index 02e9b4c..cb7e1bc 100644
--- a/llvm/test/CodeGen/X86/pr5145.ll
+++ b/llvm/test/CodeGen/X86/pr5145.ll
@@ -10,61 +10,49 @@ define void @atomic_maxmin_i8() {
 ; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $4, %al
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    jg .LBB0_3
-; CHECK-NEXT:  # %bb.2: # %atomicrmw.start
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb $5, %cl
-; CHECK-NEXT:  .LBB0_3: # %atomicrmw.start
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movl $5, %ecx
+; CHECK-NEXT:    cmovgl %eax, %ecx
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
 ; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  # %bb.4: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
 ; CHECK-NEXT:    movb {{.*}}(%rip), %al
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start2
+; CHECK-NEXT:  .LBB0_3: # %atomicrmw.start2
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $7, %al
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    jl .LBB0_7
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start2
-; CHECK-NEXT:    # in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    movb $6, %cl
-; CHECK-NEXT:  .LBB0_7: # %atomicrmw.start2
-; CHECK-NEXT:    # in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movl $6, %ecx
+; CHECK-NEXT:    cmovll %eax, %ecx
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
-; CHECK-NEXT:    jne .LBB0_5
-; CHECK-NEXT:  # %bb.8: # %atomicrmw.end1
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  # %bb.4: # %atomicrmw.end1
 ; CHECK-NEXT:    movb {{.*}}(%rip), %al
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_9: # %atomicrmw.start8
+; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start8
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $7, %al
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    ja .LBB0_11
-; CHECK-NEXT:  # %bb.10: # %atomicrmw.start8
-; CHECK-NEXT:    # in Loop: Header=BB0_9 Depth=1
-; CHECK-NEXT:    movb $7, %cl
-; CHECK-NEXT:  .LBB0_11: # %atomicrmw.start8
-; CHECK-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movl $7, %ecx
+; CHECK-NEXT:    cmoval %eax, %ecx
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
-; CHECK-NEXT:    jne .LBB0_9
-; CHECK-NEXT:  # %bb.12: # %atomicrmw.end7
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.end7
 ; CHECK-NEXT:    movb {{.*}}(%rip), %al
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_13: # %atomicrmw.start14
+; CHECK-NEXT:  .LBB0_7: # %atomicrmw.start14
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $9, %al
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    jb .LBB0_15
-; CHECK-NEXT:  # %bb.14: # %atomicrmw.start14
-; CHECK-NEXT:    # in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movb $8, %cl
-; CHECK-NEXT:  .LBB0_15: # %atomicrmw.start14
-; CHECK-NEXT:    # in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movl $8, %ecx
+; CHECK-NEXT:    cmovbl %eax, %ecx
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
-; CHECK-NEXT:    jne .LBB0_13
-; CHECK-NEXT:  # %bb.16: # %atomicrmw.end13
+; CHECK-NEXT:    jne .LBB0_7
+; CHECK-NEXT:  # %bb.8: # %atomicrmw.end13
 ; CHECK-NEXT:    retq
   %1 = atomicrmw max  i8* @sc8, i8 5 acquire
   %2 = atomicrmw min  i8* @sc8, i8 6 acquire
diff --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll
index b5524e1..e395bb4 100644
--- a/llvm/test/CodeGen/X86/sadd_sat.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat.ll
@@ -96,32 +96,30 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    shlb $4, %dl
 ; X86-NEXT:    shlb $4, %al
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addb %dl, %cl
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movb %al, %ah
+; X86-NEXT:    addb %dl, %ah
 ; X86-NEXT:    setns %cl
+; X86-NEXT:    addl $127, %ecx
 ; X86-NEXT:    addb %dl, %al
-; X86-NEXT:    jno .LBB2_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    addb $127, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    cmovol %ecx, %eax
 ; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shlb $4, %sil
-; X64-NEXT:    shlb $4, %al
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    addb %sil, %cl
-; X64-NEXT:    setns %cl
+; X64-NEXT:    shlb $4, %dil
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    addb %sil, %al
-; X64-NEXT:    jno .LBB2_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    addb $127, %cl
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:  .LBB2_2:
+; X64-NEXT:    setns %cl
+; X64-NEXT:    addl $127, %ecx
+; X64-NEXT:    addb %sil, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    cmovol %ecx, %eax
 ; X64-NEXT:    sarb $4, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 186141a..58cdc87 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -499,32 +499,30 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movb (%rdi), %cl
 ; SSE-NEXT:    movb (%rsi), %dil
+; SSE-NEXT:    xorl %esi, %esi
 ; SSE-NEXT:    movl %ecx, %eax
 ; SSE-NEXT:    addb %dil, %al
 ; SSE-NEXT:    setns %sil
+; SSE-NEXT:    addl $127, %esi
 ; SSE-NEXT:    addb %dil, %cl
-; SSE-NEXT:    jno .LBB13_2
-; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    addb $127, %sil
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:  .LBB13_2:
-; SSE-NEXT:    movb %cl, (%rdx)
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    cmovol %esi, %eax
+; SSE-NEXT:    movb %al, (%rdx)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v1i8:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movb (%rdi), %cl
 ; AVX-NEXT:    movb (%rsi), %dil
+; AVX-NEXT:    xorl %esi, %esi
 ; AVX-NEXT:    movl %ecx, %eax
 ; AVX-NEXT:    addb %dil, %al
 ; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $127, %esi
 ; AVX-NEXT:    addb %dil, %cl
-; AVX-NEXT:    jno .LBB13_2
-; AVX-NEXT:  # %bb.1:
-; AVX-NEXT:    addb $127, %sil
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:  .LBB13_2:
-; AVX-NEXT:    movb %cl, (%rdx)
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    cmovol %esi, %eax
+; AVX-NEXT:    movb %al, (%rdx)
 ; AVX-NEXT:    retq
   %x = load <1 x i8>, <1 x i8>* %px
   %y = load <1 x i8>, <1 x i8>* %py
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index 78e4d5a..63c78fa 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -10,12 +10,9 @@
 define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
 ; ANY-LABEL: unsigned_sat_constant_i8_using_min:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movl %edi, %eax
-; ANY-NEXT:    cmpb $-43, %al
-; ANY-NEXT:    jb .LBB0_2
-; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    movb $-43, %al
-; ANY-NEXT:  .LBB0_2:
+; ANY-NEXT:    cmpb $-43, %dil
+; ANY-NEXT:    movl $213, %eax
+; ANY-NEXT:    cmovbl %edi, %eax
 ; ANY-NEXT:    addb $42, %al
 ; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
@@ -29,11 +26,10 @@ define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) {
 ; ANY-LABEL: unsigned_sat_constant_i8_using_cmp_sum:
 ; ANY:       # %bb.0:
 ; ANY-NEXT:    addb $42, %dil
-; ANY-NEXT:    movb $-1, %al
-; ANY-NEXT:    jb .LBB1_2
-; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    movl %edi, %eax
-; ANY-NEXT:  .LBB1_2:
+; ANY-NEXT:    movzbl %dil, %ecx
+; ANY-NEXT:    movl $255, %eax
+; ANY-NEXT:    cmovael %ecx, %eax
+; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
   %a = add i8 %x, 42
   %c = icmp ugt i8 %x, %a
@@ -45,11 +41,10 @@ define i8 @unsigned_sat_constant_i8_using_cmp_notval(i8 %x) {
 ; ANY-LABEL: unsigned_sat_constant_i8_using_cmp_notval:
 ; ANY:       # %bb.0:
 ; ANY-NEXT:    addb $42, %dil
-; ANY-NEXT:    movb $-1, %al
-; ANY-NEXT:    jb .LBB2_2
-; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    movl %edi, %eax
-; ANY-NEXT:  .LBB2_2:
+; ANY-NEXT:    movzbl %dil, %ecx
+; ANY-NEXT:    movl $255, %eax
+; ANY-NEXT:    cmovael %ecx, %eax
+; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
   %a = add i8 %x, 42
   %c = icmp ugt i8 %x, -43
@@ -183,14 +178,11 @@ define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) {
 define i8 @unsigned_sat_variable_i8_using_min(i8 %x, i8 %y) {
 ; ANY-LABEL: unsigned_sat_variable_i8_using_min:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movl %edi, %eax
-; ANY-NEXT:    movl %esi, %ecx
-; ANY-NEXT:    notb %cl
-; ANY-NEXT:    cmpb %cl, %al
-; ANY-NEXT:    jb .LBB12_2
-; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    movl %ecx, %eax
-; ANY-NEXT:  .LBB12_2:
+; ANY-NEXT:    movl %esi, %eax
+; ANY-NEXT:    notb %al
+; ANY-NEXT:    cmpb %al, %dil
+; ANY-NEXT:    movzbl %al, %eax
+; ANY-NEXT:    cmovbl %edi, %eax
 ; ANY-NEXT:    addb %sil, %al
 ; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
@@ -205,11 +197,10 @@ define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
 ; ANY-LABEL: unsigned_sat_variable_i8_using_cmp_sum:
 ; ANY:       # %bb.0:
 ; ANY-NEXT:    addb %sil, %dil
-; ANY-NEXT:    movb $-1, %al
-; ANY-NEXT:    jb .LBB13_2
-; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    movl %edi, %eax
-; ANY-NEXT:  .LBB13_2:
+; ANY-NEXT:    movzbl %dil, %ecx
+; ANY-NEXT:    movl $255, %eax
+; ANY-NEXT:    cmovael %ecx, %eax
+; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
   %a = add i8 %x, %y
   %c = icmp ugt i8 %x, %a
@@ -220,15 +211,15 @@ define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
 define i8 @unsigned_sat_variable_i8_using_cmp_notval(i8 %x, i8 %y) {
 ; ANY-LABEL: unsigned_sat_variable_i8_using_cmp_notval:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movl %esi, %eax
-; ANY-NEXT:    notb %al
-; ANY-NEXT:    cmpb %al, %dil
-; ANY-NEXT:    movb $-1, %al
-; ANY-NEXT:    ja .LBB14_2
-; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    addb %sil, %dil
-; ANY-NEXT:    movl %edi, %eax
-; ANY-NEXT:  .LBB14_2:
+; ANY-NEXT:    # kill: def $esi killed $esi def $rsi
+; ANY-NEXT:    # kill: def $edi killed $edi def $rdi
+; ANY-NEXT:    leal (%rdi,%rsi), %eax
+; ANY-NEXT:    notb %sil
+; ANY-NEXT:    cmpb %sil, %dil
+; ANY-NEXT:    movzbl %al, %ecx
+; ANY-NEXT:    movl $255, %eax
+; ANY-NEXT:    cmovbel %ecx, %eax
+; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
   %noty = xor i8 %y, -1
   %a = add i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 558dc7d..6064917 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -1136,11 +1136,8 @@ define void @clamp_i8(i32 %src, i8* %dst) {
 ; GENERIC-NEXT:    movl $127, %eax
 ; GENERIC-NEXT:    cmovlel %edi, %eax
 ; GENERIC-NEXT:    cmpl $-128, %eax
-; GENERIC-NEXT:    movb $-128, %cl
-; GENERIC-NEXT:    jl LBB21_2
-; GENERIC-NEXT:  ## %bb.1:
-; GENERIC-NEXT:    movl %eax, %ecx
-; GENERIC-NEXT:  LBB21_2:
+; GENERIC-NEXT:    movl $128, %ecx
+; GENERIC-NEXT:    cmovgel %eax, %ecx
 ; GENERIC-NEXT:    movb %cl, (%rsi)
 ; GENERIC-NEXT:    retq
 ;
@@ -1148,30 +1145,24 @@ define void @clamp_i8(i32 %src, i8* %dst) {
 ; ATOM:       ## %bb.0:
 ; ATOM-NEXT:    cmpl $127, %edi
 ; ATOM-NEXT:    movl $127, %eax
-; ATOM-NEXT:    movb $-128, %cl
+; ATOM-NEXT:    movl $128, %ecx
 ; ATOM-NEXT:    cmovlel %edi, %eax
 ; ATOM-NEXT:    cmpl $-128, %eax
-; ATOM-NEXT:    jl LBB21_2
-; ATOM-NEXT:  ## %bb.1:
-; ATOM-NEXT:    movl %eax, %ecx
-; ATOM-NEXT:  LBB21_2:
+; ATOM-NEXT:    cmovgel %eax, %ecx
 ; ATOM-NEXT:    movb %cl, (%rsi)
 ; ATOM-NEXT:    retq
 ;
 ; ATHLON-LABEL: clamp_i8:
 ; ATHLON:       ## %bb.0:
 ; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT:    cmpl $127, %edx
-; ATHLON-NEXT:    movl $127, %ecx
-; ATHLON-NEXT:    cmovlel %edx, %ecx
-; ATHLON-NEXT:    cmpl $-128, %ecx
-; ATHLON-NEXT:    movb $-128, %dl
-; ATHLON-NEXT:    jl LBB21_2
-; ATHLON-NEXT:  ## %bb.1:
-; ATHLON-NEXT:    movl %ecx, %edx
-; ATHLON-NEXT:  LBB21_2:
-; ATHLON-NEXT:    movb %dl, (%eax)
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    cmpl $127, %ecx
+; ATHLON-NEXT:    movl $127, %edx
+; ATHLON-NEXT:    cmovlel %ecx, %edx
+; ATHLON-NEXT:    cmpl $-128, %edx
+; ATHLON-NEXT:    movl $128, %ecx
+; ATHLON-NEXT:    cmovgel %edx, %ecx
+; ATHLON-NEXT:    movb %cl, (%eax)
 ; ATHLON-NEXT:    retl
 ;
 ; MCU-LABEL: clamp_i8:
diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll
index 8f7989e..0f10649 100644
--- a/llvm/test/CodeGen/X86/select_const.ll
+++ b/llvm/test/CodeGen/X86/select_const.ll
@@ -379,11 +379,10 @@ define i8 @sel_67_neg125(i32 %x) {
 ; CHECK-LABEL: sel_67_neg125:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpl $42, %edi
-; CHECK-NEXT:    movb $67, %al
-; CHECK-NEXT:    jg .LBB31_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    movb $-125, %al
-; CHECK-NEXT:  .LBB31_2:
+; CHECK-NEXT:    movl $67, %ecx
+; CHECK-NEXT:    movl $131, %eax
+; CHECK-NEXT:    cmovgl %ecx, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp sgt i32 %x, 42
   %sel = select i1 %cmp, i8 67, i8 -125
diff --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll
index 7fda35a..9a1633d 100644
--- a/llvm/test/CodeGen/X86/ssub_sat.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat.ll
@@ -96,32 +96,30 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    shlb $4, %dl
 ; X86-NEXT:    shlb $4, %al
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movb %al, %ah
+; X86-NEXT:    subb %dl, %ah
 ; X86-NEXT:    setns %cl
+; X86-NEXT:    addl $127, %ecx
 ; X86-NEXT:    subb %dl, %al
-; X86-NEXT:    jno .LBB2_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    addb $127, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    cmovol %ecx, %eax
 ; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shlb $4, %sil
-; X64-NEXT:    shlb $4, %al
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    subb %sil, %cl
-; X64-NEXT:    setns %cl
+; X64-NEXT:    shlb $4, %dil
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    subb %sil, %al
-; X64-NEXT:    jno .LBB2_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    addb $127, %cl
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:  .LBB2_2:
+; X64-NEXT:    setns %cl
+; X64-NEXT:    addl $127, %ecx
+; X64-NEXT:    subb %sil, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    cmovol %ecx, %eax
 ; X64-NEXT:    sarb $4, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index b9adde1..70eaac5 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -499,32 +499,30 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movb (%rdi), %cl
 ; SSE-NEXT:    movb (%rsi), %dil
+; SSE-NEXT:    xorl %esi, %esi
 ; SSE-NEXT:    movl %ecx, %eax
 ; SSE-NEXT:    subb %dil, %al
 ; SSE-NEXT:    setns %sil
+; SSE-NEXT:    addl $127, %esi
 ; SSE-NEXT:    subb %dil, %cl
-; SSE-NEXT:    jno .LBB13_2
-; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    addb $127, %sil
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:  .LBB13_2:
-; SSE-NEXT:    movb %cl, (%rdx)
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    cmovol %esi, %eax
+; SSE-NEXT:    movb %al, (%rdx)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v1i8:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movb (%rdi), %cl
 ; AVX-NEXT:    movb (%rsi), %dil
+; AVX-NEXT:    xorl %esi, %esi
 ; AVX-NEXT:    movl %ecx, %eax
 ; AVX-NEXT:    subb %dil, %al
 ; AVX-NEXT:    setns %sil
+; AVX-NEXT:    addl $127, %esi
 ; AVX-NEXT:    subb %dil, %cl
-; AVX-NEXT:    jno .LBB13_2
-; AVX-NEXT:  # %bb.1:
-; AVX-NEXT:    addb $127, %sil
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:  .LBB13_2:
-; AVX-NEXT:    movb %cl, (%rdx)
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    cmovol %esi, %eax
+; AVX-NEXT:    movb %al, (%rdx)
 ; AVX-NEXT:    retq
   %x = load <1 x i8>, <1 x i8>* %px
   %y = load <1 x i8>, <1 x i8>* %py
diff --git a/llvm/test/CodeGen/X86/uadd_sat.ll b/llvm/test/CodeGen/X86/uadd_sat.ll
index 677db07..a483531 100644
--- a/llvm/test/CodeGen/X86/uadd_sat.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat.ll
@@ -51,17 +51,16 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define i4 @func3(i4 %x, i4 %y) nounwind {
 ; X86-LABEL: func3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shlb $4, %cl
-; X86-NEXT:    addb %al, %cl
-; X86-NEXT:    movb $-1, %al
-; X86-NEXT:    jb .LBB2_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    addb %cl, %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movl $255, %eax
+; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    shrb $4, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func3:
@@ -69,12 +68,11 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; X64-NEXT:    shlb $4, %sil
 ; X64-NEXT:    shlb $4, %dil
 ; X64-NEXT:    addb %sil, %dil
-; X64-NEXT:    movb $-1, %al
-; X64-NEXT:    jb .LBB2_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:  .LBB2_2:
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    movl $255, %eax
+; X64-NEXT:    cmovael %ecx, %eax
 ; X64-NEXT:    shrb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp = call i4 @llvm.uadd.sat.i4(i4 %x, i4 %y);
   ret i4 %tmp;
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index 83fe8c1..06e751a 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -499,11 +499,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movb (%rdi), %al
 ; SSE-NEXT:    addb (%rsi), %al
-; SSE-NEXT:    movb $-1, %cl
-; SSE-NEXT:    jb .LBB13_2
-; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:  .LBB13_2:
+; SSE-NEXT:    movzbl %al, %eax
+; SSE-NEXT:    movl $255, %ecx
+; SSE-NEXT:    cmovael %eax, %ecx
 ; SSE-NEXT:    movb %cl, (%rdx)
 ; SSE-NEXT:    retq
 ;
@@ -511,11 +509,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movb (%rdi), %al
 ; AVX-NEXT:    addb (%rsi), %al
-; AVX-NEXT:    movb $-1, %cl
-; AVX-NEXT:    jb .LBB13_2
-; AVX-NEXT:  # %bb.1:
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:  .LBB13_2:
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    movl $255, %ecx
+; AVX-NEXT:    cmovael %eax, %ecx
 ; AVX-NEXT:    movb %cl, (%rdx)
 ; AVX-NEXT:    retq
   %x = load <1 x i8>, <1 x i8>* %px
diff --git a/llvm/test/CodeGen/X86/usub_sat.ll b/llvm/test/CodeGen/X86/usub_sat.ll
index ef822fa..0ad2bd6 100644
--- a/llvm/test/CodeGen/X86/usub_sat.ll
+++ b/llvm/test/CodeGen/X86/usub_sat.ll
@@ -55,25 +55,22 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shlb $4, %cl
 ; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    subb %cl, %al
-; X86-NEXT:    jae .LBB2_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB2_2:
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    cmovbl %edx, %eax
 ; X86-NEXT:    shrb $4, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shlb $4, %sil
-; X64-NEXT:    shlb $4, %al
-; X64-NEXT:    subb %sil, %al
-; X64-NEXT:    jae .LBB2_2
-; X64-NEXT:  # %bb.1:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:  .LBB2_2:
+; X64-NEXT:    shlb $4, %dil
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    subb %sil, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
 ; X64-NEXT:    shrb $4, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index e3d47d2..a52bada 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -498,22 +498,20 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; SSE-LABEL: v1i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movb (%rdi), %al
+; SSE-NEXT:    xorl %ecx, %ecx
 ; SSE-NEXT:    subb (%rsi), %al
-; SSE-NEXT:    jae .LBB13_2
-; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:  .LBB13_2:
+; SSE-NEXT:    movzbl %al, %eax
+; SSE-NEXT:    cmovbl %ecx, %eax
 ; SSE-NEXT:    movb %al, (%rdx)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v1i8:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movb (%rdi), %al
+; AVX-NEXT:    xorl %ecx, %ecx
 ; AVX-NEXT:    subb (%rsi), %al
-; AVX-NEXT:    jae .LBB13_2
-; AVX-NEXT:  # %bb.1:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:  .LBB13_2:
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    cmovbl %ecx, %eax
 ; AVX-NEXT:    movb %al, (%rdx)
 ; AVX-NEXT:    retq
   %x = load <1 x i8>, <1 x i8>* %px
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index 3d65eed..32ab650 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -73,7 +73,7 @@ declare i32 @doSomething(i32, i32*)
 ; CHECK-LABEL: freqSaveAndRestoreOutsideLoop:
 ;
 ; Shrink-wrapping allows to skip the prologue in the else case.
-; ENABLE: testl %edi, %edi  
+; ENABLE: testl %edi, %edi
 ; ENABLE: je [[ELSE_LABEL:LBB[0-9_]+]]
 ;
 ; Prologue code.
@@ -508,7 +508,7 @@ declare i32 @someVariadicFunc(i32, ...)
 declare hidden fastcc %struct.temp_slot* @find_temp_slot_from_address(%struct.rtx_def* readonly)
 
 ; CHECK-LABEL: useLEA:
-; DISABLE: pushq 
+; DISABLE: pushq
 ;
 ; CHECK: testq   %rdi, %rdi
 ; CHECK-NEXT: je      [[CLEANUP:LBB[0-9_]+]]
@@ -805,19 +805,9 @@ end:
 ; Create the zero value for the select assignment.
 ; CHECK: xorl [[CMOVE_VAL:%eax]], [[CMOVE_VAL]]
 ; CHECK-NEXT: cmpb $0, _b(%rip)
-; CHECK-NEXT: jne [[STOREC_LABEL:LBB[0-9_]+]]
-;
-; CHECK: movb $48, [[CMOVE_VAL:%al]]
-;
-; CHECK: [[STOREC_LABEL]]:
-;
-; ENABLE-NEXT: pushq
-; For the stack adjustment, we need to preserve the EFLAGS.
-; ENABLE-NEXT: leaq -16(%rsp), %rsp
-;
-; Technically, we should use CMOVE_VAL here or its subregister.
-; CHECK-NEXT: movb %al, _c(%rip)
-; testb set the EFLAGS read here.
+; CHECK-NEXT: movl $48, [[IMM_VAL:%ecx]]
+; CHECK-NEXT: cmovnel [[CMOVE_VAL]], [[IMM_VAL]]
+; CHECK-NEXT: movb %cl, _c(%rip)
 ; CHECK-NEXT: je [[VARFUNC_CALL:LBB[0-9_]+]]
 ;
 ; The code of the loop is not interesting.
-- 
2.7.4