From 28d8889d272590856e7e270aff66de080225d501 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 14 Dec 2022 19:23:55 -0500
Subject: [PATCH] AMDGPU: Teach fneg combines that select has source modifiers

We do match source modifiers for f32 typed selects already, but the
combiner code was never informed of this.

A long time ago the documentation lied and stated that source
modifiers don't work for v_cndmask_b32 when they in fact do. We had a
bunch fo code operating under the assumption that they don't support
source modifiers, so we tried to move fnegs around to work around
this.

Gets a few small improvements here and there. The main hazard to watch
out for is infinite loops in the combiner since we try to move fnegs
up and down the DAG. For now, don't fold fneg directly into select.
The generic combiner does this for a restricted set of cases
when getNegatedExpression obviously shows an improvement for both
operands. It turns out to be trickier to avoid infinite looping the
combiner in conjunction with pulling out source modifiers, so
leave this for a later commit.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp      |  48 ++++++-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h        |   4 +
 .../test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll | 128 ++++++++---------
 llvm/test/CodeGen/AMDGPU/fneg-combines.ll          |  86 ++++++++++-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll      | 159 +++++++++------------
 .../AMDGPU/select-fabs-fneg-extract-legacy.ll      |  14 +-
 .../CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll |  46 +++---
 .../AMDGPU/select-fabs-fneg-extract.legal.f16.ll   |   5 +-
 .../CodeGen/AMDGPU/select-fabs-fneg-extract.ll     |  82 +++++------
 9 files changed, 324 insertions(+), 248 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index bb02eb3..c74be2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -567,6 +567,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FMAXNUM:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
+  case ISD::SELECT:
   case ISD::FSIN:
   case ISD::FTRUNC:
   case ISD::FRINT:
@@ -592,7 +593,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
 /// modifiers.
 LLVM_READONLY
 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
-  return N->getNumOperands() > 2 || VT == MVT::f64;
+  return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
+         VT == MVT::f64;
 }
 
 // Most FP instructions support source modifiers, but this could be refined
@@ -604,7 +606,6 @@ static bool hasSourceMods(const SDNode *N) {
 
   switch (N->getOpcode()) {
   case ISD::CopyToReg:
-  case ISD::SELECT:
   case ISD::FDIV:
   case ISD::FREM:
   case ISD::INLINEASM:
@@ -629,6 +630,9 @@ static bool hasSourceMods(const SDNode *N) {
       return true;
     }
   }
+  case ISD::SELECT:
+    // TODO: Only applies if select will be vector
+    return N->getValueType(0) == MVT::f32;
   default:
     return true;
   }
@@ -644,6 +648,8 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
   unsigned NumMayIncreaseSize = 0;
   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
 
+  assert(!N->use_empty());
+
   // XXX - Should this limit number of uses to check?
   for (const SDNode *U : N->uses()) {
     if (!hasSourceMods(U))
@@ -3726,8 +3732,9 @@ static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
 //
 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
 // select c, (fabs x), +k -> fabs (select c, x, k)
-static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
-                                    SDValue N) {
+SDValue
+AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+                                           SDValue N) const {
   SelectionDAG &DAG = DCI.DAG;
   SDValue Cond = N.getOperand(0);
   SDValue LHS = N.getOperand(1);
@@ -3736,6 +3743,9 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   EVT VT = N.getValueType();
   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+    if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
+      return SDValue();
+
     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
                                      SDLoc(N), Cond, LHS, RHS);
   }
@@ -3767,10 +3777,24 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
     }
 
     if (ShouldFoldNeg) {
+      if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
+        return SDValue();
+
+      // We're going to be forced to use a source modifier anyway, there's no
+      // point to pulling the negate out unless we can get a size reduction by
+      // negating the constant.
+      //
+      // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
+      // about cheaper constants.
+      if (NewLHS.getOpcode() == ISD::FABS &&
+          getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
+        return SDValue();
+
+      if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
+        return SDValue();
+
       if (LHS.getOpcode() == ISD::FNEG)
         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
-      else if (CRHS->isNegative())
-        return SDValue();
 
       if (Inv)
         std::swap(NewLHS, NewRHS);
@@ -3785,7 +3809,6 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   return SDValue();
 }
 
-
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
@@ -3861,6 +3884,12 @@ bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
   return false;
 }
 
+bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
+  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
+    return getConstantNegateCost(C) == NegatibleCost::Cheaper;
+  return false;
+}
+
 static unsigned inverseMinMax(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
@@ -4096,6 +4125,11 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
                                   DAG.getConstant(0x8000, SL, SrcVT));
     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
   }
+  case ISD::SELECT: {
+    // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
+    // TODO: Invert conditions of foldFreeOpFromSelect
+    return SDValue();
+  }
   default:
     return SDValue();
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index d7d2c6e..d82f877 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -97,12 +97,16 @@ protected:
   SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
                              SDValue RHS, DAGCombinerInfo &DCI) const;
+
+  SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+                               SDValue N) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   TargetLowering::NegatibleCost
   getConstantNegateCost(const ConstantFPSDNode *C) const;
 
   bool isConstantCostlierToNegate(SDValue N) const;
+  bool isConstantCheaperToNegate(SDValue N) const;
   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
index 68ca9d7..33568bb 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
@@ -1,19 +1,16 @@
 ; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
 
 ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-NNAN %s
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_0:
 ; GCN-NOT: v_mul
 
-; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
 
 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-
-; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, 1.0
@@ -24,13 +21,12 @@ define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
 ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
 ; GCN-NOT: v_mul
 
-; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
 
 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 
-; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+; VI-NNAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, -1.0
@@ -44,8 +40,7 @@ define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0
 ; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
 
 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 
 ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
 define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
@@ -61,8 +56,7 @@ define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
 ; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
 
 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 
 ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
 define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
@@ -73,13 +67,12 @@ define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1:
-; SI-SAFE: v_min_legacy_f32_e64 v0, 1.0, -v0
+; SI: v_min_legacy_f32_e64 v0, 1.0, -v0
 
 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 
-; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, -1.0
@@ -88,13 +81,12 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, 1.0, -v0
+; SI: v_max_legacy_f32_e64 v0, 1.0, -v0
 
 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 
-; GCN-NNAN: v_max_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_max_f32_e64 v0, -v0, 1.0
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, -1.0
@@ -103,13 +95,12 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1:
-; SI-SAFE: v_min_legacy_f32_e64 v0, -v0, 1.0
+; SI: v_min_legacy_f32_e64 v0, -v0, 1.0
 
 ; VI-SAFE: v_cmp_lt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 
-; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ogt float %a, -1.0
@@ -118,13 +109,12 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, 1.0
+; SI: v_max_legacy_f32_e64 v0, -v0, 1.0
 
 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 
-; GCN-NANN: v_max_f32_e64 v0, -v0, 1.0
+; VI-NANN: v_max_f32_e64 v0, -v0, 1.0
 define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, -1.0
@@ -133,17 +123,16 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0
 
 ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, -8.0
@@ -152,17 +141,16 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0
 
 ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, -8.0
@@ -171,17 +159,16 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]]
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]]
 
 ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
 ; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ogt float %a, -8.0
@@ -190,18 +177,17 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]]
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]]
 
 
 ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
 define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, -8.0
@@ -210,13 +196,12 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, -1.0
+; SI: v_max_legacy_f32_e64 v0, -v0, -1.0
 
 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 
-; GCN-NNAN: v_max_f32_e64 v0, -v0, -1.0
+; VI-NNAN: v_max_f32_e64 v0, -v0, -1.0
 define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, 1.0
@@ -225,15 +210,14 @@ define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}ult_a_select_fneg_a_b:
-; SI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; SI: v_cmp_nge_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 
 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1
 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 
-; GCN-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-
+; VI-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, %b
@@ -242,14 +226,14 @@ define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
 }
 
 ; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b:
-; SI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; SI: v_cmp_nle_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 
 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1
 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 
-; SI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1
-; SI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; VI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1
+; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index ff51bff..bc92f2a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -226,6 +226,10 @@ define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(ptr addrspace(1) %out
 ; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
 ; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
 ; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
+
+; GCN-NSZ-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0, v
+; GCN-NSZ: v_cmp_ngt_f32
+; GCN-NSZ: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
 .entry:
   %tmp7 = fdiv float 1.000000e+00, %tmp6
@@ -246,9 +250,12 @@ define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i
 ; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
 ; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
 ; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
-; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]],
-; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]],
-; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]]
+; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fc00000
+; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 0, [[A]]
+; GCN-NSZ-DAG: v_cmp_ngt_f32_e32 {{.*}}, s{{[0-9]+}}, [[D]]
+; GCN-NSZ-DAG: v_cndmask_b32_e64 [[E:v[0-9]+]], -[[D]], v{{[0-9]+}},
+; GCN-NSZ-DAG: v_cmp_nlt_f32_e32 {{.*}}, 0
+; GCN-NSZ-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, [[C]], 0,
 define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
 .entry:
   %tmp7 = fdiv afn float 1.000000e+00, %tmp6
@@ -2731,6 +2738,79 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha
   ret <2 x half> %add
 }
 
+; FIXME: This fneg should fold into select
+; GCN-LABEL: {{^}}v_fneg_select_f32:
+; GCN: s_waitcnt
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: s_setpc_b64
+define float @v_fneg_select_f32(i32 %arg0, float %a, float %b, float %c) {
+  %cond = icmp eq i32 %arg0, 0
+  %select = select i1 %cond, float %a, float %b
+  %fneg = fneg float %select
+  ret float %fneg
+}
+
+; FIXME: This fneg should fold into select
+; GCN-LABEL: {{^}}v_fneg_select_2_f32:
+; GCN: s_waitcnt
+; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1
+; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2
+; GCN-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc
+; GCN-NSZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1
+; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2
+; GCN-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-SAFE-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc
+; GCN-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-NEXT: s_setpc_b64
+define float @v_fneg_select_2_f32(i32 %arg0, float %a, float %b, float %c) {
+  %cond = icmp eq i32 %arg0, 0
+  %add.0 = fadd float %a, 2.0
+  %add.1 = fadd float %b, 4.0
+  %select = select i1 %cond, float %add.0, float %add.1
+  %neg.select = fneg float %select
+  ret float %neg.select
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_select_f32:
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+define amdgpu_kernel void @v_fneg_posk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %cond = icmp eq i32 %tid, 0
+  %select = select i1 %cond, float 4.0, float %a
+  %fneg = fneg float %select
+  store float %fneg, ptr addrspace(1) %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_select_f32:
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+define amdgpu_kernel void @v_fneg_negk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %cond = icmp eq i32 %tid, 0
+  %select = select i1 %cond, float -4.0, float %a
+  %fneg = fneg float %select
+  store float %fneg, ptr addrspace(1) %out.gep
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fma.f32(float, float, float) #1
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 90cb48e..4f192db 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -216,10 +216,10 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
 ; SI-NSZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NSZ-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
 ; SI-NSZ-NEXT:    v_div_fixup_f32 v0, v0, s1, 1.0
-; SI-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
+; SI-NSZ-NEXT:    v_mul_f32_e32 v0, 0, v0
 ; SI-NSZ-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NSZ-NEXT:    v_cmp_nlt_f32_e64 vcc, -v0, s0
-; SI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NSZ-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; SI-NSZ-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
 ; SI-NSZ-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; SI-NSZ-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
 ; SI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
@@ -266,9 +266,9 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; VI-NSZ-NEXT:    v_div_fixup_f32 v0, v0, s1, 1.0
-; VI-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
-; VI-NSZ-NEXT:    v_cmp_nlt_f32_e64 vcc, -v0, s0
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NSZ-NEXT:    v_mul_f32_e32 v0, 0, v0
+; VI-NSZ-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
 ; VI-NSZ-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
 ; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
 ; VI-NSZ-NEXT:    ; return to shader part epilog
@@ -302,9 +302,9 @@ define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6
 ; GCN-NSZ:       ; %bb.0: ; %.entry
 ; GCN-NSZ-NEXT:    v_rcp_f32_e32 v0, s1
 ; GCN-NSZ-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
-; GCN-NSZ-NEXT:    v_cmp_nlt_f32_e64 vcc, -v0, s0
-; GCN-NSZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, 0, v0
+; GCN-NSZ-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; GCN-NSZ-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
 ; GCN-NSZ-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GCN-NSZ-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
 ; GCN-NSZ-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
@@ -2804,14 +2804,12 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
 ; SI-LABEL: s_fneg_select_infloop_regression_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bitcmp1_b32 s1, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    flat_store_dword v[0:1], v2
@@ -2820,14 +2818,12 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
 ; VI-LABEL: s_fneg_select_infloop_regression_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_bfrev_b32_e32 v0, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s1, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -2846,9 +2842,7 @@ define float @v_fneg_select_infloop_regression_f32(float %arg, i1 %arg1) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = fneg float %i
@@ -2863,9 +2857,7 @@ define float @v_fneg_select_infloop_regression_f32_commute0(float %arg, i1 %arg1
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 0.0
   %i2 = fneg float %i
@@ -2880,9 +2872,7 @@ define float @v_fneg_select_infloop_regression_f32_commute1(float %arg, i1 %arg1
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = fneg float %i
@@ -2897,9 +2887,7 @@ define float @v_fneg_select_infloop_regression_f32_commute2(float %arg, i1 %arg1
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 0.0
   %i2 = fneg float %i
@@ -2915,8 +2903,7 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32(float %arg, i1 %ar
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 2.0, float %arg
   %i2 = fneg float %i
@@ -2931,8 +2918,7 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute0(float %ar
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 2.0
   %i2 = fneg float %i
@@ -2947,8 +2933,7 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute1(float %ar
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 2.0, float %arg
   %i2 = fneg float %i
@@ -2963,8 +2948,7 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute2(float %ar
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 2.0
   %i2 = fneg float %i
@@ -2980,8 +2964,7 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32(float %arg, i1
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float -2.0, float %arg
   %i2 = fneg float %i
@@ -2996,8 +2979,7 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0(float
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float -2.0
   %i2 = fneg float %i
@@ -3012,8 +2994,7 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1(float
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float -2.0, float %arg
   %i2 = fneg float %i
@@ -3028,8 +3009,7 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float -2.0
   %i2 = fneg float %i
@@ -3044,14 +3024,16 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitcmp1_b32 s4, 0
+; SI-NEXT:    s_and_b32 s4, 1, s4
+; SI-NEXT:    s_cselect_b32 s3, 0, s3
+; SI-NEXT:    s_cselect_b32 s2, 0, s2
+; SI-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; SI-NEXT:    s_cmp_eq_u32 s4, 1
 ; SI-NEXT:    s_cselect_b32 s3, 0, s3
 ; SI-NEXT:    s_cselect_b32 s2, 0, s2
-; SI-NEXT:    s_cselect_b32 s3, 0x80000000, s3
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_xor_b32 s2, s3, 0x80000000
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SI-NEXT:    s_endpgm
@@ -3062,14 +3044,16 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s4, 0
+; VI-NEXT:    s_and_b32 s4, 1, s4
+; VI-NEXT:    s_cselect_b32 s3, 0, s3
+; VI-NEXT:    s_cselect_b32 s2, 0, s2
+; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; VI-NEXT:    s_cmp_eq_u32 s4, 1
 ; VI-NEXT:    s_cselect_b32 s3, 0, s3
 ; VI-NEXT:    s_cselect_b32 s2, 0, s2
-; VI-NEXT:    s_cselect_b32 s3, 0x80000000, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_xor_b32 s2, s3, 0x80000000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -3087,10 +3071,10 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, double 0.0, double %arg
   %i2 = fneg double %i
@@ -3103,14 +3087,13 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_bfrev_b32_e32 v1, 1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; SI-NEXT:    s_bitcmp1_b32 s2, 16
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_cvt_f16_f32_e64 v2, -v0
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, s[2:3]
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    flat_store_short v[0:1], v2
@@ -3120,14 +3103,13 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_mov_b32_e32 v0, 0x8000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
@@ -3146,11 +3128,9 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 1, v1
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT:    v_bfrev_b32_e32 v1, 1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_select_infloop_regression_f16:
@@ -3159,9 +3139,8 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
 ; VI-NEXT:    v_and_b32_e32 v1, 1, v1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-NEXT:    v_mov_b32_e32 v1, 0x8000
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, half 0.0, half %arg
   %i2 = fneg half %i
@@ -3318,7 +3297,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
+; SI-NEXT:    v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    flat_store_dword v[0:1], v2
@@ -3332,7 +3311,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
+; VI-NEXT:    v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3351,7 +3330,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, 0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = call float @llvm.fabs.f32(float %i)
@@ -3363,14 +3342,12 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
 ; SI-LABEL: s_fneg_fabs_select_infloop_regression:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    v_bfrev_b32_e32 v0, 1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bitcmp1_b32 s1, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, |v1|, v0, s[0:1]
-; SI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    flat_store_dword v[0:1], v2
@@ -3379,14 +3356,12 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
 ; VI-LABEL: s_fneg_fabs_select_infloop_regression:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_bfrev_b32_e32 v0, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s1, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; VI-NEXT:    v_cndmask_b32_e64 v0, |v1|, v0, s[0:1]
-; VI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3406,9 +3381,7 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v0|, 0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = call float @llvm.fabs.f32(float %i)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
index c12198f..a156a3d 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -7,10 +7,9 @@
 ; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 
-; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_rcp_legacy_f32_e64 [[RCP:v[0-9]+]], -[[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -25,10 +24,9 @@ define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
 ; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 
-; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
   %x = load volatile float, ptr addrspace(1) undef
   %cmp = icmp eq i32 %c, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
index d1ee9b2..95bc04b 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -993,20 +993,20 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
-; CI-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
+; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negfabs_posk_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
+; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT:    v_mul_f16_e64 v0, -v0, v2
+; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1023,20 +1023,20 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
-; CI-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
+; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_posk_negfabs_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
+; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT:    v_mul_f16_e64 v0, -v0, v2
+; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1053,19 +1053,20 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
-; CI-NEXT:    v_mul_f32_e64 v0, -|v0|, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
+; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negfabs_negk_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
+; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT:    v_mul_f16_e64 v0, -|v0|, v2
+; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1082,19 +1083,20 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
-; CI-NEXT:    v_mul_f32_e64 v0, -|v0|, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
+; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negk_negfabs_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
+; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT:    v_mul_f16_e64 v0, -|v0|, v2
+; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll
index 35d2bf0..f4ad83b 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll
@@ -6,11 +6,10 @@ define half @select_fneg_posk_src_rcp_f16(i32 %c, half %x, half %y) {
 ; VI-LABEL: select_fneg_posk_src_rcp_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_rcp_f16_e32 v1, v1
-; VI-NEXT:    v_mov_b32_e32 v2, 0xc000
+; VI-NEXT:    v_rcp_f16_e64 v1, -v1
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %rcp = call half @llvm.amdgcn.rcp.f16(half %x)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index 2bf4e2b..3e2d5f6 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -372,8 +372,8 @@ define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
 ; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32:
 ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]]
 ; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
 
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
@@ -639,8 +639,8 @@ define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
 
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 4.0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -659,8 +659,8 @@ define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
 
 ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 4.0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -677,8 +677,8 @@ define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, -|[[X]]|, s
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -696,9 +696,9 @@ define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
 ; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 vcc, -1, 0
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: s_cselect_b64 s[0:1], -1, 0
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, -|[[X]]|, s[0:1]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -804,10 +804,9 @@ define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
 ; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 
-; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
 define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -824,10 +823,14 @@ define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
-; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN-DAG: s_cselect_b64  [[VCC:.*]], -1, 0
+
+; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; SI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, [[VCC]]
+; SI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+
+; VI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0.15915494, -|[[X]]|, [[VCC]]
+; VI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negfabs_posk_inv2pi_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -845,10 +848,16 @@ define amdgpu_kernel void @mul_select_negfabs_posk_inv2pi_f32(i32 %c) #0 {
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
 ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
-; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+
+; GCN-DAG: s_cselect_b64  [[VCC:.*]], -1, 0
+
+; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; SI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, [[VCC]]
+; SI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+
+
+; VI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0.15915494, -|[[X]]|, [[VCC]]
+; VI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_posk_inv2pi_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -864,12 +873,9 @@ define amdgpu_kernel void @mul_select_posk_inv2pi_negfabs_f32(i32 %c) #0 {
 ; GCN-LABEL: {{^}}mul_select_negfabs_negk_inv2pi_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
-; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
-
-; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
-
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, s
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negfabs_negk_inv2pi_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -886,13 +892,11 @@ define amdgpu_kernel void @mul_select_negfabs_negk_inv2pi_f32(i32 %c) #0 {
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
 ; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 vcc, -1, 0
-; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
-
-; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: s_cselect_b64 s[0:1], -1, 0
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, s[0:1]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negk_inv2pi_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -908,11 +912,10 @@ define amdgpu_kernel void @mul_select_negk_inv2pi_negfabs_f32(i32 %c) #0 {
 ; GCN-LABEL: {{^}}mul_select_negfabs_posk_0_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN-DAG: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negfabs_posk_0_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
@@ -930,11 +933,10 @@ define amdgpu_kernel void @mul_select_negfabs_posk_0_f32(i32 %c) #0 {
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN-DAG: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_posk_0_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, ptr addrspace(1) undef
   %y = load volatile float, ptr addrspace(1) undef
-- 
2.7.4