AMDGPU: Teach fneg combines that select has source modifiers

author Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 15 Dec 2022 00:23:55 +0000 (19:23 -0500)

committer Matt Arsenault <arsenm2@gmail.com>

Mon, 20 Feb 2023 00:13:38 +0000 (20:13 -0400)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 15 Dec 2022 00:23:55 +0000 (19:23 -0500)
committer Matt Arsenault <arsenm2@gmail.com>
Mon, 20 Feb 2023 00:13:38 +0000 (20:13 -0400)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index bb02eb3..c74be2d 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -567,6 +567,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
    case ISD::FMAXNUM:
    case ISD::FMINNUM_IEEE:
    case ISD::FMAXNUM_IEEE:
+  case ISD::SELECT:
    case ISD::FSIN:
    case ISD::FTRUNC:
    case ISD::FRINT:
@@ -592,7 +593,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
  /// modifiers.
  LLVM_READONLY
  static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
-  return N->getNumOperands() > 2 || VT == MVT::f64;
+  return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
+         VT == MVT::f64;
  }
  
  // Most FP instructions support source modifiers, but this could be refined
@@ -604,7 +606,6 @@ static bool hasSourceMods(const SDNode *N) {
  
    switch (N->getOpcode()) {
    case ISD::CopyToReg:
-  case ISD::SELECT:
    case ISD::FDIV:
    case ISD::FREM:
    case ISD::INLINEASM:
@@ -629,6 +630,9 @@ static bool hasSourceMods(const SDNode *N) {
        return true;
      }
    }
+  case ISD::SELECT:
+    // TODO: Only applies if select will be vector
+    return N->getValueType(0) == MVT::f32;
    default:
      return true;
    }
@@ -644,6 +648,8 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
    unsigned NumMayIncreaseSize = 0;
    MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
  
+  assert(!N->use_empty());
+
    // XXX - Should this limit number of uses to check?
    for (const SDNode *U : N->uses()) {
      if (!hasSourceMods(U))
@@ -3726,8 +3732,9 @@ static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
  //
  // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
  // select c, (fabs x), +k -> fabs (select c, x, k)
-static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
-                                    SDValue N) {
+SDValue
+AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+                                           SDValue N) const {
    SelectionDAG &DAG = DCI.DAG;
    SDValue Cond = N.getOperand(0);
    SDValue LHS = N.getOperand(1);
@@ -3736,6 +3743,9 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
    EVT VT = N.getValueType();
    if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
        (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+    if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
+      return SDValue();
+
      return distributeOpThroughSelect(DCI, LHS.getOpcode(),
                                       SDLoc(N), Cond, LHS, RHS);
    }
@@ -3767,10 +3777,24 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
      }
  
      if (ShouldFoldNeg) {
+      if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
+        return SDValue();
+
+      // We're going to be forced to use a source modifier anyway, there's no
+      // point to pulling the negate out unless we can get a size reduction by
+      // negating the constant.
+      //
+      // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
+      // about cheaper constants.
+      if (NewLHS.getOpcode() == ISD::FABS &&
+          getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
+        return SDValue();
+
+      if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
+        return SDValue();
+
        if (LHS.getOpcode() == ISD::FNEG)
          NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
-      else if (CRHS->isNegative())
-        return SDValue();
  
        if (Inv)
          std::swap(NewLHS, NewRHS);
@@ -3785,7 +3809,6 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
    return SDValue();
  }
  
-
  SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                     DAGCombinerInfo &DCI) const {
    if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
@@ -3861,6 +3884,12 @@ bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
    return false;
  }
  
+bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
+  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
+    return getConstantNegateCost(C) == NegatibleCost::Cheaper;
+  return false;
+}
+
  static unsigned inverseMinMax(unsigned Opc) {
    switch (Opc) {
    case ISD::FMAXNUM:
@@ -4096,6 +4125,11 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
                                    DAG.getConstant(0x8000, SL, SrcVT));
      return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
    }
+  case ISD::SELECT: {
+    // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
+    // TODO: Invert conditions of foldFreeOpFromSelect
+    return SDValue();
+  }
    default:
      return SDValue();
    }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

index d7d2c6e..d82f877 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -97,12 +97,16 @@ protected:
    SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
                               SDValue RHS, DAGCombinerInfo &DCI) const;
+
+  SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+                               SDValue N) const;
    SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  
    TargetLowering::NegatibleCost
    getConstantNegateCost(const ConstantFPSDNode *C) const;
  
    bool isConstantCostlierToNegate(SDValue N) const;
+  bool isConstantCheaperToNegate(SDValue N) const;
    SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll

index 68ca9d7..33568bb 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
@@ -1,19 +1,16 @@
  ; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
  
  ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-NNAN %s
  
  ; GCN-LABEL: {{^}}min_fneg_select_regression_0:
  ; GCN-NOT: v_mul
  
-; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
  
  ; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-
-; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
  define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
    %fneg.a = fsub float -0.0, %a
    %cmp.a = fcmp ult float %a, 1.0
@@ -24,13 +21,12 @@ define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
  ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
  ; GCN-NOT: v_mul
  
-; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
  
  ; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
  
-; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+; VI-NNAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
  define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
    %fneg.a = fsub float -0.0, %a
    %cmp.a = fcmp ult float %a, -1.0
@@ -44,8 +40,7 @@ define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0
  ; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
  
  ; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
  
  ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
  define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
@@ -61,8 +56,7 @@ define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
  ; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
  
  ; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
  
  ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
  define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
@@ -73,13 +67,12 @@ define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1:
-; SI-SAFE: v_min_legacy_f32_e64 v0, 1.0, -v0
+; SI: v_min_legacy_f32_e64 v0, 1.0, -v0
  
  ; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
  
-; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
  define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp ugt float %a, -1.0
@@ -88,13 +81,12 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, 1.0, -v0
+; SI: v_max_legacy_f32_e64 v0, 1.0, -v0
  
  ; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
  
-; GCN-NNAN: v_max_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_max_f32_e64 v0, -v0, 1.0
  define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp ult float %a, -1.0
@@ -103,13 +95,12 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1:
-; SI-SAFE: v_min_legacy_f32_e64 v0, -v0, 1.0
+; SI: v_min_legacy_f32_e64 v0, -v0, 1.0
  
  ; VI-SAFE: v_cmp_lt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
  
-; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
  define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp ogt float %a, -1.0
@@ -118,13 +109,12 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, 1.0
+; SI: v_max_legacy_f32_e64 v0, -v0, 1.0
  
  ; VI-SAFE: v_cmp_gt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
  
-; GCN-NANN: v_max_f32_e64 v0, -v0, 1.0
+; VI-NANN: v_max_f32_e64 v0, -v0, 1.0
  define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp olt float %a, -1.0
@@ -133,17 +123,16 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0
  
  ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
  ; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
  
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
  define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp ugt float %a, -8.0
@@ -152,17 +141,16 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0
  
  ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
  ; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
  
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
  define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp ult float %a, -8.0
@@ -171,17 +159,16 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]]
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]]
  
  ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
  ; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
  
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
  define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp ogt float %a, -8.0
@@ -190,18 +177,17 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]]
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]]
  
  
  ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
  ; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
  
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
  define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp olt float %a, -8.0
@@ -210,13 +196,12 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, -1.0
+; SI: v_max_legacy_f32_e64 v0, -v0, -1.0
  
  ; VI-SAFE: v_cmp_gt_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
  
-; GCN-NNAN: v_max_f32_e64 v0, -v0, -1.0
+; VI-NNAN: v_max_f32_e64 v0, -v0, -1.0
  define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp olt float %a, 1.0
@@ -225,15 +210,14 @@ define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0
  }
  
  ; GCN-LABEL: {{^}}ult_a_select_fneg_a_b:
-; SI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; SI: v_cmp_nge_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
  
  ; VI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1
  ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
  
-; GCN-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-
+; VI-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
  define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp ult float %a, %b
@@ -242,14 +226,14 @@ define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
  }
  
  ; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b:
-; SI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; SI: v_cmp_nle_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
  
  ; VI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1
  ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
  
-; SI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1
-; SI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; VI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1
+; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
  define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 {
    %fneg.a = fneg float %a
    %cmp.a = fcmp ugt float %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll

index ff51bff..bc92f2a 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -226,6 +226,10 @@ define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(ptr addrspace(1) %out
  ; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
  ; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
  ; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
+
+; GCN-NSZ-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0, v
+; GCN-NSZ: v_cmp_ngt_f32
+; GCN-NSZ: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
  define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
  .entry:
    %tmp7 = fdiv float 1.000000e+00, %tmp6
@@ -246,9 +250,12 @@ define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i
  ; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
  ; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
  ; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
-; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]],
-; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]],
-; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]]
+; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fc00000
+; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 0, [[A]]
+; GCN-NSZ-DAG: v_cmp_ngt_f32_e32 {{.*}}, s{{[0-9]+}}, [[D]]
+; GCN-NSZ-DAG: v_cndmask_b32_e64 [[E:v[0-9]+]], -[[D]], v{{[0-9]+}},
+; GCN-NSZ-DAG: v_cmp_nlt_f32_e32 {{.*}}, 0
+; GCN-NSZ-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, [[C]], 0,
  define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
  .entry:
    %tmp7 = fdiv afn float 1.000000e+00, %tmp6
@@ -2731,6 +2738,79 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha
    ret <2 x half> %add
  }
  
+; FIXME: This fneg should fold into select
+; GCN-LABEL: {{^}}v_fneg_select_f32:
+; GCN: s_waitcnt
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: s_setpc_b64
+define float @v_fneg_select_f32(i32 %arg0, float %a, float %b, float %c) {
+  %cond = icmp eq i32 %arg0, 0
+  %select = select i1 %cond, float %a, float %b
+  %fneg = fneg float %select
+  ret float %fneg
+}
+
+; FIXME: This fneg should fold into select
+; GCN-LABEL: {{^}}v_fneg_select_2_f32:
+; GCN: s_waitcnt
+; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1
+; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2
+; GCN-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc
+; GCN-NSZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1
+; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2
+; GCN-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-SAFE-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc
+; GCN-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-NEXT: s_setpc_b64
+define float @v_fneg_select_2_f32(i32 %arg0, float %a, float %b, float %c) {
+  %cond = icmp eq i32 %arg0, 0
+  %add.0 = fadd float %a, 2.0
+  %add.1 = fadd float %b, 4.0
+  %select = select i1 %cond, float %add.0, float %add.1
+  %neg.select = fneg float %select
+  ret float %neg.select
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_select_f32:
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+define amdgpu_kernel void @v_fneg_posk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %cond = icmp eq i32 %tid, 0
+  %select = select i1 %cond, float 4.0, float %a
+  %fneg = fneg float %select
+  store float %fneg, ptr addrspace(1) %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_select_f32:
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+define amdgpu_kernel void @v_fneg_negk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %a = load volatile float, ptr addrspace(1) %a.gep
+  %cond = icmp eq i32 %tid, 0
+  %select = select i1 %cond, float -4.0, float %a
+  %fneg = fneg float %select
+  store float %fneg, ptr addrspace(1) %out.gep
+  ret void
+}
+
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  declare float @llvm.fma.f32(float, float, float) #1
  declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll

index 90cb48e..4f192db 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -216,10 +216,10 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
  ; SI-NSZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
  ; SI-NSZ-NEXT:    v_div_fmas_f32 v0, v0, v1, v3
  ; SI-NSZ-NEXT:    v_div_fixup_f32 v0, v0, s1, 1.0
-; SI-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
+; SI-NSZ-NEXT:    v_mul_f32_e32 v0, 0, v0
  ; SI-NSZ-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NSZ-NEXT:    v_cmp_nlt_f32_e64 vcc, -v0, s0
-; SI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NSZ-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; SI-NSZ-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
  ; SI-NSZ-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
  ; SI-NSZ-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
  ; SI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
@@ -266,9 +266,9 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4
  ; VI-NSZ-NEXT:    v_mov_b32_e32 v2, s0
  ; VI-NSZ-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
  ; VI-NSZ-NEXT:    v_div_fixup_f32 v0, v0, s1, 1.0
-; VI-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
-; VI-NSZ-NEXT:    v_cmp_nlt_f32_e64 vcc, -v0, s0
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NSZ-NEXT:    v_mul_f32_e32 v0, 0, v0
+; VI-NSZ-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
  ; VI-NSZ-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
  ; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
  ; VI-NSZ-NEXT:    ; return to shader part epilog
@@ -302,9 +302,9 @@ define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6
  ; GCN-NSZ:       ; %bb.0: ; %.entry
  ; GCN-NSZ-NEXT:    v_rcp_f32_e32 v0, s1
  ; GCN-NSZ-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
-; GCN-NSZ-NEXT:    v_cmp_nlt_f32_e64 vcc, -v0, s0
-; GCN-NSZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, 0, v0
+; GCN-NSZ-NEXT:    v_cmp_ngt_f32_e32 vcc, s0, v0
+; GCN-NSZ-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
  ; GCN-NSZ-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
  ; GCN-NSZ-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
  ; GCN-NSZ-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
@@ -2804,14 +2804,12 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
  ; SI-LABEL: s_fneg_select_infloop_regression_f32:
  ; SI:       ; %bb.0:
  ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    v_bfrev_b32_e32 v0, 1
  ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  ; SI-NEXT:    s_bitcmp1_b32 s1, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
  ; SI-NEXT:    v_mov_b32_e32 v0, s2
  ; SI-NEXT:    v_mov_b32_e32 v1, s3
  ; SI-NEXT:    flat_store_dword v[0:1], v2
@@ -2820,14 +2818,12 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
  ; VI-LABEL: s_fneg_select_infloop_regression_f32:
  ; VI:       ; %bb.0:
  ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_bfrev_b32_e32 v0, 1
  ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  ; VI-NEXT:    s_bitcmp1_b32 s1, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
  ; VI-NEXT:    v_mov_b32_e32 v0, s2
  ; VI-NEXT:    v_mov_b32_e32 v1, s3
  ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -2846,9 +2842,7 @@ define float @v_fneg_select_infloop_regression_f32(float %arg, i1 %arg1) {
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float 0.0, float %arg
    %i2 = fneg float %i
@@ -2863,9 +2857,7 @@ define float @v_fneg_select_infloop_regression_f32_commute0(float %arg, i1 %arg1
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float %arg, float 0.0
    %i2 = fneg float %i
@@ -2880,9 +2872,7 @@ define float @v_fneg_select_infloop_regression_f32_commute1(float %arg, i1 %arg1
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float 0.0, float %arg
    %i2 = fneg float %i
@@ -2897,9 +2887,7 @@ define float @v_fneg_select_infloop_regression_f32_commute2(float %arg, i1 %arg1
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float %arg, float 0.0
    %i2 = fneg float %i
@@ -2915,8 +2903,7 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32(float %arg, i1 %ar
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float 2.0, float %arg
    %i2 = fneg float %i
@@ -2931,8 +2918,7 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute0(float %ar
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float %arg, float 2.0
    %i2 = fneg float %i
@@ -2947,8 +2933,7 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute1(float %ar
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float 2.0, float %arg
    %i2 = fneg float %i
@@ -2963,8 +2948,7 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute2(float %ar
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float %arg, float 2.0
    %i2 = fneg float %i
@@ -2980,8 +2964,7 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32(float %arg, i1
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float -2.0, float %arg
    %i2 = fneg float %i
@@ -2996,8 +2979,7 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0(float
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float %arg, float -2.0
    %i2 = fneg float %i
@@ -3012,8 +2994,7 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1(float
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float -2.0, float %arg
    %i2 = fneg float %i
@@ -3028,8 +3009,7 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float %arg, float -2.0
    %i2 = fneg float %i
@@ -3044,14 +3024,16 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
  ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
  ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
  ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bitcmp1_b32 s4, 0
+; SI-NEXT:    s_and_b32 s4, 1, s4
+; SI-NEXT:    s_cselect_b32 s3, 0, s3
+; SI-NEXT:    s_cselect_b32 s2, 0, s2
+; SI-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; SI-NEXT:    s_cmp_eq_u32 s4, 1
  ; SI-NEXT:    s_cselect_b32 s3, 0, s3
  ; SI-NEXT:    s_cselect_b32 s2, 0, s2
-; SI-NEXT:    s_cselect_b32 s3, 0x80000000, s3
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_xor_b32 s2, s3, 0x80000000
  ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
  ; SI-NEXT:    v_mov_b32_e32 v2, s0
  ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
  ; SI-NEXT:    s_endpgm
@@ -3062,14 +3044,16 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
  ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
  ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
  ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitcmp1_b32 s4, 0
+; VI-NEXT:    s_and_b32 s4, 1, s4
+; VI-NEXT:    s_cselect_b32 s3, 0, s3
+; VI-NEXT:    s_cselect_b32 s2, 0, s2
+; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
+; VI-NEXT:    s_cmp_eq_u32 s4, 1
  ; VI-NEXT:    s_cselect_b32 s3, 0, s3
  ; VI-NEXT:    s_cselect_b32 s2, 0, s2
-; VI-NEXT:    s_cselect_b32 s3, 0x80000000, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_xor_b32 s2, s3, 0x80000000
  ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
  ; VI-NEXT:    v_mov_b32_e32 v2, s0
  ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
  ; VI-NEXT:    s_endpgm
@@ -3087,10 +3071,10 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
  ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
  ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
  ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, double 0.0, double %arg
    %i2 = fneg double %i
@@ -3103,14 +3087,13 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
  ; SI:       ; %bb.0:
  ; SI-NEXT:    s_load_dword s2, s[0:1], 0x9
  ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    v_bfrev_b32_e32 v1, 1
  ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
  ; SI-NEXT:    s_bitcmp1_b32 s2, 16
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_cvt_f16_f32_e64 v2, -v0
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, s[2:3]
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v0
  ; SI-NEXT:    v_mov_b32_e32 v0, s0
  ; SI-NEXT:    v_mov_b32_e32 v1, s1
  ; SI-NEXT:    flat_store_short v[0:1], v2
@@ -3120,14 +3103,13 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
  ; VI:       ; %bb.0:
  ; VI-NEXT:    s_load_dword s2, s[0:1], 0x24
  ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT:    v_mov_b32_e32 v0, 0x8000
  ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  ; VI-NEXT:    s_bitcmp1_b32 s2, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[2:3]
  ; VI-NEXT:    v_mov_b32_e32 v0, s0
  ; VI-NEXT:    v_mov_b32_e32 v1, s1
  ; VI-NEXT:    flat_store_short v[0:1], v2
@@ -3146,11 +3128,9 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
  ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
  ; SI-NEXT:    v_and_b32_e32 v1, 1, v1
  ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT:    v_bfrev_b32_e32 v1, 1
  ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
  ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
  ; SI-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; VI-LABEL: v_fneg_select_infloop_regression_f16:
@@ -3159,9 +3139,8 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
  ; VI-NEXT:    v_and_b32_e32 v1, 1, v1
  ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-NEXT:    v_mov_b32_e32 v1, 0x8000
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
  ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
  ; VI-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, half 0.0, half %arg
    %i2 = fneg half %i
@@ -3318,7 +3297,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
  ; SI-NEXT:    v_mov_b32_e32 v0, s0
  ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
  ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
+; SI-NEXT:    v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
  ; SI-NEXT:    v_mov_b32_e32 v0, s2
  ; SI-NEXT:    v_mov_b32_e32 v1, s3
  ; SI-NEXT:    flat_store_dword v[0:1], v2
@@ -3332,7 +3311,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
  ; VI-NEXT:    v_mov_b32_e32 v0, s0
  ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
  ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
+; VI-NEXT:    v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
  ; VI-NEXT:    v_mov_b32_e32 v0, s2
  ; VI-NEXT:    v_mov_b32_e32 v1, s3
  ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3351,7 +3330,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) {
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, 0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float 0.0, float %arg
    %i2 = call float @llvm.fabs.f32(float %i)
@@ -3363,14 +3342,12 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
  ; SI-LABEL: s_fneg_fabs_select_infloop_regression:
  ; SI:       ; %bb.0:
  ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    v_bfrev_b32_e32 v0, 1
  ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  ; SI-NEXT:    s_bitcmp1_b32 s1, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
  ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, |v1|, v0, s[0:1]
-; SI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
  ; SI-NEXT:    v_mov_b32_e32 v0, s2
  ; SI-NEXT:    v_mov_b32_e32 v1, s3
  ; SI-NEXT:    flat_store_dword v[0:1], v2
@@ -3379,14 +3356,12 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
  ; VI-LABEL: s_fneg_fabs_select_infloop_regression:
  ; VI:       ; %bb.0:
  ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_bfrev_b32_e32 v0, 1
  ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  ; VI-NEXT:    s_bitcmp1_b32 s1, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
  ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; VI-NEXT:    v_cndmask_b32_e64 v0, |v1|, v0, s[0:1]
-; VI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
  ; VI-NEXT:    v_mov_b32_e32 v0, s2
  ; VI-NEXT:    v_mov_b32_e32 v1, s3
  ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3406,9 +3381,7 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v1, 1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v0|, 0, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %i = select i1 %arg1, float 0.0, float %arg
    %i2 = call float @llvm.fabs.f32(float %i)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll

index c12198f..a156a3d 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -7,10 +7,9 @@
  ; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32:
  ; GCN: buffer_load_dword [[X:v[0-9]+]]
  
-; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_rcp_legacy_f32_e64 [[RCP:v[0-9]+]], -[[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
  define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -25,10 +24,9 @@ define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
  ; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32:
  ; GCN: buffer_load_dword [[X:v[0-9]+]]
  
-; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
  define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
    %x = load volatile float, ptr addrspace(1) undef
    %cmp = icmp eq i32 %c, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll

index d1ee9b2..95bc04b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -993,20 +993,20 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) {
  ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
  ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
  ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
  ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
-; CI-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
+; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
  ; CI-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; VI-LABEL: mul_select_negfabs_posk_f16:
  ; VI:       ; %bb.0:
  ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
+; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
  ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
  ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT:    v_mul_f16_e64 v0, -v0, v2
+; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
  ; VI-NEXT:    s_setpc_b64 s[30:31]
    %cmp = icmp eq i32 %c, 0
    %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1023,20 +1023,20 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) {
  ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
  ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
  ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
  ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
-; CI-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
+; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
  ; CI-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; VI-LABEL: mul_select_posk_negfabs_f16:
  ; VI:       ; %bb.0:
  ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
+; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
  ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
  ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT:    v_mul_f16_e64 v0, -v0, v2
+; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
  ; VI-NEXT:    s_setpc_b64 s[30:31]
    %cmp = icmp eq i32 %c, 0
    %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1053,19 +1053,20 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) {
  ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
  ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
  ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
  ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
-; CI-NEXT:    v_mul_f32_e64 v0, -|v0|, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
+; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
  ; CI-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; VI-LABEL: mul_select_negfabs_negk_f16:
  ; VI:       ; %bb.0:
  ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
+; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
  ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
  ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT:    v_mul_f16_e64 v0, -|v0|, v2
+; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
  ; VI-NEXT:    s_setpc_b64 s[30:31]
    %cmp = icmp eq i32 %c, 0
    %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1082,19 +1083,20 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) {
  ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
  ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
  ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
  ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
-; CI-NEXT:    v_mul_f32_e64 v0, -|v0|, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
+; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
  ; CI-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; VI-LABEL: mul_select_negk_negfabs_f16:
  ; VI:       ; %bb.0:
  ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
+; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
  ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
  ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT:    v_mul_f16_e64 v0, -|v0|, v2
+; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
  ; VI-NEXT:    s_setpc_b64 s[30:31]
    %cmp = icmp eq i32 %c, 0
    %fabs.x = call half @llvm.fabs.f16(half %x)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll

index 35d2bf0..f4ad83b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll
@@ -6,11 +6,10 @@ define half @select_fneg_posk_src_rcp_f16(i32 %c, half %x, half %y) {
  ; VI-LABEL: select_fneg_posk_src_rcp_f16:
  ; VI:       ; %bb.0:
  ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_rcp_f16_e32 v1, v1
-; VI-NEXT:    v_mov_b32_e32 v2, 0xc000
+; VI-NEXT:    v_rcp_f16_e64 v1, -v1
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4000
  ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
  ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
  ; VI-NEXT:    s_setpc_b64 s[30:31]
    %cmp = icmp eq i32 %c, 0
    %rcp = call half @llvm.amdgcn.rcp.f16(half %x)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll

index 2bf4e2b..3e2d5f6 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -372,8 +372,8 @@ define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
  ; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32:
  ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]]
  ; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
  
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
  ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
  ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
  define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
@@ -639,8 +639,8 @@ define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
  
  ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
  ; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 4.0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -659,8 +659,8 @@ define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
  
  ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
  ; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 4.0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -677,8 +677,8 @@ define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
  ; GCN: buffer_load_dword [[X:v[0-9]+]]
  ; GCN: buffer_load_dword [[Y:v[0-9]+]]
  
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, -|[[X]]|, s
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -696,9 +696,9 @@ define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
  ; GCN: buffer_load_dword [[Y:v[0-9]+]]
  
  ; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 vcc, -1, 0
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: s_cselect_b64 s[0:1], -1, 0
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, -|[[X]]|, s[0:1]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -804,10 +804,9 @@ define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
  ; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32:
  ; GCN: buffer_load_dword [[X:v[0-9]+]]
  
-; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
  define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -824,10 +823,14 @@ define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
  ; GCN: buffer_load_dword [[Y:v[0-9]+]]
  
  ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
-; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN-DAG: s_cselect_b64  [[VCC:.*]], -1, 0
+
+; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; SI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, [[VCC]]
+; SI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+
+; VI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0.15915494, -|[[X]]|, [[VCC]]
+; VI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_negfabs_posk_inv2pi_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -845,10 +848,16 @@ define amdgpu_kernel void @mul_select_negfabs_posk_inv2pi_f32(i32 %c) #0 {
  ; GCN: buffer_load_dword [[Y:v[0-9]+]]
  
  ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
-; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+
+; GCN-DAG: s_cselect_b64  [[VCC:.*]], -1, 0
+
+; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; SI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, [[VCC]]
+; SI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+
+
+; VI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0.15915494, -|[[X]]|, [[VCC]]
+; VI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_posk_inv2pi_negfabs_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -864,12 +873,9 @@ define amdgpu_kernel void @mul_select_posk_inv2pi_negfabs_f32(i32 %c) #0 {
  ; GCN-LABEL: {{^}}mul_select_negfabs_negk_inv2pi_f32:
  ; GCN: buffer_load_dword [[X:v[0-9]+]]
  ; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
-; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
-
-; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
-
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, s
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_negfabs_negk_inv2pi_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -886,13 +892,11 @@ define amdgpu_kernel void @mul_select_negfabs_negk_inv2pi_f32(i32 %c) #0 {
  ; GCN: buffer_load_dword [[X:v[0-9]+]]
  ; GCN: buffer_load_dword [[Y:v[0-9]+]]
  
-; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
  ; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 vcc, -1, 0
-; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
-
-; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: s_cselect_b64 s[0:1], -1, 0
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, s[0:1]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_negk_inv2pi_negfabs_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -908,11 +912,10 @@ define amdgpu_kernel void @mul_select_negk_inv2pi_negfabs_f32(i32 %c) #0 {
  ; GCN-LABEL: {{^}}mul_select_negfabs_posk_0_f32:
  ; GCN: buffer_load_dword [[X:v[0-9]+]]
  ; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN-DAG: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
  ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
  ; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_negfabs_posk_0_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
@@ -930,11 +933,10 @@ define amdgpu_kernel void @mul_select_negfabs_posk_0_f32(i32 %c) #0 {
  ; GCN: buffer_load_dword [[X:v[0-9]+]]
  ; GCN: buffer_load_dword [[Y:v[0-9]+]]
  
-; GCN-DAG: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
  ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
  ; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
  define amdgpu_kernel void @mul_select_posk_0_negfabs_f32(i32 %c) #0 {
    %x = load volatile float, ptr addrspace(1) undef
    %y = load volatile float, ptr addrspace(1) undef
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 15 Dec 2022 00:23:55 +0000 (19:23 -0500)
committer	Matt Arsenault <arsenm2@gmail.com>
	Mon, 20 Feb 2023 00:13:38 +0000 (20:13 -0400)
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fneg-combines.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll		patch \| blob \| history