case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::SELECT:
case ISD::FSIN:
case ISD::FTRUNC:
case ISD::FRINT:
/// modifiers.
LLVM_READONLY
static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
- return N->getNumOperands() > 2 || VT == MVT::f64;
+ return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
+ VT == MVT::f64;
}
// Most FP instructions support source modifiers, but this could be refined
switch (N->getOpcode()) {
case ISD::CopyToReg:
- case ISD::SELECT:
case ISD::FDIV:
case ISD::FREM:
case ISD::INLINEASM:
return true;
}
}
+ case ISD::SELECT:
+ // TODO: Only applies if select will be vector
+ return N->getValueType(0) == MVT::f32;
default:
return true;
}
unsigned NumMayIncreaseSize = 0;
MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
+ assert(!N->use_empty());
+
// XXX - Should this limit number of uses to check?
for (const SDNode *U : N->uses()) {
if (!hasSourceMods(U))
//
// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
// select c, (fabs x), +k -> fabs (select c, x, k)
-static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
- SDValue N) {
+SDValue
+AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+ SDValue N) const {
SelectionDAG &DAG = DCI.DAG;
SDValue Cond = N.getOperand(0);
SDValue LHS = N.getOperand(1);
EVT VT = N.getValueType();
if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
(LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+ if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
+ return SDValue();
+
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
SDLoc(N), Cond, LHS, RHS);
}
}
if (ShouldFoldNeg) {
+ if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
+ return SDValue();
+
+ // We're going to be forced to use a source modifier anyway, there's no
+ // point to pulling the negate out unless we can get a size reduction by
+ // negating the constant.
+ //
+ // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
+ // about cheaper constants.
+ if (NewLHS.getOpcode() == ISD::FABS &&
+ getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
+ return SDValue();
+
+ if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
+ return SDValue();
+
if (LHS.getOpcode() == ISD::FNEG)
NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
- else if (CRHS->isNegative())
- return SDValue();
if (Inv)
std::swap(NewLHS, NewRHS);
return SDValue();
}
-
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
return false;
}
+bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
+ if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
+ return getConstantNegateCost(C) == NegatibleCost::Cheaper;
+ return false;
+}
+
static unsigned inverseMinMax(unsigned Opc) {
switch (Opc) {
case ISD::FMAXNUM:
DAG.getConstant(0x8000, SL, SrcVT));
return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
}
+ case ISD::SELECT: {
+ // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
+ // TODO: Invert conditions of foldFreeOpFromSelect
+ return SDValue();
+ }
default:
return SDValue();
}
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
+
+ SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+ SDValue N) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
TargetLowering::NegatibleCost
getConstantNegateCost(const ConstantFPSDNode *C) const;
bool isConstantCostlierToNegate(SDValue N) const;
+ bool isConstantCheaperToNegate(SDValue N) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-NNAN %s
; GCN-LABEL: {{^}}min_fneg_select_regression_0:
; GCN-NOT: v_mul
-; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-
-; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
%fneg.a = fsub float -0.0, %a
%cmp.a = fcmp ult float %a, 1.0
; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
; GCN-NOT: v_mul
-; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+; VI-NNAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
%fneg.a = fsub float -0.0, %a
%cmp.a = fcmp ult float %a, -1.0
; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
}
; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1:
-; SI-SAFE: v_min_legacy_f32_e64 v0, 1.0, -v0
+; SI: v_min_legacy_f32_e64 v0, 1.0, -v0
; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp ugt float %a, -1.0
}
; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, 1.0, -v0
+; SI: v_max_legacy_f32_e64 v0, 1.0, -v0
; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-; GCN-NNAN: v_max_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_max_f32_e64 v0, -v0, 1.0
define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp ult float %a, -1.0
}
; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1:
-; SI-SAFE: v_min_legacy_f32_e64 v0, -v0, 1.0
+; SI: v_min_legacy_f32_e64 v0, -v0, 1.0
; VI-SAFE: v_cmp_lt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp ogt float %a, -1.0
}
; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, 1.0
+; SI: v_max_legacy_f32_e64 v0, -v0, 1.0
; VI-SAFE: v_cmp_gt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-; GCN-NANN: v_max_f32_e64 v0, -v0, 1.0
+; VI-NANN: v_max_f32_e64 v0, -v0, 1.0
define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp olt float %a, -1.0
}
; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0
; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp ugt float %a, -8.0
}
; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0
; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp ult float %a, -8.0
}
; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]]
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]]
; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp ogt float %a, -8.0
}
; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8:
-; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]]
+; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; SI-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]]
; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000
+; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp olt float %a, -8.0
}
; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1:
-; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, -1.0
+; SI: v_max_legacy_f32_e64 v0, -v0, -1.0
; VI-SAFE: v_cmp_gt_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
-; GCN-NNAN: v_max_f32_e64 v0, -v0, -1.0
+; VI-NNAN: v_max_f32_e64 v0, -v0, -1.0
define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp olt float %a, 1.0
}
; GCN-LABEL: {{^}}ult_a_select_fneg_a_b:
-; SI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; SI: v_cmp_nge_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
; VI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1
; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-; GCN-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-
+; VI-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp ult float %a, %b
}
; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b:
-; SI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; SI: v_cmp_nle_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
; VI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1
; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-; SI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1
-; SI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; VI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1
+; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 {
%fneg.a = fneg float %a
%cmp.a = fcmp ugt float %a, %b
; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
+
+; GCN-NSZ-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0, v
+; GCN-NSZ: v_cmp_ngt_f32
+; GCN-NSZ: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
.entry:
%tmp7 = fdiv float 1.000000e+00, %tmp6
; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
-; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]],
-; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]],
-; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]]
+; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fc00000
+; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 0, [[A]]
+; GCN-NSZ-DAG: v_cmp_ngt_f32_e32 {{.*}}, s{{[0-9]+}}, [[D]]
+; GCN-NSZ-DAG: v_cndmask_b32_e64 [[E:v[0-9]+]], -[[D]], v{{[0-9]+}},
+; GCN-NSZ-DAG: v_cmp_nlt_f32_e32 {{.*}}, 0
+; GCN-NSZ-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, [[C]], 0,
define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
.entry:
%tmp7 = fdiv afn float 1.000000e+00, %tmp6
ret <2 x half> %add
}
+; FIXME: This fneg should fold into select
+; GCN-LABEL: {{^}}v_fneg_select_f32:
+; GCN: s_waitcnt
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: s_setpc_b64
+define float @v_fneg_select_f32(i32 %arg0, float %a, float %b, float %c) {
+ %cond = icmp eq i32 %arg0, 0
+ %select = select i1 %cond, float %a, float %b
+ %fneg = fneg float %select
+ ret float %fneg
+}
+
+; FIXME: This fneg should fold into select
+; GCN-LABEL: {{^}}v_fneg_select_2_f32:
+; GCN: s_waitcnt
+; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1
+; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2
+; GCN-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc
+; GCN-NSZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1
+; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2
+; GCN-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-SAFE-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc
+; GCN-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-NEXT: s_setpc_b64
+define float @v_fneg_select_2_f32(i32 %arg0, float %a, float %b, float %c) {
+ %cond = icmp eq i32 %arg0, 0
+ %add.0 = fadd float %a, 2.0
+ %add.1 = fadd float %b, 4.0
+ %select = select i1 %cond, float %add.0, float %add.1
+ %neg.select = fneg float %select
+ ret float %neg.select
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_select_f32:
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+define amdgpu_kernel void @v_fneg_posk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+ %a = load volatile float, ptr addrspace(1) %a.gep
+ %cond = icmp eq i32 %tid, 0
+ %select = select i1 %cond, float 4.0, float %a
+ %fneg = fneg float %select
+ store float %fneg, ptr addrspace(1) %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_select_f32:
+; GCN: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+define amdgpu_kernel void @v_fneg_negk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+ %a = load volatile float, ptr addrspace(1) %a.gep
+ %cond = icmp eq i32 %tid, 0
+ %select = select i1 %cond, float -4.0, float %a
+ %fneg = fneg float %select
+ store float %fneg, ptr addrspace(1) %out.gep
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fma.f32(float, float, float) #1
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0
-; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
+; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0
; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0
-; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0
-; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
+; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0
; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0
-; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
-; VI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0
-; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0
+; VI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
+; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; VI-NSZ-NEXT: ; return to shader part epilog
; GCN-NSZ: ; %bb.0: ; %.entry
; GCN-NSZ-NEXT: v_rcp_f32_e32 v0, s1
; GCN-NSZ-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
-; GCN-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0
-; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0
+; GCN-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
+; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
; GCN-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; SI-LABEL: s_fneg_select_infloop_regression_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: v_bfrev_b32_e32 v0, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s1, 0
-; SI-NEXT: v_mov_b32_e32 v1, s0
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: flat_store_dword v[0:1], v2
; VI-LABEL: s_fneg_select_infloop_regression_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v2, -v0, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_bfrev_b32_e32 v1, 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 0.0, float %arg
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT: v_bfrev_b32_e32 v1, 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float 0.0
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_bfrev_b32_e32 v1, 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 0.0, float %arg
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT: v_bfrev_b32_e32 v1, 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float 0.0
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 2.0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 2.0, float %arg
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 2.0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float 2.0
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, -v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 2.0, float %arg
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, -v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float 2.0
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, -2.0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float -2.0, float %arg
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, -2.0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float -2.0
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, -v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float -2.0, float %arg
%i2 = fneg float %i
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, -v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float %arg, float -2.0
%i2 = fneg float %i
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_bitcmp1_b32 s4, 0
+; SI-NEXT: s_and_b32 s4, 1, s4
+; SI-NEXT: s_cselect_b32 s3, 0, s3
+; SI-NEXT: s_cselect_b32 s2, 0, s2
+; SI-NEXT: s_xor_b32 s3, s3, 0x80000000
+; SI-NEXT: s_cmp_eq_u32 s4, 1
; SI-NEXT: s_cselect_b32 s3, 0, s3
; SI-NEXT: s_cselect_b32 s2, 0, s2
-; SI-NEXT: s_cselect_b32 s3, 0x80000000, s3
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: s_xor_b32 s2, s3, 0x80000000
; SI-NEXT: v_mov_b32_e32 v3, s1
-; SI-NEXT: v_mov_b32_e32 v1, s2
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s4, 0
+; VI-NEXT: s_and_b32 s4, 1, s4
+; VI-NEXT: s_cselect_b32 s3, 0, s3
+; VI-NEXT: s_cselect_b32 s2, 0, s2
+; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
+; VI-NEXT: s_cmp_eq_u32 s4, 1
; VI-NEXT: s_cselect_b32 s3, 0, s3
; VI-NEXT: s_cselect_b32 s2, 0, s2
-; VI-NEXT: s_cselect_b32 s3, 0x80000000, s3
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_xor_b32 s2, s3, 0x80000000
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; GCN-NEXT: v_bfrev_b32_e32 v2, 1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, double 0.0, double %arg
%i2 = fneg double %i
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT: v_bfrev_b32_e32 v1, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
; SI-NEXT: s_bitcmp1_b32 s2, 16
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT: v_cvt_f16_f32_e64 v2, -v0
+; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3]
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: flat_store_short v[0:1], v2
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT: v_mov_b32_e32 v0, 0x8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 16
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: s_cselect_b64 vcc, -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
-; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v1, 1, v1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT: v_bfrev_b32_e32 v1, 1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_select_infloop_regression_f16:
; VI-NEXT: v_and_b32_e32 v1, 1, v1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-NEXT: v_mov_b32_e32 v1, 0x8000
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; VI-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, half 0.0, half %arg
%i2 = fneg half %i
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0
+; SI-NEXT: v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
-; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0
+; VI-NEXT: v_cndmask_b32_e64 v2, |v0|, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, 0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 0.0, float %arg
%i2 = call float @llvm.fabs.f32(float %i)
; SI-LABEL: s_fneg_fabs_select_infloop_regression:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: v_bfrev_b32_e32 v0, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s1, 0
-; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, |v1|, v0, s[0:1]
-; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: flat_store_dword v[0:1], v2
; VI-LABEL: s_fneg_fabs_select_infloop_regression:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; VI-NEXT: v_cndmask_b32_e64 v0, |v1|, v0, s[0:1]
-; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT: v_bfrev_b32_e32 v1, 1
-; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, v1, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -|v0|, 0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%i = select i1 %arg1, float 0.0, float %arg
%i2 = call float @llvm.fabs.f32(float %i)
; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32:
; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_rcp_legacy_f32_e64 [[RCP:v[0-9]+]], -[[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32:
; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
%x = load volatile float, ptr addrspace(1) undef
%cmp = icmp eq i32 %c, 0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc
-; CI-NEXT: v_mul_f32_e64 v0, -v0, v2
+; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc
+; CI-NEXT: v_mul_f32_e32 v0, v0, v2
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_select_negfabs_posk_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
-; VI-NEXT: v_mov_b32_e32 v3, 0xc400
+; VI-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT: v_mov_b32_e32 v3, 0x4400
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT: v_mul_f16_e64 v0, -v0, v2
+; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc
-; CI-NEXT: v_mul_f32_e64 v0, -v0, v2
+; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc
+; CI-NEXT: v_mul_f32_e32 v0, v0, v2
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_select_posk_negfabs_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
-; VI-NEXT: v_mov_b32_e32 v3, 0xc400
+; VI-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT: v_mov_b32_e32 v3, 0x4400
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT: v_mul_f16_e64 v0, -v0, v2
+; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc
-; CI-NEXT: v_mul_f32_e64 v0, -|v0|, v2
+; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc
+; CI-NEXT: v_mul_f32_e32 v0, v0, v2
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_select_negfabs_negk_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, 0x4400
+; VI-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT: v_mov_b32_e32 v3, 0xc400
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT: v_mul_f16_e64 v0, -|v0|, v2
+; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc
-; CI-NEXT: v_mul_f32_e64 v0, -|v0|, v2
+; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc
+; CI-NEXT: v_mul_f32_e32 v0, v0, v2
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_select_negk_negfabs_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, 0x4400
+; VI-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; VI-NEXT: v_mov_b32_e32 v3, 0xc400
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; VI-NEXT: v_mul_f16_e64 v0, -|v0|, v2
+; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
; VI-LABEL: select_fneg_posk_src_rcp_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_rcp_f16_e32 v1, v1
-; VI-NEXT: v_mov_b32_e32 v2, 0xc000
+; VI-NEXT: v_rcp_f16_e64 v1, -v1
+; VI-NEXT: v_mov_b32_e32 v2, 0x4000
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%rcp = call half @llvm.amdgcn.rcp.f16(half %x)
; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32:
; GCN-DAG: buffer_load_dword [[X:v[0-9]+]]
; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 4.0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 4.0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, -|[[X]]|, s
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 vcc, -1, 0
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: s_cselect_b64 s[0:1], -1, 0
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, -|[[X]]|, s[0:1]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32:
; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]]
-; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
-; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
-; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
-; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN-DAG: s_cselect_b64 [[VCC:.*]], -1, 0
+
+; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; SI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, [[VCC]]
+; SI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+
+; VI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0.15915494, -|[[X]]|, [[VCC]]
+; VI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_negfabs_posk_inv2pi_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN: buffer_load_dword [[Y:v[0-9]+]]
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
-; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+
+; GCN-DAG: s_cselect_b64 [[VCC:.*]], -1, 0
+
+; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; SI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, [[VCC]]
+; SI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
+
+
+; VI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0.15915494, -|[[X]]|, [[VCC]]
+; VI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_posk_inv2pi_negfabs_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN-LABEL: {{^}}mul_select_negfabs_negk_inv2pi_f32:
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
-; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
-
-; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
-
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, s
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_negfabs_negk_inv2pi_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 vcc, -1, 0
-; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
-
-; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
+; GCN: s_cselect_b64 s[0:1], -1, 0
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, s[0:1]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_negk_inv2pi_negfabs_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN-LABEL: {{^}}mul_select_negfabs_posk_0_f32:
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN-DAG: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_negfabs_posk_0_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN-DAG: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
-; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
+; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, -|[[X]]|, [[VCC]]
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
define amdgpu_kernel void @mul_select_posk_0_negfabs_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) undef
%y = load volatile float, ptr addrspace(1) undef