AMDGPU: Fold fneg into bitcast of build_vector

author Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 26 Jan 2023 19:46:00 +0000 (15:46 -0400)

committer Matt Arsenault <arsenm2@gmail.com>

Tue, 11 Apr 2023 11:12:01 +0000 (07:12 -0400)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 26 Jan 2023 19:46:00 +0000 (15:46 -0400)
committer Matt Arsenault <arsenm2@gmail.com>
Tue, 11 Apr 2023 11:12:01 +0000 (07:12 -0400)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 9873537..1077398 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -556,7 +556,7 @@ bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
  //===----------------------------------------------------------------------===//
  
  LLVM_READNONE
-static bool fnegFoldsIntoOp(unsigned Opc) {
+static bool fnegFoldsIntoOpcode(unsigned Opc) {
    switch (Opc) {
    case ISD::FADD:
    case ISD::FSUB:
@@ -583,11 +583,27 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
    case AMDGPUISD::FMED3:
      // TODO: handle llvm.amdgcn.fma.legacy
      return true;
+  case ISD::BITCAST:
+    llvm_unreachable("bitcast is special cased");
    default:
      return false;
    }
  }
  
+static bool fnegFoldsIntoOp(const SDNode *N) {
+  unsigned Opc = N->getOpcode();
+  if (Opc == ISD::BITCAST) {
+    // TODO: Is there a benefit to checking the conditions performFNegCombine
+    // does? We don't for the other cases.
+    SDValue BCSrc = N->getOperand(0);
+    return BCSrc.getOpcode() == ISD::BUILD_VECTOR &&
+           BCSrc.getNumOperands() == 2 &&
+           BCSrc.getOperand(1).getValueSizeInBits() == 32;
+  }
+
+  return fnegFoldsIntoOpcode(Opc);
+}
+
  /// \p returns true if the operation will definitely need to use a 64-bit
  /// encoding, and thus will use a VOP3 encoding regardless of the source
  /// modifiers.
@@ -3773,7 +3789,7 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
  
      if (NewLHS.hasOneUse()) {
        unsigned Opc = NewLHS.getOpcode();
-      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
          ShouldFoldNeg = false;
        if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
          ShouldFoldNeg = false;
@@ -3915,8 +3931,6 @@ static unsigned inverseMinMax(unsigned Opc) {
  /// \return true if it's profitable to try to push an fneg into its source
  /// instruction.
  bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
-  unsigned Opc = N0.getOpcode();
-
    // If the input has multiple uses and we can either fold the negate down, or
    // the other uses cannot, give up. This both prevents unprofitable
    // transformations and infinite loops: we won't repeatedly try to fold around
@@ -3927,7 +3941,7 @@ bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
      if (allUsesHaveSourceMods(N, 0))
        return false;
    } else {
-    if (fnegFoldsIntoOp(Opc) &&
+    if (fnegFoldsIntoOp(N0.getNode()) &&
          (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
        return false;
    }
@@ -4133,6 +4147,43 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
      // TODO: Invert conditions of foldFreeOpFromSelect
      return SDValue();
    }
+  case ISD::BITCAST: {
+    SDLoc SL(N);
+    SDValue BCSrc = N0.getOperand(0);
+    if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
+      if (HighBits.getValueType().getSizeInBits() != 32 ||
+          !fnegFoldsIntoOp(HighBits.getNode()))
+        return SDValue();
+
+      // f64 fneg only really needs to operate on the high half of of the
+      // register, so try to force it to an f32 operation to help make use of
+      // source modifiers.
+      //
+      //
+      // fneg (f64 (bitcast (build_vector x, y))) ->
+      // f64 (bitcast (build_vector (bitcast i32:x to f32),
+      //                            (fneg (bitcast i32:y to f32)))
+
+      SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
+      SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
+      SDValue CastBack =
+          DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
+
+      SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
+      Ops.back() = CastBack;
+      DCI.AddToWorklist(NegHi.getNode());
+      SDValue Build =
+          DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
+      SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
+
+      if (!N0.hasOneUse())
+        DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
+      return Result;
+    }
+
+    return SDValue();
+  }
    default:
      return SDValue();
    }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll

index 4f192db..4a3e916 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3026,11 +3026,10 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
  ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  ; SI-NEXT:    s_and_b32 s4, 1, s4
  ; SI-NEXT:    s_cselect_b32 s3, 0, s3
-; SI-NEXT:    s_cselect_b32 s2, 0, s2
  ; SI-NEXT:    s_xor_b32 s3, s3, 0x80000000
  ; SI-NEXT:    s_cmp_eq_u32 s4, 1
-; SI-NEXT:    s_cselect_b32 s3, 0, s3
  ; SI-NEXT:    s_cselect_b32 s2, 0, s2
+; SI-NEXT:    s_cselect_b32 s3, 0, s3
  ; SI-NEXT:    v_mov_b32_e32 v3, s1
  ; SI-NEXT:    v_mov_b32_e32 v0, s2
  ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3046,11 +3045,10 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
  ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  ; VI-NEXT:    s_and_b32 s4, 1, s4
  ; VI-NEXT:    s_cselect_b32 s3, 0, s3
-; VI-NEXT:    s_cselect_b32 s2, 0, s2
  ; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
  ; VI-NEXT:    s_cmp_eq_u32 s4, 1
-; VI-NEXT:    s_cselect_b32 s3, 0, s3
  ; VI-NEXT:    s_cselect_b32 s2, 0, s2
+; VI-NEXT:    s_cselect_b32 s3, 0, s3
  ; VI-NEXT:    v_mov_b32_e32 v3, s1
  ; VI-NEXT:    v_mov_b32_e32 v0, s2
  ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3071,7 +3069,6 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
  ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
  ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
  ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
  ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll

index 4360a47..4f2bba8 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -397,9 +397,9 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
  ; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX11-LABEL: fneg_xor_select_f64:
@@ -407,11 +407,10 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
  ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
  ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
  ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
  ; GFX11-NEXT:    s_setpc_b64 s[30:31]
    %select = select i1 %cond, double %arg0, double %arg1
    %fneg = fneg double %select
@@ -501,28 +500,29 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
  ; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
  ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
  ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX11-LABEL: select_fneg_select_fneg_f64:
  ; GFX11:       ; %bb.0:
  ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
  ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
  ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
  ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v5, 0x80000000, v3
+; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
  ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
  ; GFX11-NEXT:    s_setpc_b64 s[30:31]
    %fneg0 = fneg double %arg0
    %select0 = select i1 %cond0, double %arg1, double %fneg0
@@ -893,12 +893,12 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GCN-NEXT:    v_and_b32_e32 v5, 1, v0
  ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v1, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
  ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, v3
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX11-LABEL: cospiD_pattern1:
@@ -908,12 +908,13 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
  ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v0
  ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
  ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
  ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v5, 0x80000000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_cndmask_b32 v1, v4, v5
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
  ; GFX11-NEXT:    s_setpc_b64 s[30:31]
    %i = and i32 %arg, 1
    %i3 = icmp eq i32 %i, 0
@@ -1390,17 +1391,14 @@ define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float
  ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64:
  ; GCN:       ; %bb.0:
  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_add_f32_e32 v1, 2.0, v1
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_sub_f32_e32 v1, -2.0, v1
  ; GCN-NEXT:    s_setpc_b64 s[30:31]
  ;
  ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64:
  ; GFX11:       ; %bb.0:
  ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_f32_e32 v1, 2.0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    v_sub_f32_e32 v1, -2.0, v1
  ; GFX11-NEXT:    s_setpc_b64 s[30:31]
    %fadd = fadd nsz nnan float %elt1, 2.0
    %insert.0 = insertelement <2 x float> poison, float %elt0, i32 0
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 26 Jan 2023 19:46:00 +0000 (15:46 -0400)
committer	Matt Arsenault <arsenm2@gmail.com>
	Tue, 11 Apr 2023 11:12:01 +0000 (07:12 -0400)
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll		patch \| blob \| history