ISD::ZERO_EXTEND,
ISD::SIGN_EXTEND_INREG,
ISD::EXTRACT_VECTOR_ELT,
- ISD::INSERT_VECTOR_ELT});
+ ISD::INSERT_VECTOR_ELT,
+ ISD::FCOPYSIGN});
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
return SDValue();
}
+SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue SignOp = N->getOperand(1);
+ if (SignOp.getValueType() != MVT::f64)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ // Reduce width of sign operand, we only need the highest bit.
+ //
+ // fcopysign f64:x, f64:y ->
+ // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
+ // TODO: In some cases it might make sense to go all the way to f16.
+ SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
+ SDValue SignAsF32 =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
+ DAG.getConstant(1, DL, MVT::i32));
+
+ return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
+ SignAsF32);
+}
+
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
// This is a variant of
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return performUCharToFloatCombine(N, DCI);
+ case ISD::FCOPYSIGN:
+ return performFCopySignCombine(N, DCI);
case AMDGPUISD::CVT_F32_UBYTE0:
case AMDGPUISD::CVT_F32_UBYTE1:
case AMDGPUISD::CVT_F32_UBYTE2:
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v4, s3
-; SI-NEXT: v_bfi_b32 v1, s8, v1, v4
; SI-NEXT: v_mov_b32_e32 v6, s3
+; SI-NEXT: v_bfi_b32 v1, s8, v1, v6
; SI-NEXT: v_mov_b32_e32 v7, s2
; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1]
; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_mov_b32_e32 v5, s9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_bfi_b32 v1, s10, v6, v1
; SI-NEXT: v_mov_b32_e32 v7, s3
+; SI-NEXT: v_bfi_b32 v1, s10, v6, v7
; SI-NEXT: v_mov_b32_e32 v8, s2
; SI-NEXT: v_mov_b32_e32 v9, s1
-; SI-NEXT: v_mov_b32_e32 v10, s1
-; SI-NEXT: v_mov_b32_e32 v11, s0
+; SI-NEXT: v_mov_b32_e32 v10, s0
; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1]
; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1]
; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; SI-NEXT: v_mov_b32_e32 v8, s12
; SI-NEXT: v_mov_b32_e32 v9, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v0
; SI-NEXT: v_mov_b32_e32 v2, s3
+; SI-NEXT: v_bfi_b32 v5, s14, v10, v2
; SI-NEXT: v_mov_b32_e32 v6, s2
-; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_mov_b32_e32 v7, s1
; SI-NEXT: v_mov_b32_e32 v11, s0
; SI-NEXT: v_mov_b32_e32 v12, s7
-; SI-NEXT: v_mov_b32_e32 v13, s7
-; SI-NEXT: v_mov_b32_e32 v14, s6
-; SI-NEXT: v_mov_b32_e32 v15, s5
-; SI-NEXT: v_mov_b32_e32 v16, s5
-; SI-NEXT: v_mov_b32_e32 v17, s4
+; SI-NEXT: v_mov_b32_e32 v13, s6
+; SI-NEXT: v_mov_b32_e32 v14, s5
+; SI-NEXT: v_mov_b32_e32 v15, s4
; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5]
; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v3
+; SI-NEXT: v_bfi_b32 v5, s14, v10, v7
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9]
; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5]
; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v15
+; SI-NEXT: v_bfi_b32 v5, s14, v10, v14
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
-; SI-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc
; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5]
; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v4, v17, vcc
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: s_endpgm