// instruction can still be used profitably. This function puts the DAG into a
// more appropriate form for those patterns to trigger.
static SDValue performAddSubLongCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
if (DCI.isBeforeLegalizeOps())
return SDValue();
// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
// It will transform the add/sub to a scalable version, so that we can
// make use of SVE's MLA/MLS that will be generated for that pattern
-static SDValue performMulAddSubCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
+static SDValue
+performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
// Make sure that the types are legal
if (!DCI.isAfterLegalizeDAG())
return SDValue();
return SDValue();
}
+// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
+// help, for example, to produce ssra from sshr+add.
+static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // At least one of the operands should be an extract, and the other should be
+ // something that is easy to convert to v1i64 type (in this case a load).
+ if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
+ Op0.getOpcode() != ISD::LOAD)
+ return SDValue();
+ if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
+ Op1.getOpcode() != ISD::LOAD)
+ return SDValue();
+
+ SDLoc DL(N);
+ if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op0.getOperand(0).getValueType() == MVT::v1i64) {
+ Op0 = Op0.getOperand(0);
+ Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
+ } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Op1.getOperand(0).getValueType() == MVT::v1i64) {
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
+ Op1 = Op1.getOperand(0);
+ } else
+ return SDValue();
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
+ DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
+ DAG.getConstant(0, DL, MVT::i64));
+}
+
static SDValue performAddSubCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- if (SDValue Val = performMulAddSubCombine(N, DCI, DAG))
- return Val;
+ TargetLowering::DAGCombinerInfo &DCI) {
// Try to change sum of two reductions.
- if (SDValue Val = performAddUADDVCombine(N, DAG))
+ if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
+ return Val;
+ if (SDValue Val = performAddDotCombine(N, DCI.DAG))
+ return Val;
+ if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
return Val;
- if (SDValue Val = performAddDotCombine(N, DAG))
+ if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
return Val;
- if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
+ if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG))
return Val;
- if (SDValue Val = performNegCSelCombine(N, DAG))
+ if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG))
return Val;
- if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
+ if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
return Val;
- if (SDValue Val = performAddCombineForShiftedOperands(N, DAG))
+ if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
return Val;
- if (SDValue Val = performSubAddMULCombine(N, DAG))
+ if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
return Val;
- return performAddSubLongCombine(N, DCI, DAG);
+ return performAddSubLongCombine(N, DCI);
}
// Massage DAGs which we can use the high-half "long" operations on into
return performVecReduceBitwiseCombine(N, DCI, DAG);
case ISD::ADD:
case ISD::SUB:
- return performAddSubCombine(N, DCI, DAG);
+ return performAddSubCombine(N, DCI);
case ISD::BUILD_VECTOR:
return performBuildVectorCombine(N, DCI, DAG);
case ISD::TRUNCATE:
define i64 @add_i64_ext_load(<1 x i64> %A, ptr %B) nounwind {
; CHECK-LABEL: add_i64_ext_load:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: add d0, d0, d1
+; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%a = extractelement <1 x i64> %A, i32 0
%b = load i64, ptr %B
define i64 @sub_i64_ext_load(<1 x i64> %A, ptr %B) nounwind {
; CHECK-LABEL: sub_i64_ext_load:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: sub x0, x9, x8
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: sub d0, d0, d1
+; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%a = extractelement <1 x i64> %A, i32 0
%b = load i64, ptr %B
define void @add_i64_ext_load_store(<1 x i64> %A, ptr %B) nounwind {
; CHECK-LABEL: add_i64_ext_load_store:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: add x8, x9, x8
-; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: add d0, d0, d1
+; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
%a = extractelement <1 x i64> %A, i32 0
%b = load i64, ptr %B
define i64 @add_i64_ext_ext(<1 x i64> %A, <1 x i64> %B) nounwind {
; CHECK-LABEL: add_i64_ext_ext:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: add d0, d0, d1
+; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%a = extractelement <1 x i64> %A, i32 0
%b = extractelement <1 x i64> %B, i32 0
define i64 @add_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind {
; CHECK-LABEL: add_i64_ext_ext_test1:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: add x9, x9, x10
-; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: add d0, d0, d1
+; CHECK-NEXT: dup v1.2d, v1.d[1]
+; CHECK-NEXT: add d0, d0, d1
+; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%a = extractelement <1 x i64> %A, i32 0
%b = extractelement <2 x i64> %B, i32 0
define i64 @sub_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind {
; CHECK-LABEL: sub_i64_ext_ext_test1:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: sub x9, x9, x10
-; CHECK-NEXT: sub x0, x9, x8
+; CHECK-NEXT: sub d0, d0, d1
+; CHECK-NEXT: dup v1.2d, v1.d[1]
+; CHECK-NEXT: sub d0, d0, d1
+; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
%a = extractelement <1 x i64> %A, i32 0
%b = extractelement <2 x i64> %B, i32 0
; CHECK-LABEL: ursra_scalar:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr x9, [x1]
-; CHECK-NEXT: urshr d0, d0, #1
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ursra d1, d0, #1
+; CHECK-NEXT: fmov x0, d1
; CHECK-NEXT: ret
%tmp1 = load i64, ptr %A
%tmp3 = call i64 @llvm.aarch64.neon.urshl.i64(i64 %tmp1, i64 -1)
; CHECK-LABEL: srsra_scalar:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr x9, [x1]
-; CHECK-NEXT: srshr d0, d0, #1
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: add x0, x8, x9
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: srsra d1, d0, #1
+; CHECK-NEXT: fmov x0, d1
; CHECK-NEXT: ret
%tmp1 = load i64, ptr %A
%tmp3 = call i64 @llvm.aarch64.neon.srshl.i64(i64 %tmp1, i64 -1)