// If the destination is FPR, preserve that.
if (OpRegBankIdx[0] != PMI_FirstGPR)
break;
+
+ // If we're taking in vectors, we have no choice but to put everything on
+ // FPRs.
LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
- if (SrcTy.isVector() ||
- any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
- [&](MachineInstr &MI) { return HasFPConstraints(MI); })) {
- // Set the register bank of every operand to FPR.
- for (unsigned Idx = 0, NumOperands = MI.getNumOperands();
- Idx < NumOperands; ++Idx)
+ if (SrcTy.isVector()) {
+ for (unsigned Idx = 0; Idx < 4; ++Idx)
OpRegBankIdx[Idx] = PMI_FirstFPR;
+ break;
+ }
+
+ // Try to minimize the number of copies. If we have more floating point
+ // constrained values than not, then we'll put everything on FPR. Otherwise,
+ // everything has to be on GPR.
+ unsigned NumFP = 0;
+
+ // Check if the uses of the result always produce floating point values.
+ //
+ // For example:
+ //
+ // %z = G_SELECT %cond %x %y
+ // fpr = G_FOO %z ...
+ if (any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
+ [&](MachineInstr &MI) { return HasFPConstraints(MI); }))
+ ++NumFP;
+
+ // Check if the defs of the source values always produce floating point
+ // values.
+ //
+ // For example:
+ //
+ // %x = G_SOMETHING_ALWAYS_FLOAT %a ...
+ // %z = G_SELECT %cond %x %y
+ //
+ // Also check whether or not the sources have already been decided to be
+ // FPR. Keep track of this.
+ //
+ // This doesn't check the condition, since it's just whatever is in NZCV.
+ // This isn't passed explicitly in a register to fcsel/csel.
+ for (unsigned Idx = 2; Idx < 4; ++Idx) {
+ unsigned VReg = MI.getOperand(Idx).getReg();
+ MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank ||
+ HasFPConstraints(*DefMI))
+ ++NumFP;
}
+
+ // If we have more FP constraints than not, then move everything over to
+ // FPR.
+ if (NumFP >= 2)
+ for (unsigned Idx = 0; Idx < 4; ++Idx)
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
+
break;
}
case TargetOpcode::G_UNMERGE_VALUES: {
%4:_(s64) = G_SELECT %0(s1), %1, %2
$d0 = COPY %4(s64)
RET_ReallyLR implicit $d0
+
+...
+---
+name: two_fpr_inputs_gpr_output
+alignment: 2
+legalized: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $d0, $d1, $w0
+ ; CHECK-LABEL: name: two_fpr_inputs_gpr_output
+ ; CHECK: liveins: $d0, $d1, $w0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+ ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[COPY1:%[0-9]+]]:fpr(s64) = COPY $d0
+ ; CHECK: [[COPY2:%[0-9]+]]:fpr(s64) = COPY $d1
+ ; CHECK: [[COPY3:%[0-9]+]]:fpr(s1) = COPY [[TRUNC]](s1)
+ ; CHECK: [[SELECT:%[0-9]+]]:fpr(s64) = G_SELECT [[COPY3]](s1), [[COPY1]], [[COPY2]]
+ ; CHECK: $x0 = COPY [[SELECT]](s64)
+ ; CHECK: RET_ReallyLR implicit $x0
+
+ ; Verify that the G_SELECT only has FPRs.
+ ; The only difference between fcsel and csel are the register banks. So,
+ ; if we have two FPR inputs and a GPR output, we should do a floating point
+ ; select anyway. This will cost one copy for the output, but that's less
+ ; than doing two to put the inputs on GPRs.
+
+ %3:_(s32) = COPY $w0
+ %0:_(s1) = G_TRUNC %3(s32)
+ %1:_(s64) = COPY $d0
+ %2:_(s64) = COPY $d1
+ %4:_(s64) = G_SELECT %0(s1), %1, %2
+ $x0 = COPY %4(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: one_fpr_input_fpr_output
+alignment: 2
+legalized: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $d0, $x1, $w0
+ ; CHECK-LABEL: name: one_fpr_input_fpr_output
+ ; CHECK: liveins: $d0, $x1, $w0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+ ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[COPY1:%[0-9]+]]:fpr(s64) = COPY $d0
+ ; CHECK: [[COPY2:%[0-9]+]]:gpr(s64) = COPY $x1
+ ; CHECK: [[COPY3:%[0-9]+]]:fpr(s1) = COPY [[TRUNC]](s1)
+ ; CHECK: [[COPY4:%[0-9]+]]:fpr(s64) = COPY [[COPY2]](s64)
+ ; CHECK: [[SELECT:%[0-9]+]]:fpr(s64) = G_SELECT [[COPY3]](s1), [[COPY1]], [[COPY4]]
+ ; CHECK: $d0 = COPY [[SELECT]](s64)
+ ; CHECK: RET_ReallyLR implicit $d0
+
+ ; Same idea as the above test. If the output is an FPR, and one of the
+ ; inputs is an FPR, then it's fewer copies to just do a FCSEL.
+
+ %3:_(s32) = COPY $w0
+ %0:_(s1) = G_TRUNC %3(s32)
+ %1:_(s64) = COPY $d0
+ %2:_(s64) = COPY $x1
+ %4:_(s64) = G_SELECT %0(s1), %1, %2
+ $d0 = COPY %4(s64)
+ RET_ReallyLR implicit $d0
+
+...
+---
+name: one_fpr_input_gpr_output
+alignment: 2
+legalized: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $d0, $x1, $w0
+ ; CHECK-LABEL: name: one_fpr_input_gpr_output
+ ; CHECK: liveins: $d0, $x1, $w0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+ ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[COPY1:%[0-9]+]]:fpr(s64) = COPY $d0
+ ; CHECK: [[COPY2:%[0-9]+]]:gpr(s64) = COPY $x1
+ ; CHECK: [[COPY3:%[0-9]+]]:gpr(s64) = COPY [[COPY1]](s64)
+ ; CHECK: [[SELECT:%[0-9]+]]:gpr(s64) = G_SELECT [[TRUNC]](s1), [[COPY3]], [[COPY2]]
+ ; CHECK: $x0 = COPY [[SELECT]](s64)
+ ; CHECK: RET_ReallyLR implicit $x0
+
+ ; Now we have more GPR registers on the G_SELECT. It's cheaper here to put
+ ; everything on GPR.
+
+ %3:_(s32) = COPY $w0
+ %0:_(s1) = G_TRUNC %3(s32)
+ %1:_(s64) = COPY $d0
+ %2:_(s64) = COPY $x1
+ %4:_(s64) = G_SELECT %0(s1), %1, %2
+ $x0 = COPY %4(s64)
+ RET_ReallyLR implicit $x0
+
+...
+---
+name: two_gpr_input_fpr_output
+alignment: 2
+legalized: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $x0, $x1, $w0
+ ; CHECK-LABEL: name: two_gpr_input_fpr_output
+ ; CHECK: liveins: $x0, $x1, $w0
+ ; CHECK: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
+ ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr(s64) = COPY $x0
+ ; CHECK: [[COPY2:%[0-9]+]]:gpr(s64) = COPY $x1
+ ; CHECK: [[SELECT:%[0-9]+]]:gpr(s64) = G_SELECT [[TRUNC]](s1), [[COPY1]], [[COPY2]]
+ ; CHECK: $d0 = COPY [[SELECT]](s64)
+ ; CHECK: RET_ReallyLR implicit $d0
+
+ ; Same as above. The G_SELECT should get all GPRS.
+
+ %3:_(s32) = COPY $w0
+ %0:_(s1) = G_TRUNC %3(s32)
+ %1:_(s64) = COPY $x0
+ %2:_(s64) = COPY $x1
+ %4:_(s64) = G_SELECT %0(s1), %1, %2
+ $d0 = COPY %4(s64)
+ RET_ReallyLR implicit $d0