}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
- EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ EVT OVT = N->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
SDLoc dl(N);
// Promote to the larger FP type.
- Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
- Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
- Op2 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op2);
+ auto PromotionOpcode = GetPromotionOpcode(OVT, NVT);
+ Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
+ Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
+ Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2);
SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
// Convert back to FP16 as an integer.
- return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+ return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FPOWI(SDNode *N) {
- EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ EVT OVT = N->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
SDValue Op1 = N->getOperand(1);
SDLoc dl(N);
- Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+ // Promote to the larger FP type.
+ Op0 = DAG.getNode(GetPromotionOpcode(OVT, NVT), dl, NVT, Op0);
SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1);
// Convert back to FP16 as an integer.
- return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+ return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) {
+ EVT RVT = N->getValueType(0);
+ EVT SVT = N->getOperand(0).getValueType();
+
if (N->isStrictFPOpcode()) {
+ assert(RVT == MVT::f16);
SDValue Res =
DAG.getNode(ISD::STRICT_FP_TO_FP16, SDLoc(N), {MVT::i16, MVT::Other},
{N->getOperand(0), N->getOperand(1)});
return Res;
}
- return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), MVT::i16, N->getOperand(0));
+ return DAG.getNode(GetPromotionOpcode(SVT, RVT), SDLoc(N), MVT::i16,
+ N->getOperand(0));
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) {
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_XINT_TO_FP(SDNode *N) {
- EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ EVT OVT = N->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
SDLoc dl(N);
SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
// Round the value to the softened type.
- return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+ return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UNDEF(SDNode *N) {
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) {
- EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ EVT OVT = N->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
SDValue Op = GetSoftPromotedHalf(N->getOperand(0));
SDLoc dl(N);
// Promote to the larger FP type.
- Op = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op);
+ Op = DAG.getNode(GetPromotionOpcode(OVT, NVT), dl, NVT, Op);
SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op);
// Convert back to FP16 as an integer.
- return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+ return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) {
- EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ EVT OVT = N->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
SDLoc dl(N);
// Promote to the larger FP type.
- Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
- Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+ auto PromotionOpcode = GetPromotionOpcode(OVT, NVT);
+ Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
+ Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1);
// Convert back to FP16 as an integer.
- return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+ return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE(SDNode *N) {
unsigned OpNo) {
assert(OpNo == 1 && "Only Operand 1 must need promotion here");
SDValue Op1 = N->getOperand(1);
+ EVT RVT = Op1.getValueType();
SDLoc dl(N);
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op1.getValueType());
Op1 = GetSoftPromotedHalf(Op1);
- Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+ Op1 = DAG.getNode(GetPromotionOpcode(RVT, NVT), dl, NVT, Op1);
return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), N->getOperand(0),
Op1);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_EXTEND(SDNode *N) {
+ EVT RVT = N->getValueType(0);
bool IsStrict = N->isStrictFPOpcode();
- SDValue Op = GetSoftPromotedHalf(N->getOperand(IsStrict ? 1 : 0));
+ SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+ EVT SVT = Op.getValueType();
+ Op = GetSoftPromotedHalf(N->getOperand(IsStrict ? 1 : 0));
if (IsStrict) {
+ assert(SVT == MVT::f16);
SDValue Res =
DAG.getNode(ISD::STRICT_FP16_TO_FP, SDLoc(N),
{N->getValueType(0), MVT::Other}, {N->getOperand(0), Op});
return SDValue();
}
- return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), Op);
+ return DAG.getNode(GetPromotionOpcode(SVT, RVT), SDLoc(N), RVT, Op);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT(SDNode *N) {
+ EVT RVT = N->getValueType(0);
SDValue Op = N->getOperand(0);
+ EVT SVT = Op.getValueType();
SDLoc dl(N);
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
Op = GetSoftPromotedHalf(Op);
- SDValue Res = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op);
+ SDValue Res = DAG.getNode(GetPromotionOpcode(SVT, RVT), dl, NVT, Op);
return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT_SAT(SDNode *N) {
+ EVT RVT = N->getValueType(0);
SDValue Op = N->getOperand(0);
+ EVT SVT = Op.getValueType();
SDLoc dl(N);
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
Op = GetSoftPromotedHalf(Op);
- SDValue Res = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op);
+ SDValue Res = DAG.getNode(GetPromotionOpcode(SVT, RVT), dl, NVT, Op);
return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res,
N->getOperand(1));
SDValue Op1 = N->getOperand(1);
SDLoc dl(N);
- EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType());
+ EVT SVT = Op0.getValueType();
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), SVT);
Op0 = GetSoftPromotedHalf(Op0);
Op1 = GetSoftPromotedHalf(Op1);
// Promote to the larger FP type.
- Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
- Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+ auto PromotionOpcode = GetPromotionOpcode(SVT, NVT);
+ Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
+ Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), Op0, Op1,
N->getOperand(2), N->getOperand(3), N->getOperand(4));
ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
SDLoc dl(N);
+ EVT SVT = Op0.getValueType();
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType());
Op0 = GetSoftPromotedHalf(Op0);
Op1 = GetSoftPromotedHalf(Op1);
// Promote to the larger FP type.
- Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
- Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+ auto PromotionOpcode = GetPromotionOpcode(SVT, NVT);
+ Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
+ Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
return DAG.getSetCC(SDLoc(N), N->getValueType(0), Op0, Op1, CCCode);
}
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movq %rdx, %rbx
-; CHECK-NEXT: movzwl (%rdi), %eax
+; CHECK-NEXT: movzwl (%rsi), %eax
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: movzwl (%rsi), %eax
+; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-LABEL: add2:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: movd %xmm1, %eax
-; CHECK-NEXT: shll $16, %eax
-; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: shll $16, %ecx
+; CHECK-NEXT: movd %ecx, %xmm1
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; CHECK-LABEL: add_double:
; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movq %rdx, %r14
; CHECK-NEXT: movq %rsi, %rbx
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq __truncdfbf2@PLT
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: shll $16, %eax
-; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movd %xmm0, %ebp
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq __truncdfbf2@PLT
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm1
+; CHECK-NEXT: shll $16, %ebp
+; CHECK-NEXT: movd %ebp, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
+; CHECK-NEXT: callq __truncsfbf2@PLT
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
; CHECK-NEXT: movsd %xmm0, (%r14)
-; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
%la = load double, ptr %pa
%a = fptrunc double %la to bfloat
define double @add_double2(double %da, double %db) nounwind {
; CHECK-LABEL: add_double2:
; CHECK: # %bb.0:
-; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $16, %rsp
; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: callq __truncdfbf2@PLT
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: shll $16, %eax
-; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movd %xmm0, %ebx
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: callq __truncdfbf2@PLT
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm1
+; CHECK-NEXT: shll $16, %ebx
+; CHECK-NEXT: movd %ebx, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
+; CHECK-NEXT: callq __truncsfbf2@PLT
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
-; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: addq $16, %rsp
+; CHECK-NEXT: popq %rbx
; CHECK-NEXT: retq
%a = fptrunc double %da to bfloat
%b = fptrunc double %db to bfloat
; CHECK-LABEL: addv:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: subq $32, %rsp
-; CHECK-NEXT: movq %xmm1, %rax
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: movd %ecx, %xmm2
+; CHECK-NEXT: subq $56, %rsp
; CHECK-NEXT: movq %xmm0, %rcx
-; CHECK-NEXT: movq %rcx, %rdx
-; CHECK-NEXT: shrq $32, %rdx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm3
-; CHECK-NEXT: addss %xmm2, %xmm3
-; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movq %rax, %rdx
-; CHECK-NEXT: shrq $48, %rdx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: movq %rcx, %rdx
-; CHECK-NEXT: shrq $48, %rdx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm3
-; CHECK-NEXT: addss %xmm2, %xmm3
-; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm2
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm3
-; CHECK-NEXT: addss %xmm2, %xmm3
-; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
-; CHECK-NEXT: movd %eax, %xmm2
-; CHECK-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
-; CHECK-NEXT: movd %ecx, %xmm3
-; CHECK-NEXT: addss %xmm2, %xmm3
-; CHECK-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; CHECK-NEXT: movq %xmm1, %rax
-; CHECK-NEXT: movq %rax, %rcx
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: shll $16, %ecx
-; CHECK-NEXT: movd %ecx, %xmm1
+; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %xmm1, %rdx
+; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: shrq $48, %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: shrq $48, %rax
+; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; CHECK-NEXT: movq %xmm0, %rcx
-; CHECK-NEXT: movq %rcx, %rdx
-; CHECK-NEXT: shrq $32, %rdx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm0
-; CHECK-NEXT: addss %xmm1, %xmm0
-; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movq %rax, %rdx
-; CHECK-NEXT: shrq $48, %rdx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm0
-; CHECK-NEXT: movq %rcx, %rdx
-; CHECK-NEXT: shrq $48, %rdx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm1
-; CHECK-NEXT: addss %xmm0, %xmm1
-; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm0
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: movd %edx, %xmm1
-; CHECK-NEXT: addss %xmm0, %xmm1
-; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movq %xmm0, %rbx
+; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; CHECK-NEXT: movq %xmm0, %rbp
+; CHECK-NEXT: movq %rbp, %r15
+; CHECK-NEXT: shrq $32, %r15
+; CHECK-NEXT: movq %rbx, %r13
+; CHECK-NEXT: shrq $48, %r13
+; CHECK-NEXT: movq %rbp, %r12
+; CHECK-NEXT: shrq $48, %r12
+; CHECK-NEXT: movl %ebp, %eax
; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
-; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2@PLT
-; CHECK-NEXT: movd %xmm0, %ebx
+; CHECK-NEXT: movd %xmm0, %r14d
+; CHECK-NEXT: shll $16, %r14d
+; CHECK-NEXT: shll $16, %ebp
+; CHECK-NEXT: movd %ebp, %xmm1
; CHECK-NEXT: shll $16, %ebx
-; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd %ebx, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2@PLT
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movzwl %ax, %r14d
-; CHECK-NEXT: orl %ebx, %r14d
-; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movzwl %ax, %ebx
+; CHECK-NEXT: orl %r14d, %ebx
+; CHECK-NEXT: shll $16, %r12d
+; CHECK-NEXT: movd %r12d, %xmm1
+; CHECK-NEXT: shll $16, %r13d
+; CHECK-NEXT: movd %r13d, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2@PLT
; CHECK-NEXT: movd %xmm0, %ebp
; CHECK-NEXT: shll $16, %ebp
-; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: shll $16, %r15d
+; CHECK-NEXT: movd %r15d, %xmm1
+; CHECK-NEXT: movq (%rsp), %rax # 8-byte Reload
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2@PLT
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movzwl %ax, %ebx
-; CHECK-NEXT: orl %ebp, %ebx
-; CHECK-NEXT: shlq $32, %rbx
-; CHECK-NEXT: orq %r14, %rbx
-; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movzwl %ax, %r14d
+; CHECK-NEXT: orl %ebp, %r14d
+; CHECK-NEXT: shlq $32, %r14
+; CHECK-NEXT: orq %rbx, %r14
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; CHECK-NEXT: movl %r15d, %eax
+; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; CHECK-NEXT: movd %eax, %xmm1
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2@PLT
; CHECK-NEXT: movd %xmm0, %ebp
; CHECK-NEXT: shll $16, %ebp
-; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq %r15, %rax
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm1
+; CHECK-NEXT: movq %rbx, %rax
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2@PLT
; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: movzwl %ax, %r14d
-; CHECK-NEXT: orl %ebp, %r14d
-; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movzwl %ax, %ebx
+; CHECK-NEXT: orl %ebp, %ebx
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm1
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2@PLT
; CHECK-NEXT: movd %xmm0, %ebp
; CHECK-NEXT: shll $16, %ebp
-; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm1
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: shll $16, %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: callq __truncsfbf2@PLT
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: orl %ebp, %eax
; CHECK-NEXT: shlq $32, %rax
-; CHECK-NEXT: orq %r14, %rax
+; CHECK-NEXT: orq %rbx, %rax
; CHECK-NEXT: movq %rax, %xmm0
-; CHECK-NEXT: movq %rbx, %xmm1
+; CHECK-NEXT: movq %r14, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: addq $32, %rsp
+; CHECK-NEXT: addq $56, %rsp
; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
%add = fadd <8 x bfloat> %a, %b