From 0c011335c9c6b0c0598f8fa7393af5e8aad428a2 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Tue, 22 Nov 2022 12:29:44 -0800 Subject: [PATCH] [X86] Don't lower f16->f80 fpext to libcall on darwin. We don't provide __extendhfxf2, and only have the soft-float __extendhfsf2 in compiler-rt. This only changed recently with 655ba9c8a1d2, so this patch reverts back to the previous behavior. However, the f80->f16 fptrunc is not easily implementable without the compiler-rt __truncxfhf2, but that has always been true, and isn't an immediate regression. Patch by Ahmed Bougacha. rdar://102194995 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 ++- llvm/test/CodeGen/X86/half-fp80-darwin.ll | 71 +++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/half-fp80-darwin.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2e64c20..5b27710 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23154,7 +23154,10 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); - if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80)) + // Let f16->f80 get lowered to a libcall, except for darwin, where we should + // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available) + if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 && + !Subtarget.getTargetTriple().isOSDarwin())) return SDValue(); if (SVT == MVT::f16) { diff --git a/llvm/test/CodeGen/X86/half-fp80-darwin.ll b/llvm/test/CodeGen/X86/half-fp80-darwin.ll new file mode 100644 index 0000000..0ba734e --- /dev/null +++ b/llvm/test/CodeGen/X86/half-fp80-darwin.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-apple-macosx %s -o - | FileCheck %s --check-prefix=CHECK-SOFT +; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+f16c %s -o - | FileCheck %s --check-prefix=CHECK-F16C + +define void @extendhfxf(ptr %outptr, ptr %inptr) nounwind { +; CHECK-SOFT-LABEL: extendhfxf: +; CHECK-SOFT: ## %bb.0: +; CHECK-SOFT-NEXT: pushq %rbx +; CHECK-SOFT-NEXT: subq $16, %rsp +; CHECK-SOFT-NEXT: movq %rdi, %rbx +; CHECK-SOFT-NEXT: movzwl (%rsi), %edi +; CHECK-SOFT-NEXT: callq ___extendhfsf2 +; CHECK-SOFT-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) +; CHECK-SOFT-NEXT: flds {{[0-9]+}}(%rsp) +; CHECK-SOFT-NEXT: fstpt (%rbx) +; CHECK-SOFT-NEXT: addq $16, %rsp +; CHECK-SOFT-NEXT: popq %rbx +; CHECK-SOFT-NEXT: retq +; +; CHECK-F16C-LABEL: extendhfxf: +; CHECK-F16C: ## %bb.0: +; CHECK-F16C-NEXT: movzwl (%rsi), %eax +; CHECK-F16C-NEXT: vmovd %eax, %xmm0 +; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-F16C-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-F16C-NEXT: flds -{{[0-9]+}}(%rsp) +; CHECK-F16C-NEXT: fstpt (%rdi) +; CHECK-F16C-NEXT: retq + %in = load half, ptr %inptr + %fp80 = fpext half %in to x86_fp80 + store x86_fp80 %fp80, ptr %outptr + ret void +} + +; FIXME: We don't currently provide __truncxfhf2, but we can't lower this as +; successive fptruncs (like we do fpext) because of double rounding. +; We also don't currently soft-float this call, like we do e.g., __truncsfhf2: +; the latter long predates the fp16 parameter passing ABI, so can't change. +; If we ever add a __truncxfhf2, we're not bound by existing ABI. +define void @truncxfhf(ptr %outptr, ptr %inptr) nounwind { +; CHECK-SOFT-LABEL: truncxfhf: +; CHECK-SOFT: ## %bb.0: +; CHECK-SOFT-NEXT: pushq %rbx +; CHECK-SOFT-NEXT: subq $16, %rsp +; CHECK-SOFT-NEXT: movq %rdi, %rbx +; CHECK-SOFT-NEXT: fldt (%rsi) +; CHECK-SOFT-NEXT: fstpt (%rsp) +; CHECK-SOFT-NEXT: callq ___truncxfhf2 +; CHECK-SOFT-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SOFT-NEXT: movw %ax, (%rbx) +; CHECK-SOFT-NEXT: addq $16, %rsp +; CHECK-SOFT-NEXT: popq %rbx +; CHECK-SOFT-NEXT: retq +; +; CHECK-F16C-LABEL: truncxfhf: +; CHECK-F16C: ## %bb.0: +; CHECK-F16C-NEXT: pushq %rbx +; CHECK-F16C-NEXT: subq $16, %rsp +; CHECK-F16C-NEXT: movq %rdi, %rbx +; CHECK-F16C-NEXT: fldt (%rsi) +; CHECK-F16C-NEXT: fstpt (%rsp) +; CHECK-F16C-NEXT: callq ___truncxfhf2 +; CHECK-F16C-NEXT: vpextrw $0, %xmm0, (%rbx) +; CHECK-F16C-NEXT: addq $16, %rsp +; CHECK-F16C-NEXT: popq %rbx +; CHECK-F16C-NEXT: retq + %in = load x86_fp80, ptr %inptr + %half = fptrunc x86_fp80 %in to half + store half %half, ptr %outptr + ret void +} -- 2.7.4