From 0c011335c9c6b0c0598f8fa7393af5e8aad428a2 Mon Sep 17 00:00:00 2001
From: Davide Italiano <ditaliano@apple.com>
Date: Tue, 22 Nov 2022 12:29:44 -0800
Subject: [PATCH] [X86] Don't lower f16->f80 fpext to libcall on darwin.

We don't provide __extendhfxf2, and only have the soft-float
__extendhfsf2 in compiler-rt.  This only changed recently with
655ba9c8a1d2, so this patch reverts back to the previous behavior.

However, the f80->f16 fptrunc is not easily implementable without
the compiler-rt __truncxfhf2, but that has always been true, and
isn't an immediate regression.

Patch by Ahmed Bougacha.

rdar://102194995
---
 llvm/lib/Target/X86/X86ISelLowering.cpp   |  5 ++-
 llvm/test/CodeGen/X86/half-fp80-darwin.ll | 71 +++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/half-fp80-darwin.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2e64c20..5b27710 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23154,7 +23154,10 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   MVT SVT = In.getSimpleValueType();
 
-  if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80))
+  // Let f16->f80 get lowered to a libcall, except for darwin, where we should
+  // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
+  if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
+                          !Subtarget.getTargetTriple().isOSDarwin()))
     return SDValue();
 
   if (SVT == MVT::f16) {
diff --git a/llvm/test/CodeGen/X86/half-fp80-darwin.ll b/llvm/test/CodeGen/X86/half-fp80-darwin.ll
new file mode 100644
index 0000000..0ba734e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/half-fp80-darwin.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-macosx %s -o - | FileCheck %s --check-prefix=CHECK-SOFT
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+f16c %s -o - | FileCheck %s --check-prefix=CHECK-F16C
+
+define void @extendhfxf(ptr %outptr, ptr %inptr) nounwind {
+; CHECK-SOFT-LABEL: extendhfxf:
+; CHECK-SOFT:       ## %bb.0:
+; CHECK-SOFT-NEXT:    pushq %rbx
+; CHECK-SOFT-NEXT:    subq $16, %rsp
+; CHECK-SOFT-NEXT:    movq %rdi, %rbx
+; CHECK-SOFT-NEXT:    movzwl (%rsi), %edi
+; CHECK-SOFT-NEXT:    callq ___extendhfsf2
+; CHECK-SOFT-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-SOFT-NEXT:    flds {{[0-9]+}}(%rsp)
+; CHECK-SOFT-NEXT:    fstpt (%rbx)
+; CHECK-SOFT-NEXT:    addq $16, %rsp
+; CHECK-SOFT-NEXT:    popq %rbx
+; CHECK-SOFT-NEXT:    retq
+;
+; CHECK-F16C-LABEL: extendhfxf:
+; CHECK-F16C:       ## %bb.0:
+; CHECK-F16C-NEXT:    movzwl (%rsi), %eax
+; CHECK-F16C-NEXT:    vmovd %eax, %xmm0
+; CHECK-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-F16C-NEXT:    flds -{{[0-9]+}}(%rsp)
+; CHECK-F16C-NEXT:    fstpt (%rdi)
+; CHECK-F16C-NEXT:    retq
+  %in = load half, ptr %inptr
+  %fp80 = fpext half %in to x86_fp80
+  store x86_fp80 %fp80, ptr %outptr
+  ret void
+}
+
+; FIXME: We don't currently provide __truncxfhf2, but we can't lower this as
+; successive fptruncs (like we do fpext) because of double rounding.
+; We also don't currently soft-float this call, like we do e.g., __truncsfhf2:
+; the latter long predates the fp16 parameter passing ABI, so can't change.
+; If we ever add a __truncxfhf2, we're not bound by existing ABI.
+define void @truncxfhf(ptr %outptr, ptr %inptr) nounwind {
+; CHECK-SOFT-LABEL: truncxfhf:
+; CHECK-SOFT:       ## %bb.0:
+; CHECK-SOFT-NEXT:    pushq %rbx
+; CHECK-SOFT-NEXT:    subq $16, %rsp
+; CHECK-SOFT-NEXT:    movq %rdi, %rbx
+; CHECK-SOFT-NEXT:    fldt (%rsi)
+; CHECK-SOFT-NEXT:    fstpt (%rsp)
+; CHECK-SOFT-NEXT:    callq ___truncxfhf2
+; CHECK-SOFT-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-SOFT-NEXT:    movw %ax, (%rbx)
+; CHECK-SOFT-NEXT:    addq $16, %rsp
+; CHECK-SOFT-NEXT:    popq %rbx
+; CHECK-SOFT-NEXT:    retq
+;
+; CHECK-F16C-LABEL: truncxfhf:
+; CHECK-F16C:       ## %bb.0:
+; CHECK-F16C-NEXT:    pushq %rbx
+; CHECK-F16C-NEXT:    subq $16, %rsp
+; CHECK-F16C-NEXT:    movq %rdi, %rbx
+; CHECK-F16C-NEXT:    fldt (%rsi)
+; CHECK-F16C-NEXT:    fstpt (%rsp)
+; CHECK-F16C-NEXT:    callq ___truncxfhf2
+; CHECK-F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
+; CHECK-F16C-NEXT:    addq $16, %rsp
+; CHECK-F16C-NEXT:    popq %rbx
+; CHECK-F16C-NEXT:    retq
+  %in = load x86_fp80, ptr %inptr
+  %half = fptrunc x86_fp80 %in to half
+  store half %half, ptr %outptr
+  ret void
+}
-- 
2.7.4