From: Tue Ly Date: Thu, 7 Apr 2022 20:02:10 +0000 (-0400) Subject: [libc] Add support for x86-64 targets that do not have FMA instructions. X-Git-Tag: upstream/15.0.7~10974 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c5f8a0a1e929ea41bd3e0d781c6b394a3f619427;p=platform%2Fupstream%2Fllvm.git [libc] Add support for x86-64 targets that do not have FMA instructions. Make FMA flag checks more accurate for x86-64 targets, and refactor polyeval to use multiply and add instead when FMA instructions are not available. Reviewed By: michaelrj, sivachandra Differential Revision: https://reviews.llvm.org/D123335 --- diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt index f1cd0b5..7f1cecc 100644 --- a/libc/src/__support/FPUtil/CMakeLists.txt +++ b/libc/src/__support/FPUtil/CMakeLists.txt @@ -12,7 +12,6 @@ add_header_library( NearestIntegerOperations.h NormalFloat.h PlatformDefs.h - PolyEval.h UInt.h XFloat.h DEPENDS @@ -34,4 +33,29 @@ add_header_library( libc.src.__support.FPUtil.generic.sqrt ) +add_header_library( + fma + HDRS + FMA.h + DEPENDS + .fputil + libc.src.__support.FPUtil.generic.fma +) + +add_header_library( + multiply_add + HDRS + multiply_add.h + DEPENDS + .fma +) + +add_header_library( + polyeval + HDRS + PolyEval.h + DEPENDS + .multiply_add +) + add_subdirectory(generic) diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h index c735c06..6823dd0 100644 --- a/libc/src/__support/FPUtil/FMA.h +++ b/libc/src/__support/FPUtil/FMA.h @@ -11,11 +11,16 @@ #include "src/__support/architectures.h" +#if defined(LIBC_TARGET_HAS_FMA) + #if defined(LLVM_LIBC_ARCH_X86_64) #include "x86_64/FMA.h" #elif defined(LLVM_LIBC_ARCH_AARCH64) #include "aarch64/FMA.h" +#endif + #else +// FMA instructions are not available #include "generic/FMA.h" #include "src/__support/CPP/TypeTraits.h" diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h index 368ee38..c9e818a 100644 --- a/libc/src/__support/FPUtil/PolyEval.h +++ b/libc/src/__support/FPUtil/PolyEval.h @@ -9,19 +9,15 @@ #ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H -#include "src/__support/CPP/TypeTraits.h" -#include "src/__support/architectures.h" +#include "multiply_add.h" // Evaluate polynomial using Horner's Scheme: // With polyeval(x, a_0, a_1, ..., a_n) = a_n * x^n + ... + a_1 * x + a_0, we // evaluated it as: a_0 + x * (a_1 + x * ( ... (a_(n-1) + x * a_n) ... ) ) ). -// We will use fma instructions if available. +// We will use FMA instructions if available. // Example: to evaluate x^3 + 2*x^2 + 3*x + 4, call // polyeval( x, 4.0, 3.0, 2.0, 1.0 ) -#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64) -#include "FMA.h" - namespace __llvm_libc { namespace fputil { @@ -29,35 +25,10 @@ template static inline T polyeval(T x, T a0) { return a0; } template INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) { - return fma(x, polyeval(x, a...), a0); + return multiply_add(x, polyeval(x, a...), a0); } } // namespace fputil } // namespace __llvm_libc -#ifdef LLVM_LIBC_ARCH_X86_64 - -// [DISABLED] There is a regression with using vectorized version for polyeval -// compared to the naive Horner's scheme with fma. Need further investigation -// #include "x86_64/PolyEval.h" - -#endif // LLVM_LIBC_ARCH_X86_64 - -#else - -namespace __llvm_libc { -namespace fputil { - -template static inline T polyeval(T x, T a0) { return a0; } - -template -static inline T polyeval(T x, T a0, Ts... a) { - return x * polyeval(x, a...) + a0; -} - -} // namespace fputil -} // namespace __llvm_libc - -#endif - -#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_FMA_H +#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H diff --git a/libc/src/__support/FPUtil/aarch64/FMA.h b/libc/src/__support/FPUtil/aarch64/FMA.h index c236c9a..ed637c8 100644 --- a/libc/src/__support/FPUtil/aarch64/FMA.h +++ b/libc/src/__support/FPUtil/aarch64/FMA.h @@ -15,6 +15,10 @@ #error "Invalid include" #endif +#if !defined(LIBC_TARGET_HAS_FMA) +#error "FMA instructions are not supported" +#endif + #include "src/__support/CPP/TypeTraits.h" namespace __llvm_libc { diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt index bf69e7d..a755e76 100644 --- a/libc/src/__support/FPUtil/generic/CMakeLists.txt +++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt @@ -4,3 +4,9 @@ add_header_library( sqrt.h sqrt_80_bit_long_double.h ) + +add_header_library( + fma + HDRS + FMA.h +) diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h index efdd8b7..78b640c 100644 --- a/libc/src/__support/FPUtil/generic/FMA.h +++ b/libc/src/__support/FPUtil/generic/FMA.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMA_H #include "src/__support/CPP/TypeTraits.h" +#include "src/__support/FPUtil/FPBits.h" namespace __llvm_libc { namespace fputil { diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h new file mode 100644 index 0000000..8f5da22 --- /dev/null +++ b/libc/src/__support/FPUtil/multiply_add.h @@ -0,0 +1,41 @@ +//===-- Common header for multiply-add implementations ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H +#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H + +#include "src/__support/architectures.h" + +namespace __llvm_libc { +namespace fputil { + +// Implement a simple wrapper for multiply-add operation: +// multiply_add(x, y, z) = x*y + z +// which uses FMA instructions to speed up if available. + +template static inline T multiply_add(T x, T y, T z) { + return x * y + z; +} + +#if defined(LIBC_TARGET_HAS_FMA) +// FMA instructions are available. +#include "FMA.h" + +template <> inline float multiply_add(float x, float y, float z) { + return fma(x, y, z); +} + +template <> inline double multiply_add(double x, double y, double z) { + return fma(x, y, z); +} +#endif // LIBC_TARGET_HAS_FMA + +} // namespace fputil +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h index 70ebe38..08de6da 100644 --- a/libc/src/__support/FPUtil/x86_64/FMA.h +++ b/libc/src/__support/FPUtil/x86_64/FMA.h @@ -15,6 +15,10 @@ #error "Invalid include" #endif +#if !defined(LIBC_TARGET_HAS_FMA) +#error "FMA instructions are not supported" +#endif + #include "src/__support/CPP/TypeTraits.h" #include diff --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h index 14eb1a5..70eeb99 100644 --- a/libc/src/__support/architectures.h +++ b/libc/src/__support/architectures.h @@ -37,7 +37,15 @@ #define LLVM_LIBC_ARCH_ANY_ARM #endif -#if defined(LLVM_LIBC_ARCH_X86_64) +#if defined(LLVM_LIBC_ARCH_AARCH64) +#define LIBC_TARGET_HAS_FMA +#elif defined(LLVM_LIBC_ARCH_X86_64) +#if (defined(__AVX2__) || defined(__FMA__)) +#define LIBC_TARGET_HAS_FMA +#endif +#endif + +#if (defined(LLVM_LIBC_ARCH_X86_64) && defined(LIBC_TARGET_HAS_FMA)) #define INLINE_FMA __attribute__((target("fma"))) #else #define INLINE_FMA diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 9737f4a..8fc550d 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -48,8 +48,9 @@ add_entrypoint_object( fmaf.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.fma COMPILE_OPTIONS - -O2 + -O3 -mfma ) @@ -61,8 +62,9 @@ add_entrypoint_object( fma.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.fma COMPILE_OPTIONS - -O2 + -O3 -mfma ) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 73957cf..6a96b55 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -478,6 +478,7 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -492,6 +493,7 @@ add_entrypoint_object( ../exp2f.h DEPENDS libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -507,6 +509,8 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval libc.include.math COMPILE_OPTIONS -O3 @@ -674,6 +678,8 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -688,6 +694,8 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -702,6 +710,7 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma @@ -716,6 +725,8 @@ add_entrypoint_object( DEPENDS .common_constants libc.src.__support.FPUtil.fputil + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval COMPILE_OPTIONS -O3 -mfma diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp index b0544b7..76232d6 100644 --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -83,7 +83,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { // = x otherwise. // To simplify the rounding decision and make it more efficient, we use // fma(x, x, x) ~ x + x^2 instead. - return fputil::fma(x, x, x); + return fputil::multiply_add(x, x, x); } // 2^-25 <= |x| < 2^-4 @@ -96,7 +96,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { fputil::polyeval(xd, 0x1p-1, 0x1.55555555557ddp-3, 0x1.55555555552fap-5, 0x1.111110fcd58b7p-7, 0x1.6c16c1717660bp-10, 0x1.a0241f0006d62p-13, 0x1.a01e3f8d3c06p-16); - return static_cast(fputil::fma(r, xsq, xd)); + return static_cast(fputil::multiply_add(r, xsq, xd)); } // For -18 < x < 89, to compute expm1(x), we perform the following range @@ -132,7 +132,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { double exp_lo = fputil::polyeval(xd, 0x1.0p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1, 0x1.555566668e5e7p-3, 0x1.55555555ef243p-5); - return static_cast(fputil::fma(exp_hi_mid, exp_lo, -1.0)); + return static_cast(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0)); } } // namespace __llvm_libc diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp index 59ca659..878ae68 100644 --- a/libc/src/math/generic/log10f.cpp +++ b/libc/src/math/generic/log10f.cpp @@ -170,7 +170,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) { double d = static_cast(xbits) - static_cast(f); d *= ONE_OVER_F[f_index]; - double extra_factor = fputil::fma(m, LOG10_2, LOG10_F[f_index]); + double extra_factor = fputil::multiply_add(m, LOG10_2, LOG10_F[f_index]); double r = fputil::polyeval(d, extra_factor, 0x1.bcb7b1526e4c5p-2, -0x1.bcb7b1518a5e9p-3, 0x1.287a72a6f716p-3, diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp index 7d1e71e..6e8c678 100644 --- a/libc/src/math/generic/log1pf.cpp +++ b/libc/src/math/generic/log1pf.cpp @@ -66,7 +66,7 @@ INLINE_FMA static inline float log(double x) { double d = static_cast(xbits) - static_cast(f); d *= ONE_OVER_F[f_index]; - double extra_factor = fputil::fma(m, LOG_2, LOG_F[f_index]); + double extra_factor = fputil::multiply_add(m, LOG_2, LOG_F[f_index]); double r = fputil::polyeval(d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2, 0x1.5555513bc679ap-2, @@ -161,7 +161,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { // > fpminimax(log(1 + x)/x, 5, [|D...|], [-2^-8; 2^-8]); r = fputil::polyeval(xd, -0x1p-1, 0x1.5555555515551p-2, -0x1.ffffffff82bdap-3, 0x1.999b33348d3aep-3, -0x1.5556cae3adcc3p-3); - return static_cast(fputil::fma(r, xd * xd, xd)); + return static_cast(fputil::multiply_add(r, xd * xd, xd)); } } // namespace __llvm_libc diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp index 3e71237..747f8c7 100644 --- a/libc/src/math/generic/logf.cpp +++ b/libc/src/math/generic/logf.cpp @@ -120,7 +120,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) { d *= ONE_OVER_F[f_index]; double extra_factor = - fputil::fma(static_cast(m), LOG_2, LOG_F[f_index]); + fputil::multiply_add(static_cast(m), LOG_2, LOG_F[f_index]); double r = __llvm_libc::fputil::polyeval( d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2, diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index ad8bcd0..93c45b9 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -196,6 +196,54 @@ cc_library( ], ) +fma_common_hdrs = [ + "src/__support/FPUtil/FMA.h", + "src/__support/FPUtil/generic/FMA.h", +] + +fma_hdrs = selects.with_or({ + "//conditions:default": fma_common_hdrs, + PLATFORM_CPU_X86_64: fma_common_hdrs + [ + "src/__support/FPUtil/x86_64/FMA.h", + ], + PLATFORM_CPU_ARM64: fma_common_hdrs + [ + "src/__support/FPUtil/aarch64/FMA.h", + ], +}) + +cc_library( + name = "__support_fputil_fma", + hdrs = fma_hdrs, + deps = [ + ":__support_common", + ":__support_cpp_bit", + ":__support_cpp_type_traits", + ":__support_fputil", + ":libc_root", + ], +) + +cc_library( + name = "__support_fputil_multiply_add", + hdrs = [ + "src/__support/FPUtil/multiply_add.h", + ], + deps = [ + ":__support_common", + ":__support_fputil_fma", + ], +) + +cc_library( + name = "__support_fputil_polyeval", + hdrs = [ + "src/__support/FPUtil/PolyEval.h", + ], + deps = [ + ":__support_fputil_multiply_add", + ], +) + ################################ fenv targets ################################ libc_function(