From f85ae058f580e9d74c4a8f2f0de168c18da6150f Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Fri, 27 Mar 2020 20:36:30 -0500
Subject: [PATCH] [OpenMP] Provide math functions in OpenMP device code via
 OpenMP variants

For OpenMP target regions to piggy back on the CUDA/AMDGPU/... implementation of math functions,
we include the appropriate definitions inside of an `omp begin/end declare variant match(device={arch(nvptx)})` scope.
This way, the vendor specific math functions will become specialized versions of the system math functions.
When a system math function is called and specialized version is available the selection logic introduced in D75779
instead call the specialized version. In contrast to the code path we used so far, the system header is actually included.
This means functions without specialized versions are available and so are macro definitions.

This should address PR42061, PR42798, and PR42799.

Reviewed By: ye-luo

Differential Revision: https://reviews.llvm.org/D75788
---
 clang/lib/Driver/ToolChains/Clang.cpp              |   2 +-
 clang/lib/Headers/CMakeLists.txt                   |   3 +-
 clang/lib/Headers/__clang_cuda_cmath.h             |  39 ++--
 clang/lib/Headers/__clang_cuda_device_functions.h  |   2 +-
 clang/lib/Headers/__clang_cuda_math.h              |  50 ++---
 .../Headers/__clang_cuda_math_forward_declares.h   |  37 +---
 ...eclares.h => __clang_openmp_device_functions.h} |  24 ++-
 .../Headers/openmp_wrappers/__clang_openmp_math.h  |  35 ----
 clang/lib/Headers/openmp_wrappers/cmath            |  67 +++++-
 clang/lib/Headers/openmp_wrappers/math.h           |  31 ++-
 clang/lib/Headers/openmp_wrappers/time.h           |  32 +++
 clang/test/Headers/Inputs/include/climits          |   4 +
 clang/test/Headers/Inputs/include/cmath            | 226 ++++++++++++++++++++-
 clang/test/Headers/Inputs/include/cstdlib          |   4 +
 clang/test/Headers/Inputs/include/math.h           | 200 +++++++++++++++++-
 clang/test/Headers/Inputs/include/stdlib.h         |   4 +
 clang/test/Headers/nvptx_device_cmath_functions.c  |   7 +-
 .../test/Headers/nvptx_device_cmath_functions.cpp  |   4 +-
 .../Headers/nvptx_device_cmath_functions_cxx17.cpp |   4 +-
 clang/test/Headers/nvptx_device_math_complex.c     |  23 +++
 clang/test/Headers/nvptx_device_math_functions.c   |  22 +-
 clang/test/Headers/nvptx_device_math_functions.cpp |   6 +-
 .../Headers/nvptx_device_math_functions_cxx17.cpp  |   6 +-
 clang/test/Headers/nvptx_device_math_macro.cpp     |  17 ++
 clang/test/Headers/nvptx_device_math_modf.cpp      |  53 +++++
 clang/test/Headers/nvptx_device_math_sin.c         |  27 +++
 clang/test/Headers/nvptx_device_math_sin.cpp       |  27 +++
 clang/test/Headers/nvptx_device_math_sin_cos.cpp   |  63 ++++++
 clang/test/Headers/nvptx_device_math_sincos.cpp    |  58 ++++++
 29 files changed, 909 insertions(+), 168 deletions(-)
 rename clang/lib/Headers/openmp_wrappers/{__clang_openmp_math_declares.h => __clang_openmp_device_functions.h} (58%)
 delete mode 100644 clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h
 create mode 100644 clang/lib/Headers/openmp_wrappers/time.h
 create mode 100644 clang/test/Headers/Inputs/include/climits
 create mode 100644 clang/test/Headers/nvptx_device_math_complex.c
 create mode 100644 clang/test/Headers/nvptx_device_math_macro.cpp
 create mode 100644 clang/test/Headers/nvptx_device_math_modf.cpp
 create mode 100644 clang/test/Headers/nvptx_device_math_sin.c
 create mode 100644 clang/test/Headers/nvptx_device_math_sin.cpp
 create mode 100644 clang/test/Headers/nvptx_device_math_sin_cos.cpp
 create mode 100644 clang/test/Headers/nvptx_device_math_sincos.cpp

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4d82530..2b36813 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1216,7 +1216,7 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
     }
 
     CmdArgs.push_back("-include");
-    CmdArgs.push_back("__clang_openmp_math_declares.h");
+    CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
   // Add -i* options, and automatically translate to
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 6851957..d6c8ed5 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -145,8 +145,7 @@ set(ppc_wrapper_files
 set(openmp_wrapper_files
   openmp_wrappers/math.h
   openmp_wrappers/cmath
-  openmp_wrappers/__clang_openmp_math.h
-  openmp_wrappers/__clang_openmp_math_declares.h
+  openmp_wrappers/__clang_openmp_device_functions.h
   openmp_wrappers/new
 )
 
diff --git a/clang/lib/Headers/__clang_cuda_cmath.h b/clang/lib/Headers/__clang_cuda_cmath.h
index 834a2e3..f406112 100644
--- a/clang/lib/Headers/__clang_cuda_cmath.h
+++ b/clang/lib/Headers/__clang_cuda_cmath.h
@@ -12,7 +12,9 @@
 #error "This file is for CUDA compilation only."
 #endif
 
+#ifndef _OPENMP
 #include <limits>
+#endif
 
 // CUDA lets us use various std math functions on the device side.  This file
 // works in concert with __clang_cuda_math_forward_declares.h to make this work.
@@ -31,31 +33,15 @@
 // std covers all of the known knowns.
 
 #ifdef _OPENMP
-#define __DEVICE__ static __attribute__((always_inline))
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
 #endif
 
-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
-
-#if !(defined(_OPENMP) && defined(__cplusplus))
 __DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
 __DEVICE__ long abs(long __n) { return ::labs(__n); }
 __DEVICE__ float abs(float __x) { return ::fabsf(__x); }
 __DEVICE__ double abs(double __x) { return ::fabs(__x); }
-#endif
-// TODO: remove once variat is supported.
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ const float abs(const float __x) { return ::fabsf((float)__x); }
-__DEVICE__ const double abs(const double __x) { return ::fabs((double)__x); }
-#endif
 __DEVICE__ float acos(float __x) { return ::acosf(__x); }
 __DEVICE__ float asin(float __x) { return ::asinf(__x); }
 __DEVICE__ float atan(float __x) { return ::atanf(__x); }
@@ -64,11 +50,9 @@ __DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
 __DEVICE__ float cos(float __x) { return ::cosf(__x); }
 __DEVICE__ float cosh(float __x) { return ::coshf(__x); }
 __DEVICE__ float exp(float __x) { return ::expf(__x); }
-__DEVICE__ float fabs(float __x) __NOEXCEPT { return ::fabsf(__x); }
+__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
 __DEVICE__ float floor(float __x) { return ::floorf(__x); }
 __DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
-// TODO: remove when variant is supported
-#ifndef _OPENMP
 __DEVICE__ int fpclassify(float __x) {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
@@ -77,14 +61,15 @@ __DEVICE__ int fpclassify(double __x) {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
 }
-#endif
 __DEVICE__ float frexp(float __arg, int *__exp) {
   return ::frexpf(__arg, __exp);
 }
 
 // For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows.
-#ifndef _MSC_VER
+// Windows. For OpenMP we omit these as some old system headers have
+// non-conforming `isinf(float)` and `isnan(float)` implementations that return
+// an `int`. The system versions of these functions should be fine anyway.
+#if !defined(_MSC_VER) && !defined(_OPENMP)
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
@@ -161,6 +146,8 @@ __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
 // libdevice doesn't provide an implementation, and we don't want to be in the
 // business of implementing tricky libm functions in this header.
 
+#ifndef _OPENMP
+
 // Now we've defined everything we promised we'd define in
 // __clang_cuda_math_forward_declares.h.  We need to do two additional things to
 // fix up our math functions.
@@ -457,10 +444,7 @@ using ::remainderf;
 using ::remquof;
 using ::rintf;
 using ::roundf;
-// TODO: remove once variant is supported
-#ifndef _OPENMP
 using ::scalblnf;
-#endif
 using ::scalbnf;
 using ::sinf;
 using ::sinhf;
@@ -479,7 +463,8 @@ _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif
 
-#undef __NOEXCEPT
+#endif // _OPENMP
+
 #undef __DEVICE__
 
 #endif
diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h
index d15f6b6..76c5889 100644
--- a/clang/lib/Headers/__clang_cuda_device_functions.h
+++ b/clang/lib/Headers/__clang_cuda_device_functions.h
@@ -21,7 +21,7 @@
 // functions and __forceinline__ helps inlining these wrappers at -O1.
 #pragma push_macro("__DEVICE__")
 #ifdef _OPENMP
-#define __DEVICE__ static __attribute__((always_inline))
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE__ static __device__ __forceinline__
 #endif
diff --git a/clang/lib/Headers/__clang_cuda_math.h b/clang/lib/Headers/__clang_cuda_math.h
index 7956135..01db2f2 100644
--- a/clang/lib/Headers/__clang_cuda_math.h
+++ b/clang/lib/Headers/__clang_cuda_math.h
@@ -23,11 +23,25 @@
 // functions and __forceinline__ helps inlining these wrappers at -O1.
 #pragma push_macro("__DEVICE__")
 #ifdef _OPENMP
-#define __DEVICE__ static __inline__ __attribute__((always_inline))
+#if defined(__cplusplus)
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
+#endif
 #else
 #define __DEVICE__ static __device__ __forceinline__
 #endif
 
+// Specialized version of __DEVICE__ for functions with void return type. Needed
+// because the OpenMP overlay requires constexpr functions here but prior to
+// c++14 void return functions could not be constexpr.
+#pragma push_macro("__DEVICE_VOID__")
+#ifdef _OPENMP && defined(__cplusplus) && __cplusplus < 201402L
+#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE_VOID__ __DEVICE__
+#endif
+
 // libdevice provides fast low precision and slow full-recision implementations
 // for some functions. Which one gets selected depends on
 // __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
@@ -39,17 +53,8 @@
 #define __FAST_OR_SLOW(fast, slow) slow
 #endif
 
-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
-
-__DEVICE__ int abs(int __a) __NOEXCEPT { return __nv_abs(__a); }
-__DEVICE__ double fabs(double __a) __NOEXCEPT { return __nv_fabs(__a); }
+__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
+__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
 __DEVICE__ double acos(double __a) { return __nv_acos(__a); }
 __DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
 __DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
@@ -104,7 +109,7 @@ __DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
 __DEVICE__ float expf(float __a) { return __nv_expf(__a); }
 __DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
 __DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
-__DEVICE__ float fabsf(float __a) __NOEXCEPT { return __nv_fabsf(__a); }
+__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
 __DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
 __DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
 __DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
@@ -142,15 +147,15 @@ __DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
 __DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
 __DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
 #if defined(__LP64__) || defined(_WIN64)
-__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_llabs(__a); };
+__DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
 #else
-__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_abs(__a); };
+__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
 #endif
 __DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
 __DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
 __DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
 __DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
-__DEVICE__ long long llabs(long long __a) __NOEXCEPT { return __nv_llabs(__a); }
+__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
 __DEVICE__ long long llmax(long long __a, long long __b) {
   return __nv_llmax(__a, __b);
 }
@@ -270,8 +275,6 @@ __DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
 __DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
 __DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
 __DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
-// TODO: remove once variant is supported
-#ifndef _OPENMP
 __DEVICE__ double scalbln(double __a, long __b) {
   if (__b > INT_MAX)
     return __a > 0 ? HUGE_VAL : -HUGE_VAL;
@@ -286,18 +289,17 @@ __DEVICE__ float scalblnf(float __a, long __b) {
     return __a > 0 ? 0.f : -0.f;
   return scalbnf(__a, (int)__b);
 }
-#endif
 __DEVICE__ double sin(double __a) { return __nv_sin(__a); }
-__DEVICE__ void sincos(double __a, double *__s, double *__c) {
+__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) {
   return __nv_sincos(__a, __s, __c);
 }
-__DEVICE__ void sincosf(float __a, float *__s, float *__c) {
+__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) {
   return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
 }
-__DEVICE__ void sincospi(double __a, double *__s, double *__c) {
+__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) {
   return __nv_sincospi(__a, __s, __c);
 }
-__DEVICE__ void sincospif(float __a, float *__s, float *__c) {
+__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) {
   return __nv_sincospif(__a, __s, __c);
 }
 __DEVICE__ float sinf(float __a) {
@@ -339,7 +341,7 @@ __DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
 __DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
 
 #pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__DEVICE_VOID__")
 #pragma pop_macro("__FAST_OR_SLOW")
-#undef __NOEXCEPT
 
 #endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
diff --git a/clang/lib/Headers/__clang_cuda_math_forward_declares.h b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
index 0afe4db..3d6d0b9 100644
--- a/clang/lib/Headers/__clang_cuda_math_forward_declares.h
+++ b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -20,37 +20,14 @@
 // would preclude the use of our own __device__ overloads for these functions.
 
 #pragma push_macro("__DEVICE__")
-#ifdef _OPENMP
-#define __DEVICE__ static __inline__ __attribute__((always_inline))
-#else
 #define __DEVICE__                                                             \
   static __inline__ __attribute__((always_inline)) __attribute__((device))
-#endif
-
-// For C++ 17 we need to include noexcept attribute to be compatible
-// with the header-defined version. This may be removed once
-// variant is supported.
-#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
-#define __NOEXCEPT noexcept
-#else
-#define __NOEXCEPT
-#endif
 
-#if !(defined(_OPENMP) && defined(__cplusplus))
 __DEVICE__ long abs(long);
 __DEVICE__ long long abs(long long);
 __DEVICE__ double abs(double);
 __DEVICE__ float abs(float);
-#endif
-// While providing the CUDA declarations and definitions for math functions,
-// we may manually define additional functions.
-// TODO: Once variant is supported the additional functions will have
-// to be removed.
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ const double abs(const double);
-__DEVICE__ const float abs(const float);
-#endif
-__DEVICE__ int abs(int) __NOEXCEPT;
+__DEVICE__ int abs(int);
 __DEVICE__ double acos(double);
 __DEVICE__ float acos(float);
 __DEVICE__ double acosh(double);
@@ -85,8 +62,8 @@ __DEVICE__ double exp(double);
 __DEVICE__ float exp(float);
 __DEVICE__ double expm1(double);
 __DEVICE__ float expm1(float);
-__DEVICE__ double fabs(double) __NOEXCEPT;
-__DEVICE__ float fabs(float) __NOEXCEPT;
+__DEVICE__ double fabs(double);
+__DEVICE__ float fabs(float);
 __DEVICE__ double fdim(double, double);
 __DEVICE__ float fdim(float, float);
 __DEVICE__ double floor(double);
@@ -136,12 +113,12 @@ __DEVICE__ bool isnormal(double);
 __DEVICE__ bool isnormal(float);
 __DEVICE__ bool isunordered(double, double);
 __DEVICE__ bool isunordered(float, float);
-__DEVICE__ long labs(long) __NOEXCEPT;
+__DEVICE__ long labs(long);
 __DEVICE__ double ldexp(double, int);
 __DEVICE__ float ldexp(float, int);
 __DEVICE__ double lgamma(double);
 __DEVICE__ float lgamma(float);
-__DEVICE__ long long llabs(long long) __NOEXCEPT;
+__DEVICE__ long long llabs(long long);
 __DEVICE__ long long llrint(double);
 __DEVICE__ long long llrint(float);
 __DEVICE__ double log10(double);
@@ -152,9 +129,6 @@ __DEVICE__ double log2(double);
 __DEVICE__ float log2(float);
 __DEVICE__ double logb(double);
 __DEVICE__ float logb(float);
-#if defined(_OPENMP) && defined(__cplusplus)
-__DEVICE__ long double log(long double);
-#endif
 __DEVICE__ double log(double);
 __DEVICE__ float log(float);
 __DEVICE__ long lrint(double);
@@ -302,7 +276,6 @@ _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif
 
-#undef __NOEXCEPT
 #pragma pop_macro("__DEVICE__")
 
 #endif
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_math_declares.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
similarity index 58%
rename from clang/lib/Headers/openmp_wrappers/__clang_openmp_math_declares.h
rename to clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
index dd97fac..9ff0a18 100644
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_math_declares.h
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -1,4 +1,4 @@
-/*===---- __clang_openmp_math_declares.h - OpenMP math declares ------------===
+/*===- __clang_openmp_device_functions.h - OpenMP device function declares -===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,28 +7,32 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __CLANG_OPENMP_MATH_DECLARES_H__
-#define __CLANG_OPENMP_MATH_DECLARES_H__
+#ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
+#define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 
 #ifndef _OPENMP
 #error "This file is for OpenMP compilation only."
 #endif
 
-#if defined(__NVPTX__) && defined(_OPENMP)
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
 
-#define __CUDA__
-
-#if defined(__cplusplus)
-  #include <__clang_cuda_math_forward_declares.h>
+#ifdef __cplusplus
+extern "C" {
 #endif
 
+#define __CUDA__
 /// Include declarations for libdevice functions.
 #include <__clang_cuda_libdevice_declares.h>
+
 /// Provide definitions for these functions.
 #include <__clang_cuda_device_functions.h>
-#include <__clang_cuda_math.h>
-
 #undef __CUDA__
 
+#ifdef __cplusplus
+} // extern "C"
 #endif
+
+#pragma omp end declare variant
+
 #endif
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h
deleted file mode 100644
index 5d7ce9a..0000000
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*===---- __clang_openmp_math.h - OpenMP target math support ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if defined(__NVPTX__) && defined(_OPENMP)
-/// TODO:
-/// We are currently reusing the functionality of the Clang-CUDA code path
-/// as an alternative to the host declarations provided by math.h and cmath.
-/// This is suboptimal.
-///
-/// We should instead declare the device functions in a similar way, e.g.,
-/// through OpenMP 5.0 variants, and afterwards populate the module with the
-/// host declarations by unconditionally including the host math.h or cmath,
-/// respectively. This is actually what the Clang-CUDA code path does, using
-/// __device__ instead of variants to avoid redeclarations and get the desired
-/// overload resolution.
-
-#define __CUDA__
-
-#if defined(__cplusplus)
-  #include <__clang_cuda_cmath.h>
-#endif
-
-#undef __CUDA__
-
-/// Magic macro for stopping the math.h/cmath host header from being included.
-#define __CLANG_NO_HOST_MATH__
-
-#endif
-
diff --git a/clang/lib/Headers/openmp_wrappers/cmath b/clang/lib/Headers/openmp_wrappers/cmath
index a5183a1..05be252 100644
--- a/clang/lib/Headers/openmp_wrappers/cmath
+++ b/clang/lib/Headers/openmp_wrappers/cmath
@@ -1,4 +1,4 @@
-/*===-------------- cmath - Alternative cmath header -----------------------===
+/*===-- __clang_openmp_device_functions.h - OpenMP math declares ------ c++ -===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,10 +7,67 @@
  *===-----------------------------------------------------------------------===
  */
 
-#include <__clang_openmp_math.h>
+#ifndef __CLANG_OPENMP_CMATH_H__
+#define __CLANG_OPENMP_CMATH_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
 
-#ifndef __CLANG_NO_HOST_MATH__
 #include_next <cmath>
-#else
-#undef __CLANG_NO_HOST_MATH__
+
+// Make sure we include our math.h overlay, it probably happend already but we
+// need to be sure.
+#include <math.h>
+
+// We (might) need cstdlib because __clang_cuda_cmath.h below declares `abs`
+// which might live in cstdlib.
+#include <cstdlib>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+#define __CUDA__
+#include <__clang_cuda_cmath.h>
+#undef __CUDA__
+
+// Overloads not provided by the CUDA wrappers but by the CUDA system headers.
+// Since we do not include the latter we define them ourselves.
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+
+__DEVICE__ float acosh(float __x) { return ::acoshf(__x); }
+__DEVICE__ float asinh(float __x) { return ::asinhf(__x); }
+__DEVICE__ float atanh(float __x) { return ::atanhf(__x); }
+__DEVICE__ float cbrt(float __x) { return ::cbrtf(__x); }
+__DEVICE__ float erf(float __x) { return ::erff(__x); }
+__DEVICE__ float erfc(float __x) { return ::erfcf(__x); }
+__DEVICE__ float exp2(float __x) { return ::exp2f(__x); }
+__DEVICE__ float expm1(float __x) { return ::expm1f(__x); }
+__DEVICE__ float fdim(float __x, float __y) { return ::fdimf(__x, __y); }
+__DEVICE__ float hypot(float __x, float __y) { return ::hypotf(__x, __y); }
+__DEVICE__ int ilogb(float __x) { return ::ilogbf(__x); }
+__DEVICE__ float lgamma(float __x) { return ::lgammaf(__x); }
+__DEVICE__ long long int llrint(float __x) { return ::llrintf(__x); }
+__DEVICE__ long long int llround(float __x) { return ::llroundf(__x); }
+__DEVICE__ float log1p(float __x) { return ::log1pf(__x); }
+__DEVICE__ float log2(float __x) { return ::log2f(__x); }
+__DEVICE__ float logb(float __x) { return ::logbf(__x); }
+__DEVICE__ long int lrint(float __x) { return ::lrintf(__x); }
+__DEVICE__ long int lround(float __x) { return ::lroundf(__x); }
+__DEVICE__ float nextafter(float __x, float __y) {
+  return ::nextafterf(__x, __y);
+}
+__DEVICE__ float remainder(float __x, float __y) {
+  return ::remainderf(__x, __y);
+}
+__DEVICE__ float scalbln(float __x, long int __y) {
+  return ::scalblnf(__x, __y);
+}
+__DEVICE__ float scalbn(float __x, int __y) { return ::scalbnf(__x, __y); }
+__DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); }
+
+#undef __DEVICE__
+
+#pragma omp end declare variant
+
 #endif
diff --git a/clang/lib/Headers/openmp_wrappers/math.h b/clang/lib/Headers/openmp_wrappers/math.h
index d2786ec..1ce22e0 100644
--- a/clang/lib/Headers/openmp_wrappers/math.h
+++ b/clang/lib/Headers/openmp_wrappers/math.h
@@ -1,4 +1,4 @@
-/*===------------- math.h - Alternative math.h header ----------------------===
+/*===---- openmp_wrapper/math.h -------- OpenMP math.h intercept ------ c++ -===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -7,11 +7,30 @@
  *===-----------------------------------------------------------------------===
  */
 
-#include <__clang_openmp_math.h>
+#ifndef __CLANG_OPENMP_MATH_H__
+#define __CLANG_OPENMP_MATH_H__
 
-#ifndef __CLANG_NO_HOST_MATH__
-#include_next <math.h>
-#else
-#undef __CLANG_NO_HOST_MATH__
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
 #endif
 
+#include_next <math.h>
+
+// We need limits.h for __clang_cuda_math.h below and because it should not hurt
+// we include it eagerly here.
+#include <limits.h>
+
+// We need stdlib.h because (for now) __clang_cuda_math.h below declares `abs`
+// which should live in stdlib.h.
+#include <stdlib.h>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+#define __CUDA__
+#include <__clang_cuda_math.h>
+#undef __CUDA__
+
+#pragma omp end declare variant
+
+#endif
diff --git a/clang/lib/Headers/openmp_wrappers/time.h b/clang/lib/Headers/openmp_wrappers/time.h
new file mode 100644
index 0000000..c760dd1
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/time.h
@@ -0,0 +1,32 @@
+/*===---- time.h - OpenMP time header wrapper ------------------------ c ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_OPENMP_TIME_H__
+#define __CLANG_OPENMP_TIME_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#if defined(__cplusplus)
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
+#endif
+
+#include_next <time.h>
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+__DEVICE__ clock_t clock() { return __nvvm_read_ptx_sreg_clock(); }
+
+#pragma omp end declare variant
+
+#endif
diff --git a/clang/test/Headers/Inputs/include/climits b/clang/test/Headers/Inputs/include/climits
new file mode 100644
index 0000000..929762e
--- /dev/null
+++ b/clang/test/Headers/Inputs/include/climits
@@ -0,0 +1,4 @@
+#pragma once
+
+#define INT_MIN  -2147483648
+#define INT_MAX   2147483647
diff --git a/clang/test/Headers/Inputs/include/cmath b/clang/test/Headers/Inputs/include/cmath
index 4ba1795..0cadc13 100644
--- a/clang/test/Headers/Inputs/include/cmath
+++ b/clang/test/Headers/Inputs/include/cmath
@@ -1,5 +1,227 @@
 #pragma once
 
-double sqrt(double);
+// __clang_cuda_(c)math(.h) also provide `abs` which actually belong in
+// cstdlib. We could split them out but for now we just include cstdlib from
+// cmath.h which is what the systems I've seen do as well.
+#include <cstdlib>
+
+#include <math.h>
+
+double acos(double);
+float acos(float);
+double acosh(double);
+float acosh(float);
+double asin(double);
+float asin(float);
+double asinh(double);
+float asinh(float);
+double atan2(double, double);
+float atan2(float, float);
+double atan(double);
+float atan(float);
+double atanh(double);
+float atanh(float);
+double cbrt(double);
+float cbrt(float);
+double ceil(double);
+float ceil(float);
+double copysign(double, double);
+float copysign(float, float);
+double cos(double);
+float cos(float);
+double cosh(double);
+float cosh(float);
+double erfc(double);
+float erfc(float);
+double erf(double);
+float erf(float);
+double exp2(double);
+float exp2(float);
+double exp(double);
+float exp(float);
+double expm1(double);
+float expm1(float);
+double fdim(double, double);
+float fdim(float, float);
+double floor(double);
+float floor(float);
+double fma(double, double, double);
+float fma(float, float, float);
+double fmax(double, double);
+float fmax(float, float);
+double fmin(double, double);
+float fmin(float, float);
+double fmod(double, double);
+float fmod(float, float);
+int fpclassify(double);
+int fpclassify(float);
+double frexp(double, int *);
+float frexp(float, int *);
+double hypot(double, double);
+float hypot(float, float);
+int ilogb(double);
+int ilogb(float);
+bool isfinite(long double);
+bool isfinite(double);
+bool isfinite(float);
+bool isgreater(double, double);
+bool isgreaterequal(double, double);
+bool isgreaterequal(float, float);
+bool isgreater(float, float);
+bool isinf(long double);
+bool isinf(double);
+bool isinf(float);
+bool isless(double, double);
+bool islessequal(double, double);
+bool islessequal(float, float);
+bool isless(float, float);
+bool islessgreater(double, double);
+bool islessgreater(float, float);
+bool isnan(long double);
+bool isnan(double);
+bool isnan(float);
+bool isnormal(double);
+bool isnormal(float);
+bool isunordered(double, double);
+bool isunordered(float, float);
+double ldexp(double, int);
+float ldexp(float, int);
+double lgamma(double);
+float lgamma(float);
+long long llrint(double);
+long long llrint(float);
+double log10(double);
+float log10(float);
+double log1p(double);
+float log1p(float);
+double log2(double);
+float log2(float);
+double logb(double);
+float logb(float);
+double log(double);
+float log(float);
+long lrint(double);
+long lrint(float);
+long lround(double);
+long lround(float);
+long long llround(float); // No llround(double).
+double modf(double, double *);
+float modf(float, float *);
+double nan(const char *);
+float nanf(const char *);
+double nearbyint(double);
+float nearbyint(float);
+double nextafter(double, double);
+float nextafter(float, float);
 double pow(double, double);
-double modf(double, double*);
+double pow(double, int);
+float pow(float, float);
+float pow(float, int);
+double remainder(double, double);
+float remainder(float, float);
+double remquo(double, double, int *);
+float remquo(float, float, int *);
+double rint(double);
+float rint(float);
+double round(double);
+float round(float);
+double scalbln(double, long);
+float scalbln(float, long);
+double scalbn(double, int);
+float scalbn(float, int);
+bool signbit(double);
+bool signbit(float);
+long double sin(long double);
+double sin(double);
+float sin(float);
+double sinh(double);
+float sinh(float);
+double sqrt(double);
+float sqrt(float);
+double tan(double);
+float tan(float);
+double tanh(double);
+float tanh(float);
+double tgamma(double);
+float tgamma(float);
+double trunc(double);
+float trunc(float);
+
+namespace std {
+
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isinf;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnan;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::llround;
+using ::modf;
+using ::nan;
+using ::nanf;
+using ::nearbyint;
+using ::nextafter;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+} // namespace std
+
+#define FP_NAN 0
+#define FP_INFINITE 1
+#define FP_ZERO 2
+#define FP_SUBNORMAL 3
+#define FP_NORMAL 4
diff --git a/clang/test/Headers/Inputs/include/cstdlib b/clang/test/Headers/Inputs/include/cstdlib
index f038a6d..00e81e8 100644
--- a/clang/test/Headers/Inputs/include/cstdlib
+++ b/clang/test/Headers/Inputs/include/cstdlib
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <stdlib.h>
+
 #if __cplusplus >= 201703L
 extern int abs (int __x) throw()  __attribute__ ((__const__)) ;
 extern long int labs (long int __x) throw() __attribute__ ((__const__)) ;
@@ -20,4 +22,6 @@ abs(long __i) { return __builtin_labs(__i); }
 
 inline long long
 abs(long long __x) { return __builtin_llabs (__x); }
+
+float fabs(float __x) { return __builtin_fabs(__x); }
 }
diff --git a/clang/test/Headers/Inputs/include/math.h b/clang/test/Headers/Inputs/include/math.h
index 4ba1795..a60ad45 100644
--- a/clang/test/Headers/Inputs/include/math.h
+++ b/clang/test/Headers/Inputs/include/math.h
@@ -1,5 +1,199 @@
 #pragma once
 
-double sqrt(double);
-double pow(double, double);
-double modf(double, double*);
+// __clang_cuda_(c)math(.h) also provide `abs` which actually belong in
+// cstdlib. We could split them out but for now we just include cstdlib from
+// cmath.h which is what the systems I've seen do as well.
+#include <stdlib.h>
+
+double fabs(double __a);
+double acos(double __a);
+float acosf(float __a);
+double acosh(double __a);
+float acoshf(float __a);
+double asin(double __a);
+float asinf(float __a);
+double asinh(double __a);
+float asinhf(float __a);
+double atan(double __a);
+double atan2(double __a, double __b);
+float atan2f(float __a, float __b);
+float atanf(float __a);
+double atanh(double __a);
+float atanhf(float __a);
+double cbrt(double __a);
+float cbrtf(float __a);
+double ceil(double __a);
+float ceilf(float __a);
+double copysign(double __a, double __b);
+float copysignf(float __a, float __b);
+double cos(double __a);
+float cosf(float __a);
+double cosh(double __a);
+float coshf(float __a);
+double cospi(double __a);
+float cospif(float __a);
+double cyl_bessel_i0(double __a);
+float cyl_bessel_i0f(float __a);
+double cyl_bessel_i1(double __a);
+float cyl_bessel_i1f(float __a);
+double erf(double __a);
+double erfc(double __a);
+float erfcf(float __a);
+double erfcinv(double __a);
+float erfcinvf(float __a);
+double erfcx(double __a);
+float erfcxf(float __a);
+float erff(float __a);
+double erfinv(double __a);
+float erfinvf(float __a);
+double exp(double __a);
+double exp10(double __a);
+float exp10f(float __a);
+double exp2(double __a);
+float exp2f(float __a);
+float expf(float __a);
+double expm1(double __a);
+float expm1f(float __a);
+float fabsf(float __a);
+double fdim(double __a, double __b);
+float fdimf(float __a, float __b);
+double fdivide(double __a, double __b);
+float fdividef(float __a, float __b);
+double floor(double __f);
+float floorf(float __f);
+double fma(double __a, double __b, double __c);
+float fmaf(float __a, float __b, float __c);
+double fmax(double __a, double __b);
+float fmaxf(float __a, float __b);
+double fmin(double __a, double __b);
+float fminf(float __a, float __b);
+double fmod(double __a, double __b);
+float fmodf(float __a, float __b);
+double frexp(double __a, int *__b);
+float frexpf(float __a, int *__b);
+double hypot(double __a, double __b);
+float hypotf(float __a, float __b);
+int ilogb(double __a);
+int ilogbf(float __a);
+double j0(double __a);
+float j0f(float __a);
+double j1(double __a);
+float j1f(float __a);
+double jn(int __n, double __a);
+float jnf(int __n, float __a);
+double ldexp(double __a, int __b);
+float ldexpf(float __a, int __b);
+double lgamma(double __a);
+float lgammaf(float __a);
+long long llmax(long long __a, long long __b);
+long long llmin(long long __a, long long __b);
+long long llrint(double __a);
+long long llrintf(float __a);
+long long llround(double __a);
+long long llroundf(float __a);
+double log(double __a);
+double log10(double __a);
+float log10f(float __a);
+double log1p(double __a);
+float log1pf(float __a);
+double log2(double __a);
+float log2f(float __a);
+double logb(double __a);
+float logbf(float __a);
+float logf(float __a);
+long lrint(double __a);
+long lrintf(float __a);
+long lround(double __a);
+long lroundf(float __a);
+int max(int __a, int __b);
+int min(int __a, int __b);
+double modf(double __a, double *__b);
+float modff(float __a, float *__b);
+double nearbyint(double __a);
+float nearbyintf(float __a);
+double nextafter(double __a, double __b);
+float nextafterf(float __a, float __b);
+double norm(int __dim, const double *__t);
+double norm3d(double __a, double __b, double __c);
+float norm3df(float __a, float __b, float __c);
+double norm4d(double __a, double __b, double __c, double __d);
+float norm4df(float __a, float __b, float __c, float __d);
+double normcdf(double __a);
+float normcdff(float __a);
+double normcdfinv(double __a);
+float normcdfinvf(float __a);
+float normf(int __dim, const float *__t);
+double pow(double __a, double __b);
+float powf(float __a, float __b);
+double powi(double __a, int __b);
+float powif(float __a, int __b);
+double rcbrt(double __a);
+float rcbrtf(float __a);
+double remainder(double __a, double __b);
+float remainderf(float __a, float __b);
+double remquo(double __a, double __b, int *__c);
+float remquof(float __a, float __b, int *__c);
+double rhypot(double __a, double __b);
+float rhypotf(float __a, float __b);
+double rint(double __a);
+float rintf(float __a);
+double rnorm(int __a, const double *__b);
+double rnorm3d(double __a, double __b, double __c);
+float rnorm3df(float __a, float __b, float __c);
+double rnorm4d(double __a, double __b, double __c, double __d);
+float rnorm4df(float __a, float __b, float __c, float __d);
+float rnormf(int __dim, const float *__t);
+double round(double __a);
+float roundf(float __a);
+double rsqrt(double __a);
+float rsqrtf(float __a);
+double scalbn(double __a, int __b);
+float scalbnf(float __a, int __b);
+double scalbln(double __a, long __b);
+float scalblnf(float __a, long __b);
+double sin(double __a);
+void sincos(double __a, double *__s, double *__c);
+void sincosf(float __a, float *__s, float *__c);
+void sincospi(double __a, double *__s, double *__c);
+void sincospif(float __a, float *__s, float *__c);
+float sinf(float __a);
+double sinh(double __a);
+float sinhf(float __a);
+double sinpi(double __a);
+float sinpif(float __a);
+double sqrt(double __a);
+float sqrtf(float __a);
+double tan(double __a);
+float tanf(float __a);
+double tanh(double __a);
+float tanhf(float __a);
+double tgamma(double __a);
+float tgammaf(float __a);
+double trunc(double __a);
+float truncf(float __a);
+unsigned long long ullmax(unsigned long long __a,
+                          unsigned long long __b);
+unsigned long long ullmin(unsigned long long __a,
+                          unsigned long long __b);
+unsigned int umax(unsigned int __a, unsigned int __b);
+unsigned int umin(unsigned int __a, unsigned int __b);
+double y0(double __a);
+float y0f(float __a);
+double y1(double __a);
+float y1f(float __a);
+double yn(int __a, double __b);
+float ynf(int __a, float __b);
+
+/**
+ * A positive float constant expression. HUGE_VALF evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VALF (__builtin_huge_valf())
+
+/**
+ * A positive double constant expression. HUGE_VAL evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VAL (__builtin_huge_val())
diff --git a/clang/test/Headers/Inputs/include/stdlib.h b/clang/test/Headers/Inputs/include/stdlib.h
index 296b623..516e521 100644
--- a/clang/test/Headers/Inputs/include/stdlib.h
+++ b/clang/test/Headers/Inputs/include/stdlib.h
@@ -1,2 +1,6 @@
 #pragma once
 typedef __SIZE_TYPE__ size_t;
+
+#ifndef __cplusplus
+extern int abs(int __x) __attribute__((__const__));
+#endif
diff --git a/clang/test/Headers/nvptx_device_cmath_functions.c b/clang/test/Headers/nvptx_device_cmath_functions.c
index 23265d00f..7ba2844 100644
--- a/clang/test/Headers/nvptx_device_cmath_functions.c
+++ b/clang/test/Headers/nvptx_device_cmath_functions.c
@@ -3,10 +3,11 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include cmath -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include cmath -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
 
-#include <cmath>
+#include <stdlib.h>
+#include <math.h>
 
 void test_sqrt(double a1) {
   #pragma omp target
diff --git a/clang/test/Headers/nvptx_device_cmath_functions.cpp b/clang/test/Headers/nvptx_device_cmath_functions.cpp
index 0787b94..35e33f9 100644
--- a/clang/test/Headers/nvptx_device_cmath_functions.cpp
+++ b/clang/test/Headers/nvptx_device_cmath_functions.cpp
@@ -3,8 +3,8 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include cmath -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include cmath -internal-isystem %S/Inputs/include -include stdlib.h -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
 
 #include <cmath>
 #include <cstdlib>
diff --git a/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp b/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
index 0b715fe..62ae1e2 100644
--- a/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
+++ b/clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp
@@ -3,8 +3,8 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include cmath -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include cmath -internal-isystem %S/Inputs/include -include stdlib.h -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s
 
 #include <cmath>
 #include <cstdlib>
diff --git a/clang/test/Headers/nvptx_device_math_complex.c b/clang/test/Headers/nvptx_device_math_complex.c
new file mode 100644
index 0000000..43f4ec6
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_complex.c
@@ -0,0 +1,23 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+// CHECK-DAG: call { float, float } @__divsc3(
+// CHECK-DAG: call { float, float } @__mulsc3(
+void test_scmplx(float _Complex a) {
+#pragma omp target
+  {
+    (void)(a * (a / a));
+  }
+}
+
+
+// CHECK-DAG: call { double, double } @__divdc3(
+// CHECK-DAG: call { double, double } @__muldc3(
+void test_dcmplx(double _Complex a) {
+#pragma omp target
+  {
+    (void)(a * (a / a));
+  }
+}
diff --git a/clang/test/Headers/nvptx_device_math_functions.c b/clang/test/Headers/nvptx_device_math_functions.c
index 3cc1be5..7e37e3f 100644
--- a/clang/test/Headers/nvptx_device_math_functions.c
+++ b/clang/test/Headers/nvptx_device_math_functions.c
@@ -3,23 +3,31 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include math.h -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include math.h -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
 
+#ifdef __cplusplus
+#include <cstdlib>
+#include <cmath>
+#else
+#include <stdlib.h>
 #include <math.h>
+#endif
 
 void test_sqrt(double a1) {
   #pragma omp target
   {
-    // CHECK-YES: call double @__nv_sqrt(double
+    // CHECK: call double @__nv_sqrt(double
     double l1 = sqrt(a1);
-    // CHECK-YES: call double @__nv_pow(double
+    // CHECK: call double @__nv_pow(double
     double l2 = pow(a1, a1);
-    // CHECK-YES: call double @__nv_modf(double
+    // CHECK: call double @__nv_modf(double
     double l3 = modf(a1 + 3.5, &a1);
-    // CHECK-YES: call double @__nv_fabs(double
+    // CHECK: call double @__nv_fabs(double
     double l4 = fabs(a1);
-    // CHECK-YES: call i32 @__nv_abs(i32
+    // CHECK: call i32 @__nv_abs(i32
     double l5 = abs((int)a1);
   }
 }
diff --git a/clang/test/Headers/nvptx_device_math_functions.cpp b/clang/test/Headers/nvptx_device_math_functions.cpp
index e0f1826..6ace3c5 100644
--- a/clang/test/Headers/nvptx_device_math_functions.cpp
+++ b/clang/test/Headers/nvptx_device_math_functions.cpp
@@ -3,11 +3,11 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include math.h -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include math.h -internal-isystem %S/Inputs/include -include stdlib.h -include limits -include cstdlib -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
 
 #include <cstdlib>
-#include <math.h>
+#include <cmath>
 
 void test_sqrt(double a1) {
   #pragma omp target
diff --git a/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp b/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
index e3c0b12..5220f44 100644
--- a/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
+++ b/clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
@@ -3,11 +3,11 @@
 
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include math.h -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17
-// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math_declares.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include math.h -internal-isystem %S/Inputs/include -include stdlib.h -include limits -include cstdlib -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s
 
 #include <cstdlib>
-#include <math.h>
+#include <cmath>
 
 void test_sqrt(double a1) {
   #pragma omp target
diff --git a/clang/test/Headers/nvptx_device_math_macro.cpp b/clang/test/Headers/nvptx_device_math_macro.cpp
new file mode 100644
index 0000000..e21aa2b
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_macro.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+#include <cmath>
+
+#pragma omp declare target
+int use_macro() {
+  double a(0);
+// CHECK-NOT:  call
+// CHECK:  call double @llvm.fabs.f64(double
+// CHECK-NOT:  call
+// CHECK:      ret i32 %conv
+  return (std::fpclassify(a) != FP_ZERO);
+}
+#pragma omp end declare target
diff --git a/clang/test/Headers/nvptx_device_math_modf.cpp b/clang/test/Headers/nvptx_device_math_modf.cpp
new file mode 100644
index 0000000..fcfe20f
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_modf.cpp
@@ -0,0 +1,53 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+
+#include <cmath>
+
+// 4 calls to modf(f), all translated to __nv_modf calls:
+
+// CHECK-NOT: _Z.modf
+// CHECK: call double @__nv_modf(double
+// CHECK-NOT: _Z.modf
+// CHECK: call float @__nv_modff(float
+// CHECK-NOT: _Z.modf
+// CHECK: call double @__nv_modf(double
+// CHECK-NOT: _Z.modf
+// CHECK: call float @__nv_modff(float
+// CHECK-NOT: _Z.modf
+
+template<typename T>
+void test_modf(T x)
+{
+  T dx;
+  int intx;
+
+  #pragma omp target map(from: intx, dx)
+  {
+    T ipart;
+    dx = std::modf(x, &ipart);
+    intx = static_cast<int>(ipart);
+  }
+}
+
+int main()
+{
+
+#if !defined(C_ONLY)
+  test_modf<double>(1.0);
+  test_modf<float>(1.0);
+#endif
+
+  #pragma omp target
+  {
+    double intpart, res;
+    res = modf(1.1, &intpart);
+  }
+
+  #pragma omp target
+  {
+    float intpart, res;
+    res = modff(1.1f, &intpart);
+  }
+
+}
diff --git a/clang/test/Headers/nvptx_device_math_sin.c b/clang/test/Headers/nvptx_device_math_sin.c
new file mode 100644
index 0000000..75b998d
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_sin.c
@@ -0,0 +1,27 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SLOW
+// RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math
+// RUN: %clang_cc1 -x c -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math | FileCheck %s --check-prefix=FAST
+// expected-no-diagnostics
+
+#include <math.h>
+
+double math(float f, double d, long double ld) {
+  double r = 0;
+// SLOW:  call float @__nv_sinf(float
+// FAST:  call fast float @__nv_fast_sinf(float
+  r += sinf(f);
+// SLOW:  call double @__nv_sin(double
+// FAST:  call fast double @__nv_sin(double
+  r += sin(d);
+  return r;
+}
+
+long double foo(float f, double d, long double ld) {
+  double r = ld;
+  r += math(f, d, ld);
+#pragma omp target map(r)
+  { r += math(f, d, ld); }
+  return r;
+}
diff --git a/clang/test/Headers/nvptx_device_math_sin.cpp b/clang/test/Headers/nvptx_device_math_sin.cpp
new file mode 100644
index 0000000..e4d25b4
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_sin.cpp
@@ -0,0 +1,27 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SLOW
+// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math
+// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math | FileCheck %s --check-prefix=FAST
+// expected-no-diagnostics
+
+#include <cmath>
+
+double math(float f, double d, long double ld) {
+  double r = 0;
+// SLOW:  call float @__nv_sinf(float
+// FAST:  call fast float @__nv_fast_sinf(float
+  r += sin(f);
+// SLOW:  call double @__nv_sin(double
+// FAST:  call fast double @__nv_sin(double
+  r += sin(d);
+  return r;
+}
+
+long double foo(float f, double d, long double ld) {
+  double r = ld;
+  r += math(f, d, ld);
+#pragma omp target map(r)
+  { r += math(f, d, ld); }
+  return r;
+}
diff --git a/clang/test/Headers/nvptx_device_math_sin_cos.cpp b/clang/test/Headers/nvptx_device_math_sin_cos.cpp
new file mode 100644
index 0000000..dbb2e71
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_sin_cos.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+
+#include <cmath>
+
+// 6 calls to sin/cos(f), all translated to __nv_sin/__nv_cos calls:
+
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call double @__nv_sin(double
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call float @__nv_sinf(float
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call double @__nv_sin(double
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call double @__nv_cos(double
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call float @__nv_sinf(float
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+// CHECK: call float @__nv_cosf(float
+// CHECK-NOT: _Z.sin
+// CHECK-NOT: _Z.cos
+
+template<typename T>
+void test_sin_cos(T x)
+{
+  T res_sin, res_cos;
+
+  #pragma omp target map(from: res_sin, res_cos)
+  {
+    res_sin = std::sin(x);
+    res_cos = std::cos(x);
+  }
+}
+
+int main()
+{
+
+#if !defined(C_ONLY)
+  test_sin_cos<double>(0.0);
+  test_sin_cos<float>(0.0);
+#endif
+
+  #pragma omp target
+  {
+    double res;
+    res = sin(1.0);
+  }
+
+  #pragma omp target
+  {
+    float res;
+    res = sinf(1.0f);
+  }
+
+  return 0;
+}
diff --git a/clang/test/Headers/nvptx_device_math_sincos.cpp b/clang/test/Headers/nvptx_device_math_sincos.cpp
new file mode 100644
index 0000000..5419ee2
--- /dev/null
+++ b/clang/test/Headers/nvptx_device_math_sincos.cpp
@@ -0,0 +1,58 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+
+#include <cmath>
+
+// 4 calls to sincos(f), all translated to __nv_sincos calls:
+
+// CHECK-NOT: _Z.sincos
+// CHECK: call void @__nv_sincos(double
+// CHECK-NOT: _Z.sincos
+// CHECK: call void @__nv_sincosf(float
+// CHECK-NOT: _Z.sincos
+// CHECK: call void @__nv_sincos(double
+// CHECK-NOT: _Z.sincos
+// CHECK: call void @__nv_sincosf(float
+// CHECK-NOT: _Z.sincos
+
+// single precision wrapper
+inline void sincos(float x, float* __restrict__ sin, float* __restrict__ cos)
+{
+  sincosf(x, sin, cos);
+}
+
+template<typename T>
+void test_sincos(T x)
+{
+  T res_sin, res_cos;
+
+  #pragma omp target map(from: res_sin, res_cos)
+  {
+    sincos(x, &res_sin, &res_cos);
+  }
+
+}
+
+int main(int argc, char **argv)
+{
+
+#if !defined(C_ONLY)
+  test_sincos<double>(0.0);
+  test_sincos<float>(0.0);
+#endif
+
+  #pragma omp target
+  {
+    double s, c;
+    sincos(0, &s, &c);
+  }
+
+  #pragma omp target
+  {
+    float s, c;
+    sincosf(0.f, &s, &c);
+  }
+
+  return 0;
+}
-- 
2.7.4