From dfc0d9475556cb04f443f728e68cf8c7afa904eb Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 10 Jun 2020 17:40:04 -0700 Subject: [PATCH] Revert D80450 "[CUDA][HIP] Fix implicit HD function resolution" This reverts commit 263390d4f5f23967a31af09eb6e0c12e633d6104. This can still cause bogus errors: eigen3/Eigen/src/Core/CoreEvaluators.h:94:38: error: call to implicitly-deleted copy constructor of 'unary_evaluator>>' thrust/system/detail/generic/for_each.h:49:3: error: implicit instantiation of undefined template 'thrust::detail::STATIC_ASSERTION_FAILURE' --- clang/include/clang/Sema/Sema.h | 2 - clang/lib/Sema/SemaCUDA.cpp | 14 --- clang/lib/Sema/SemaOverload.cpp | 143 ++++++++--------------- clang/test/SemaCUDA/function-overload.cu | 194 ++----------------------------- 4 files changed, 58 insertions(+), 295 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 33be6c6..e7e6dc4 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11698,8 +11698,6 @@ public: return IdentifyCUDATarget(dyn_cast(CurContext)); } - static bool IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D); - // CUDA function call preference. Must be ordered numerically from // worst to best. enum CUDAFunctionPreference { diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index 1106cef..5d6c151 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -211,20 +211,6 @@ Sema::IdentifyCUDAPreference(const FunctionDecl *Caller, llvm_unreachable("All cases should've been handled by now."); } -template static bool hasImplicitAttr(const FunctionDecl *D) { - if (!D) - return false; - if (auto *A = D->getAttr()) - return A->isImplicit(); - return D->isImplicit(); -} - -bool Sema::IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D) { - bool IsImplicitDevAttr = hasImplicitAttr(D); - bool IsImplicitHostAttr = hasImplicitAttr(D); - return IsImplicitDevAttr && IsImplicitHostAttr; -} - void Sema::EraseUnwantedCUDAMatches( const FunctionDecl *Caller, SmallVectorImpl> &Matches) { diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 5e5f539..319a4b5 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -9397,22 +9397,16 @@ static Comparison compareEnableIfAttrs(const Sema &S, const FunctionDecl *Cand1, return Comparison::Equal; } -static Comparison -isBetterMultiversionCandidate(const OverloadCandidate &Cand1, - const OverloadCandidate &Cand2) { +static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1, + const OverloadCandidate &Cand2) { if (!Cand1.Function || !Cand1.Function->isMultiVersion() || !Cand2.Function || !Cand2.Function->isMultiVersion()) - return Comparison::Equal; + return false; - // If both are invalid, they are equal. If one of them is invalid, the other - // is better. - if (Cand1.Function->isInvalidDecl()) { - if (Cand2.Function->isInvalidDecl()) - return Comparison::Equal; - return Comparison::Worse; - } - if (Cand2.Function->isInvalidDecl()) - return Comparison::Better; + // If Cand1 is invalid, it cannot be a better match, if Cand2 is invalid, this + // is obviously better. + if (Cand1.Function->isInvalidDecl()) return false; + if (Cand2.Function->isInvalidDecl()) return true; // If this is a cpu_dispatch/cpu_specific multiversion situation, prefer // cpu_dispatch, else arbitrarily based on the identifiers. @@ -9422,18 +9416,16 @@ isBetterMultiversionCandidate(const OverloadCandidate &Cand1, const auto *Cand2CPUSpec = Cand2.Function->getAttr(); if (!Cand1CPUDisp && !Cand2CPUDisp && !Cand1CPUSpec && !Cand2CPUSpec) - return Comparison::Equal; + return false; if (Cand1CPUDisp && !Cand2CPUDisp) - return Comparison::Better; + return true; if (Cand2CPUDisp && !Cand1CPUDisp) - return Comparison::Worse; + return false; if (Cand1CPUSpec && Cand2CPUSpec) { if (Cand1CPUSpec->cpus_size() != Cand2CPUSpec->cpus_size()) - return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size() - ? Comparison::Better - : Comparison::Worse; + return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size(); std::pair FirstDiff = std::mismatch( @@ -9446,9 +9438,7 @@ isBetterMultiversionCandidate(const OverloadCandidate &Cand1, assert(FirstDiff.first != Cand1CPUSpec->cpus_end() && "Two different cpu-specific versions should not have the same " "identifier list, otherwise they'd be the same decl!"); - return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName() - ? Comparison::Better - : Comparison::Worse; + return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName(); } llvm_unreachable("No way to get here unless both had cpu_dispatch"); } @@ -9508,66 +9498,6 @@ bool clang::isBetterOverloadCandidate( else if (!Cand1.Viable) return false; - // [CUDA] A function with 'never' preference is marked not viable, therefore - // is never shown up here. The worst preference shown up here is 'wrong side', - // e.g. a host function called by a device host function in device - // compilation. This is valid AST as long as the host device function is not - // emitted, e.g. it is an inline function which is called only by a host - // function. A deferred diagnostic will be triggered if it is emitted. - // However a wrong-sided function is still a viable candidate here. - // - // If Cand1 can be emitted and Cand2 cannot be emitted in the current - // context, Cand1 is better than Cand2. If Cand1 can not be emitted and Cand2 - // can be emitted, Cand1 is not better than Cand2. This rule should have - // precedence over other rules. - // - // If both Cand1 and Cand2 can be emitted, or neither can be emitted, then - // other rules should be used to determine which is better. This is because - // host/device based overloading resolution is mostly for determining - // viability of a function. If two functions are both viable, other factors - // should take precedence in preference, e.g. the standard-defined preferences - // like argument conversion ranks or enable_if partial-ordering. The - // preference for pass-object-size parameters is probably most similar to a - // type-based-overloading decision and so should take priority. - // - // If other rules cannot determine which is better, CUDA preference will be - // used again to determine which is better. - // - // TODO: Currently IdentifyCUDAPreference does not return correct values - // for functions called in global variable initializers due to missing - // correct context about device/host. Therefore we can only enforce this - // rule when there is a caller. We should enforce this rule for functions - // in global variable initializers once proper context is added. - if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { - if (FunctionDecl *Caller = dyn_cast(S.CurContext)) { - bool IsCallerImplicitHD = Sema::IsCUDAImplicitHostDeviceFunction(Caller); - bool IsCand1ImplicitHD = - Sema::IsCUDAImplicitHostDeviceFunction(Cand1.Function); - bool IsCand2ImplicitHD = - Sema::IsCUDAImplicitHostDeviceFunction(Cand2.Function); - auto P1 = S.IdentifyCUDAPreference(Caller, Cand1.Function); - auto P2 = S.IdentifyCUDAPreference(Caller, Cand2.Function); - assert(P1 != Sema::CFP_Never && P2 != Sema::CFP_Never); - // The implicit HD function may be a function in a system header which - // is forced by pragma. In device compilation, if we prefer HD candidates - // over wrong-sided candidates, overloading resolution may change, which - // may result in non-deferrable diagnostics. As a workaround, we let - // implicit HD candidates take equal preference as wrong-sided candidates. - // This will preserve the overloading resolution. - auto EmitThreshold = - (S.getLangOpts().CUDAIsDevice && IsCallerImplicitHD && - (IsCand1ImplicitHD || IsCand2ImplicitHD)) - ? Sema::CFP_Never - : Sema::CFP_WrongSide; - auto Cand1Emittable = P1 > EmitThreshold; - auto Cand2Emittable = P2 > EmitThreshold; - if (Cand1Emittable && !Cand2Emittable) - return true; - if (!Cand1Emittable && Cand2Emittable) - return false; - } - } - // C++ [over.match.best]p1: // // -- if F is a static member function, ICS1(F) is defined such @@ -9802,6 +9732,12 @@ bool clang::isBetterOverloadCandidate( return Cmp == Comparison::Better; } + if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { + FunctionDecl *Caller = dyn_cast(S.CurContext); + return S.IdentifyCUDAPreference(Caller, Cand1.Function) > + S.IdentifyCUDAPreference(Caller, Cand2.Function); + } + bool HasPS1 = Cand1.Function != nullptr && functionHasPassObjectSizeParams(Cand1.Function); bool HasPS2 = Cand2.Function != nullptr && @@ -9809,21 +9745,7 @@ bool clang::isBetterOverloadCandidate( if (HasPS1 != HasPS2 && HasPS1) return true; - auto MV = isBetterMultiversionCandidate(Cand1, Cand2); - if (MV == Comparison::Better) - return true; - if (MV == Comparison::Worse) - return false; - - // If other rules cannot determine which is better, CUDA preference is used - // to determine which is better. - if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { - FunctionDecl *Caller = dyn_cast(S.CurContext); - return S.IdentifyCUDAPreference(Caller, Cand1.Function) > - S.IdentifyCUDAPreference(Caller, Cand2.Function); - } - - return false; + return isBetterMultiversionCandidate(Cand1, Cand2); } /// Determine whether two declarations are "equivalent" for the purposes of @@ -9909,6 +9831,33 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc, std::transform(begin(), end(), std::back_inserter(Candidates), [](OverloadCandidate &Cand) { return &Cand; }); + // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA but + // are accepted by both clang and NVCC. However, during a particular + // compilation mode only one call variant is viable. We need to + // exclude non-viable overload candidates from consideration based + // only on their host/device attributes. Specifically, if one + // candidate call is WrongSide and the other is SameSide, we ignore + // the WrongSide candidate. + if (S.getLangOpts().CUDA) { + const FunctionDecl *Caller = dyn_cast(S.CurContext); + bool ContainsSameSideCandidate = + llvm::any_of(Candidates, [&](OverloadCandidate *Cand) { + // Check viable function only. + return Cand->Viable && Cand->Function && + S.IdentifyCUDAPreference(Caller, Cand->Function) == + Sema::CFP_SameSide; + }); + if (ContainsSameSideCandidate) { + auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) { + // Check viable function only to avoid unnecessary data copying/moving. + return Cand->Viable && Cand->Function && + S.IdentifyCUDAPreference(Caller, Cand->Function) == + Sema::CFP_WrongSide; + }; + llvm::erase_if(Candidates, IsWrongSideCandidate); + } + } + // Find the best viable function. Best = end(); for (auto *Cand : Candidates) { diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu index da4c491..b9efd1c 100644 --- a/clang/test/SemaCUDA/function-overload.cu +++ b/clang/test/SemaCUDA/function-overload.cu @@ -1,8 +1,8 @@ // REQUIRES: x86-registered-target // REQUIRES: nvptx-registered-target -// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s -// RUN: %clang_cc1 -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s #include "Inputs/cuda.h" @@ -14,13 +14,6 @@ struct DeviceReturnTy2 {}; struct HostDeviceReturnTy {}; struct TemplateReturnTy {}; -struct CorrectOverloadRetTy{}; -#if __CUDA_ARCH__ -// expected-note@-2 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'IncorrectOverloadRetTy' to 'const CorrectOverloadRetTy &' for 1st argument}} -// expected-note@-3 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'IncorrectOverloadRetTy' to 'CorrectOverloadRetTy &&' for 1st argument}} -#endif -struct IncorrectOverloadRetTy{}; - typedef HostReturnTy (*HostFnPtr)(); typedef DeviceReturnTy (*DeviceFnPtr)(); typedef HostDeviceReturnTy (*HostDeviceFnPtr)(); @@ -338,6 +331,9 @@ __device__ void test_device_calls_template_fn() { // If we have a mix of HD and H-only or D-only candidates in the overload set, // normal C++ overload resolution rules apply first. template TemplateReturnTy template_vs_hd_function(T arg) +#ifdef __CUDA_ARCH__ +//expected-note@-2 {{declared here}} +#endif { return TemplateReturnTy(); } @@ -346,13 +342,11 @@ __host__ __device__ HostDeviceReturnTy template_vs_hd_function(float arg) { } __host__ __device__ void test_host_device_calls_hd_template() { + HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f); + TemplateReturnTy ret2 = template_vs_hd_function(1); #ifdef __CUDA_ARCH__ - typedef HostDeviceReturnTy ExpectedReturnTy; -#else - typedef TemplateReturnTy ExpectedReturnTy; + // expected-error@-2 {{reference to __host__ function 'template_vs_hd_function' in __host__ __device__ function}} #endif - HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f); - ExpectedReturnTy ret2 = template_vs_hd_function(1); } __host__ void test_host_calls_hd_template() { @@ -373,14 +367,14 @@ __device__ void test_device_calls_hd_template() { __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); } __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); } #ifndef __CUDA_ARCH__ - // expected-note@-3 2{{'device_only_function' declared here}} - // expected-note@-3 2{{'device_only_function' declared here}} + // expected-note@-3 {{'device_only_function' declared here}} + // expected-note@-3 {{'device_only_function' declared here}} #endif __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); } __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); } #ifdef __CUDA_ARCH__ - // expected-note@-3 2{{'host_only_function' declared here}} - // expected-note@-3 2{{'host_only_function' declared here}} + // expected-note@-3 {{'host_only_function' declared here}} + // expected-note@-3 {{'host_only_function' declared here}} #endif __host__ __device__ void test_host_device_single_side_overloading() { @@ -398,37 +392,6 @@ __host__ __device__ void test_host_device_single_side_overloading() { #endif } -// wrong-sided overloading should not cause diagnostic unless it is emitted. -// This inline function is not emitted. -inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_no_diag() { - DeviceReturnTy ret1 = device_only_function(1); - DeviceReturnTy2 ret2 = device_only_function(1.0f); - HostReturnTy ret3 = host_only_function(1); - HostReturnTy2 ret4 = host_only_function(1.0f); -} - -// wrong-sided overloading should cause diagnostic if it is emitted. -// This inline function is emitted since it is called by an emitted function. -inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_diag() { - DeviceReturnTy ret1 = device_only_function(1); - DeviceReturnTy2 ret2 = device_only_function(1.0f); -#ifndef __CUDA_ARCH__ - // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}} - // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}} -#endif - HostReturnTy ret3 = host_only_function(1); - HostReturnTy2 ret4 = host_only_function(1.0f); -#ifdef __CUDA_ARCH__ - // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}} - // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}} -#endif -} - -__host__ __device__ void test_host_device_wrong_side_overloading_inline_diag_caller() { - test_host_device_wrong_side_overloading_inline_diag(); - // expected-note@-1 {{called by 'test_host_device_wrong_side_overloading_inline_diag_caller'}} -} - // Verify that we allow overloading function templates. template __host__ T template_overload(const T &a) { return a; }; template __device__ T template_overload(const T &a) { return a; }; @@ -456,136 +419,3 @@ __host__ __device__ int constexpr_overload(const T &x, const T &y) { int test_constexpr_overload(C2 &x, C2 &y) { return constexpr_overload(x, y); } - -// Verify no ambiguity for new operator. -void *a = new int; -__device__ void *b = new int; -// expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}} - -// Verify no ambiguity for new operator. -template _Tp&& f(); -template()))> -void __test(); - -void foo() { - __test(); -} - -// Test resolving implicit host device candidate vs wrong-sided candidate. -// In device compilation, implicit host device caller choose implicit host -// device candidate and wrong-sided candidate with equal preference. -// Resolution result should not change with/without pragma. -namespace ImplicitHostDeviceVsWrongSided { -CorrectOverloadRetTy callee(double x); -#pragma clang force_cuda_host_device begin -IncorrectOverloadRetTy callee(int x); -inline CorrectOverloadRetTy implicit_hd_caller() { - return callee(1.0); -} -#pragma clang force_cuda_host_device end -} - -// Test resolving implicit host device candidate vs same-sided candidate. -// In host compilation, implicit host device caller choose implicit host -// device candidate and same-sided candidate with equal preference. -// Resolution result should not change with/without pragma. -namespace ImplicitHostDeviceVsSameSide { -IncorrectOverloadRetTy callee(int x); -#pragma clang force_cuda_host_device begin -CorrectOverloadRetTy callee(double x); -inline CorrectOverloadRetTy implicit_hd_caller() { - return callee(1.0); -} -#pragma clang force_cuda_host_device end -} - -// Test resolving explicit host device candidate vs. wrong-sided candidate. -// Explicit host device caller favors host device candidate against wrong-sided -// candidate. -namespace ExplicitHostDeviceVsWrongSided { -CorrectOverloadRetTy callee(double x); -__host__ __device__ IncorrectOverloadRetTy callee(int x); -inline __host__ __device__ CorrectOverloadRetTy explicit_hd_caller() { - return callee(1.0); -#if __CUDA_ARCH__ - // expected-error@-2 {{no viable conversion from returned value of type 'IncorrectOverloadRetTy' to function return type 'CorrectOverloadRetTy'}} -#endif -} -} - -// In the implicit host device function 'caller', the second 'callee' should be -// chosen since it has better match, even though it is an implicit host device -// function whereas the first 'callee' is a host function. A diagnostic will be -// emitted if the first 'callee' is chosen since deduced return type cannot be -// used before it is defined. -namespace ImplicitHostDeviceByConstExpr { -template a b; -auto callee(...); -template constexpr auto callee(d) -> decltype(0); -struct e { - template static auto g(ad, f...) { - return h)...>; - } - struct i { - template static constexpr auto caller(f... k) { - return callee(k...); - } - }; - template static auto h() { - return i::caller; - } -}; -class l { - l() { - e::g([] {}, this); - } -}; -} - -// Implicit HD candidate competes with device candidate. -// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved. -// copy ctor of a should win over a(short), otherwise there will be ambiguity -// due to conversion operator. -namespace TestImplicitHDWithD { - struct a { - __device__ a(short); - __device__ operator unsigned() const; - __device__ operator int() const; - }; - struct b { - a d; - }; - void f(b g) { b e = g; } -} - -// Implicit HD candidate competes with host candidate. -// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved. -// copy ctor of a should win over a(short), otherwise there will be ambiguity -// due to conversion operator. -namespace TestImplicitHDWithH { - struct a { - a(short); - __device__ operator unsigned() const; - __device__ operator int() const; - }; - struct b { - a d; - }; - void f(b g) { b e = g; } -} - -// Implicit HD candidate comptes with HD candidate. -// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved. -// copy ctor of a should win over a(short), otherwise there will be ambiguity -// due to conversion operator. -namespace TestImplicitHDWithHD { - struct a { - __host__ __device__ a(short); - __device__ operator unsigned() const; - __device__ operator int() const; - }; - struct b { - a d; - }; - void f(b g) { b e = g; } -} -- 2.7.4