From 0f53df864eb3b97fba74feb78132a38c87fa1db6 Mon Sep 17 00:00:00 2001 From: wlei Date: Mon, 13 Dec 2021 20:33:33 -0800 Subject: [PATCH] [CSSPGO][llvm-profgen] Fix external address issues of perf reader (return to external addr part) Before we have an issue with artificial LBR whose source is a return, recalling that "an internal code(A) can return to external address, then from the external address call a new internal code(B), making an artificial branch that looks like a return from A to B can confuse the unwinder". We just ignore the LBRs after this artificial LBR which can miss some samples. This change aims at fixing this by correctly unwinding them instead of ignoring them. List some typical scenarios covered by this change. 1) multiple sequential call back happen in external address, e.g. ``` [ext, call, foo] [foo, return, ext] [ext, call, bar] ``` Unwinder should avoid having foo return from bar. Wrong call stack is like [foo, bar] 2) the call stack before and after external call should be correctly unwinded. ``` {call stack1} {call stack2} [foo, call, ext] [ext, call, bar] [bar, return, ext] [ext, return, foo ] ``` call stack 1 should be the same to call stack2. Both shouldn't be truncated 3) call stack should be truncated after call into external code since we can't do inlining with external code. ``` [foo, call, ext] [ext, call, bar] [bar, call, baz] [baz, return, bar ] [bar, return, ext] ``` the call stack of code in baz should not include foo. ### Implementation: We leverage artificial frame to fix #2 and #3: when we got a return artificial LBR, push an extra artificial frame to the stack. when we pop frame, check if the parent is an artificial frame to pop(fix #2). Therefore, call/ return artificial LBR is just the same as regular LBR which can keep the call stack. While recording context on the trie, artificial frame is used as a tag indicating that we should truncate the call stack(fix #3). To differentiate #1 and #2, we leverage `getCallAddrFromFrameAddr`. Normally the target of the return should be the next inst of a call inst and `getCallAddrFromFrameAddr` will return the address of call inst. Otherwise, getCallAddrFromFrameAddr will return to 0 which is the case of #1. Reviewed By: hoy, wenlei Differential Revision: https://reviews.llvm.org/D115550 --- .../Inputs/callback-external-addr.perfbin | Bin 0 -> 19296 bytes .../Inputs/callback-external-addr.perfscript | 28 +++++ .../tools/llvm-profgen/callback-external-addr.test | 114 +++++++++++++++++++++ llvm/test/tools/llvm-profgen/inline-noprobe2.test | 7 +- llvm/tools/llvm-profgen/PerfReader.cpp | 82 ++++++++++++--- llvm/tools/llvm-profgen/PerfReader.h | 42 +++++--- llvm/tools/llvm-profgen/ProfiledBinary.h | 12 +++ 7 files changed, 251 insertions(+), 34 deletions(-) create mode 100755 llvm/test/tools/llvm-profgen/Inputs/callback-external-addr.perfbin create mode 100644 llvm/test/tools/llvm-profgen/Inputs/callback-external-addr.perfscript create mode 100644 llvm/test/tools/llvm-profgen/callback-external-addr.test diff --git a/llvm/test/tools/llvm-profgen/Inputs/callback-external-addr.perfbin b/llvm/test/tools/llvm-profgen/Inputs/callback-external-addr.perfbin new file mode 100755 index 0000000000000000000000000000000000000000..8076ddc5421984f77938d79baf41be2cec3b4f2e GIT binary patch literal 19296 zcmeHPeQX@Zb)Wm>hp6LG9~C91*4Z&@>VwA*krZW1(H*IijxP04iIQm~^744MB(FZ+ z(RX_a<=CkbI*J)ti5$d%ll-F~fsvpMU=&HAr1_8`J8FTTl~D&xVHc&-xTRASWl+VX zZCUJlv+v#Q?(NF1i#REI1MahfvQPNCnNzVfm?fTq0(C3cG2-6Sfuwcq1MCv1% zP9ntV^g_MPG8G~u0kp%i&aad#&`}v-Iz;6dADFc$o5Y@=J)y z&y=bu+B4~~-tP8Hsx6bw7H8UK`g+@XyTgTCxJ%X>(?{KX2ad{|u)RyJg*fuqR9I$g z&pNn_0c~sGLN{<7jcH&U<*0{gJQfLIv=%W=t3USBtAF{=pMUrt9vtCr1X6Qo0yboZ zB$tNYRW|sv1KtLB$VPvs1AYu}yYxQ^cu=epTZ5wzD~Bf`8Whc<&w|epZq*miKFLHT zGnPm`&TsN=nVE6X}Ac=7;wWWO7+`I5Cz{g`!MM<+4gaOXM|0(N%2~IYpT| zHg>O=&Zo25I3$4?QW8^%bXF*8DxoDnFjgq&I6&Ey*f%uTKcIAlyTU!PdhSxca6xNg zBVxdq0+jn{aYGuK)h)g(rEniNj>;IeJ=-9#V8F4Da9A|p+@DakWWbHjm1goY_9tV% z4HVT2lBKLA>1Z1^8{;7=ak_ zlPG&_ei=f`XOPD#y>0&pAC=jii=W*!2!i-oUq6x^OWIl};=j~E9Ew=CJm0wS6lz`~3i|7% ze$Of-n{LFPz2S+^EykZ+*d2HMIR3&7Z4D&w14^KFd49ZcYz4^|L~kYf|VEX@13XE!pCmtijOQUoF|*u60~FR zIJhVxusnwJ7dZV#L08u**TtxRId&a0x*S{5$u%UOc;s57zfm-vy$hH#`>z>`)&Y%* zA^trs_L24cs#qyY||5A>T9IEm);9N1<+y(R<<jR9G|`9itqTx;xDnEfB1OxBhlm0`=g4njb(p?Y9N*O2i*}a(_JM5 zKUqS_TrsQdYDsMk6^c_p;U(d^#dG&0q|pG^Pp&U7{~6HhK(R(!p@>fby$bY2#DSu3 z8=x82z+LnKpxr>X0v!hWI8Y7f_kn%^=n(kt+sFgG4s;FJVsJflSh!|_u3H*vYR>_V z&mnw?OhdlysIaFdh*$?)j|0Zfjyq5US2JKJ6M^8~K=VC~D?VC#O6=Zn_vUSP+>S7& zm4Pb=CeK{{-aoH<4cj`4y0V1ZXLL_r0IZ<$D9c&$(lP=FfU!fzVlREYSM6 zFCOSP?T-igW^48ZBH2J+G|&+ZwDt!={ekBGK(IehEB%h`bOF*~A7gUWQ1w971MlpC zQ~rolIz$w1B^j&~$66Q~bRy!uh>X#+M@-UtWj)Dx-gFD`_{_qvK__B?=1KS*!N78_ z&Wag#M8ySB3+@P#(%kN69-Zeg)`DB-;dLOx(?nMgZ6>;wD3j6{kly(`mXptB zP~zu3j`J|A(}`%N`ZIpP8<7&O$0+IXT!Zt6*NUvqQ=|{C70I)|7m5Ek$tjuN8sT}Y zwE(B7{$C`D@_hpXcZFJEMz<-{7w!pngt|JqIy-tgyF;yq)l?{+&~&_Q+a{6BB(f8s z6KcMY&SgWLUEz*!XQ;JMn7pgKeIl)ePo$^A$y>Wn`O6@@9|O@w!m~O z`G5kBEKCZq+N_@(SHK$@cmv#vHy7Whl+WIShldCE-Y1sP?DtHzTjT_6WJHM`IT8b{ zg9na)+)|zzyl+bHTj`VAWY@Q(m77Xy%6L98r7F`om96-STQ8 zlb*;T?$l1_shyZweP3;-YCBc$r`xLclYFjhu0Hp=tM8}sduGdT{QY!=-cLUzT75rB z@AJ6keIG~QI0?sl_(^MDbaTZx$=Y5#ZX$i#=Z7w{A2(IUE!FWQ&l#%ofa*M;I$w*( z7eIBs_Pa5!#!0(r{Ugy?+S|cvA9gw{V``_vV;-hoNyb~^K9 zf11`Yo!-0nNfd)hY5lgxW4Mm;8(7zo>qVBen0K|VgNE>)(8uL9?{`0}w$pnXr&rs_e!c$x`*Xc|KmFgh zpX|m`{O)7=UUhg(m%Pc?j>mLGdcS`Pp8xtc;97{`_vX_`Abb`s-s&_0GWP`lcEjcJ z)_Hi#mghMDd|odr;?T-eu)i zGFeGZcyYrYhR5J?9eV?RTTly0N_I_R=`J0Zx{?#`6JDq07~y%k9y#5pKud2 z2tzM_w~4J;A&nsWH{tTi52r4d>#e{_Sf9gDoUFXi)PlAxoCFp^otxWsy|2E$=xgxr z^m%srgn#${jP=!j%XjM&zLj8JGveFq-{q^rsQZJfeCwGR@zq1r1Gw;i)rS^=Cjf#B zM*yka?W?JO)z?t;trc#s+pqaNNHt%Ze?k_k4ijqu%^cAIJ>!hGB>5Re>9_}+cP=Xch|1vPNIxy8}G#{ZOR+SY63p)im^lS0wY=Ccxrj>Feq;vX!|i1A!b$c@0$BCl!!cJqm}-o#G79_S8z?ns3g++Nx@ zZxni97x|2>zPGfW#H{ZV3%2@x!TU|j`ZdCCpN0oMxbyxFvwof6eIjPOzO>H)CkGAN31RF+cpY4xI-amr@v0L+TzU>`{OY^kKo-&#?%qt0Q=_8n?b41 zyO+!&PwCW&75{xRK%7V1T7l!k%`1MjY!&Q6z`e4x|AgiH9qT6MEX}qU+&e1^#2KPEWh>rcopzaWpTt?fH%1t#VN~p zpccmcO|U}E;{^84I>51A_E_&y?79He}WQy-KK1K#9bS889@9|b(bjDbER(=h-Py$x}<&3LT6SjgjH z$9C8W+7722mEP{QyqcO!X!^;=_P+LTxcmp2a4a&J*E+*F5vDJ~N;(T$UXxZ{9*48& zgeP;Fot_;YE@m_72h_|-TLULySRV5$PTLwe$inJO+P+w30hnv)q>@)*VqvVW88_G4 zEHl^BEJM4S$!+{$lR%|k%u1F1G0SBs0Lp7dADVS6{xSG!firO(SU zXAi zWn6Ng6U4acpr101M2v@rahbxn=zzRwRiffdve_ONDWC=5+yNg?wj6_2QiYr{naHN_ zi1gq=&`PDVO0l4VZ_3ren@%ovINY6}TRs7;pNv-E^fh}Ok7eKdthERiPEKiwF`!yr zr;{9$Tkhdeo`tgj;b{M0o0gam;fZW9Jc-8!!l{#4FxIJ-*Cli)!^9{cgQs=^4JbC9 zfhWGKTv&s17$Pj&FPzWGW8GnOk~-OB3T&CvE%e@|TW}l_PE4iY9XNq6IQR+64D`gn zK>|;KOhI4P%QaR^XHwyWrsdOPMd+|l1$C@A0ey5F-m7{%F*cT0PcR4nhd|{xxTo?J z|DF(BA%LL`%n|g!>r?!HEBJg9iKQ;bk$w=a8o1bg*c}0`{C^=rk}Lo4_=^!dtAb-D zw&(S(83J~aB9E(TiRW}Wfqxy&9I^cccSLGjAfQ8X<#9LL6D;nA2%ceKdtN_VAYhRc zVfZBL&i2iO4*-=@IO2Jo?Pmx=cv>f=U-Y#x{2?%ShKlp&^|+5y1^N6I*Pr#6j(|O$ z$zq<@?Vck05M^d9Kfz)GBB;vtyuNpV>_wq{EX<#?;d#mo*>7P(qU=A0pM;3fp8t;u|9_PXD-yNl|22y} zua5@l;NLeeaYL}?|1FRO;G*tcT1TCyid*%WXZl^R#W^V3^Zb0_UD`hnw%B*rp4Vkd z|IdcX&;5(@<@Wy@utgQN=l{DgPZQ%1$c%yQnf@IJjrOH=Udo`=Xbf?-WBRhip69!* zWN&RROLK^yQ<_|Uo>y)qdp_^V#jvLLuatiWD-dP-KeynVKhs5!buS3sM{^u+= zur+@_B*bZP8c3(N>fD?~Xko=F2m3eLbo;DMO21b7kc0h1SU16st;S%r$Nv$4nALwO zP5A3A2Fw~M-}}aVrYXT^>UA^(6d0:5 +; CHECK-UNWINDER: [foo:2 @ qux] +; CHECK-UNWINDER: 4 +; CHECK-UNWINDER: 6d0-6e7:5 +; CHECK-UNWINDER: 6ec-710:6 +; CHECK-UNWINDER: 715-71b:7 +; CHECK-UNWINDER: 720-72b:6 +; CHECK-UNWINDER: 3 +; CHECK-UNWINDER: 6e7->6b0:6 +; CHECK-UNWINDER: 71b->6c0:7 +; CHECK-UNWINDER: 72b->74c:6 +; CHECK-UNWINDER: [foo:2 @ qux:2 @ callBeforeReturn] +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6b0-6be:6 +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6be->6ec:7 +; CHECK-UNWINDER: [foo:2 @ qux:4 @ callAfterReturn] +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6c0-6ce:7 +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6ce->720:7 +; CHECK-UNWINDER: [main] +; CHECK-UNWINDER: 2 +; CHECK-UNWINDER: 77d-7ab:5 +; CHECK-UNWINDER: 7b0-7bf:5 +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 7bf->77d:5 + +; libcallback.c +; clang -shared -fPIC -o libcallback.so libcallback.c + +int callback(int *cnt, int (*func1)(int), int (*func2)(int), int p) { + (*cnt)++; + return func1(p) + func2(p); +} + +; test.c +; clang test.c -O0 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling -L $PWD -lcallback -fno-inline + +#include + +int callbackCnt = 0; + +int callback(int *cnt, int (*func1)(int), int (*func2)(int), int p); + +int bar(int p) { + return p + 1; +} + +int baz(int p) { + return p - 1; +} + +int callBeforeReturn(int p) { + return p + 10; +} + +int callAfterReturn(int p) { + return p - 10; +} + +int qux(int p) { + p += 10; + int ret = callBeforeReturn(p); + ret = callback(&callbackCnt, bar, baz, ret); + ret = callAfterReturn(ret); + return ret; +} + +int foo (int p) { + p -= 10; + return qux(p); +} + +int main(void) { + int sum = 0; + for (int i = 0; i < 1000 * 1000; i++) { + sum += callback(&callbackCnt, foo, bar, i); + } + printf("callback count=%d, sum=%d\n", callbackCnt, sum); +} diff --git a/llvm/test/tools/llvm-profgen/inline-noprobe2.test b/llvm/test/tools/llvm-profgen/inline-noprobe2.test index 3c110f2..122bfe1 100644 --- a/llvm/test/tools/llvm-profgen/inline-noprobe2.test +++ b/llvm/test/tools/llvm-profgen/inline-noprobe2.test @@ -8,8 +8,11 @@ ; RUN: llvm-profgen --format=extbinary --perfscript=%S/Inputs/inline-noprobe2.perfscript --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t --populate-profile-symbol-list=1 ; RUN: llvm-profdata show -show-prof-sym-list -sample %t | FileCheck %s --check-prefix=CHECK-SYM-LIST -; CHECK-ARTIFICIAL-BRANCH: 0 -; CHECK-ARTIFICIAL-BRANCH: 0 +; CHECK-ARTIFICIAL-BRANCH: 2 +; CHECK-ARTIFICIAL-BRANCH: 400870-400870:2 +; CHECK-ARTIFICIAL-BRANCH: 400875-4008bf:1 +; CHECK-ARTIFICIAL-BRANCH: 1 +; CHECK-ARTIFICIAL-BRANCH: 4008bf->400870:2 ; CHECK-SYM-LIST: Dump profile symbol list ; CHECK-SYM-LIST: main diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp index df75909..c8edc9b 100644 --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -51,18 +51,44 @@ namespace llvm { namespace sampleprof { void VirtualUnwinder::unwindCall(UnwindState &State) { + uint64_t Source = State.getCurrentLBRSource(); + // An artificial return should push an external frame and an artificial call + // will match it and pop the external frame so that the context before and + // after the external call will be the same. + if (State.getCurrentLBR().IsArtificial) { + NumExtCallBranch++; + // A return is matched and pop the external frame. + if (State.getParentFrame()->isExternalFrame()) { + State.popFrame(); + } else { + // An artificial return is missing, it happens that the sample is just hit + // in the middle of the external code. In this case, the leading branch is + // a call to external, we just keep unwinding use a context-less stack. + if (State.getParentFrame() != State.getDummyRootPtr()) + NumMissingExternalFrame++; + State.clearCallStack(); + State.pushFrame(Source); + State.InstPtr.update(Source); + return; + } + } + + auto *ParentFrame = State.getParentFrame(); // The 2nd frame after leaf could be missing if stack sample is // taken when IP is within prolog/epilog, as frame chain isn't // setup yet. Fill in the missing frame in that case. // TODO: Currently we just assume all the addr that can't match the // 2nd frame is in prolog/epilog. In the future, we will switch to // pro/epi tracker(Dwarf CFI) for the precise check. - uint64_t Source = State.getCurrentLBRSource(); - auto *ParentFrame = State.getParentFrame(); - if (ParentFrame == State.getDummyRootPtr() || ParentFrame->Address != Source) { State.switchToFrame(Source); + if (ParentFrame != State.getDummyRootPtr()) { + if (State.getCurrentLBR().IsArtificial) + NumMismatchedExtCallBranch++; + else + NumMismatchedProEpiBranch++; + } } else { State.popFrame(); } @@ -118,6 +144,19 @@ void VirtualUnwinder::unwindReturn(UnwindState &State) { const LBREntry &LBR = State.getCurrentLBR(); uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); State.switchToFrame(CallAddr); + // Push an external frame for the case of returning to external + // address(callback), later if an aitificial call is matched and it will be + // popped up. This is to 1)avoid context being interrupted by callback, + // context before or after the callback should be the same. 2) the call stack + // of function called by callback should be truncated which is done during + // recording the context on trie. For example: + // main (call)--> foo (call)--> callback (call)--> bar (return)--> callback + // (return)--> foo (return)--> main + // Context for bar should not include main and foo. + // For the code of foo, the context of before and after callback should both + // be [foo, main]. + if (LBR.IsArtificial) + State.pushFrame(ExternalAddr); State.pushFrame(LBR.Source); State.InstPtr.update(LBR.Source); } @@ -180,7 +219,9 @@ template void VirtualUnwinder::collectSamplesFromFrameTrie( UnwindState::ProfiledFrame *Cur, T &Stack) { if (!Cur->isDummyRoot()) { - if (!Stack.pushFrame(Cur)) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see. + if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) { // Process truncated context // Start a new traversal ignoring its bottom context T EmptyStack(Binary); @@ -453,6 +494,21 @@ void HybridPerfReader::unwindSamples() { SampleCounters.size(), "of profiled contexts are truncated due to missing probe " "for call instruction."); + + emitWarningSummary( + Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to unwinding error of external frame."); + + emitWarningSummary( + Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to frame in prolog/epilog."); + + emitWarningSummary(Unwinder.NumMissingExternalFrame, + Unwinder.NumExtCallBranch, + "of artificial call branches but doesn't have an external " + "frame to match."); } bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, @@ -538,15 +594,6 @@ bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, break; } - if (Binary->addressIsReturn(Src)) { - // In a callback case, a return from internal code, say A, to external - // runtime can happen. The external runtime can then call back to - // another internal routine, say B. Making an artificial branch that - // looks like a return from A to B can confuse the unwinder to treat - // the instruction before B as the call instruction. - break; - } - // For transition to external code, group the Source with the next // availabe transition target. Dst = PrevTrDst; @@ -854,10 +901,15 @@ void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, SampleCounter &Counter = SampleCounters.begin()->second; uint64_t EndOffeset = 0; for (const LBREntry &LBR : Sample->LBRStack) { + assert(LBR.Source != ExternalAddr && + "Branch' source should not be an external address, it should be " + "converted to aritificial branch."); uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); - uint64_t TargetOffset = Binary->virtualAddrToOffset(LBR.Target); + uint64_t TargetOffset = LBR.Target == ExternalAddr + ? ExternalAddr + : Binary->virtualAddrToOffset(LBR.Target); - if (!LBR.IsArtificial) { + if (!LBR.IsArtificial && TargetOffset != ExternalAddr) { Counter.recordBranchCount(SourceOffset, TargetOffset, Repeat); } diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h index a133001..9d84ad3 100644 --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -214,14 +214,6 @@ using AggregatedCounter = using SampleVector = SmallVector, 16>; -// The special frame addresses. -enum SpecialFrameAddr { - // Dummy root of frame trie. - DummyRoot = 0, - // Represent all the addresses outside of current binary. - ExternalAddr = 1, -}; - // The state for the unwinder, it doesn't hold the data but only keep the // pointer/index of the data, While unwinding, the CallStack is changed // dynamicially and will be recorded as the context of the sample @@ -313,6 +305,8 @@ struct UnwindState { void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; } + void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; } + void initFrameTrie(const SmallVectorImpl &CallStack) { ProfiledFrame *Cur = &DummyTrieRoot; for (auto Address : reverse(CallStack)) { @@ -426,10 +420,8 @@ struct FrameStack { ProfiledBinary *Binary; FrameStack(ProfiledBinary *B) : Binary(B) {} bool pushFrame(UnwindState::ProfiledFrame *Cur) { - // Truncate the context for external frame since this isn't a real call - // context the compiler will see - if (Cur->isExternalFrame()) - return false; + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); Stack.push_back(Cur->Address); return true; } @@ -446,10 +438,8 @@ struct ProbeStack { ProfiledBinary *Binary; ProbeStack(ProfiledBinary *B) : Binary(B) {} bool pushFrame(UnwindState::ProfiledFrame *Cur) { - // Truncate the context for external frame since this isn't a real call - // context the compiler will see - if (Cur->isExternalFrame()) - return false; + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); const MCDecodedPseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address); // We may not find a probe for a merged or external callsite. @@ -506,6 +496,12 @@ public: bool unwind(const PerfSample *Sample, uint64_t Repeat); std::set &getUntrackedCallsites() { return UntrackedCallsites; } + uint64_t NumTotalBranches = 0; + uint64_t NumExtCallBranch = 0; + uint64_t NumMissingExternalFrame = 0; + uint64_t NumMismatchedProEpiBranch = 0; + uint64_t NumMismatchedExtCallBranch = 0; + private: bool isCallState(UnwindState &State) const { // The tail call frame is always missing here in stack sample, we will @@ -516,7 +512,19 @@ private: bool isReturnState(UnwindState &State) const { // Simply check addressIsReturn, as ret is always reliable, both for // regular call and tail call. - return Binary->addressIsReturn(State.getCurrentLBRSource()); + if (!Binary->addressIsReturn(State.getCurrentLBRSource())) + return false; + + // In a callback case, a return from internal code, say A, to external + // runtime can happen. The external runtime can then call back to + // another internal routine, say B. Making an artificial branch that + // looks like a return from A to B can confuse the unwinder to treat + // the instruction before B as the call instruction. Here we detect this + // case if the return target is not the next inst of call inst, then we just + // do not treat it as a return. + uint64_t CallAddr = + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()); + return (CallAddr != 0); } void unwindCall(UnwindState &State); diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h index fcc3a4f..b5c985f 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -70,6 +70,16 @@ struct InstructionPointer { void update(uint64_t Addr); }; +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + // This's also used to indicate the call stack should be truncated since this + // isn't a real call context the compiler will see. + ExternalAddr = 1, +}; + using RangesTy = std::vector>; struct BinaryFunction { @@ -386,6 +396,8 @@ public: } uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + if (FrameAddr == ExternalAddr) + return ExternalAddr; auto I = getIndexForAddr(FrameAddr); FrameAddr = I ? getAddressforIndex(I - 1) : 0; if (FrameAddr && addressIsCall(FrameAddr)) -- 2.7.4