From bad8f0feb47dbd648ebee5208ae5d966ed22ad50 Mon Sep 17 00:00:00 2001 From: Dean Michael Berris Date: Mon, 21 Nov 2016 03:20:43 +0000 Subject: [PATCH] [XRay] Support AArch64 in compiler-rt This patch adds XRay support in compiler-rt for AArch64 targets. This patch is one of a series: LLVM: https://reviews.llvm.org/D26412 Clang: https://reviews.llvm.org/D26415 Author: rSerge Reviewers: rengolin, dberris Subscribers: aemerson, mgorny, llvm-commits, iid_iunknown Differential Revision: https://reviews.llvm.org/D26413 llvm-svn: 287517 --- compiler-rt/cmake/config-ix.cmake | 2 +- compiler-rt/include/xray/xray_interface.h | 7 ++ compiler-rt/lib/xray/CMakeLists.txt | 5 ++ compiler-rt/lib/xray/xray_AArch64.cc | 105 +++++++++++++++++++++++++ compiler-rt/lib/xray/xray_inmemory_log.cc | 6 +- compiler-rt/lib/xray/xray_interface.cc | 2 + compiler-rt/lib/xray/xray_trampoline_AArch64.S | 89 +++++++++++++++++++++ 7 files changed, 212 insertions(+), 4 deletions(-) create mode 100644 compiler-rt/lib/xray/xray_AArch64.cc create mode 100644 compiler-rt/lib/xray/xray_trampoline_AArch64.S diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index 0795a63..7b18584 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -161,7 +161,7 @@ set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64}) set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64}) set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64}) set(ALL_SCUDO_SUPPORTED_ARCH ${X86_64}) -set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32}) +set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64}) if(APPLE) include(CompilerRTDarwinUtils) diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h index 680fcfd..9e712b1 100644 --- a/compiler-rt/include/xray/xray_interface.h +++ b/compiler-rt/include/xray/xray_interface.h @@ -32,6 +32,13 @@ enum XRayEntryType { ENTRY = 0, EXIT = 1, TAIL = 2 }; // (function entry, function exit, etc.). See the enum // XRayEntryType for more details. // +// The user handler must handle correctly spurious calls after this handler is +// removed or replaced with another handler, because it would be too costly for +// XRay runtime to avoid spurious calls. +// To prevent circular calling, the handler function itself and all its +// direct&indirect callees must not be instrumented with XRay, which can be +// achieved by marking them all with: __attribute__((xray_never_instrument)) +// // Returns 1 on success, 0 on error. extern int __xray_set_handler(void (*entry)(int32_t, XRayEntryType)); diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt index c9f5105..bab84d8 100644 --- a/compiler-rt/lib/xray/CMakeLists.txt +++ b/compiler-rt/lib/xray/CMakeLists.txt @@ -19,6 +19,11 @@ set(arm_SOURCES set(armhf_SOURCES ${arm_SOURCES}) +set(aarch64_SOURCES + xray_AArch64.cc + xray_trampoline_AArch64.S + ${XRAY_SOURCES}) + include_directories(..) include_directories(../../include) diff --git a/compiler-rt/lib/xray/xray_AArch64.cc b/compiler-rt/lib/xray/xray_AArch64.cc new file mode 100644 index 0000000..c2d33a2 --- /dev/null +++ b/compiler-rt/lib/xray/xray_AArch64.cc @@ -0,0 +1,105 @@ +//===-- xray_AArch64.cc -----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Implementation of AArch64-specific routines (64-bit). +// +//===----------------------------------------------------------------------===// +#include "sanitizer_common/sanitizer_common.h" +#include "xray_defs.h" +#include "xray_interface_internal.h" +#include +#include + +namespace __xray { + +// The machine codes for some instructions used in runtime patching. +enum class PatchOpcodes : uint32_t { + PO_StpX0X30SP_m16e = 0xA9BF7BE0, // STP X0, X30, [SP, #-16]! + PO_LdrW0_12 = 0x18000060, // LDR W0, #12 + PO_LdrX16_12 = 0x58000070, // LDR X16, #12 + PO_BlrX16 = 0xD63F0200, // BLR X16 + PO_LdpX0X30SP_16 = 0xA8C17BE0, // LDP X0, X30, [SP], #16 + PO_B32 = 0x14000008 // B #32 +}; + +inline static bool patchSled(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled, + void (*TracingHook)()) XRAY_NEVER_INSTRUMENT { + // When |Enable| == true, + // We replace the following compile-time stub (sled): + // + // xray_sled_n: + // B #32 + // 7 NOPs (24 bytes) + // + // With the following runtime patch: + // + // xray_sled_n: + // STP X0, X30, [SP, #-16]! ; PUSH {r0, lr} + // LDR W0, #12 ; W0 := function ID + // LDR X16,#12 ; X16 := address of the trampoline + // BLR X16 + // ;DATA: 32 bits of function ID + // ;DATA: lower 32 bits of the address of the trampoline + // ;DATA: higher 32 bits of the address of the trampoline + // LDP X0, X30, [SP], #16 ; POP {r0, lr} + // + // Replacement of the first 4-byte instruction should be the last and atomic + // operation, so that the user code which reaches the sled concurrently + // either jumps over the whole sled, or executes the whole sled when the + // latter is ready. + // + // When |Enable|==false, we set back the first instruction in the sled to be + // B #32 + + uint32_t *FirstAddress = reinterpret_cast(Sled.Address); + if (Enable) { + uint32_t *CurAddress = FirstAddress + 1; + *CurAddress = uint32_t(PatchOpcodes::PO_LdrW0_12); + CurAddress++; + *CurAddress = uint32_t(PatchOpcodes::PO_LdrX16_12); + CurAddress++; + *CurAddress = uint32_t(PatchOpcodes::PO_BlrX16); + CurAddress++; + *CurAddress = FuncId; + CurAddress++; + *reinterpret_cast(CurAddress) = TracingHook; + CurAddress += 2; + *CurAddress = uint32_t(PatchOpcodes::PO_LdpX0X30SP_16); + std::atomic_store_explicit( + reinterpret_cast *>(FirstAddress), + uint32_t(PatchOpcodes::PO_StpX0X30SP_m16e), std::memory_order_release); + } else { + std::atomic_store_explicit( + reinterpret_cast *>(FirstAddress), + uint32_t(PatchOpcodes::PO_B32), std::memory_order_release); + } + return true; +} + +bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + return patchSled(Enable, FuncId, Sled, __xray_FunctionEntry); +} + +bool patchFunctionExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); +} + +bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: In the future we'd need to distinguish between non-tail exits and + // tail exits for better information preservation. + return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); +} + +} // namespace __xray diff --git a/compiler-rt/lib/xray/xray_inmemory_log.cc b/compiler-rt/lib/xray/xray_inmemory_log.cc index 17275cc..d00e393 100644 --- a/compiler-rt/lib/xray/xray_inmemory_log.cc +++ b/compiler-rt/lib/xray/xray_inmemory_log.cc @@ -27,7 +27,7 @@ #if defined(__x86_64__) #include -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) static const int64_t NanosecondsPerSecond = 1000LL * 1000 * 1000; #else #error "Unsupported CPU Architecture" @@ -195,7 +195,7 @@ void __xray_InMemoryRawLog(int32_t FuncId, } else { Report("Unable to determine CPU frequency for TSC accounting."); } -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does // not have a constant frequency like TSC on x86(_64), it may go faster // or slower depending on CPU turbo or power saving mode. Furthermore, @@ -243,7 +243,7 @@ void __xray_InMemoryRawLog(int32_t FuncId, R.TSC = __rdtscp(&CPU); R.CPU = CPU; } -#elif defined(__arm__) +#elif defined(__arm__) || defined(__aarch64__) { timespec TS; int result = clock_gettime(CLOCK_REALTIME, &TS); diff --git a/compiler-rt/lib/xray/xray_interface.cc b/compiler-rt/lib/xray/xray_interface.cc index bfee1b8..60a5c77 100644 --- a/compiler-rt/lib/xray/xray_interface.cc +++ b/compiler-rt/lib/xray/xray_interface.cc @@ -33,6 +33,8 @@ namespace __xray { static const int16_t cSledLength = 12; #elif defined(__arm__) static const int16_t cSledLength = 28; +#elif defined(__aarch64__) +static const int16_t cSledLength = 32; #else #error "Unsupported CPU Architecture" #endif /* CPU architecture */ diff --git a/compiler-rt/lib/xray/xray_trampoline_AArch64.S b/compiler-rt/lib/xray/xray_trampoline_AArch64.S new file mode 100644 index 0000000..f1a471c --- /dev/null +++ b/compiler-rt/lib/xray/xray_trampoline_AArch64.S @@ -0,0 +1,89 @@ + .text + /* The variable containing the handler function pointer */ + .global _ZN6__xray19XRayPatchedFunctionE + /* Word-aligned function entry point */ + .p2align 2 + /* Let C/C++ see the symbol */ + .global __xray_FunctionEntry + .type __xray_FunctionEntry, %function + /* In C++ it is void extern "C" __xray_FunctionEntry(uint32_t FuncId) with + FuncId passed in W0 register. */ +__xray_FunctionEntry: + /* Move the return address beyond the end of sled data. The 12 bytes of + data are inserted in the code of the runtime patch, between the call + instruction and the instruction returned into. The data contains 32 + bits of instrumented function ID and 64 bits of the address of + the current trampoline. */ + ADD X30, X30, #12 + /* Push the registers which may be modified by the handler function */ + STP X1, X2, [SP, #-16]! + STP X3, X4, [SP, #-16]! + STP X5, X6, [SP, #-16]! + STP X7, X30, [SP, #-16]! + STP Q0, Q1, [SP, #-32]! + STP Q2, Q3, [SP, #-32]! + STP Q4, Q5, [SP, #-32]! + STP Q6, Q7, [SP, #-32]! + /* Load the address of _ZN6__xray19XRayPatchedFunctionE into X1 */ + LDR X1, =_ZN6__xray19XRayPatchedFunctionE + /* Load the handler function pointer into X2 */ + LDR X2, [X1] + /* Handler address is nullptr if handler is not set */ + CMP X2, #0 + BEQ FunctionEntry_restore + /* Function ID is already in W0 (the first parameter). + X1=0 means that we are tracing an entry event */ + MOV X1, #0 + /* Call the handler with 2 parameters in W0 and X1 */ + BLR X2 +FunctionEntry_restore: + /* Pop the saved registers */ + LDP Q6, Q7, [SP], #32 + LDP Q4, Q5, [SP], #32 + LDP Q2, Q3, [SP], #32 + LDP Q0, Q1, [SP], #32 + LDP X7, X30, [SP], #16 + LDP X5, X6, [SP], #16 + LDP X3, X4, [SP], #16 + LDP X1, X2, [SP], #16 + RET + + /* Word-aligned function entry point */ + .p2align 2 + /* Let C/C++ see the symbol */ + .global __xray_FunctionExit + .type __xray_FunctionExit, %function + /* In C++ it is void extern "C" __xray_FunctionExit(uint32_t FuncId) with + FuncId passed in W0 register. */ +__xray_FunctionExit: + /* Move the return address beyond the end of sled data. The 12 bytes of + data are inserted in the code of the runtime patch, between the call + instruction and the instruction returned into. The data contains 32 + bits of instrumented function ID and 64 bits of the address of + the current trampoline. */ + ADD X30, X30, #12 + /* Push the registers which may be modified by the handler function */ + STP X1, X2, [SP, #-16]! + STP X3, X4, [SP, #-16]! + STP X5, X6, [SP, #-16]! + STP X7, X30, [SP, #-16]! + STR Q0, [SP, #-16]! + /* Load the address of _ZN6__xray19XRayPatchedFunctionE into X1 */ + LDR X1, =_ZN6__xray19XRayPatchedFunctionE + /* Load the handler function pointer into X2 */ + LDR X2, [X1] + /* Handler address is nullptr if handler is not set */ + CMP X2, #0 + BEQ FunctionExit_restore + /* Function ID is already in W0 (the first parameter). + X1=1 means that we are tracing an exit event */ + MOV X1, #1 + /* Call the handler with 2 parameters in W0 and X1 */ + BLR X2 +FunctionExit_restore: + LDR Q0, [SP], #16 + LDP X7, X30, [SP], #16 + LDP X5, X6, [SP], #16 + LDP X3, X4, [SP], #16 + LDP X1, X2, [SP], #16 + RET -- 2.7.4