From bad8f0feb47dbd648ebee5208ae5d966ed22ad50 Mon Sep 17 00:00:00 2001
From: Dean Michael Berris <dberris@google.com>
Date: Mon, 21 Nov 2016 03:20:43 +0000
Subject: [PATCH] [XRay] Support AArch64 in compiler-rt

This patch adds XRay support in compiler-rt for AArch64 targets.
This patch is one of a series:

LLVM: https://reviews.llvm.org/D26412
Clang: https://reviews.llvm.org/D26415

Author: rSerge

Reviewers: rengolin, dberris

Subscribers: aemerson, mgorny, llvm-commits, iid_iunknown

Differential Revision: https://reviews.llvm.org/D26413

llvm-svn: 287517
---
 compiler-rt/cmake/config-ix.cmake              |   2 +-
 compiler-rt/include/xray/xray_interface.h      |   7 ++
 compiler-rt/lib/xray/CMakeLists.txt            |   5 ++
 compiler-rt/lib/xray/xray_AArch64.cc           | 105 +++++++++++++++++++++++++
 compiler-rt/lib/xray/xray_inmemory_log.cc      |   6 +-
 compiler-rt/lib/xray/xray_interface.cc         |   2 +
 compiler-rt/lib/xray/xray_trampoline_AArch64.S |  89 +++++++++++++++++++++
 7 files changed, 212 insertions(+), 4 deletions(-)
 create mode 100644 compiler-rt/lib/xray/xray_AArch64.cc
 create mode 100644 compiler-rt/lib/xray/xray_trampoline_AArch64.S

diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 0795a63..7b18584 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -161,7 +161,7 @@ set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64})
 set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64})
 set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64})
 set(ALL_SCUDO_SUPPORTED_ARCH ${X86_64})
-set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32})
+set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64})
 
 if(APPLE)
   include(CompilerRTDarwinUtils)
diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h
index 680fcfd..9e712b1 100644
--- a/compiler-rt/include/xray/xray_interface.h
+++ b/compiler-rt/include/xray/xray_interface.h
@@ -32,6 +32,13 @@ enum XRayEntryType { ENTRY = 0, EXIT = 1, TAIL = 2 };
 //                 (function entry, function exit, etc.). See the enum
 //                 XRayEntryType for more details.
 //
+// The user handler must handle correctly spurious calls after this handler is
+// removed or replaced with another handler, because it would be too costly for
+// XRay runtime to avoid spurious calls.
+// To prevent circular calling, the handler function itself and all its
+// direct&indirect callees must not be instrumented with XRay, which can be
+// achieved by marking them all with: __attribute__((xray_never_instrument))
+//
 // Returns 1 on success, 0 on error.
 extern int __xray_set_handler(void (*entry)(int32_t, XRayEntryType));
 
diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt
index c9f5105..bab84d8 100644
--- a/compiler-rt/lib/xray/CMakeLists.txt
+++ b/compiler-rt/lib/xray/CMakeLists.txt
@@ -19,6 +19,11 @@ set(arm_SOURCES
 
 set(armhf_SOURCES ${arm_SOURCES})
 
+set(aarch64_SOURCES
+        xray_AArch64.cc
+        xray_trampoline_AArch64.S
+        ${XRAY_SOURCES})
+
 include_directories(..)
 include_directories(../../include)
 
diff --git a/compiler-rt/lib/xray/xray_AArch64.cc b/compiler-rt/lib/xray/xray_AArch64.cc
new file mode 100644
index 0000000..c2d33a2
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_AArch64.cc
@@ -0,0 +1,105 @@
+//===-- xray_AArch64.cc -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of AArch64-specific routines (64-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+#include <cassert>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum class PatchOpcodes : uint32_t {
+  PO_StpX0X30SP_m16e = 0xA9BF7BE0, // STP X0, X30, [SP, #-16]!
+  PO_LdrW0_12 = 0x18000060,        // LDR W0, #12
+  PO_LdrX16_12 = 0x58000070,       // LDR X16, #12
+  PO_BlrX16 = 0xD63F0200,          // BLR X16
+  PO_LdpX0X30SP_16 = 0xA8C17BE0,   // LDP X0, X30, [SP], #16
+  PO_B32 = 0x14000008              // B #32
+};
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //   B #32
+  //   7 NOPs (24 bytes)
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n:
+  //   STP X0, X30, [SP, #-16]! ; PUSH {r0, lr}
+  //   LDR W0, #12 ; W0 := function ID
+  //   LDR X16,#12 ; X16 := address of the trampoline
+  //   BLR X16
+  //   ;DATA: 32 bits of function ID
+  //   ;DATA: lower 32 bits of the address of the trampoline
+  //   ;DATA: higher 32 bits of the address of the trampoline
+  //   LDP X0, X30, [SP], #16 ; POP {r0, lr}
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #32
+
+  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.Address);
+  if (Enable) {
+    uint32_t *CurAddress = FirstAddress + 1;
+    *CurAddress = uint32_t(PatchOpcodes::PO_LdrW0_12);
+    CurAddress++;
+    *CurAddress = uint32_t(PatchOpcodes::PO_LdrX16_12);
+    CurAddress++;
+    *CurAddress = uint32_t(PatchOpcodes::PO_BlrX16);
+    CurAddress++;
+    *CurAddress = FuncId;
+    CurAddress++;
+    *reinterpret_cast<void (**)()>(CurAddress) = TracingHook;
+    CurAddress += 2;
+    *CurAddress = uint32_t(PatchOpcodes::PO_LdpX0X30SP_16);
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_StpX0X30SP_m16e), std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_B32), std::memory_order_release);
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionEntry);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: In the future we'd need to distinguish between non-tail exits and
+  // tail exits for better information preservation.
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+} // namespace __xray
diff --git a/compiler-rt/lib/xray/xray_inmemory_log.cc b/compiler-rt/lib/xray/xray_inmemory_log.cc
index 17275cc..d00e393 100644
--- a/compiler-rt/lib/xray/xray_inmemory_log.cc
+++ b/compiler-rt/lib/xray/xray_inmemory_log.cc
@@ -27,7 +27,7 @@
 
 #if defined(__x86_64__)
 #include <x86intrin.h>
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
 static const int64_t NanosecondsPerSecond = 1000LL * 1000 * 1000;
 #else
 #error "Unsupported CPU Architecture"
@@ -195,7 +195,7 @@ void __xray_InMemoryRawLog(int32_t FuncId,
     } else {
       Report("Unable to determine CPU frequency for TSC accounting.");
     }
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
     // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does
     //   not have a constant frequency like TSC on x86(_64), it may go faster
     //   or slower depending on CPU turbo or power saving mode. Furthermore,
@@ -243,7 +243,7 @@ void __xray_InMemoryRawLog(int32_t FuncId,
     R.TSC = __rdtscp(&CPU);
     R.CPU = CPU;
   }
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
   {
     timespec TS;
     int result = clock_gettime(CLOCK_REALTIME, &TS);
diff --git a/compiler-rt/lib/xray/xray_interface.cc b/compiler-rt/lib/xray/xray_interface.cc
index bfee1b8..60a5c77 100644
--- a/compiler-rt/lib/xray/xray_interface.cc
+++ b/compiler-rt/lib/xray/xray_interface.cc
@@ -33,6 +33,8 @@ namespace __xray {
 static const int16_t cSledLength = 12;
 #elif defined(__arm__)
 static const int16_t cSledLength = 28;
+#elif defined(__aarch64__)
+static const int16_t cSledLength = 32;
 #else
 #error "Unsupported CPU Architecture"
 #endif /* CPU architecture */
diff --git a/compiler-rt/lib/xray/xray_trampoline_AArch64.S b/compiler-rt/lib/xray/xray_trampoline_AArch64.S
new file mode 100644
index 0000000..f1a471c
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_trampoline_AArch64.S
@@ -0,0 +1,89 @@
+    .text
+    /* The variable containing the handler function pointer */
+    .global _ZN6__xray19XRayPatchedFunctionE
+    /* Word-aligned function entry point */
+    .p2align 2
+    /* Let C/C++ see the symbol */
+    .global __xray_FunctionEntry
+    .type __xray_FunctionEntry, %function
+    /* In C++ it is void extern "C" __xray_FunctionEntry(uint32_t FuncId) with
+         FuncId passed in W0 register. */
+__xray_FunctionEntry:
+    /* Move the return address beyond the end of sled data. The 12 bytes of
+         data are inserted in the code of the runtime patch, between the call
+         instruction and the instruction returned into. The data contains 32
+         bits of instrumented function ID and 64 bits of the address of
+         the current trampoline. */
+    ADD X30, X30, #12
+    /* Push the registers which may be modified by the handler function */
+    STP X1, X2, [SP, #-16]!
+    STP X3, X4, [SP, #-16]!
+    STP X5, X6, [SP, #-16]!
+    STP X7, X30, [SP, #-16]!
+    STP Q0, Q1, [SP, #-32]!
+    STP Q2, Q3, [SP, #-32]!
+    STP Q4, Q5, [SP, #-32]!
+    STP Q6, Q7, [SP, #-32]!
+    /* Load the address of _ZN6__xray19XRayPatchedFunctionE into X1 */
+    LDR X1, =_ZN6__xray19XRayPatchedFunctionE
+    /* Load the handler function pointer into X2 */
+    LDR X2, [X1]
+    /* Handler address is nullptr if handler is not set */
+    CMP X2, #0
+    BEQ FunctionEntry_restore
+    /* Function ID is already in W0 (the first parameter).
+         X1=0 means that we are tracing an entry event */
+    MOV X1, #0
+    /* Call the handler with 2 parameters in W0 and X1 */
+    BLR X2
+FunctionEntry_restore:
+    /* Pop the saved registers */
+    LDP Q6, Q7, [SP], #32
+    LDP Q4, Q5, [SP], #32
+    LDP Q2, Q3, [SP], #32
+    LDP Q0, Q1, [SP], #32
+    LDP X7, X30, [SP], #16
+    LDP X5, X6, [SP], #16
+    LDP X3, X4, [SP], #16
+    LDP X1, X2, [SP], #16
+    RET
+
+    /* Word-aligned function entry point */
+    .p2align 2
+    /* Let C/C++ see the symbol */
+    .global __xray_FunctionExit
+    .type __xray_FunctionExit, %function
+    /* In C++ it is void extern "C" __xray_FunctionExit(uint32_t FuncId) with
+         FuncId passed in W0 register. */
+__xray_FunctionExit:
+    /* Move the return address beyond the end of sled data. The 12 bytes of
+         data are inserted in the code of the runtime patch, between the call
+         instruction and the instruction returned into. The data contains 32
+         bits of instrumented function ID and 64 bits of the address of
+         the current trampoline. */
+    ADD X30, X30, #12
+    /* Push the registers which may be modified by the handler function */
+    STP X1, X2, [SP, #-16]!
+    STP X3, X4, [SP, #-16]!
+    STP X5, X6, [SP, #-16]!
+    STP X7, X30, [SP, #-16]!
+    STR Q0, [SP, #-16]!
+    /* Load the address of _ZN6__xray19XRayPatchedFunctionE into X1 */
+    LDR X1, =_ZN6__xray19XRayPatchedFunctionE
+    /* Load the handler function pointer into X2 */
+    LDR X2, [X1]
+    /* Handler address is nullptr if handler is not set */
+    CMP X2, #0
+    BEQ FunctionExit_restore
+    /* Function ID is already in W0 (the first parameter).
+         X1=1 means that we are tracing an exit event */
+    MOV X1, #1
+    /* Call the handler with 2 parameters in W0 and X1 */
+    BLR X2
+FunctionExit_restore:
+    LDR Q0, [SP], #16
+    LDP X7, X30, [SP], #16
+    LDP X5, X6, [SP], #16
+    LDP X3, X4, [SP], #16
+    LDP X1, X2, [SP], #16
+    RET
-- 
2.7.4