From: Martin Liska <mliska@suse.cz>
Date: Tue, 3 May 2022 10:56:26 +0000 (+0200)
Subject: libsanitizer: merge from upstream (0a1bcab9f3bf75c4c5d3e53bafb3eeb80320af46).
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f732bf6a603721f61102a08ad2d023c7c2670870;p=platform%2Fupstream%2Fgcc.git

libsanitizer: merge from upstream (0a1bcab9f3bf75c4c5d3e53bafb3eeb80320af46).
---

diff --git a/libsanitizer/MERGE b/libsanitizer/MERGE
index 01913de..b92d082 100644
--- a/libsanitizer/MERGE
+++ b/libsanitizer/MERGE
@@ -1,4 +1,4 @@
-82bc6a094e85014f1891ef9407496f44af8fe442
+0a1bcab9f3bf75c4c5d3e53bafb3eeb80320af46
 
 The first line of this file holds the git revision number of the
 last merge done from the master library sources.
diff --git a/libsanitizer/asan/asan_activation.cpp b/libsanitizer/asan/asan_activation.cpp
index 795df95..1757838 100644
--- a/libsanitizer/asan/asan_activation.cpp
+++ b/libsanitizer/asan/asan_activation.cpp
@@ -112,7 +112,7 @@ void AsanDeactivate() {
   disabled.quarantine_size_mb = 0;
   disabled.thread_local_quarantine_size_kb = 0;
   // Redzone must be at least Max(16, granularity) bytes long.
-  disabled.min_redzone = Max(16, (int)SHADOW_GRANULARITY);
+  disabled.min_redzone = Max(16, (int)ASAN_SHADOW_GRANULARITY);
   disabled.max_redzone = disabled.min_redzone;
   disabled.alloc_dealloc_mismatch = false;
   disabled.may_return_null = true;
diff --git a/libsanitizer/asan/asan_allocator.cpp b/libsanitizer/asan/asan_allocator.cpp
index 3fa3674..7b7a289 100644
--- a/libsanitizer/asan/asan_allocator.cpp
+++ b/libsanitizer/asan/asan_allocator.cpp
@@ -210,8 +210,7 @@ struct QuarantineCallback {
       CHECK_EQ(old_chunk_state, CHUNK_QUARANTINE);
     }
 
-    PoisonShadow(m->Beg(),
-                 RoundUpTo(m->UsedSize(), SHADOW_GRANULARITY),
+    PoisonShadow(m->Beg(), RoundUpTo(m->UsedSize(), ASAN_SHADOW_GRANULARITY),
                  kAsanHeapLeftRedzoneMagic);
 
     // Statistics.
@@ -305,7 +304,6 @@ struct Allocator {
   QuarantineCache fallback_quarantine_cache;
 
   uptr max_user_defined_malloc_size;
-  atomic_uint8_t rss_limit_exceeded;
 
   // ------------------- Options --------------------------
   atomic_uint16_t min_redzone;
@@ -345,14 +343,6 @@ struct Allocator {
                                        : kMaxAllowedMallocSize;
   }
 
-  bool RssLimitExceeded() {
-    return atomic_load(&rss_limit_exceeded, memory_order_relaxed);
-  }
-
-  void SetRssLimitExceeded(bool limit_exceeded) {
-    atomic_store(&rss_limit_exceeded, limit_exceeded, memory_order_relaxed);
-  }
-
   void RePoisonChunk(uptr chunk) {
     // This could be a user-facing chunk (with redzones), or some internal
     // housekeeping chunk, like TransferBatch. Start by assuming the former.
@@ -366,7 +356,7 @@ struct Allocator {
       if (chunk < beg && beg < end && end <= chunk_end) {
         // Looks like a valid AsanChunk in use, poison redzones only.
         PoisonShadow(chunk, beg - chunk, kAsanHeapLeftRedzoneMagic);
-        uptr end_aligned_down = RoundDownTo(end, SHADOW_GRANULARITY);
+        uptr end_aligned_down = RoundDownTo(end, ASAN_SHADOW_GRANULARITY);
         FastPoisonShadowPartialRightRedzone(
             end_aligned_down, end - end_aligned_down,
             chunk_end - end_aligned_down, kAsanHeapLeftRedzoneMagic);
@@ -484,14 +474,14 @@ struct Allocator {
                  AllocType alloc_type, bool can_fill) {
     if (UNLIKELY(!asan_inited))
       AsanInitFromRtl();
-    if (RssLimitExceeded()) {
+    if (UNLIKELY(IsRssLimitExceeded())) {
       if (AllocatorMayReturnNull())
         return nullptr;
       ReportRssLimitExceeded(stack);
     }
     Flags &fl = *flags();
     CHECK(stack);
-    const uptr min_alignment = SHADOW_GRANULARITY;
+    const uptr min_alignment = ASAN_SHADOW_GRANULARITY;
     const uptr user_requested_alignment_log =
         ComputeUserRequestedAlignmentLog(alignment);
     if (alignment < min_alignment)
@@ -572,7 +562,7 @@ struct Allocator {
     m->SetAllocContext(t ? t->tid() : kMainTid, StackDepotPut(*stack));
 
     uptr size_rounded_down_to_granularity =
-        RoundDownTo(size, SHADOW_GRANULARITY);
+        RoundDownTo(size, ASAN_SHADOW_GRANULARITY);
     // Unpoison the bulk of the memory region.
     if (size_rounded_down_to_granularity)
       PoisonShadow(user_beg, size_rounded_down_to_granularity, 0);
@@ -580,7 +570,7 @@ struct Allocator {
     if (size != size_rounded_down_to_granularity && CanPoisonMemory()) {
       u8 *shadow =
           (u8 *)MemToShadow(user_beg + size_rounded_down_to_granularity);
-      *shadow = fl.poison_partial ? (size & (SHADOW_GRANULARITY - 1)) : 0;
+      *shadow = fl.poison_partial ? (size & (ASAN_SHADOW_GRANULARITY - 1)) : 0;
     }
 
     AsanStats &thread_stats = GetCurrentThreadStats();
@@ -607,7 +597,7 @@ struct Allocator {
       CHECK_LE(alloc_beg + sizeof(LargeChunkHeader), chunk_beg);
       reinterpret_cast<LargeChunkHeader *>(alloc_beg)->Set(m);
     }
-    ASAN_MALLOC_HOOK(res, size);
+    RunMallocHooks(res, size);
     return res;
   }
 
@@ -650,8 +640,7 @@ struct Allocator {
     }
 
     // Poison the region.
-    PoisonShadow(m->Beg(),
-                 RoundUpTo(m->UsedSize(), SHADOW_GRANULARITY),
+    PoisonShadow(m->Beg(), RoundUpTo(m->UsedSize(), ASAN_SHADOW_GRANULARITY),
                  kAsanHeapFreeMagic);
 
     AsanStats &thread_stats = GetCurrentThreadStats();
@@ -689,7 +678,7 @@ struct Allocator {
       return;
     }
 
-    ASAN_FREE_HOOK(ptr);
+    RunFreeHooks(ptr);
 
     // Must mark the chunk as quarantined before any changes to its metadata.
     // Do not quarantine given chunk if we failed to set CHUNK_QUARANTINE flag.
@@ -851,12 +840,12 @@ struct Allocator {
     quarantine.PrintStats();
   }
 
-  void ForceLock() ACQUIRE(fallback_mutex) {
+  void ForceLock() SANITIZER_ACQUIRE(fallback_mutex) {
     allocator.ForceLock();
     fallback_mutex.Lock();
   }
 
-  void ForceUnlock() RELEASE(fallback_mutex) {
+  void ForceUnlock() SANITIZER_RELEASE(fallback_mutex) {
     fallback_mutex.Unlock();
     allocator.ForceUnlock();
   }
@@ -1065,14 +1054,12 @@ uptr asan_mz_size(const void *ptr) {
   return instance.AllocationSize(reinterpret_cast<uptr>(ptr));
 }
 
-void asan_mz_force_lock() NO_THREAD_SAFETY_ANALYSIS { instance.ForceLock(); }
-
-void asan_mz_force_unlock() NO_THREAD_SAFETY_ANALYSIS {
-  instance.ForceUnlock();
+void asan_mz_force_lock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  instance.ForceLock();
 }
 
-void AsanSoftRssLimitExceededCallback(bool limit_exceeded) {
-  instance.SetRssLimitExceeded(limit_exceeded);
+void asan_mz_force_unlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  instance.ForceUnlock();
 }
 
 }  // namespace __asan
@@ -1230,16 +1217,3 @@ int __asan_update_allocation_context(void* addr) {
   GET_STACK_TRACE_MALLOC;
   return instance.UpdateAllocationStack((uptr)addr, &stack);
 }
-
-#if !SANITIZER_SUPPORTS_WEAK_HOOKS
-// Provide default (no-op) implementation of malloc hooks.
-SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_malloc_hook,
-                             void *ptr, uptr size) {
-  (void)ptr;
-  (void)size;
-}
-
-SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_free_hook, void *ptr) {
-  (void)ptr;
-}
-#endif
diff --git a/libsanitizer/asan/asan_debugging.cpp b/libsanitizer/asan/asan_debugging.cpp
index 0b4bf52..f078f10 100644
--- a/libsanitizer/asan/asan_debugging.cpp
+++ b/libsanitizer/asan/asan_debugging.cpp
@@ -141,7 +141,7 @@ uptr __asan_get_free_stack(uptr addr, uptr *trace, uptr size, u32 *thread_id) {
 SANITIZER_INTERFACE_ATTRIBUTE
 void __asan_get_shadow_mapping(uptr *shadow_scale, uptr *shadow_offset) {
   if (shadow_scale)
-    *shadow_scale = SHADOW_SCALE;
+    *shadow_scale = ASAN_SHADOW_SCALE;
   if (shadow_offset)
-    *shadow_offset = SHADOW_OFFSET;
+    *shadow_offset = ASAN_SHADOW_OFFSET;
 }
diff --git a/libsanitizer/asan/asan_errors.cpp b/libsanitizer/asan/asan_errors.cpp
index 7cd9fe9..a22bf13 100644
--- a/libsanitizer/asan/asan_errors.cpp
+++ b/libsanitizer/asan/asan_errors.cpp
@@ -329,7 +329,7 @@ void ErrorBadParamsToAnnotateContiguousContainer::Print() {
       "      old_mid : %p\n"
       "      new_mid : %p\n",
       (void *)beg, (void *)end, (void *)old_mid, (void *)new_mid);
-  uptr granularity = SHADOW_GRANULARITY;
+  uptr granularity = ASAN_SHADOW_GRANULARITY;
   if (!IsAligned(beg, granularity))
     Report("ERROR: beg is not aligned by %zu\n", granularity);
   stack->Print();
@@ -410,7 +410,8 @@ ErrorGeneric::ErrorGeneric(u32 tid, uptr pc_, uptr bp_, uptr sp_, uptr addr,
     if (AddrIsInMem(addr)) {
       u8 *shadow_addr = (u8 *)MemToShadow(addr);
       // If we are accessing 16 bytes, look at the second shadow byte.
-      if (*shadow_addr == 0 && access_size > SHADOW_GRANULARITY) shadow_addr++;
+      if (*shadow_addr == 0 && access_size > ASAN_SHADOW_GRANULARITY)
+        shadow_addr++;
       // If we are in the partial right redzone, look at the next shadow byte.
       if (*shadow_addr > 0 && *shadow_addr < 128) shadow_addr++;
       bool far_from_bounds = false;
@@ -501,10 +502,11 @@ static void PrintLegend(InternalScopedString *str) {
   str->append(
       "Shadow byte legend (one shadow byte represents %d "
       "application bytes):\n",
-      (int)SHADOW_GRANULARITY);
+      (int)ASAN_SHADOW_GRANULARITY);
   PrintShadowByte(str, "  Addressable:           ", 0);
   str->append("  Partially addressable: ");
-  for (u8 i = 1; i < SHADOW_GRANULARITY; i++) PrintShadowByte(str, "", i, " ");
+  for (u8 i = 1; i < ASAN_SHADOW_GRANULARITY; i++)
+    PrintShadowByte(str, "", i, " ");
   str->append("\n");
   PrintShadowByte(str, "  Heap left redzone:       ",
                   kAsanHeapLeftRedzoneMagic);
diff --git a/libsanitizer/asan/asan_errors.h b/libsanitizer/asan/asan_errors.h
index a7fda2f..c6ac88f 100644
--- a/libsanitizer/asan/asan_errors.h
+++ b/libsanitizer/asan/asan_errors.h
@@ -53,9 +53,9 @@ struct ErrorDeadlySignal : ErrorBase {
       scariness.Scare(10, "null-deref");
     } else if (signal.addr == signal.pc) {
       scariness.Scare(60, "wild-jump");
-    } else if (signal.write_flag == SignalContext::WRITE) {
+    } else if (signal.write_flag == SignalContext::Write) {
       scariness.Scare(30, "wild-addr-write");
-    } else if (signal.write_flag == SignalContext::READ) {
+    } else if (signal.write_flag == SignalContext::Read) {
       scariness.Scare(20, "wild-addr-read");
     } else {
       scariness.Scare(25, "wild-addr");
@@ -372,7 +372,7 @@ struct ErrorGeneric : ErrorBase {
   u8 shadow_val;
 
   ErrorGeneric() = default;  // (*)
-  ErrorGeneric(u32 tid, uptr addr, uptr pc_, uptr bp_, uptr sp_, bool is_write_,
+  ErrorGeneric(u32 tid, uptr pc_, uptr bp_, uptr sp_, uptr addr, bool is_write_,
                uptr access_size_);
   void Print();
 };
diff --git a/libsanitizer/asan/asan_fake_stack.cpp b/libsanitizer/asan/asan_fake_stack.cpp
index 07681c1..74a039b 100644
--- a/libsanitizer/asan/asan_fake_stack.cpp
+++ b/libsanitizer/asan/asan_fake_stack.cpp
@@ -28,8 +28,8 @@ static const u64 kAllocaRedzoneMask = 31UL;
 // For small size classes inline PoisonShadow for better performance.
 ALWAYS_INLINE void SetShadow(uptr ptr, uptr size, uptr class_id, u64 magic) {
   u64 *shadow = reinterpret_cast<u64*>(MemToShadow(ptr));
-  if (SHADOW_SCALE == 3 && class_id <= 6) {
-    // This code expects SHADOW_SCALE=3.
+  if (ASAN_SHADOW_SCALE == 3 && class_id <= 6) {
+    // This code expects ASAN_SHADOW_SCALE=3.
     for (uptr i = 0; i < (((uptr)1) << class_id); i++) {
       shadow[i] = magic;
       // Make sure this does not become memset.
@@ -140,7 +140,6 @@ void FakeStack::HandleNoReturn() {
 // We do it based on their 'real_stack' values -- everything that is lower
 // than the current real_stack is garbage.
 NOINLINE void FakeStack::GC(uptr real_stack) {
-  uptr collected = 0;
   for (uptr class_id = 0; class_id < kNumberOfSizeClasses; class_id++) {
     u8 *flags = GetFlags(stack_size_log(), class_id);
     for (uptr i = 0, n = NumberOfFrames(stack_size_log(), class_id); i < n;
@@ -150,7 +149,6 @@ NOINLINE void FakeStack::GC(uptr real_stack) {
           GetFrame(stack_size_log(), class_id, i));
       if (ff->real_stack < real_stack) {
         flags[i] = 0;
-        collected++;
       }
     }
   }
@@ -294,10 +292,10 @@ void __asan_alloca_poison(uptr addr, uptr size) {
   uptr LeftRedzoneAddr = addr - kAllocaRedzoneSize;
   uptr PartialRzAddr = addr + size;
   uptr RightRzAddr = (PartialRzAddr + kAllocaRedzoneMask) & ~kAllocaRedzoneMask;
-  uptr PartialRzAligned = PartialRzAddr & ~(SHADOW_GRANULARITY - 1);
+  uptr PartialRzAligned = PartialRzAddr & ~(ASAN_SHADOW_GRANULARITY - 1);
   FastPoisonShadow(LeftRedzoneAddr, kAllocaRedzoneSize, kAsanAllocaLeftMagic);
   FastPoisonShadowPartialRightRedzone(
-      PartialRzAligned, PartialRzAddr % SHADOW_GRANULARITY,
+      PartialRzAligned, PartialRzAddr % ASAN_SHADOW_GRANULARITY,
       RightRzAddr - PartialRzAligned, kAsanAllocaRightMagic);
   FastPoisonShadow(RightRzAddr, kAllocaRedzoneSize, kAsanAllocaRightMagic);
 }
@@ -305,7 +303,8 @@ void __asan_alloca_poison(uptr addr, uptr size) {
 SANITIZER_INTERFACE_ATTRIBUTE
 void __asan_allocas_unpoison(uptr top, uptr bottom) {
   if ((!top) || (top > bottom)) return;
-  REAL(memset)(reinterpret_cast<void*>(MemToShadow(top)), 0,
-               (bottom - top) / SHADOW_GRANULARITY);
+  REAL(memset)
+  (reinterpret_cast<void *>(MemToShadow(top)), 0,
+   (bottom - top) / ASAN_SHADOW_GRANULARITY);
 }
 } // extern "C"
diff --git a/libsanitizer/asan/asan_flags.cpp b/libsanitizer/asan/asan_flags.cpp
index c64e464..9ea899f 100644
--- a/libsanitizer/asan/asan_flags.cpp
+++ b/libsanitizer/asan/asan_flags.cpp
@@ -140,9 +140,9 @@ void InitializeFlags() {
            SanitizerToolName);
     Die();
   }
-  // Ensure that redzone is at least SHADOW_GRANULARITY.
-  if (f->redzone < (int)SHADOW_GRANULARITY)
-    f->redzone = SHADOW_GRANULARITY;
+  // Ensure that redzone is at least ASAN_SHADOW_GRANULARITY.
+  if (f->redzone < (int)ASAN_SHADOW_GRANULARITY)
+    f->redzone = ASAN_SHADOW_GRANULARITY;
   // Make "strict_init_order" imply "check_initialization_order".
   // TODO(samsonov): Use a single runtime flag for an init-order checker.
   if (f->strict_init_order) {
diff --git a/libsanitizer/asan/asan_flags.inc b/libsanitizer/asan/asan_flags.inc
index 514b225..314ed19 100644
--- a/libsanitizer/asan/asan_flags.inc
+++ b/libsanitizer/asan/asan_flags.inc
@@ -49,9 +49,10 @@ ASAN_FLAG(
     "to find more errors.")
 ASAN_FLAG(bool, replace_intrin, true,
           "If set, uses custom wrappers for memset/memcpy/memmove intrinsics.")
-ASAN_FLAG(bool, detect_stack_use_after_return, false,
+ASAN_FLAG(bool, detect_stack_use_after_return,
+          SANITIZER_LINUX && !SANITIZER_ANDROID,
           "Enables stack-use-after-return checking at run-time.")
-ASAN_FLAG(int, min_uar_stack_size_log, 16, // We can't do smaller anyway.
+ASAN_FLAG(int, min_uar_stack_size_log, 16,  // We can't do smaller anyway.
           "Minimum fake stack size log.")
 ASAN_FLAG(int, max_uar_stack_size_log,
           20, // 1Mb per size class, i.e. ~11Mb per thread
diff --git a/libsanitizer/asan/asan_fuchsia.cpp b/libsanitizer/asan/asan_fuchsia.cpp
index b419019..2b15504 100644
--- a/libsanitizer/asan/asan_fuchsia.cpp
+++ b/libsanitizer/asan/asan_fuchsia.cpp
@@ -14,16 +14,17 @@
 #include "sanitizer_common/sanitizer_fuchsia.h"
 #if SANITIZER_FUCHSIA
 
-#include "asan_interceptors.h"
-#include "asan_internal.h"
-#include "asan_stack.h"
-#include "asan_thread.h"
-
 #include <limits.h>
 #include <zircon/sanitizer.h>
 #include <zircon/syscalls.h>
 #include <zircon/threads.h>
 
+#  include "asan_interceptors.h"
+#  include "asan_internal.h"
+#  include "asan_stack.h"
+#  include "asan_thread.h"
+#  include "lsan/lsan_common.h"
+
 namespace __asan {
 
 // The system already set up the shadow memory for us.
@@ -118,14 +119,12 @@ struct AsanThread::InitOptions {
 
 // Shared setup between thread creation and startup for the initial thread.
 static AsanThread *CreateAsanThread(StackTrace *stack, u32 parent_tid,
-                                    uptr user_id, bool detached,
-                                    const char *name) {
+                                    bool detached, const char *name) {
   // In lieu of AsanThread::Create.
   AsanThread *thread = (AsanThread *)MmapOrDie(AsanThreadMmapSize(), __func__);
 
   AsanThreadContext::CreateThreadContextArgs args = {thread, stack};
-  u32 tid =
-      asanThreadRegistry().CreateThread(user_id, detached, parent_tid, &args);
+  u32 tid = asanThreadRegistry().CreateThread(0, detached, parent_tid, &args);
   asanThreadRegistry().SetThreadName(tid, name);
 
   return thread;
@@ -152,7 +151,7 @@ AsanThread *CreateMainThread() {
   CHECK_NE(__sanitizer::MainThreadStackBase, 0);
   CHECK_GT(__sanitizer::MainThreadStackSize, 0);
   AsanThread *t = CreateAsanThread(
-      nullptr, 0, reinterpret_cast<uptr>(self), true,
+      nullptr, 0, true,
       _zx_object_get_property(thrd_get_zx_handle(self), ZX_PROP_NAME, name,
                               sizeof(name)) == ZX_OK
           ? name
@@ -182,8 +181,7 @@ static void *BeforeThreadCreateHook(uptr user_id, bool detached,
   GET_STACK_TRACE_THREAD;
   u32 parent_tid = GetCurrentTidOrInvalid();
 
-  AsanThread *thread =
-      CreateAsanThread(&stack, parent_tid, user_id, detached, name);
+  AsanThread *thread = CreateAsanThread(&stack, parent_tid, detached, name);
 
   // On other systems, AsanThread::Init() is called from the new
   // thread itself.  But on Fuchsia we already know the stack address
@@ -238,8 +236,18 @@ void FlushUnneededASanShadowMemory(uptr p, uptr size) {
   __sanitizer_fill_shadow(p, size, 0, 0);
 }
 
+// On Fuchsia, leak detection is done by a special hook after atexit hooks.
+// So this doesn't install any atexit hook like on other platforms.
+void InstallAtExitCheckLeaks() {}
+
 }  // namespace __asan
 
+namespace __lsan {
+
+bool UseExitcodeOnLeak() { return __asan::flags()->halt_on_error; }
+
+}  // namespace __lsan
+
 // These are declared (in extern "C") by <zircon/sanitizer.h>.
 // The system runtime will call our definitions directly.
 
diff --git a/libsanitizer/asan/asan_globals.cpp b/libsanitizer/asan/asan_globals.cpp
index 9400487..ecc2600 100644
--- a/libsanitizer/asan/asan_globals.cpp
+++ b/libsanitizer/asan/asan_globals.cpp
@@ -61,14 +61,13 @@ ALWAYS_INLINE void PoisonShadowForGlobal(const Global *g, u8 value) {
 }
 
 ALWAYS_INLINE void PoisonRedZones(const Global &g) {
-  uptr aligned_size = RoundUpTo(g.size, SHADOW_GRANULARITY);
+  uptr aligned_size = RoundUpTo(g.size, ASAN_SHADOW_GRANULARITY);
   FastPoisonShadow(g.beg + aligned_size, g.size_with_redzone - aligned_size,
                    kAsanGlobalRedzoneMagic);
   if (g.size != aligned_size) {
     FastPoisonShadowPartialRightRedzone(
-        g.beg + RoundDownTo(g.size, SHADOW_GRANULARITY),
-        g.size % SHADOW_GRANULARITY,
-        SHADOW_GRANULARITY,
+        g.beg + RoundDownTo(g.size, ASAN_SHADOW_GRANULARITY),
+        g.size % ASAN_SHADOW_GRANULARITY, ASAN_SHADOW_GRANULARITY,
         kAsanGlobalRedzoneMagic);
   }
 }
@@ -154,6 +153,23 @@ static void CheckODRViolationViaIndicator(const Global *g) {
   }
 }
 
+// Check ODR violation for given global G by checking if it's already poisoned.
+// We use this method in case compiler doesn't use private aliases for global
+// variables.
+static void CheckODRViolationViaPoisoning(const Global *g) {
+  if (__asan_region_is_poisoned(g->beg, g->size_with_redzone)) {
+    // This check may not be enough: if the first global is much larger
+    // the entire redzone of the second global may be within the first global.
+    for (ListOfGlobals *l = list_of_all_globals; l; l = l->next) {
+      if (g->beg == l->g->beg &&
+          (flags()->detect_odr_violation >= 2 || g->size != l->g->size) &&
+          !IsODRViolationSuppressed(g->name))
+        ReportODRViolation(g, FindRegistrationSite(g),
+                           l->g, FindRegistrationSite(l->g));
+    }
+  }
+}
+
 // Clang provides two different ways for global variables protection:
 // it can poison the global itself or its private alias. In former
 // case we may poison same symbol multiple times, that can help us to
@@ -199,6 +215,8 @@ static void RegisterGlobal(const Global *g) {
     // where two globals with the same name are defined in different modules.
     if (UseODRIndicator(g))
       CheckODRViolationViaIndicator(g);
+    else
+      CheckODRViolationViaPoisoning(g);
   }
   if (CanPoisonMemory())
     PoisonRedZones(*g);
diff --git a/libsanitizer/asan/asan_interceptors.cpp b/libsanitizer/asan/asan_interceptors.cpp
index b289091..2ff314a 100644
--- a/libsanitizer/asan/asan_interceptors.cpp
+++ b/libsanitizer/asan/asan_interceptors.cpp
@@ -130,23 +130,24 @@ DECLARE_REAL_AND_INTERCEPTOR(void, free, void *)
 #define COMMON_INTERCEPTOR_BLOCK_REAL(name) REAL(name)
 // Strict init-order checking is dlopen-hostile:
 // https://github.com/google/sanitizers/issues/178
-#define COMMON_INTERCEPTOR_ON_DLOPEN(filename, flag)                           \
-  do {                                                                         \
-    if (flags()->strict_init_order)                                            \
-      StopInitOrderChecking();                                                 \
-    CheckNoDeepBind(filename, flag);                                           \
-  } while (false)
-#define COMMON_INTERCEPTOR_ON_EXIT(ctx) OnExit()
-#define COMMON_INTERCEPTOR_LIBRARY_LOADED(filename, handle)
-#define COMMON_INTERCEPTOR_LIBRARY_UNLOADED()
-#define COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED (!asan_inited)
-#define COMMON_INTERCEPTOR_GET_TLS_RANGE(begin, end)                           \
-  if (AsanThread *t = GetCurrentThread()) {                                    \
-    *begin = t->tls_begin();                                                   \
-    *end = t->tls_end();                                                       \
-  } else {                                                                     \
-    *begin = *end = 0;                                                         \
-  }
+#  define COMMON_INTERCEPTOR_DLOPEN(filename, flag) \
+    ({                                              \
+      if (flags()->strict_init_order)               \
+        StopInitOrderChecking();                    \
+      CheckNoDeepBind(filename, flag);              \
+      REAL(dlopen)(filename, flag);                 \
+    })
+#  define COMMON_INTERCEPTOR_ON_EXIT(ctx) OnExit()
+#  define COMMON_INTERCEPTOR_LIBRARY_LOADED(filename, handle)
+#  define COMMON_INTERCEPTOR_LIBRARY_UNLOADED()
+#  define COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED (!asan_inited)
+#  define COMMON_INTERCEPTOR_GET_TLS_RANGE(begin, end) \
+    if (AsanThread *t = GetCurrentThread()) {          \
+      *begin = t->tls_begin();                         \
+      *end = t->tls_end();                             \
+    } else {                                           \
+      *begin = *end = 0;                               \
+    }
 
 #define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \
   do {                                                       \
diff --git a/libsanitizer/asan/asan_interceptors.h b/libsanitizer/asan/asan_interceptors.h
index 105c672..047b044 100644
--- a/libsanitizer/asan/asan_interceptors.h
+++ b/libsanitizer/asan/asan_interceptors.h
@@ -81,12 +81,7 @@ void InitializePlatformInterceptors();
 #if ASAN_HAS_EXCEPTIONS && !SANITIZER_WINDOWS && !SANITIZER_SOLARIS && \
     !SANITIZER_NETBSD
 # define ASAN_INTERCEPT___CXA_THROW 1
-# if ! defined(ASAN_HAS_CXA_RETHROW_PRIMARY_EXCEPTION) \
-     || ASAN_HAS_CXA_RETHROW_PRIMARY_EXCEPTION
-#   define ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION 1
-# else
-#   define ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION 0
-# endif
+# define ASAN_INTERCEPT___CXA_RETHROW_PRIMARY_EXCEPTION 1
 # if defined(_GLIBCXX_SJLJ_EXCEPTIONS) || (SANITIZER_IOS && defined(__arm__))
 #  define ASAN_INTERCEPT__UNWIND_SJLJ_RAISEEXCEPTION 1
 # else
diff --git a/libsanitizer/asan/asan_interface.inc b/libsanitizer/asan/asan_interface.inc
index ea28fc8..89ef552 100644
--- a/libsanitizer/asan/asan_interface.inc
+++ b/libsanitizer/asan/asan_interface.inc
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 // Asan interface list.
 //===----------------------------------------------------------------------===//
+
 INTERFACE_FUNCTION(__asan_addr_is_in_fake_stack)
 INTERFACE_FUNCTION(__asan_address_is_poisoned)
 INTERFACE_FUNCTION(__asan_after_dynamic_init)
diff --git a/libsanitizer/asan/asan_internal.h b/libsanitizer/asan/asan_internal.h
index ad33203..7468f12 100644
--- a/libsanitizer/asan/asan_internal.h
+++ b/libsanitizer/asan/asan_internal.h
@@ -17,19 +17,19 @@
 #include "asan_interface_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
-#include "sanitizer_common/sanitizer_stacktrace.h"
 #include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_stacktrace.h"
 
 #if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
-# error "The AddressSanitizer run-time should not be"
-        " instrumented by AddressSanitizer"
+#  error \
+      "The AddressSanitizer run-time should not be instrumented by AddressSanitizer"
 #endif
 
 // Build-time configuration options.
 
 // If set, asan will intercept C++ exception api call(s).
 #ifndef ASAN_HAS_EXCEPTIONS
-# define ASAN_HAS_EXCEPTIONS 1
+#  define ASAN_HAS_EXCEPTIONS 1
 #endif
 
 // If set, values like allocator chunk size, as well as defaults for some flags
@@ -43,11 +43,11 @@
 #endif
 
 #ifndef ASAN_DYNAMIC
-# ifdef PIC
-#  define ASAN_DYNAMIC 1
-# else
-#  define ASAN_DYNAMIC 0
-# endif
+#  ifdef PIC
+#    define ASAN_DYNAMIC 1
+#  else
+#    define ASAN_DYNAMIC 0
+#  endif
 #endif
 
 // All internal functions in asan reside inside the __asan namespace
@@ -123,26 +123,18 @@ void *AsanDlSymNext(const char *sym);
 // `dlopen()` specific initialization inside this function.
 bool HandleDlopenInit();
 
-// Add convenient macro for interface functions that may be represented as
-// weak hooks.
-#define ASAN_MALLOC_HOOK(ptr, size)                                   \
-  do {                                                                \
-    if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(ptr, size); \
-    RunMallocHooks(ptr, size);                                        \
-  } while (false)
-#define ASAN_FREE_HOOK(ptr)                                 \
-  do {                                                      \
-    if (&__sanitizer_free_hook) __sanitizer_free_hook(ptr); \
-    RunFreeHooks(ptr);                                      \
-  } while (false)
+void InstallAtExitCheckLeaks();
+
 #define ASAN_ON_ERROR() \
-  if (&__asan_on_error) __asan_on_error()
+  if (&__asan_on_error) \
+  __asan_on_error()
 
 extern int asan_inited;
 // Used to avoid infinite recursion in __asan_init().
 extern bool asan_init_is_running;
 extern void (*death_callback)(void);
-// These magic values are written to shadow for better error reporting.
+// These magic values are written to shadow for better error
+// reporting.
 const int kAsanHeapLeftRedzoneMagic = 0xfa;
 const int kAsanHeapFreeMagic = 0xfd;
 const int kAsanStackLeftRedzoneMagic = 0xf1;
diff --git a/libsanitizer/asan/asan_linux.cpp b/libsanitizer/asan/asan_linux.cpp
index ad3693d..defd81b 100644
--- a/libsanitizer/asan/asan_linux.cpp
+++ b/libsanitizer/asan/asan_linux.cpp
@@ -107,7 +107,7 @@ uptr FindDynamicShadowStart() {
     return FindPremappedShadowStart(shadow_size_bytes);
 #endif
 
-  return MapDynamicShadow(shadow_size_bytes, SHADOW_SCALE,
+  return MapDynamicShadow(shadow_size_bytes, ASAN_SHADOW_SCALE,
                           /*min_shadow_base_alignment*/ 0, kHighMemEnd);
 }
 
@@ -131,30 +131,24 @@ static int FindFirstDSOCallback(struct dl_phdr_info *info, size_t size,
   VReport(2, "info->dlpi_name = %s\tinfo->dlpi_addr = %p\n", info->dlpi_name,
           (void *)info->dlpi_addr);
 
-  // Continue until the first dynamic library is found
-  if (!info->dlpi_name || info->dlpi_name[0] == 0)
-    return 0;
-
-  // Ignore vDSO
-  if (internal_strncmp(info->dlpi_name, "linux-", sizeof("linux-") - 1) == 0)
-    return 0;
+  const char **name = (const char **)data;
 
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD
   // Ignore first entry (the main program)
-  char **p = (char **)data;
-  if (!(*p)) {
-    *p = (char *)-1;
+  if (!*name) {
+    *name = "";
     return 0;
   }
-#endif
 
-#if SANITIZER_SOLARIS
-  // Ignore executable on Solaris
-  if (info->dlpi_addr == 0)
+#    if SANITIZER_LINUX
+  // Ignore vDSO. glibc versions earlier than 2.15 (and some patched
+  // by distributors) return an empty name for the vDSO entry, so
+  // detect this as well.
+  if (!info->dlpi_name[0] ||
+      internal_strncmp(info->dlpi_name, "linux-", sizeof("linux-") - 1) == 0)
     return 0;
-#endif
+#    endif
 
-  *(const char **)data = info->dlpi_name;
+  *name = info->dlpi_name;
   return 1;
 }
 
@@ -175,7 +169,7 @@ void AsanCheckDynamicRTPrereqs() {
   // Ensure that dynamic RT is the first DSO in the list
   const char *first_dso_name = nullptr;
   dl_iterate_phdr(FindFirstDSOCallback, &first_dso_name);
-  if (first_dso_name && !IsDynamicRTName(first_dso_name)) {
+  if (first_dso_name && first_dso_name[0] && !IsDynamicRTName(first_dso_name)) {
     Report("ASan runtime does not come first in initial library list; "
            "you should either link runtime to your application or "
            "manually preload it with LD_PRELOAD.\n");
diff --git a/libsanitizer/asan/asan_mac.cpp b/libsanitizer/asan/asan_mac.cpp
index c695054..9161f72 100644
--- a/libsanitizer/asan/asan_mac.cpp
+++ b/libsanitizer/asan/asan_mac.cpp
@@ -55,7 +55,7 @@ void *AsanDoesNotSupportStaticLinkage() {
 }
 
 uptr FindDynamicShadowStart() {
-  return MapDynamicShadow(MemToShadowSize(kHighMemEnd), SHADOW_SCALE,
+  return MapDynamicShadow(MemToShadowSize(kHighMemEnd), ASAN_SHADOW_SCALE,
                           /*min_shadow_base_alignment*/ 0, kHighMemEnd);
 }
 
diff --git a/libsanitizer/asan/asan_mapping.h b/libsanitizer/asan/asan_mapping.h
index 4b0037f..4ff09b1 100644
--- a/libsanitizer/asan/asan_mapping.h
+++ b/libsanitizer/asan/asan_mapping.h
@@ -13,7 +13,7 @@
 #ifndef ASAN_MAPPING_H
 #define ASAN_MAPPING_H
 
-#include "asan_internal.h"
+#include "sanitizer_common/sanitizer_platform.h"
 
 // The full explanation of the memory mapping could be found here:
 // https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
@@ -151,149 +151,145 @@
 // || `[0x30000000, 0x35ffffff]` || LowShadow  ||
 // || `[0x00000000, 0x2fffffff]` || LowMem     ||
 
-#if defined(ASAN_SHADOW_SCALE)
-static const u64 kDefaultShadowScale = ASAN_SHADOW_SCALE;
-#else
-static const u64 kDefaultShadowScale = 3;
-#endif
-static const u64 kDefaultShadowSentinel = ~(uptr)0;
-static const u64 kDefaultShadowOffset32 = 1ULL << 29;  // 0x20000000
-static const u64 kDefaultShadowOffset64 = 1ULL << 44;
-static const u64 kDefaultShort64bitShadowOffset =
-    0x7FFFFFFF & (~0xFFFULL << kDefaultShadowScale);  // < 2G.
-static const u64 kAArch64_ShadowOffset64 = 1ULL << 36;
-static const u64 kRiscv64_ShadowOffset64 = 0xd55550000;
-static const u64 kMIPS32_ShadowOffset32 = 0x0aaa0000;
-static const u64 kMIPS64_ShadowOffset64 = 1ULL << 37;
-static const u64 kPPC64_ShadowOffset64 = 1ULL << 41;
-static const u64 kSystemZ_ShadowOffset64 = 1ULL << 52;
-static const u64 kSPARC64_ShadowOffset64 = 1ULL << 43;  // 0x80000000000
-static const u64 kFreeBSD_ShadowOffset32 = 1ULL << 30;  // 0x40000000
-static const u64 kFreeBSD_ShadowOffset64 = 1ULL << 46;  // 0x400000000000
-static const u64 kNetBSD_ShadowOffset32 = 1ULL << 30;  // 0x40000000
-static const u64 kNetBSD_ShadowOffset64 = 1ULL << 46;  // 0x400000000000
-static const u64 kWindowsShadowOffset32 = 3ULL << 28;  // 0x30000000
-
-#define SHADOW_SCALE kDefaultShadowScale
+#define ASAN_SHADOW_SCALE 3
 
 #if SANITIZER_FUCHSIA
-#  define SHADOW_OFFSET (0)
+#  define ASAN_SHADOW_OFFSET_CONST (0)
 #elif SANITIZER_WORDSIZE == 32
 #  if SANITIZER_ANDROID
-#    define SHADOW_OFFSET __asan_shadow_memory_dynamic_address
+#    define ASAN_SHADOW_OFFSET_DYNAMIC
 #  elif defined(__mips__)
-#    define SHADOW_OFFSET kMIPS32_ShadowOffset32
+#    define ASAN_SHADOW_OFFSET_CONST 0x0aaa0000
 #  elif SANITIZER_FREEBSD
-#    define SHADOW_OFFSET kFreeBSD_ShadowOffset32
+#    define ASAN_SHADOW_OFFSET_CONST 0x40000000
 #  elif SANITIZER_NETBSD
-#    define SHADOW_OFFSET kNetBSD_ShadowOffset32
+#    define ASAN_SHADOW_OFFSET_CONST 0x40000000
 #  elif SANITIZER_WINDOWS
-#    define SHADOW_OFFSET kWindowsShadowOffset32
+#    define ASAN_SHADOW_OFFSET_CONST 0x30000000
 #  elif SANITIZER_IOS
-#    define SHADOW_OFFSET __asan_shadow_memory_dynamic_address
+#    define ASAN_SHADOW_OFFSET_DYNAMIC
 #  else
-#    define SHADOW_OFFSET kDefaultShadowOffset32
+#    define ASAN_SHADOW_OFFSET_CONST 0x20000000
 #  endif
 #else
 #  if SANITIZER_IOS
-#    define SHADOW_OFFSET __asan_shadow_memory_dynamic_address
+#    define ASAN_SHADOW_OFFSET_DYNAMIC
 #  elif SANITIZER_MAC && defined(__aarch64__)
-#    define SHADOW_OFFSET __asan_shadow_memory_dynamic_address
-#elif SANITIZER_RISCV64
-#define SHADOW_OFFSET kRiscv64_ShadowOffset64
+#    define ASAN_SHADOW_OFFSET_DYNAMIC
+#  elif SANITIZER_RISCV64
+#    define ASAN_SHADOW_OFFSET_CONST 0x0000000d55550000
 #  elif defined(__aarch64__)
-#    define SHADOW_OFFSET kAArch64_ShadowOffset64
+#    define ASAN_SHADOW_OFFSET_CONST 0x0000001000000000
 #  elif defined(__powerpc64__)
-#    define SHADOW_OFFSET kPPC64_ShadowOffset64
+#    define ASAN_SHADOW_OFFSET_CONST 0x0000100000000000
 #  elif defined(__s390x__)
-#    define SHADOW_OFFSET kSystemZ_ShadowOffset64
+#    define ASAN_SHADOW_OFFSET_CONST 0x0010000000000000
 #  elif SANITIZER_FREEBSD
-#    define SHADOW_OFFSET kFreeBSD_ShadowOffset64
+#    define ASAN_SHADOW_OFFSET_CONST 0x0000400000000000
 #  elif SANITIZER_NETBSD
-#    define SHADOW_OFFSET kNetBSD_ShadowOffset64
+#    define ASAN_SHADOW_OFFSET_CONST 0x0000400000000000
 #  elif SANITIZER_MAC
-#   define SHADOW_OFFSET kDefaultShadowOffset64
+#    define ASAN_SHADOW_OFFSET_CONST 0x0000100000000000
 #  elif defined(__mips64)
-#   define SHADOW_OFFSET kMIPS64_ShadowOffset64
-#elif defined(__sparc__)
-#define SHADOW_OFFSET kSPARC64_ShadowOffset64
+#    define ASAN_SHADOW_OFFSET_CONST 0x0000002000000000
+#  elif defined(__sparc__)
+#    define ASAN_SHADOW_OFFSET_CONST 0x0000080000000000
 #  elif SANITIZER_WINDOWS64
-#   define SHADOW_OFFSET __asan_shadow_memory_dynamic_address
+#    define ASAN_SHADOW_OFFSET_DYNAMIC
 #  else
-#   define SHADOW_OFFSET kDefaultShort64bitShadowOffset
+#    if ASAN_SHADOW_SCALE != 3
+#      error "Value below is based on shadow scale = 3."
+#      error "Original formula was: 0x7FFFFFFF & (~0xFFFULL << SHADOW_SCALE)."
+#    endif
+#    define ASAN_SHADOW_OFFSET_CONST 0x000000007fff8000
 #  endif
 #endif
 
-#if SANITIZER_ANDROID && defined(__arm__)
-# define ASAN_PREMAP_SHADOW 1
-#else
-# define ASAN_PREMAP_SHADOW 0
-#endif
+#if defined(__cplusplus)
+#  include "asan_internal.h"
+
+static const u64 kDefaultShadowSentinel = ~(uptr)0;
 
-#define SHADOW_GRANULARITY (1ULL << SHADOW_SCALE)
+#  if defined(ASAN_SHADOW_OFFSET_CONST)
+static const u64 kConstShadowOffset = ASAN_SHADOW_OFFSET_CONST;
+#    define ASAN_SHADOW_OFFSET kConstShadowOffset
+#  elif defined(ASAN_SHADOW_OFFSET_DYNAMIC)
+#    define ASAN_SHADOW_OFFSET __asan_shadow_memory_dynamic_address
+#  else
+#    error "ASAN_SHADOW_OFFSET can't be determined."
+#  endif
 
-#define DO_ASAN_MAPPING_PROFILE 0  // Set to 1 to profile the functions below.
+#  if SANITIZER_ANDROID && defined(__arm__)
+#    define ASAN_PREMAP_SHADOW 1
+#  else
+#    define ASAN_PREMAP_SHADOW 0
+#  endif
 
-#if DO_ASAN_MAPPING_PROFILE
-# define PROFILE_ASAN_MAPPING() AsanMappingProfile[__LINE__]++;
-#else
-# define PROFILE_ASAN_MAPPING()
-#endif
+#  define ASAN_SHADOW_GRANULARITY (1ULL << ASAN_SHADOW_SCALE)
+
+#  define DO_ASAN_MAPPING_PROFILE 0  // Set to 1 to profile the functions below.
+
+#  if DO_ASAN_MAPPING_PROFILE
+#    define PROFILE_ASAN_MAPPING() AsanMappingProfile[__LINE__]++;
+#  else
+#    define PROFILE_ASAN_MAPPING()
+#  endif
 
 // If 1, all shadow boundaries are constants.
 // Don't set to 1 other than for testing.
-#define ASAN_FIXED_MAPPING 0
+#  define ASAN_FIXED_MAPPING 0
 
 namespace __asan {
 
 extern uptr AsanMappingProfile[];
 
-#if ASAN_FIXED_MAPPING
+#  if ASAN_FIXED_MAPPING
 // Fixed mapping for 64-bit Linux. Mostly used for performance comparison
 // with non-fixed mapping. As of r175253 (Feb 2013) the performance
 // difference between fixed and non-fixed mapping is below the noise level.
 static uptr kHighMemEnd = 0x7fffffffffffULL;
-static uptr kMidMemBeg =    0x3000000000ULL;
-static uptr kMidMemEnd =    0x4fffffffffULL;
-#else
+static uptr kMidMemBeg = 0x3000000000ULL;
+static uptr kMidMemEnd = 0x4fffffffffULL;
+#  else
 extern uptr kHighMemEnd, kMidMemBeg, kMidMemEnd;  // Initialized in __asan_init.
-#endif
+#  endif
 
 }  // namespace __asan
 
-#if defined(__sparc__) && SANITIZER_WORDSIZE == 64
-#  include "asan_mapping_sparc64.h"
-#else
-#define MEM_TO_SHADOW(mem) (((mem) >> SHADOW_SCALE) + (SHADOW_OFFSET))
+#  if defined(__sparc__) && SANITIZER_WORDSIZE == 64
+#    include "asan_mapping_sparc64.h"
+#  else
+#    define MEM_TO_SHADOW(mem) \
+      (((mem) >> ASAN_SHADOW_SCALE) + (ASAN_SHADOW_OFFSET))
 
-#define kLowMemBeg      0
-#define kLowMemEnd      (SHADOW_OFFSET ? SHADOW_OFFSET - 1 : 0)
+#    define kLowMemBeg 0
+#    define kLowMemEnd (ASAN_SHADOW_OFFSET ? ASAN_SHADOW_OFFSET - 1 : 0)
 
-#define kLowShadowBeg   SHADOW_OFFSET
-#define kLowShadowEnd   MEM_TO_SHADOW(kLowMemEnd)
+#    define kLowShadowBeg ASAN_SHADOW_OFFSET
+#    define kLowShadowEnd MEM_TO_SHADOW(kLowMemEnd)
 
-#define kHighMemBeg     (MEM_TO_SHADOW(kHighMemEnd) + 1)
+#    define kHighMemBeg (MEM_TO_SHADOW(kHighMemEnd) + 1)
 
-#define kHighShadowBeg  MEM_TO_SHADOW(kHighMemBeg)
-#define kHighShadowEnd  MEM_TO_SHADOW(kHighMemEnd)
+#    define kHighShadowBeg MEM_TO_SHADOW(kHighMemBeg)
+#    define kHighShadowEnd MEM_TO_SHADOW(kHighMemEnd)
 
-# define kMidShadowBeg MEM_TO_SHADOW(kMidMemBeg)
-# define kMidShadowEnd MEM_TO_SHADOW(kMidMemEnd)
+#    define kMidShadowBeg MEM_TO_SHADOW(kMidMemBeg)
+#    define kMidShadowEnd MEM_TO_SHADOW(kMidMemEnd)
 
 // With the zero shadow base we can not actually map pages starting from 0.
 // This constant is somewhat arbitrary.
-#define kZeroBaseShadowStart 0
-#define kZeroBaseMaxShadowStart (1 << 18)
+#    define kZeroBaseShadowStart 0
+#    define kZeroBaseMaxShadowStart (1 << 18)
 
-#define kShadowGapBeg   (kLowShadowEnd ? kLowShadowEnd + 1 \
-                                       : kZeroBaseShadowStart)
-#define kShadowGapEnd   ((kMidMemBeg ? kMidShadowBeg : kHighShadowBeg) - 1)
+#    define kShadowGapBeg \
+      (kLowShadowEnd ? kLowShadowEnd + 1 : kZeroBaseShadowStart)
+#    define kShadowGapEnd ((kMidMemBeg ? kMidShadowBeg : kHighShadowBeg) - 1)
 
-#define kShadowGap2Beg (kMidMemBeg ? kMidShadowEnd + 1 : 0)
-#define kShadowGap2End (kMidMemBeg ? kMidMemBeg - 1 : 0)
+#    define kShadowGap2Beg (kMidMemBeg ? kMidShadowEnd + 1 : 0)
+#    define kShadowGap2End (kMidMemBeg ? kMidMemBeg - 1 : 0)
 
-#define kShadowGap3Beg (kMidMemBeg ? kMidMemEnd + 1 : 0)
-#define kShadowGap3End (kMidMemBeg ? kHighShadowBeg - 1 : 0)
+#    define kShadowGap3Beg (kMidMemBeg ? kMidMemEnd + 1 : 0)
+#    define kShadowGap3End (kMidMemBeg ? kHighShadowBeg - 1 : 0)
 
 namespace __asan {
 
@@ -331,29 +327,31 @@ static inline bool AddrIsInShadowGap(uptr a) {
   PROFILE_ASAN_MAPPING();
   if (kMidMemBeg) {
     if (a <= kShadowGapEnd)
-      return SHADOW_OFFSET == 0 || a >= kShadowGapBeg;
+      return ASAN_SHADOW_OFFSET == 0 || a >= kShadowGapBeg;
     return (a >= kShadowGap2Beg && a <= kShadowGap2End) ||
            (a >= kShadowGap3Beg && a <= kShadowGap3End);
   }
   // In zero-based shadow mode we treat addresses near zero as addresses
   // in shadow gap as well.
-  if (SHADOW_OFFSET == 0)
+  if (ASAN_SHADOW_OFFSET == 0)
     return a <= kShadowGapEnd;
   return a >= kShadowGapBeg && a <= kShadowGapEnd;
 }
 
 }  // namespace __asan
 
-#endif
+#  endif
 
 namespace __asan {
 
-static inline uptr MemToShadowSize(uptr size) { return size >> SHADOW_SCALE; }
+static inline uptr MemToShadowSize(uptr size) {
+  return size >> ASAN_SHADOW_SCALE;
+}
 
 static inline bool AddrIsInMem(uptr a) {
   PROFILE_ASAN_MAPPING();
   return AddrIsInLowMem(a) || AddrIsInMidMem(a) || AddrIsInHighMem(a) ||
-      (flags()->protect_shadow_gap == 0 && AddrIsInShadowGap(a));
+         (flags()->protect_shadow_gap == 0 && AddrIsInShadowGap(a));
 }
 
 static inline uptr MemToShadow(uptr p) {
@@ -369,17 +367,17 @@ static inline bool AddrIsInShadow(uptr a) {
 
 static inline bool AddrIsAlignedByGranularity(uptr a) {
   PROFILE_ASAN_MAPPING();
-  return (a & (SHADOW_GRANULARITY - 1)) == 0;
+  return (a & (ASAN_SHADOW_GRANULARITY - 1)) == 0;
 }
 
 static inline bool AddressIsPoisoned(uptr a) {
   PROFILE_ASAN_MAPPING();
   const uptr kAccessSize = 1;
-  u8 *shadow_address = (u8*)MEM_TO_SHADOW(a);
+  u8 *shadow_address = (u8 *)MEM_TO_SHADOW(a);
   s8 shadow_value = *shadow_address;
   if (shadow_value) {
-    u8 last_accessed_byte = (a & (SHADOW_GRANULARITY - 1))
-                                 + kAccessSize - 1;
+    u8 last_accessed_byte =
+        (a & (ASAN_SHADOW_GRANULARITY - 1)) + kAccessSize - 1;
     return (last_accessed_byte >= shadow_value);
   }
   return false;
@@ -390,4 +388,6 @@ static const uptr kAsanMappingProfileSize = __LINE__;
 
 }  // namespace __asan
 
+#endif  // __cplusplus
+
 #endif  // ASAN_MAPPING_H
diff --git a/libsanitizer/asan/asan_mapping_sparc64.h b/libsanitizer/asan/asan_mapping_sparc64.h
index 432a181..90261d3 100644
--- a/libsanitizer/asan/asan_mapping_sparc64.h
+++ b/libsanitizer/asan/asan_mapping_sparc64.h
@@ -25,13 +25,14 @@
 // The idea is to chop the high bits before doing the scaling, so the two
 // parts become contiguous again and the usual scheme can be applied.
 
-#define MEM_TO_SHADOW(mem) \
-  ((((mem) << HIGH_BITS) >> (HIGH_BITS + (SHADOW_SCALE))) + (SHADOW_OFFSET))
+#define MEM_TO_SHADOW(mem)                                       \
+  ((((mem) << HIGH_BITS) >> (HIGH_BITS + (ASAN_SHADOW_SCALE))) + \
+   (ASAN_SHADOW_OFFSET))
 
 #define kLowMemBeg 0
-#define kLowMemEnd (SHADOW_OFFSET - 1)
+#define kLowMemEnd (ASAN_SHADOW_OFFSET - 1)
 
-#define kLowShadowBeg SHADOW_OFFSET
+#define kLowShadowBeg ASAN_SHADOW_OFFSET
 #define kLowShadowEnd MEM_TO_SHADOW(kLowMemEnd)
 
 // But of course there is the huge hole between the high shadow memory,
diff --git a/libsanitizer/asan/asan_poisoning.cpp b/libsanitizer/asan/asan_poisoning.cpp
index d97af91..3b7c9d1 100644
--- a/libsanitizer/asan/asan_poisoning.cpp
+++ b/libsanitizer/asan/asan_poisoning.cpp
@@ -12,11 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "asan_poisoning.h"
+
 #include "asan_report.h"
 #include "asan_stack.h"
 #include "sanitizer_common/sanitizer_atomic.h"
-#include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_flags.h"
+#include "sanitizer_common/sanitizer_interface_internal.h"
+#include "sanitizer_common/sanitizer_libc.h"
 
 namespace __asan {
 
@@ -35,7 +37,7 @@ void PoisonShadow(uptr addr, uptr size, u8 value) {
   CHECK(AddrIsAlignedByGranularity(addr));
   CHECK(AddrIsInMem(addr));
   CHECK(AddrIsAlignedByGranularity(addr + size));
-  CHECK(AddrIsInMem(addr + size - SHADOW_GRANULARITY));
+  CHECK(AddrIsInMem(addr + size - ASAN_SHADOW_GRANULARITY));
   CHECK(REAL(memset));
   FastPoisonShadow(addr, size, value);
 }
@@ -52,12 +54,12 @@ void PoisonShadowPartialRightRedzone(uptr addr,
 
 struct ShadowSegmentEndpoint {
   u8 *chunk;
-  s8 offset;  // in [0, SHADOW_GRANULARITY)
+  s8 offset;  // in [0, ASAN_SHADOW_GRANULARITY)
   s8 value;  // = *chunk;
 
   explicit ShadowSegmentEndpoint(uptr address) {
     chunk = (u8*)MemToShadow(address);
-    offset = address & (SHADOW_GRANULARITY - 1);
+    offset = address & (ASAN_SHADOW_GRANULARITY - 1);
     value = *chunk;
   }
 };
@@ -72,14 +74,14 @@ void AsanPoisonOrUnpoisonIntraObjectRedzone(uptr ptr, uptr size, bool poison) {
   }
   CHECK(size);
   CHECK_LE(size, 4096);
-  CHECK(IsAligned(end, SHADOW_GRANULARITY));
-  if (!IsAligned(ptr, SHADOW_GRANULARITY)) {
+  CHECK(IsAligned(end, ASAN_SHADOW_GRANULARITY));
+  if (!IsAligned(ptr, ASAN_SHADOW_GRANULARITY)) {
     *(u8 *)MemToShadow(ptr) =
-        poison ? static_cast<u8>(ptr % SHADOW_GRANULARITY) : 0;
-    ptr |= SHADOW_GRANULARITY - 1;
+        poison ? static_cast<u8>(ptr % ASAN_SHADOW_GRANULARITY) : 0;
+    ptr |= ASAN_SHADOW_GRANULARITY - 1;
     ptr++;
   }
-  for (; ptr < end; ptr += SHADOW_GRANULARITY)
+  for (; ptr < end; ptr += ASAN_SHADOW_GRANULARITY)
     *(u8*)MemToShadow(ptr) = poison ? kAsanIntraObjectRedzone : 0;
 }
 
@@ -181,12 +183,12 @@ uptr __asan_region_is_poisoned(uptr beg, uptr size) {
   if (!AddrIsInMem(end))
     return end;
   CHECK_LT(beg, end);
-  uptr aligned_b = RoundUpTo(beg, SHADOW_GRANULARITY);
-  uptr aligned_e = RoundDownTo(end, SHADOW_GRANULARITY);
+  uptr aligned_b = RoundUpTo(beg, ASAN_SHADOW_GRANULARITY);
+  uptr aligned_e = RoundDownTo(end, ASAN_SHADOW_GRANULARITY);
   uptr shadow_beg = MemToShadow(aligned_b);
   uptr shadow_end = MemToShadow(aligned_e);
   // First check the first and the last application bytes,
-  // then check the SHADOW_GRANULARITY-aligned region by calling
+  // then check the ASAN_SHADOW_GRANULARITY-aligned region by calling
   // mem_is_zero on the corresponding shadow.
   if (!__asan::AddressIsPoisoned(beg) && !__asan::AddressIsPoisoned(end - 1) &&
       (shadow_end <= shadow_beg ||
@@ -285,7 +287,7 @@ uptr __asan_load_cxx_array_cookie(uptr *p) {
 // assumes that left border of region to be poisoned is properly aligned.
 static void PoisonAlignedStackMemory(uptr addr, uptr size, bool do_poison) {
   if (size == 0) return;
-  uptr aligned_size = size & ~(SHADOW_GRANULARITY - 1);
+  uptr aligned_size = size & ~(ASAN_SHADOW_GRANULARITY - 1);
   PoisonShadow(addr, aligned_size,
                do_poison ? kAsanStackUseAfterScopeMagic : 0);
   if (size == aligned_size)
@@ -351,7 +353,7 @@ void __sanitizer_annotate_contiguous_container(const void *beg_p,
   uptr end = reinterpret_cast<uptr>(end_p);
   uptr old_mid = reinterpret_cast<uptr>(old_mid_p);
   uptr new_mid = reinterpret_cast<uptr>(new_mid_p);
-  uptr granularity = SHADOW_GRANULARITY;
+  uptr granularity = ASAN_SHADOW_GRANULARITY;
   if (!(beg <= old_mid && beg <= new_mid && old_mid <= end && new_mid <= end &&
         IsAligned(beg, granularity))) {
     GET_STACK_TRACE_FATAL_HERE;
diff --git a/libsanitizer/asan/asan_poisoning.h b/libsanitizer/asan/asan_poisoning.h
index 3d536f2..600bd01 100644
--- a/libsanitizer/asan/asan_poisoning.h
+++ b/libsanitizer/asan/asan_poisoning.h
@@ -44,8 +44,8 @@ ALWAYS_INLINE void FastPoisonShadow(uptr aligned_beg, uptr aligned_size,
                           common_flags()->clear_shadow_mmap_threshold);
 #else
   uptr shadow_beg = MEM_TO_SHADOW(aligned_beg);
-  uptr shadow_end = MEM_TO_SHADOW(
-      aligned_beg + aligned_size - SHADOW_GRANULARITY) + 1;
+  uptr shadow_end =
+      MEM_TO_SHADOW(aligned_beg + aligned_size - ASAN_SHADOW_GRANULARITY) + 1;
   // FIXME: Page states are different on Windows, so using the same interface
   // for mapping shadow and zeroing out pages doesn't "just work", so we should
   // probably provide higher-level interface for these operations.
@@ -78,11 +78,12 @@ ALWAYS_INLINE void FastPoisonShadowPartialRightRedzone(
   DCHECK(CanPoisonMemory());
   bool poison_partial = flags()->poison_partial;
   u8 *shadow = (u8*)MEM_TO_SHADOW(aligned_addr);
-  for (uptr i = 0; i < redzone_size; i += SHADOW_GRANULARITY, shadow++) {
-    if (i + SHADOW_GRANULARITY <= size) {
+  for (uptr i = 0; i < redzone_size; i += ASAN_SHADOW_GRANULARITY, shadow++) {
+    if (i + ASAN_SHADOW_GRANULARITY <= size) {
       *shadow = 0;  // fully addressable
     } else if (i >= size) {
-      *shadow = (SHADOW_GRANULARITY == 128) ? 0xff : value;  // unaddressable
+      *shadow =
+          (ASAN_SHADOW_GRANULARITY == 128) ? 0xff : value;  // unaddressable
     } else {
       // first size-i bytes are addressable
       *shadow = poison_partial ? static_cast<u8>(size - i) : 0;
diff --git a/libsanitizer/asan/asan_posix.cpp b/libsanitizer/asan/asan_posix.cpp
index 63ad735..765f4a2 100644
--- a/libsanitizer/asan/asan_posix.cpp
+++ b/libsanitizer/asan/asan_posix.cpp
@@ -14,22 +14,23 @@
 #include "sanitizer_common/sanitizer_platform.h"
 #if SANITIZER_POSIX
 
-#include "asan_internal.h"
-#include "asan_interceptors.h"
-#include "asan_mapping.h"
-#include "asan_poisoning.h"
-#include "asan_report.h"
-#include "asan_stack.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#include "sanitizer_common/sanitizer_posix.h"
-#include "sanitizer_common/sanitizer_procmaps.h"
-
-#include <pthread.h>
-#include <signal.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <unistd.h>
+#  include <pthread.h>
+#  include <signal.h>
+#  include <stdlib.h>
+#  include <sys/resource.h>
+#  include <sys/time.h>
+#  include <unistd.h>
+
+#  include "asan_interceptors.h"
+#  include "asan_internal.h"
+#  include "asan_mapping.h"
+#  include "asan_poisoning.h"
+#  include "asan_report.h"
+#  include "asan_stack.h"
+#  include "lsan/lsan_common.h"
+#  include "sanitizer_common/sanitizer_libc.h"
+#  include "sanitizer_common/sanitizer_posix.h"
+#  include "sanitizer_common/sanitizer_procmaps.h"
 
 namespace __asan {
 
@@ -131,7 +132,7 @@ void AsanTSDSet(void *tsd) {
 }
 
 void PlatformTSDDtor(void *tsd) {
-  AsanThreadContext *context = (AsanThreadContext*)tsd;
+  AsanThreadContext *context = (AsanThreadContext *)tsd;
   if (context->destructor_iterations > 1) {
     context->destructor_iterations--;
     CHECK_EQ(0, pthread_setspecific(tsd_key, tsd));
@@ -140,6 +141,18 @@ void PlatformTSDDtor(void *tsd) {
   AsanThread::TSDDtor(tsd);
 }
 #endif
+
+void InstallAtExitCheckLeaks() {
+  if (CAN_SANITIZE_LEAKS) {
+    if (common_flags()->detect_leaks && common_flags()->leak_check_at_exit) {
+      if (flags()->halt_on_error)
+        Atexit(__lsan::DoLeakCheck);
+      else
+        Atexit(__lsan::DoRecoverableLeakCheckVoid);
+    }
+  }
+}
+
 }  // namespace __asan
 
 #endif  // SANITIZER_POSIX
diff --git a/libsanitizer/asan/asan_premap_shadow.cpp b/libsanitizer/asan/asan_premap_shadow.cpp
index 666bb9b..bed2f62 100644
--- a/libsanitizer/asan/asan_premap_shadow.cpp
+++ b/libsanitizer/asan/asan_premap_shadow.cpp
@@ -26,7 +26,7 @@ namespace __asan {
 // Conservative upper limit.
 uptr PremapShadowSize() {
   uptr granularity = GetMmapGranularity();
-  return RoundUpTo(GetMaxVirtualAddress() >> SHADOW_SCALE, granularity);
+  return RoundUpTo(GetMaxVirtualAddress() >> ASAN_SHADOW_SCALE, granularity);
 }
 
 // Returns an address aligned to 8 pages, such that one page on the left and
diff --git a/libsanitizer/asan/asan_report.cpp b/libsanitizer/asan/asan_report.cpp
index 1f26633..2a55d6c 100644
--- a/libsanitizer/asan/asan_report.cpp
+++ b/libsanitizer/asan/asan_report.cpp
@@ -11,17 +11,19 @@
 // This file contains error reporting code.
 //===----------------------------------------------------------------------===//
 
+#include "asan_report.h"
+
+#include "asan_descriptions.h"
 #include "asan_errors.h"
 #include "asan_flags.h"
-#include "asan_descriptions.h"
 #include "asan_internal.h"
 #include "asan_mapping.h"
-#include "asan_report.h"
 #include "asan_scariness_score.h"
 #include "asan_stack.h"
 #include "asan_thread.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_flags.h"
+#include "sanitizer_common/sanitizer_interface_internal.h"
 #include "sanitizer_common/sanitizer_report_decorator.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
 #include "sanitizer_common/sanitizer_symbolizer.h"
@@ -460,6 +462,10 @@ static bool SuppressErrorReport(uptr pc) {
 
 void ReportGenericError(uptr pc, uptr bp, uptr sp, uptr addr, bool is_write,
                         uptr access_size, u32 exp, bool fatal) {
+  if (__asan_test_only_reported_buggy_pointer) {
+    *__asan_test_only_reported_buggy_pointer = addr;
+    return;
+  }
   if (!fatal && SuppressErrorReport(pc)) return;
   ENABLE_FRAME_POINTER;
 
diff --git a/libsanitizer/asan/asan_rtl.cpp b/libsanitizer/asan/asan_rtl.cpp
index 1b150b3..3a52614 100644
--- a/libsanitizer/asan/asan_rtl.cpp
+++ b/libsanitizer/asan/asan_rtl.cpp
@@ -27,6 +27,7 @@
 #include "lsan/lsan_common.h"
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_flags.h"
+#include "sanitizer_common/sanitizer_interface_internal.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_symbolizer.h"
 #include "ubsan/ubsan_init.h"
@@ -44,7 +45,9 @@ static void AsanDie() {
   static atomic_uint32_t num_calls;
   if (atomic_fetch_add(&num_calls, 1, memory_order_relaxed) != 0) {
     // Don't die twice - run a busy loop.
-    while (1) { }
+    while (1) {
+      internal_sched_yield();
+    }
   }
   if (common_flags()->print_module_map >= 1)
     DumpProcessMap();
@@ -85,12 +88,8 @@ void ShowStatsAndAbort() {
 NOINLINE
 static void ReportGenericErrorWrapper(uptr addr, bool is_write, int size,
                                       int exp_arg, bool fatal) {
-  if (__asan_test_only_reported_buggy_pointer) {
-    *__asan_test_only_reported_buggy_pointer = addr;
-  } else {
-    GET_CALLER_PC_BP_SP;
-    ReportGenericError(pc, bp, sp, addr, is_write, size, exp_arg, fatal);
-  }
+  GET_CALLER_PC_BP_SP;
+  ReportGenericError(pc, bp, sp, addr, is_write, size, exp_arg, fatal);
 }
 
 // --------------- LowLevelAllocateCallbac ---------- {{{1
@@ -150,11 +149,11 @@ ASAN_REPORT_ERROR_N(store, true)
 
 #define ASAN_MEMORY_ACCESS_CALLBACK_BODY(type, is_write, size, exp_arg, fatal) \
   uptr sp = MEM_TO_SHADOW(addr);                                               \
-  uptr s = size <= SHADOW_GRANULARITY ? *reinterpret_cast<u8 *>(sp)            \
-                                      : *reinterpret_cast<u16 *>(sp);          \
+  uptr s = size <= ASAN_SHADOW_GRANULARITY ? *reinterpret_cast<u8 *>(sp)       \
+                                           : *reinterpret_cast<u16 *>(sp);     \
   if (UNLIKELY(s)) {                                                           \
-    if (UNLIKELY(size >= SHADOW_GRANULARITY ||                                 \
-                 ((s8)((addr & (SHADOW_GRANULARITY - 1)) + size - 1)) >=       \
+    if (UNLIKELY(size >= ASAN_SHADOW_GRANULARITY ||                            \
+                 ((s8)((addr & (ASAN_SHADOW_GRANULARITY - 1)) + size - 1)) >=  \
                      (s8)s)) {                                                 \
       ReportGenericErrorWrapper(addr, is_write, size, exp_arg, fatal);         \
     }                                                                          \
@@ -188,7 +187,7 @@ ASAN_MEMORY_ACCESS_CALLBACK(store, true, 16)
 extern "C"
 NOINLINE INTERFACE_ATTRIBUTE
 void __asan_loadN(uptr addr, uptr size) {
-  if (__asan_region_is_poisoned(addr, size)) {
+  if ((addr = __asan_region_is_poisoned(addr, size))) {
     GET_CALLER_PC_BP_SP;
     ReportGenericError(pc, bp, sp, addr, false, size, 0, true);
   }
@@ -197,7 +196,7 @@ void __asan_loadN(uptr addr, uptr size) {
 extern "C"
 NOINLINE INTERFACE_ATTRIBUTE
 void __asan_exp_loadN(uptr addr, uptr size, u32 exp) {
-  if (__asan_region_is_poisoned(addr, size)) {
+  if ((addr = __asan_region_is_poisoned(addr, size))) {
     GET_CALLER_PC_BP_SP;
     ReportGenericError(pc, bp, sp, addr, false, size, exp, true);
   }
@@ -206,7 +205,7 @@ void __asan_exp_loadN(uptr addr, uptr size, u32 exp) {
 extern "C"
 NOINLINE INTERFACE_ATTRIBUTE
 void __asan_loadN_noabort(uptr addr, uptr size) {
-  if (__asan_region_is_poisoned(addr, size)) {
+  if ((addr = __asan_region_is_poisoned(addr, size))) {
     GET_CALLER_PC_BP_SP;
     ReportGenericError(pc, bp, sp, addr, false, size, 0, false);
   }
@@ -215,7 +214,7 @@ void __asan_loadN_noabort(uptr addr, uptr size) {
 extern "C"
 NOINLINE INTERFACE_ATTRIBUTE
 void __asan_storeN(uptr addr, uptr size) {
-  if (__asan_region_is_poisoned(addr, size)) {
+  if ((addr = __asan_region_is_poisoned(addr, size))) {
     GET_CALLER_PC_BP_SP;
     ReportGenericError(pc, bp, sp, addr, true, size, 0, true);
   }
@@ -224,7 +223,7 @@ void __asan_storeN(uptr addr, uptr size) {
 extern "C"
 NOINLINE INTERFACE_ATTRIBUTE
 void __asan_exp_storeN(uptr addr, uptr size, u32 exp) {
-  if (__asan_region_is_poisoned(addr, size)) {
+  if ((addr = __asan_region_is_poisoned(addr, size))) {
     GET_CALLER_PC_BP_SP;
     ReportGenericError(pc, bp, sp, addr, true, size, exp, true);
   }
@@ -233,7 +232,7 @@ void __asan_exp_storeN(uptr addr, uptr size, u32 exp) {
 extern "C"
 NOINLINE INTERFACE_ATTRIBUTE
 void __asan_storeN_noabort(uptr addr, uptr size) {
-  if (__asan_region_is_poisoned(addr, size)) {
+  if ((addr = __asan_region_is_poisoned(addr, size))) {
     GET_CALLER_PC_BP_SP;
     ReportGenericError(pc, bp, sp, addr, true, size, 0, false);
   }
@@ -313,7 +312,7 @@ static void InitializeHighMemEnd() {
   kHighMemEnd = GetMaxUserVirtualAddress();
   // Increase kHighMemEnd to make sure it's properly
   // aligned together with kHighMemBeg:
-  kHighMemEnd |= (GetMmapGranularity() << SHADOW_SCALE) - 1;
+  kHighMemEnd |= (GetMmapGranularity() << ASAN_SHADOW_SCALE) - 1;
 #endif  // !ASAN_FIXED_MAPPING
   CHECK_EQ((kHighMemBeg % GetMmapGranularity()), 0);
 }
@@ -365,29 +364,16 @@ void PrintAddressSpaceLayout() {
   Printf("malloc_context_size=%zu\n",
          (uptr)common_flags()->malloc_context_size);
 
-  Printf("SHADOW_SCALE: %d\n", (int)SHADOW_SCALE);
-  Printf("SHADOW_GRANULARITY: %d\n", (int)SHADOW_GRANULARITY);
-  Printf("SHADOW_OFFSET: 0x%zx\n", (uptr)SHADOW_OFFSET);
-  CHECK(SHADOW_SCALE >= 3 && SHADOW_SCALE <= 7);
+  Printf("SHADOW_SCALE: %d\n", (int)ASAN_SHADOW_SCALE);
+  Printf("SHADOW_GRANULARITY: %d\n", (int)ASAN_SHADOW_GRANULARITY);
+  Printf("SHADOW_OFFSET: 0x%zx\n", (uptr)ASAN_SHADOW_OFFSET);
+  CHECK(ASAN_SHADOW_SCALE >= 3 && ASAN_SHADOW_SCALE <= 7);
   if (kMidMemBeg)
     CHECK(kMidShadowBeg > kLowShadowEnd &&
           kMidMemBeg > kMidShadowEnd &&
           kHighShadowBeg > kMidMemEnd);
 }
 
-#if defined(__thumb__) && defined(__linux__)
-#define START_BACKGROUND_THREAD_IN_ASAN_INTERNAL
-#endif
-
-#ifndef START_BACKGROUND_THREAD_IN_ASAN_INTERNAL
-static bool UNUSED __local_asan_dyninit = [] {
-  MaybeStartBackgroudThread();
-  SetSoftRssLimitExceededCallback(AsanSoftRssLimitExceededCallback);
-
-  return false;
-}();
-#endif
-
 static void AsanInitInternal() {
   if (LIKELY(asan_inited)) return;
   SanitizerToolName = "AddressSanitizer";
@@ -438,7 +424,7 @@ static void AsanInitInternal() {
   MaybeReexec();
 
   // Setup internal allocator callback.
-  SetLowLevelAllocateMinAlignment(SHADOW_GRANULARITY);
+  SetLowLevelAllocateMinAlignment(ASAN_SHADOW_GRANULARITY);
   SetLowLevelAllocateCallback(OnLowLevelAllocate);
 
   InitializeAsanInterceptors();
@@ -462,10 +448,8 @@ static void AsanInitInternal() {
   allocator_options.SetFrom(flags(), common_flags());
   InitializeAllocator(allocator_options);
 
-#ifdef START_BACKGROUND_THREAD_IN_ASAN_INTERNAL
-  MaybeStartBackgroudThread();
-  SetSoftRssLimitExceededCallback(AsanSoftRssLimitExceededCallback);
-#endif
+  if (SANITIZER_START_BACKGROUND_THREAD_IN_ASAN_INTERNAL)
+    MaybeStartBackgroudThread();
 
   // On Linux AsanThread::ThreadStart() calls malloc() that's why asan_inited
   // should be set to 1 prior to initializing the threads.
@@ -493,12 +477,7 @@ static void AsanInitInternal() {
 
   if (CAN_SANITIZE_LEAKS) {
     __lsan::InitCommonLsan();
-    if (common_flags()->detect_leaks && common_flags()->leak_check_at_exit) {
-      if (flags()->halt_on_error)
-        Atexit(__lsan::DoLeakCheck);
-      else
-        Atexit(__lsan::DoRecoverableLeakCheckVoid);
-    }
+    InstallAtExitCheckLeaks();
   }
 
 #if CAN_SANITIZE_UB
@@ -561,7 +540,7 @@ void UnpoisonStack(uptr bottom, uptr top, const char *type) {
         top - bottom);
     return;
   }
-  PoisonShadow(bottom, RoundUpTo(top - bottom, SHADOW_GRANULARITY), 0);
+  PoisonShadow(bottom, RoundUpTo(top - bottom, ASAN_SHADOW_GRANULARITY), 0);
 }
 
 static void UnpoisonDefaultStack() {
diff --git a/libsanitizer/asan/asan_rtl_static.cpp b/libsanitizer/asan/asan_rtl_static.cpp
new file mode 100644
index 0000000..a6f812b
--- /dev/null
+++ b/libsanitizer/asan/asan_rtl_static.cpp
@@ -0,0 +1,36 @@
+//===-- asan_static_rtl.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// Main file of the ASan run-time library.
+//===----------------------------------------------------------------------===//
+
+// This file is empty for now. Main reason to have it is workaround for Windows
+// build, which complains because no files are part of the asan_static lib.
+
+#include "sanitizer_common/sanitizer_common.h"
+
+#define REPORT_FUNCTION(Name)                                       \
+  extern "C" SANITIZER_WEAK_ATTRIBUTE void Name(__asan::uptr addr); \
+  extern "C" void Name##_asm(uptr addr) { Name(addr); }
+
+namespace __asan {
+
+REPORT_FUNCTION(__asan_report_load1)
+REPORT_FUNCTION(__asan_report_load2)
+REPORT_FUNCTION(__asan_report_load4)
+REPORT_FUNCTION(__asan_report_load8)
+REPORT_FUNCTION(__asan_report_load16)
+REPORT_FUNCTION(__asan_report_store1)
+REPORT_FUNCTION(__asan_report_store2)
+REPORT_FUNCTION(__asan_report_store4)
+REPORT_FUNCTION(__asan_report_store8)
+REPORT_FUNCTION(__asan_report_store16)
+
+}  // namespace __asan
diff --git a/libsanitizer/asan/asan_rtl_x86_64.S b/libsanitizer/asan/asan_rtl_x86_64.S
new file mode 100644
index 0000000..d93b5ed
--- /dev/null
+++ b/libsanitizer/asan/asan_rtl_x86_64.S
@@ -0,0 +1,146 @@
+#include "asan_mapping.h"
+#include "sanitizer_common/sanitizer_asm.h"
+
+#if defined(__x86_64__)
+#include "sanitizer_common/sanitizer_platform.h"
+
+.file "asan_rtl_x86_64.S"
+
+#define NAME(n, reg, op, s, i) n##_##op##_##i##_##s##_##reg
+
+#define FNAME(reg, op, s, i) NAME(__asan_check, reg, op, s, i)
+#define RLABEL(reg, op, s, i) NAME(.return, reg, op, s, i)
+#define CLABEL(reg, op, s, i) NAME(.check, reg, op, s, i)
+#define FLABEL(reg, op, s, i) NAME(.fail, reg, op, s, i)
+
+#define BEGINF(reg, op, s, i) \
+.section .text.FNAME(reg, op, s, i),"ax",@progbits ;\
+.globl  FNAME(reg, op, s, i) ;\
+.hidden  FNAME(reg, op, s, i) ;\
+ASM_TYPE_FUNCTION(FNAME(reg, op, s, i)) ;\
+.cfi_startproc ;\
+FNAME(reg, op, s, i): ;\
+
+#define ENDF .cfi_endproc ;\
+
+// Access check functions for 1,2 and 4 byte types, which require extra checks.
+#define ASAN_MEMORY_ACCESS_INITIAL_CHECK_ADD(reg, op, s) \
+        mov    %##reg,%r10 ;\
+        shr    $0x3,%r10 ;\
+        movsbl ASAN_SHADOW_OFFSET_CONST(%r10),%r10d ;\
+        test   %r10d,%r10d ;\
+        jne    CLABEL(reg, op, s, add) ;\
+RLABEL(reg, op, s, add): ;\
+        retq  ;\
+
+#define ASAN_MEMORY_ACCESS_EXTRA_CHECK_1(reg, op, i) \
+CLABEL(reg, op, 1, i): ;\
+        push   %rcx ;\
+        mov    %##reg,%rcx ;\
+        and    $0x7,%ecx ;\
+        cmp    %r10d,%ecx ;\
+        pop    %rcx ;\
+        jl     RLABEL(reg, op, 1, i);\
+        mov    %##reg,%rdi ;\
+        jmp    __asan_report_##op##1_asm ;\
+
+#define ASAN_MEMORY_ACCESS_EXTRA_CHECK_2(reg, op, i) \
+CLABEL(reg, op, 2, i): ;\
+        push   %rcx ;\
+        mov    %##reg,%rcx ;\
+        and    $0x7,%ecx ;\
+        add    $0x1,%ecx ;\
+        cmp    %r10d,%ecx ;\
+        pop    %rcx ;\
+        jl     RLABEL(reg, op, 2, i);\
+        mov    %##reg,%rdi ;\
+        jmp    __asan_report_##op##2_asm ;\
+
+#define ASAN_MEMORY_ACCESS_EXTRA_CHECK_4(reg, op, i) \
+CLABEL(reg, op, 4, i): ;\
+        push   %rcx ;\
+        mov    %##reg,%rcx ;\
+        and    $0x7,%ecx ;\
+        add    $0x3,%ecx ;\
+        cmp    %r10d,%ecx ;\
+        pop    %rcx ;\
+        jl     RLABEL(reg, op, 4, i);\
+        mov    %##reg,%rdi ;\
+        jmp    __asan_report_##op##4_asm ;\
+
+#define ASAN_MEMORY_ACCESS_CALLBACK_ADD_1(reg, op) \
+BEGINF(reg, op, 1, add) ;\
+        ASAN_MEMORY_ACCESS_INITIAL_CHECK_ADD(reg, op, 1) ;\
+        ASAN_MEMORY_ACCESS_EXTRA_CHECK_1(reg, op, add) ;\
+ENDF
+
+#define ASAN_MEMORY_ACCESS_CALLBACK_ADD_2(reg, op) \
+BEGINF(reg, op, 2, add) ;\
+        ASAN_MEMORY_ACCESS_INITIAL_CHECK_ADD(reg, op, 2) ;\
+        ASAN_MEMORY_ACCESS_EXTRA_CHECK_2(reg, op, add) ;\
+ENDF
+
+#define ASAN_MEMORY_ACCESS_CALLBACK_ADD_4(reg, op) \
+BEGINF(reg, op, 4, add) ;\
+        ASAN_MEMORY_ACCESS_INITIAL_CHECK_ADD(reg, op, 4) ;\
+        ASAN_MEMORY_ACCESS_EXTRA_CHECK_4(reg, op, add) ;\
+ENDF
+
+// Access check functions for 8 and 16 byte types: no extra checks required.
+#define ASAN_MEMORY_ACCESS_CHECK_ADD(reg, op, s, c) \
+        mov    %##reg,%r10 ;\
+        shr    $0x3,%r10 ;\
+        ##c    $0x0,ASAN_SHADOW_OFFSET_CONST(%r10) ;\
+        jne    FLABEL(reg, op, s, add) ;\
+        retq  ;\
+
+#define ASAN_MEMORY_ACCESS_FAIL(reg, op, s, i) \
+FLABEL(reg, op, s, i): ;\
+        mov    %##reg,%rdi ;\
+        jmp    __asan_report_##op##s##_asm;\
+
+#define ASAN_MEMORY_ACCESS_CALLBACK_ADD_8(reg, op) \
+BEGINF(reg, op, 8, add) ;\
+        ASAN_MEMORY_ACCESS_CHECK_ADD(reg, op, 8, cmpb) ;\
+        ASAN_MEMORY_ACCESS_FAIL(reg, op, 8, add) ;\
+ENDF
+
+#define ASAN_MEMORY_ACCESS_CALLBACK_ADD_16(reg, op) \
+BEGINF(reg, op, 16, add) ;\
+        ASAN_MEMORY_ACCESS_CHECK_ADD(reg, op, 16, cmpw) ;\
+        ASAN_MEMORY_ACCESS_FAIL(reg, op, 16, add) ;\
+ENDF
+
+#define ASAN_MEMORY_ACCESS_CALLBACKS_ADD(reg) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_1(reg, load) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_1(reg, store) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_2(reg, load) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_2(reg, store) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_4(reg, load) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_4(reg, store) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_8(reg, load) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_8(reg, store) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_16(reg, load) \
+ASAN_MEMORY_ACCESS_CALLBACK_ADD_16(reg, store) \
+
+
+// Instantiate all but R10 and R11 callbacks. We are using PLTSafe class with
+// the intrinsic, which guarantees that the code generation will never emit
+// R10 or R11 callback.
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(RAX)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(RBX)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(RCX)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(RDX)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(RSI)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(RDI)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(RBP)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(R8)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(R9)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(R12)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(R13)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(R14)
+ASAN_MEMORY_ACCESS_CALLBACKS_ADD(R15)
+
+#endif
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/libsanitizer/asan/asan_thread.cpp b/libsanitizer/asan/asan_thread.cpp
index 8af7425..c15963e1 100644
--- a/libsanitizer/asan/asan_thread.cpp
+++ b/libsanitizer/asan/asan_thread.cpp
@@ -83,8 +83,7 @@ AsanThread *AsanThread::Create(thread_callback_t start_routine, void *arg,
   thread->start_routine_ = start_routine;
   thread->arg_ = arg;
   AsanThreadContext::CreateThreadContextArgs args = {thread, stack};
-  asanThreadRegistry().CreateThread(*reinterpret_cast<uptr *>(thread), detached,
-                                    parent_tid, &args);
+  asanThreadRegistry().CreateThread(0, detached, parent_tid, &args);
 
   return thread;
 }
@@ -306,7 +305,7 @@ void AsanThread::SetThreadStackAndTls(const InitOptions *options) {
   uptr stack_size = 0;
   GetThreadStackAndTls(tid() == kMainTid, &stack_bottom_, &stack_size,
                        &tls_begin_, &tls_size);
-  stack_top_ = RoundDownTo(stack_bottom_ + stack_size, SHADOW_GRANULARITY);
+  stack_top_ = RoundDownTo(stack_bottom_ + stack_size, ASAN_SHADOW_GRANULARITY);
   tls_end_ = tls_begin_ + tls_size;
   dtls_ = DTLS_Get();
 
@@ -322,11 +321,9 @@ void AsanThread::ClearShadowForThreadStackAndTLS() {
   if (stack_top_ != stack_bottom_)
     PoisonShadow(stack_bottom_, stack_top_ - stack_bottom_, 0);
   if (tls_begin_ != tls_end_) {
-    uptr tls_begin_aligned = RoundDownTo(tls_begin_, SHADOW_GRANULARITY);
-    uptr tls_end_aligned = RoundUpTo(tls_end_, SHADOW_GRANULARITY);
-    FastPoisonShadowPartialRightRedzone(tls_begin_aligned,
-                                        tls_end_ - tls_begin_aligned,
-                                        tls_end_aligned - tls_end_, 0);
+    uptr tls_begin_aligned = RoundDownTo(tls_begin_, ASAN_SHADOW_GRANULARITY);
+    uptr tls_end_aligned = RoundUpTo(tls_end_, ASAN_SHADOW_GRANULARITY);
+    FastPoisonShadow(tls_begin_aligned, tls_end_aligned - tls_begin_aligned, 0);
   }
 }
 
@@ -347,27 +344,27 @@ bool AsanThread::GetStackFrameAccessByAddr(uptr addr,
     return true;
   }
   uptr aligned_addr = RoundDownTo(addr, SANITIZER_WORDSIZE / 8);  // align addr.
-  uptr mem_ptr = RoundDownTo(aligned_addr, SHADOW_GRANULARITY);
+  uptr mem_ptr = RoundDownTo(aligned_addr, ASAN_SHADOW_GRANULARITY);
   u8 *shadow_ptr = (u8*)MemToShadow(aligned_addr);
   u8 *shadow_bottom = (u8*)MemToShadow(bottom);
 
   while (shadow_ptr >= shadow_bottom &&
          *shadow_ptr != kAsanStackLeftRedzoneMagic) {
     shadow_ptr--;
-    mem_ptr -= SHADOW_GRANULARITY;
+    mem_ptr -= ASAN_SHADOW_GRANULARITY;
   }
 
   while (shadow_ptr >= shadow_bottom &&
          *shadow_ptr == kAsanStackLeftRedzoneMagic) {
     shadow_ptr--;
-    mem_ptr -= SHADOW_GRANULARITY;
+    mem_ptr -= ASAN_SHADOW_GRANULARITY;
   }
 
   if (shadow_ptr < shadow_bottom) {
     return false;
   }
 
-  uptr* ptr = (uptr*)(mem_ptr + SHADOW_GRANULARITY);
+  uptr *ptr = (uptr *)(mem_ptr + ASAN_SHADOW_GRANULARITY);
   CHECK(ptr[0] == kCurrentStackFrameMagic);
   access->offset = addr - (uptr)ptr;
   access->frame_pc = ptr[2];
diff --git a/libsanitizer/asan/asan_win.cpp b/libsanitizer/asan/asan_win.cpp
index 1577c83..8195803 100644
--- a/libsanitizer/asan/asan_win.cpp
+++ b/libsanitizer/asan/asan_win.cpp
@@ -1,4 +1,5 @@
-//===-- asan_win.cpp ------------------------------------------------------===//
+//===-- asan_win.cpp
+//------------------------------------------------------===//>
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -13,21 +14,20 @@
 
 #include "sanitizer_common/sanitizer_platform.h"
 #if SANITIZER_WINDOWS
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-
-#include <stdlib.h>
-
-#include "asan_interceptors.h"
-#include "asan_internal.h"
-#include "asan_mapping.h"
-#include "asan_report.h"
-#include "asan_stack.h"
-#include "asan_thread.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#include "sanitizer_common/sanitizer_mutex.h"
-#include "sanitizer_common/sanitizer_win.h"
-#include "sanitizer_common/sanitizer_win_defs.h"
+#  define WIN32_LEAN_AND_MEAN
+#  include <stdlib.h>
+#  include <windows.h>
+
+#  include "asan_interceptors.h"
+#  include "asan_internal.h"
+#  include "asan_mapping.h"
+#  include "asan_report.h"
+#  include "asan_stack.h"
+#  include "asan_thread.h"
+#  include "sanitizer_common/sanitizer_libc.h"
+#  include "sanitizer_common/sanitizer_mutex.h"
+#  include "sanitizer_common/sanitizer_win.h"
+#  include "sanitizer_common/sanitizer_win_defs.h"
 
 using namespace __asan;
 
@@ -49,8 +49,8 @@ uptr __asan_get_shadow_memory_dynamic_address() {
 static LPTOP_LEVEL_EXCEPTION_FILTER default_seh_handler;
 static LPTOP_LEVEL_EXCEPTION_FILTER user_seh_handler;
 
-extern "C" SANITIZER_INTERFACE_ATTRIBUTE
-long __asan_unhandled_exception_filter(EXCEPTION_POINTERS *info) {
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE long __asan_unhandled_exception_filter(
+    EXCEPTION_POINTERS *info) {
   EXCEPTION_RECORD *exception_record = info->ExceptionRecord;
   CONTEXT *context = info->ContextRecord;
 
@@ -187,6 +187,8 @@ void InitializePlatformInterceptors() {
   }
 }
 
+void InstallAtExitCheckLeaks() {}
+
 void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
   UNIMPLEMENTED();
 }
@@ -253,7 +255,7 @@ void *AsanDoesNotSupportStaticLinkage() {
 }
 
 uptr FindDynamicShadowStart() {
-  return MapDynamicShadow(MemToShadowSize(kHighMemEnd), SHADOW_SCALE,
+  return MapDynamicShadow(MemToShadowSize(kHighMemEnd), ASAN_SHADOW_SCALE,
                           /*min_shadow_base_alignment*/ 0, kHighMemEnd);
 }
 
diff --git a/libsanitizer/asan/asan_win_dll_thunk.cpp b/libsanitizer/asan/asan_win_dll_thunk.cpp
index a5671cc..e3a90f1 100644
--- a/libsanitizer/asan/asan_win_dll_thunk.cpp
+++ b/libsanitizer/asan/asan_win_dll_thunk.cpp
@@ -56,6 +56,13 @@ INTERCEPT_WRAP_W_W(_expand_dbg)
 
 // TODO(timurrrr): Do we need to add _Crt* stuff here? (see asan_malloc_win.cpp)
 
+#  if defined(_MSC_VER) && !defined(__clang__)
+// Disable warnings such as: 'void memchr(void)': incorrect number of arguments
+// for intrinsic function, expected '3' arguments.
+#    pragma warning(push)
+#    pragma warning(disable : 4392)
+#  endif
+
 INTERCEPT_LIBRARY_FUNCTION(atoi);
 INTERCEPT_LIBRARY_FUNCTION(atol);
 INTERCEPT_LIBRARY_FUNCTION(frexp);
@@ -87,6 +94,10 @@ INTERCEPT_LIBRARY_FUNCTION(strtol);
 INTERCEPT_LIBRARY_FUNCTION(wcslen);
 INTERCEPT_LIBRARY_FUNCTION(wcsnlen);
 
+#  if defined(_MSC_VER) && !defined(__clang__)
+#    pragma warning(pop)
+#  endif
+
 #ifdef _WIN64
 INTERCEPT_LIBRARY_FUNCTION(__C_specific_handler);
 #else
diff --git a/libsanitizer/builtins/assembly.h b/libsanitizer/builtins/assembly.h
index 9c01505..69a3d86 100644
--- a/libsanitizer/builtins/assembly.h
+++ b/libsanitizer/builtins/assembly.h
@@ -14,6 +14,12 @@
 #ifndef COMPILERRT_ASSEMBLY_H
 #define COMPILERRT_ASSEMBLY_H
 
+#if defined(__linux__) && defined(__CET__)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
 #if defined(__APPLE__) && defined(__aarch64__)
 #define SEPARATOR %%
 #else
diff --git a/libsanitizer/hwasan/hwasan.cpp b/libsanitizer/hwasan/hwasan.cpp
index 6f0ea64..f8725a1 100644
--- a/libsanitizer/hwasan/hwasan.cpp
+++ b/libsanitizer/hwasan/hwasan.cpp
@@ -25,6 +25,7 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_flag_parser.h"
 #include "sanitizer_common/sanitizer_flags.h"
+#include "sanitizer_common/sanitizer_interface_internal.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_procmaps.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
diff --git a/libsanitizer/hwasan/hwasan.h b/libsanitizer/hwasan/hwasan.h
index 371c43f..3cc2fc4 100644
--- a/libsanitizer/hwasan/hwasan.h
+++ b/libsanitizer/hwasan/hwasan.h
@@ -172,21 +172,6 @@ void HwasanTagMismatch(uptr addr, uptr access_info, uptr *registers_frame,
 
 }  // namespace __hwasan
 
-#define HWASAN_MALLOC_HOOK(ptr, size)       \
-  do {                                    \
-    if (&__sanitizer_malloc_hook) {       \
-      __sanitizer_malloc_hook(ptr, size); \
-    }                                     \
-    RunMallocHooks(ptr, size);            \
-  } while (false)
-#define HWASAN_FREE_HOOK(ptr)       \
-  do {                            \
-    if (&__sanitizer_free_hook) { \
-      __sanitizer_free_hook(ptr); \
-    }                             \
-    RunFreeHooks(ptr);            \
-  } while (false)
-
 #if HWASAN_WITH_INTERCEPTORS
 // For both bionic and glibc __sigset_t is an unsigned long.
 typedef unsigned long __hw_sigset_t;
diff --git a/libsanitizer/hwasan/hwasan_allocator.cpp b/libsanitizer/hwasan/hwasan_allocator.cpp
index 9e17299..8424551 100644
--- a/libsanitizer/hwasan/hwasan_allocator.cpp
+++ b/libsanitizer/hwasan/hwasan_allocator.cpp
@@ -132,6 +132,11 @@ static void *HwasanAllocate(StackTrace *stack, uptr orig_size, uptr alignment,
     }
     ReportAllocationSizeTooBig(orig_size, kMaxAllowedMallocSize, stack);
   }
+  if (UNLIKELY(IsRssLimitExceeded())) {
+    if (AllocatorMayReturnNull())
+      return nullptr;
+    ReportRssLimitExceeded(stack);
+  }
 
   alignment = Max(alignment, kShadowAlignment);
   uptr size = TaggedSize(orig_size);
@@ -194,7 +199,7 @@ static void *HwasanAllocate(StackTrace *stack, uptr orig_size, uptr alignment,
     }
   }
 
-  HWASAN_MALLOC_HOOK(user_ptr, size);
+  RunMallocHooks(user_ptr, size);
   return user_ptr;
 }
 
@@ -221,7 +226,7 @@ static bool CheckInvalidFree(StackTrace *stack, void *untagged_ptr,
 
 static void HwasanDeallocate(StackTrace *stack, void *tagged_ptr) {
   CHECK(tagged_ptr);
-  HWASAN_FREE_HOOK(tagged_ptr);
+  RunFreeHooks(tagged_ptr);
 
   bool in_taggable_region =
       InTaggableRegion(reinterpret_cast<uptr>(tagged_ptr));
diff --git a/libsanitizer/hwasan/hwasan_interceptors.cpp b/libsanitizer/hwasan/hwasan_interceptors.cpp
index f96ed88..8dc886e 100644
--- a/libsanitizer/hwasan/hwasan_interceptors.cpp
+++ b/libsanitizer/hwasan/hwasan_interceptors.cpp
@@ -47,6 +47,12 @@ INTERCEPTOR(int, pthread_create, void *th, void *attr, void *(*callback)(void*),
   return res;
 }
 
+INTERCEPTOR(int, pthread_join, void *t, void **arg) {
+  return REAL(pthread_join)(t, arg);
+}
+
+DEFINE_REAL_PTHREAD_FUNCTIONS
+
 DEFINE_REAL(int, vfork)
 DECLARE_EXTERN_INTERCEPTOR_AND_WRAPPER(int, vfork)
 
@@ -189,7 +195,8 @@ void InitializeInterceptors() {
   INTERCEPT_FUNCTION(vfork);
 #endif  // __linux__
   INTERCEPT_FUNCTION(pthread_create);
-#endif
+  INTERCEPT_FUNCTION(pthread_join);
+#  endif
 
   inited = 1;
 }
diff --git a/libsanitizer/hwasan/hwasan_memintrinsics.cpp b/libsanitizer/hwasan/hwasan_memintrinsics.cpp
index fab017a..ea7f5ce 100644
--- a/libsanitizer/hwasan/hwasan_memintrinsics.cpp
+++ b/libsanitizer/hwasan/hwasan_memintrinsics.cpp
@@ -40,5 +40,5 @@ void *__hwasan_memmove(void *to, const void *from, uptr size) {
       reinterpret_cast<uptr>(to), size);
   CheckAddressSized<ErrorAction::Recover, AccessType::Load>(
       reinterpret_cast<uptr>(from), size);
-  return memmove(UntagPtr(to), UntagPtr(from), size);
+  return memmove(to, from, size);
 }
diff --git a/libsanitizer/hwasan/hwasan_new_delete.cpp b/libsanitizer/hwasan/hwasan_new_delete.cpp
index 4e057a6..495046a 100644
--- a/libsanitizer/hwasan/hwasan_new_delete.cpp
+++ b/libsanitizer/hwasan/hwasan_new_delete.cpp
@@ -22,21 +22,23 @@
 #if HWASAN_REPLACE_OPERATORS_NEW_AND_DELETE
 
 // TODO(alekseys): throw std::bad_alloc instead of dying on OOM.
-#define OPERATOR_NEW_BODY(nothrow) \
-  GET_MALLOC_STACK_TRACE; \
-  void *res = hwasan_malloc(size, &stack);\
-  if (!nothrow && UNLIKELY(!res)) ReportOutOfMemory(size, &stack);\
-  return res
-#define OPERATOR_NEW_ALIGN_BODY(nothrow)                                    \
-  GET_MALLOC_STACK_TRACE;                                                   \
-  void *res = hwasan_aligned_alloc(static_cast<uptr>(align), size, &stack); \
-  if (!nothrow && UNLIKELY(!res))                                           \
-    ReportOutOfMemory(size, &stack);                                        \
-  return res
-
-#define OPERATOR_DELETE_BODY \
-  GET_MALLOC_STACK_TRACE; \
-  if (ptr) hwasan_free(ptr, &stack)
+#  define OPERATOR_NEW_BODY(nothrow)         \
+    GET_MALLOC_STACK_TRACE;                  \
+    void *res = hwasan_malloc(size, &stack); \
+    if (!nothrow && UNLIKELY(!res))          \
+      ReportOutOfMemory(size, &stack);       \
+    return res
+#  define OPERATOR_NEW_ALIGN_BODY(nothrow)                               \
+    GET_MALLOC_STACK_TRACE;                                              \
+    void *res = hwasan_memalign(static_cast<uptr>(align), size, &stack); \
+    if (!nothrow && UNLIKELY(!res))                                      \
+      ReportOutOfMemory(size, &stack);                                   \
+    return res
+
+#  define OPERATOR_DELETE_BODY \
+    GET_MALLOC_STACK_TRACE;    \
+    if (ptr)                   \
+    hwasan_free(ptr, &stack)
 
 #elif defined(__ANDROID__)
 
@@ -44,8 +46,8 @@
 // since we previously released a runtime that intercepted these functions,
 // removing the interceptors would break ABI. Therefore we simply forward to
 // malloc and free.
-#define OPERATOR_NEW_BODY(nothrow) return malloc(size)
-#define OPERATOR_DELETE_BODY free(ptr)
+#  define OPERATOR_NEW_BODY(nothrow) return malloc(size)
+#  define OPERATOR_DELETE_BODY free(ptr)
 
 #endif
 
@@ -55,26 +57,27 @@ using namespace __hwasan;
 
 // Fake std::nothrow_t to avoid including <new>.
 namespace std {
-  struct nothrow_t {};
+struct nothrow_t {};
 }  // namespace std
 
-
-
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void *operator new(size_t size) { OPERATOR_NEW_BODY(false /*nothrow*/); }
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void *operator new[](size_t size) { OPERATOR_NEW_BODY(false /*nothrow*/); }
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void *operator new(size_t size, std::nothrow_t const&) {
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void *operator new(size_t size) {
+  OPERATOR_NEW_BODY(false /*nothrow*/);
+}
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void *operator new[](
+    size_t size) {
+  OPERATOR_NEW_BODY(false /*nothrow*/);
+}
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void *operator new(
+    size_t size, std::nothrow_t const &) {
   OPERATOR_NEW_BODY(true /*nothrow*/);
 }
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void *operator new[](size_t size, std::nothrow_t const&) {
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void *operator new[](
+    size_t size, std::nothrow_t const &) {
   OPERATOR_NEW_BODY(true /*nothrow*/);
 }
 
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void operator delete(void *ptr)
-    NOEXCEPT {
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void operator delete(
+    void *ptr) NOEXCEPT {
   OPERATOR_DELETE_BODY;
 }
 INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void operator delete[](
diff --git a/libsanitizer/hwasan/hwasan_preinit.cpp b/libsanitizer/hwasan/hwasan_preinit.cpp
new file mode 100644
index 0000000..8c9c95f
--- /dev/null
+++ b/libsanitizer/hwasan/hwasan_preinit.cpp
@@ -0,0 +1,23 @@
+//===-- hwasan_preinit.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of HWAddressSanitizer, an address sanity checker.
+//
+// Call __hwasan_init at the very early stage of process startup.
+//===----------------------------------------------------------------------===//
+#include "hwasan_interface_internal.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+#if SANITIZER_CAN_USE_PREINIT_ARRAY
+// The symbol is called __local_hwasan_preinit, because it's not intended to
+// be exported.
+// This code linked into the main executable when -fsanitize=hwaddress is in
+// the link flags. It can only use exported interface functions.
+__attribute__((section(".preinit_array"), used)) static void (
+    *__local_hwasan_preinit)(void) = __hwasan_init;
+#endif
diff --git a/libsanitizer/include/sanitizer/common_interface_defs.h b/libsanitizer/include/sanitizer/common_interface_defs.h
index 692b8f7..ba58ad4 100644
--- a/libsanitizer/include/sanitizer/common_interface_defs.h
+++ b/libsanitizer/include/sanitizer/common_interface_defs.h
@@ -211,6 +211,15 @@ void __sanitizer_symbolize_pc(void *pc, const char *fmt, char *out_buf,
 // Same as __sanitizer_symbolize_pc, but for data section (i.e. globals).
 void __sanitizer_symbolize_global(void *data_ptr, const char *fmt,
                                   char *out_buf, size_t out_buf_size);
+// Determine the return address.
+#if !defined(_MSC_VER) || defined(__clang__)
+#define __sanitizer_return_address()                                           \
+  __builtin_extract_return_addr(__builtin_return_address(0))
+#else
+extern "C" void *_ReturnAddress(void);
+#pragma intrinsic(_ReturnAddress)
+#define __sanitizer_return_address() _ReturnAddress()
+#endif
 
 /// Sets the callback to be called immediately before death on error.
 ///
diff --git a/libsanitizer/include/sanitizer/dfsan_interface.h b/libsanitizer/include/sanitizer/dfsan_interface.h
index d6209a3..8e581a6 100644
--- a/libsanitizer/include/sanitizer/dfsan_interface.h
+++ b/libsanitizer/include/sanitizer/dfsan_interface.h
@@ -27,6 +27,10 @@ typedef uint32_t dfsan_origin;
 /// Signature of the callback argument to dfsan_set_write_callback().
 typedef void (*dfsan_write_callback_t)(int fd, const void *buf, size_t count);
 
+/// Signature of the callback argument to dfsan_set_conditional_callback().
+typedef void (*dfsan_conditional_callback_t)(dfsan_label label,
+                                             dfsan_origin origin);
+
 /// Computes the union of \c l1 and \c l2, resulting in a union label.
 dfsan_label dfsan_union(dfsan_label l1, dfsan_label l2);
 
@@ -54,6 +58,10 @@ dfsan_origin dfsan_get_origin(long data);
 /// Retrieves the label associated with the data at the given address.
 dfsan_label dfsan_read_label(const void *addr, size_t size);
 
+/// Return the origin associated with the first taint byte in the size bytes
+/// from the address addr.
+dfsan_origin dfsan_read_origin_of_first_taint(const void *addr, size_t size);
+
 /// Returns whether the given label label contains the label elem.
 int dfsan_has_label(dfsan_label label, dfsan_label elem);
 
@@ -70,6 +78,19 @@ void dfsan_flush(void);
 /// callback executes.  Pass in NULL to remove any callback.
 void dfsan_set_write_callback(dfsan_write_callback_t labeled_write_callback);
 
+/// Sets a callback to be invoked on any conditional expressions which have a
+/// taint label set. This can be used to find where tainted data influences
+/// the behavior of the program.
+/// These callbacks will only be added when -dfsan-conditional-callbacks=true.
+void dfsan_set_conditional_callback(dfsan_conditional_callback_t callback);
+
+/// Conditional expressions occur during signal handlers.
+/// Making callbacks that handle signals well is tricky, so when
+/// -dfsan-conditional-callbacks=true, conditional expressions used in signal
+/// handlers will add the labels they see into a global (bitwise-or together).
+/// This function returns all label bits seen in signal handler conditions.
+dfsan_label dfsan_get_labels_in_signal_conditional();
+
 /// Interceptor hooks.
 /// Whenever a dfsan's custom function is called the corresponding
 /// hook is called it non-zero. The hooks should be defined by the user.
@@ -87,6 +108,9 @@ void dfsan_weak_hook_strncmp(void *caller_pc, const char *s1, const char *s2,
 /// prints description at the beginning of the trace. If origin tracking is not
 /// on, or the address is not labeled, it prints nothing.
 void dfsan_print_origin_trace(const void *addr, const char *description);
+/// As above, but use an origin id from dfsan_get_origin() instead of address.
+/// Does not include header line with taint label and address information.
+void dfsan_print_origin_id_trace(dfsan_origin origin);
 
 /// Prints the origin trace of the label at the address \p addr to a
 /// pre-allocated output buffer. If origin tracking is not on, or the address is
@@ -124,6 +148,10 @@ void dfsan_print_origin_trace(const void *addr, const char *description);
 /// return value is not less than \p out_buf_size.
 size_t dfsan_sprint_origin_trace(const void *addr, const char *description,
                                  char *out_buf, size_t out_buf_size);
+/// As above, but use an origin id from dfsan_get_origin() instead of address.
+/// Does not include header line with taint label and address information.
+size_t dfsan_sprint_origin_id_trace(dfsan_origin origin, char *out_buf,
+                                    size_t out_buf_size);
 
 /// Prints the stack trace leading to this call to a pre-allocated output
 /// buffer.
diff --git a/libsanitizer/interception/interception_win.cpp b/libsanitizer/interception/interception_win.cpp
index 38b8c05..10b8933 100644
--- a/libsanitizer/interception/interception_win.cpp
+++ b/libsanitizer/interception/interception_win.cpp
@@ -401,6 +401,7 @@ static uptr AllocateMemoryForTrampoline(uptr image_address, size_t size) {
 // The following prologues cannot be patched because of the short jump
 // jumping to the patching region.
 
+#if SANITIZER_WINDOWS64
 // ntdll!wcslen in Win11
 //   488bc1          mov     rax,rcx
 //   0fb710          movzx   edx,word ptr [rax]
@@ -422,6 +423,7 @@ static const u8 kPrologueWithShortJump2[] = {
     0x4c, 0x8b, 0xc1, 0x8a, 0x01, 0x48, 0xff, 0xc1,
     0x84, 0xc0, 0x75, 0xf7,
 };
+#endif
 
 // Returns 0 on error.
 static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
@@ -602,6 +604,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0x246c8948:  // 48 89 6C 24 XX : mov QWORD ptr [rsp + XX], rbp
     case 0x245c8948:  // 48 89 5c 24 XX : mov QWORD PTR [rsp + XX], rbx
     case 0x24748948:  // 48 89 74 24 XX : mov QWORD PTR [rsp + XX], rsi
+    case 0x247c8948:  // 48 89 7c 24 XX : mov QWORD PTR [rsp + XX], rdi
     case 0x244C8948:  // 48 89 4C 24 XX : mov QWORD PTR [rsp + XX], rcx
     case 0x24548948:  // 48 89 54 24 XX : mov QWORD PTR [rsp + XX], rdx
     case 0x244c894c:  // 4c 89 4c 24 XX : mov QWORD PTR [rsp + XX], r9
diff --git a/libsanitizer/lsan/lsan.cpp b/libsanitizer/lsan/lsan.cpp
index b6adc24..489c5ca 100644
--- a/libsanitizer/lsan/lsan.cpp
+++ b/libsanitizer/lsan/lsan.cpp
@@ -13,11 +13,12 @@
 
 #include "lsan.h"
 
-#include "sanitizer_common/sanitizer_flags.h"
-#include "sanitizer_common/sanitizer_flag_parser.h"
 #include "lsan_allocator.h"
 #include "lsan_common.h"
 #include "lsan_thread.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "sanitizer_common/sanitizer_interface_internal.h"
 
 bool lsan_inited;
 bool lsan_init_is_running;
@@ -99,9 +100,7 @@ extern "C" void __lsan_init() {
   InitializeThreadRegistry();
   InstallDeadlySignalHandlers(LsanOnDeadlySignal);
   InitializeMainThread();
-
-  if (common_flags()->detect_leaks && common_flags()->leak_check_at_exit)
-    Atexit(DoLeakCheck);
+  InstallAtExitCheckLeaks();
 
   InitializeCoverage(common_flags()->coverage, common_flags()->coverage_dir);
 
diff --git a/libsanitizer/lsan/lsan.h b/libsanitizer/lsan/lsan.h
index 1e82ad7..757edec 100644
--- a/libsanitizer/lsan/lsan.h
+++ b/libsanitizer/lsan/lsan.h
@@ -13,17 +13,17 @@
 
 #include "lsan_thread.h"
 #if SANITIZER_POSIX
-#include "lsan_posix.h"
+#  include "lsan_posix.h"
 #elif SANITIZER_FUCHSIA
-#include "lsan_fuchsia.h"
+#  include "lsan_fuchsia.h"
 #endif
 #include "sanitizer_common/sanitizer_flags.h"
 #include "sanitizer_common/sanitizer_stacktrace.h"
 
-#define GET_STACK_TRACE(max_size, fast)                       \
-  __sanitizer::BufferedStackTrace stack;                      \
-  stack.Unwind(StackTrace::GetCurrentPc(),                    \
-               GET_CURRENT_FRAME(), nullptr, fast, max_size);
+#define GET_STACK_TRACE(max_size, fast)                                        \
+  __sanitizer::BufferedStackTrace stack;                                       \
+  stack.Unwind(StackTrace::GetCurrentPc(), GET_CURRENT_FRAME(), nullptr, fast, \
+               max_size);
 
 #define GET_STACK_TRACE_FATAL \
   GET_STACK_TRACE(kStackTraceMax, common_flags()->fast_unwind_on_fatal)
@@ -39,12 +39,14 @@ namespace __lsan {
 void InitializeInterceptors();
 void ReplaceSystemMalloc();
 void LsanOnDeadlySignal(int signo, void *siginfo, void *context);
-
-#define ENSURE_LSAN_INITED do {   \
-  CHECK(!lsan_init_is_running);   \
-  if (!lsan_inited)               \
-    __lsan_init();                \
-} while (0)
+void InstallAtExitCheckLeaks();
+
+#define ENSURE_LSAN_INITED        \
+  do {                            \
+    CHECK(!lsan_init_is_running); \
+    if (!lsan_inited)             \
+      __lsan_init();              \
+  } while (0)
 
 }  // namespace __lsan
 
diff --git a/libsanitizer/lsan/lsan_allocator.cpp b/libsanitizer/lsan/lsan_allocator.cpp
index 91e34eb..b4fd7e9 100644
--- a/libsanitizer/lsan/lsan_allocator.cpp
+++ b/libsanitizer/lsan/lsan_allocator.cpp
@@ -27,11 +27,11 @@ extern "C" void *memset(void *ptr, int value, uptr num);
 
 namespace __lsan {
 #if defined(__i386__) || defined(__arm__)
-static const uptr kMaxAllowedMallocSize = 1UL << 30;
+static const uptr kMaxAllowedMallocSize = 1ULL << 30;
 #elif defined(__mips64) || defined(__aarch64__)
-static const uptr kMaxAllowedMallocSize = 4UL << 30;
+static const uptr kMaxAllowedMallocSize = 4ULL << 30;
 #else
-static const uptr kMaxAllowedMallocSize = 8UL << 30;
+static const uptr kMaxAllowedMallocSize = 8ULL << 30;
 #endif
 
 static Allocator allocator;
@@ -88,6 +88,11 @@ void *Allocate(const StackTrace &stack, uptr size, uptr alignment,
     size = 1;
   if (size > max_malloc_size)
     return ReportAllocationSizeTooBig(size, stack);
+  if (UNLIKELY(IsRssLimitExceeded())) {
+    if (AllocatorMayReturnNull())
+      return nullptr;
+    ReportRssLimitExceeded(&stack);
+  }
   void *p = allocator.Allocate(GetAllocatorCache(), size, alignment);
   if (UNLIKELY(!p)) {
     SetAllocatorOutOfMemory();
@@ -99,7 +104,6 @@ void *Allocate(const StackTrace &stack, uptr size, uptr alignment,
   if (cleared && allocator.FromPrimary(p))
     memset(p, 0, size);
   RegisterAllocation(stack, p, size);
-  if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(p, size);
   RunMallocHooks(p, size);
   return p;
 }
@@ -115,7 +119,6 @@ static void *Calloc(uptr nmemb, uptr size, const StackTrace &stack) {
 }
 
 void Deallocate(void *p) {
-  if (&__sanitizer_free_hook) __sanitizer_free_hook(p);
   RunFreeHooks(p);
   RegisterDeallocation(p);
   allocator.Deallocate(GetAllocatorCache(), p);
@@ -359,16 +362,4 @@ uptr __sanitizer_get_allocated_size(const void *p) {
   return GetMallocUsableSize(p);
 }
 
-#if !SANITIZER_SUPPORTS_WEAK_HOOKS
-// Provide default (no-op) implementation of malloc hooks.
-SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void __sanitizer_malloc_hook(void *ptr, uptr size) {
-  (void)ptr;
-  (void)size;
-}
-SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void __sanitizer_free_hook(void *ptr) {
-  (void)ptr;
-}
-#endif
 } // extern "C"
diff --git a/libsanitizer/lsan/lsan_allocator.h b/libsanitizer/lsan/lsan_allocator.h
index 45c6ac4..5393304 100644
--- a/libsanitizer/lsan/lsan_allocator.h
+++ b/libsanitizer/lsan/lsan_allocator.h
@@ -66,12 +66,9 @@ template <typename AddressSpaceView>
 using PrimaryAllocatorASVT = SizeClassAllocator32<AP32<AddressSpaceView>>;
 using PrimaryAllocator = PrimaryAllocatorASVT<LocalAddressSpaceView>;
 #elif defined(__x86_64__) || defined(__powerpc64__) || defined(__s390x__)
-# if SANITIZER_FUCHSIA
+# if SANITIZER_FUCHSIA || defined(__powerpc64__)
 const uptr kAllocatorSpace = ~(uptr)0;
 const uptr kAllocatorSize  =  0x40000000000ULL;  // 4T.
-# elif defined(__powerpc64__)
-const uptr kAllocatorSpace = 0xa0000000000ULL;
-const uptr kAllocatorSize  = 0x20000000000ULL;  // 2T.
 #elif defined(__s390x__)
 const uptr kAllocatorSpace = 0x40000000000ULL;
 const uptr kAllocatorSize = 0x40000000000ULL;  // 4T.
diff --git a/libsanitizer/lsan/lsan_common.cpp b/libsanitizer/lsan/lsan_common.cpp
index 308dbb3..8d1bf11 100644
--- a/libsanitizer/lsan/lsan_common.cpp
+++ b/libsanitizer/lsan/lsan_common.cpp
@@ -34,7 +34,6 @@ Mutex global_mutex;
 
 Flags lsan_flags;
 
-
 void DisableCounterUnderflow() {
   if (common_flags()->detect_leaks) {
     Report("Unmatched call to __lsan_enable().\n");
@@ -43,44 +42,48 @@ void DisableCounterUnderflow() {
 }
 
 void Flags::SetDefaults() {
-#define LSAN_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
-#include "lsan_flags.inc"
-#undef LSAN_FLAG
+#  define LSAN_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#  include "lsan_flags.inc"
+#  undef LSAN_FLAG
 }
 
 void RegisterLsanFlags(FlagParser *parser, Flags *f) {
-#define LSAN_FLAG(Type, Name, DefaultValue, Description) \
-  RegisterFlag(parser, #Name, Description, &f->Name);
-#include "lsan_flags.inc"
-#undef LSAN_FLAG
+#  define LSAN_FLAG(Type, Name, DefaultValue, Description) \
+    RegisterFlag(parser, #Name, Description, &f->Name);
+#  include "lsan_flags.inc"
+#  undef LSAN_FLAG
 }
 
-#define LOG_POINTERS(...)                           \
-  do {                                              \
-    if (flags()->log_pointers) Report(__VA_ARGS__); \
-  } while (0)
+#  define LOG_POINTERS(...)      \
+    do {                         \
+      if (flags()->log_pointers) \
+        Report(__VA_ARGS__);     \
+    } while (0)
 
-#define LOG_THREADS(...)                           \
-  do {                                             \
-    if (flags()->log_threads) Report(__VA_ARGS__); \
-  } while (0)
+#  define LOG_THREADS(...)      \
+    do {                        \
+      if (flags()->log_threads) \
+        Report(__VA_ARGS__);    \
+    } while (0)
 
 class LeakSuppressionContext {
   bool parsed = false;
   SuppressionContext context;
   bool suppressed_stacks_sorted = true;
   InternalMmapVector<u32> suppressed_stacks;
+  const LoadedModule *suppress_module = nullptr;
 
-  Suppression *GetSuppressionForAddr(uptr addr);
   void LazyInit();
+  Suppression *GetSuppressionForAddr(uptr addr);
+  bool SuppressInvalid(const StackTrace &stack);
+  bool SuppressByRule(const StackTrace &stack, uptr hit_count, uptr total_size);
 
  public:
   LeakSuppressionContext(const char *supprression_types[],
                          int suppression_types_num)
       : context(supprression_types, suppression_types_num) {}
 
-  Suppression *GetSuppressionForStack(u32 stack_trace_id,
-                                      const StackTrace &stack);
+  bool Suppress(u32 stack_trace_id, uptr hit_count, uptr total_size);
 
   const InternalMmapVector<u32> &GetSortedSuppressedStacks() {
     if (!suppressed_stacks_sorted) {
@@ -95,17 +98,17 @@ class LeakSuppressionContext {
 ALIGNED(64) static char suppression_placeholder[sizeof(LeakSuppressionContext)];
 static LeakSuppressionContext *suppression_ctx = nullptr;
 static const char kSuppressionLeak[] = "leak";
-static const char *kSuppressionTypes[] = { kSuppressionLeak };
+static const char *kSuppressionTypes[] = {kSuppressionLeak};
 static const char kStdSuppressions[] =
-#if SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT
+#  if SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT
     // For more details refer to the SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT
     // definition.
     "leak:*pthread_exit*\n"
-#endif  // SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT
-#if SANITIZER_MAC
+#  endif  // SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT
+#  if SANITIZER_MAC
     // For Darwin and os_log/os_trace: https://reviews.llvm.org/D35173
     "leak:*_os_trace*\n"
-#endif
+#  endif
     // TLS leak in some glibc versions, described in
     // https://sourceware.org/bugzilla/show_bug.cgi?id=12650.
     "leak:*tls_get_addr*\n";
@@ -123,9 +126,93 @@ void LeakSuppressionContext::LazyInit() {
     if (&__lsan_default_suppressions)
       context.Parse(__lsan_default_suppressions());
     context.Parse(kStdSuppressions);
+    if (flags()->use_tls && flags()->use_ld_allocations)
+      suppress_module = GetLinker();
   }
 }
 
+Suppression *LeakSuppressionContext::GetSuppressionForAddr(uptr addr) {
+  Suppression *s = nullptr;
+
+  // Suppress by module name.
+  const char *module_name = Symbolizer::GetOrInit()->GetModuleNameForPc(addr);
+  if (!module_name)
+    module_name = "<unknown module>";
+  if (context.Match(module_name, kSuppressionLeak, &s))
+    return s;
+
+  // Suppress by file or function name.
+  SymbolizedStack *frames = Symbolizer::GetOrInit()->SymbolizePC(addr);
+  for (SymbolizedStack *cur = frames; cur; cur = cur->next) {
+    if (context.Match(cur->info.function, kSuppressionLeak, &s) ||
+        context.Match(cur->info.file, kSuppressionLeak, &s)) {
+      break;
+    }
+  }
+  frames->ClearAll();
+  return s;
+}
+
+static uptr GetCallerPC(const StackTrace &stack) {
+  // The top frame is our malloc/calloc/etc. The next frame is the caller.
+  if (stack.size >= 2)
+    return stack.trace[1];
+  return 0;
+}
+
+// On Linux, treats all chunks allocated from ld-linux.so as reachable, which
+// covers dynamically allocated TLS blocks, internal dynamic loader's loaded
+// modules accounting etc.
+// Dynamic TLS blocks contain the TLS variables of dynamically loaded modules.
+// They are allocated with a __libc_memalign() call in allocate_and_init()
+// (elf/dl-tls.c). Glibc won't tell us the address ranges occupied by those
+// blocks, but we can make sure they come from our own allocator by intercepting
+// __libc_memalign(). On top of that, there is no easy way to reach them. Their
+// addresses are stored in a dynamically allocated array (the DTV) which is
+// referenced from the static TLS. Unfortunately, we can't just rely on the DTV
+// being reachable from the static TLS, and the dynamic TLS being reachable from
+// the DTV. This is because the initial DTV is allocated before our interception
+// mechanism kicks in, and thus we don't recognize it as allocated memory. We
+// can't special-case it either, since we don't know its size.
+// Our solution is to include in the root set all allocations made from
+// ld-linux.so (which is where allocate_and_init() is implemented). This is
+// guaranteed to include all dynamic TLS blocks (and possibly other allocations
+// which we don't care about).
+// On all other platforms, this simply checks to ensure that the caller pc is
+// valid before reporting chunks as leaked.
+bool LeakSuppressionContext::SuppressInvalid(const StackTrace &stack) {
+  uptr caller_pc = GetCallerPC(stack);
+  // If caller_pc is unknown, this chunk may be allocated in a coroutine. Mark
+  // it as reachable, as we can't properly report its allocation stack anyway.
+  return !caller_pc ||
+         (suppress_module && suppress_module->containsAddress(caller_pc));
+}
+
+bool LeakSuppressionContext::SuppressByRule(const StackTrace &stack,
+                                            uptr hit_count, uptr total_size) {
+  for (uptr i = 0; i < stack.size; i++) {
+    Suppression *s = GetSuppressionForAddr(
+        StackTrace::GetPreviousInstructionPc(stack.trace[i]));
+    if (s) {
+      s->weight += total_size;
+      atomic_fetch_add(&s->hit_count, hit_count, memory_order_relaxed);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool LeakSuppressionContext::Suppress(u32 stack_trace_id, uptr hit_count,
+                                      uptr total_size) {
+  LazyInit();
+  StackTrace stack = StackDepotGet(stack_trace_id);
+  if (!SuppressInvalid(stack) && !SuppressByRule(stack, hit_count, total_size))
+    return false;
+  suppressed_stacks_sorted = false;
+  suppressed_stacks.push_back(stack_trace_id);
+  return true;
+}
+
 static LeakSuppressionContext *GetSuppressionContext() {
   CHECK(suppression_ctx);
   return suppression_ctx;
@@ -146,9 +233,9 @@ void InitCommonLsan() {
   }
 }
 
-class Decorator: public __sanitizer::SanitizerCommonDecorator {
+class Decorator : public __sanitizer::SanitizerCommonDecorator {
  public:
-  Decorator() : SanitizerCommonDecorator() { }
+  Decorator() : SanitizerCommonDecorator() {}
   const char *Error() { return Red(); }
   const char *Leak() { return Blue(); }
 };
@@ -157,19 +244,19 @@ static inline bool CanBeAHeapPointer(uptr p) {
   // Since our heap is located in mmap-ed memory, we can assume a sensible lower
   // bound on heap addresses.
   const uptr kMinAddress = 4 * 4096;
-  if (p < kMinAddress) return false;
-#if defined(__x86_64__)
+  if (p < kMinAddress)
+    return false;
+#  if defined(__x86_64__)
   // Accept only canonical form user-space addresses.
   return ((p >> 47) == 0);
-#elif defined(__mips64)
+#  elif defined(__mips64)
   return ((p >> 40) == 0);
-#elif defined(__aarch64__)
-  unsigned runtimeVMA =
-    (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1);
+#  elif defined(__aarch64__)
+  unsigned runtimeVMA = (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1);
   return ((p >> runtimeVMA) == 0);
-#else
+#  else
   return true;
-#endif
+#  endif
 }
 
 // Scans the memory range, looking for byte patterns that point into allocator
@@ -178,8 +265,7 @@ static inline bool CanBeAHeapPointer(uptr p) {
 // (|tag| = kReachable) and finding indirectly leaked chunks
 // (|tag| = kIndirectlyLeaked). In the second case, there's no flood fill,
 // so |frontier| = 0.
-void ScanRangeForPointers(uptr begin, uptr end,
-                          Frontier *frontier,
+void ScanRangeForPointers(uptr begin, uptr end, Frontier *frontier,
                           const char *region_type, ChunkTag tag) {
   CHECK(tag == kReachable || tag == kIndirectlyLeaked);
   const uptr alignment = flags()->pointer_alignment();
@@ -190,13 +276,17 @@ void ScanRangeForPointers(uptr begin, uptr end,
     pp = pp + alignment - pp % alignment;
   for (; pp + sizeof(void *) <= end; pp += alignment) {
     void *p = *reinterpret_cast<void **>(pp);
-    if (!CanBeAHeapPointer(reinterpret_cast<uptr>(p))) continue;
+    if (!CanBeAHeapPointer(reinterpret_cast<uptr>(p)))
+      continue;
     uptr chunk = PointsIntoChunk(p);
-    if (!chunk) continue;
+    if (!chunk)
+      continue;
     // Pointers to self don't count. This matters when tag == kIndirectlyLeaked.
-    if (chunk == begin) continue;
+    if (chunk == begin)
+      continue;
     LsanMetadata m(chunk);
-    if (m.tag() == kReachable || m.tag() == kIgnored) continue;
+    if (m.tag() == kReachable || m.tag() == kIgnored)
+      continue;
 
     // Do this check relatively late so we can log only the interesting cases.
     if (!flags()->use_poisoned && WordIsPoisoned(pp)) {
@@ -234,23 +324,23 @@ void ScanGlobalRange(uptr begin, uptr end, Frontier *frontier) {
   }
 }
 
-void ForEachExtraStackRangeCb(uptr begin, uptr end, void* arg) {
+void ForEachExtraStackRangeCb(uptr begin, uptr end, void *arg) {
   Frontier *frontier = reinterpret_cast<Frontier *>(arg);
   ScanRangeForPointers(begin, end, frontier, "FAKE STACK", kReachable);
 }
 
-#if SANITIZER_FUCHSIA
+#  if SANITIZER_FUCHSIA
 
 // Fuchsia handles all threads together with its own callback.
 static void ProcessThreads(SuspendedThreadsList const &, Frontier *) {}
 
-#else
+#  else
 
-#if SANITIZER_ANDROID
+#    if SANITIZER_ANDROID
 // FIXME: Move this out into *libcdep.cpp
 extern "C" SANITIZER_WEAK_ATTRIBUTE void __libc_iterate_dynamic_tls(
     pid_t, void (*cb)(void *, void *, uptr, void *), void *);
-#endif
+#    endif
 
 static void ProcessThreadRegistry(Frontier *frontier) {
   InternalMmapVector<uptr> ptrs;
@@ -282,9 +372,9 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
     LOG_THREADS("Processing thread %llu.\n", os_id);
     uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end;
     DTLS *dtls;
-    bool thread_found = GetThreadRangesLocked(os_id, &stack_begin, &stack_end,
-                                              &tls_begin, &tls_end,
-                                              &cache_begin, &cache_end, &dtls);
+    bool thread_found =
+        GetThreadRangesLocked(os_id, &stack_begin, &stack_end, &tls_begin,
+                              &tls_end, &cache_begin, &cache_end, &dtls);
     if (!thread_found) {
       // If a thread can't be found in the thread registry, it's probably in the
       // process of destruction. Log this event and move on.
@@ -298,7 +388,8 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
       Report("Unable to get registers from thread %llu.\n", os_id);
       // If unable to get SP, consider the entire stack to be reachable unless
       // GetRegistersAndSP failed with ESRCH.
-      if (have_registers == REGISTERS_UNAVAILABLE_FATAL) continue;
+      if (have_registers == REGISTERS_UNAVAILABLE_FATAL)
+        continue;
       sp = stack_begin;
     }
 
@@ -353,7 +444,7 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
                                  kReachable);
         }
       }
-#if SANITIZER_ANDROID
+#    if SANITIZER_ANDROID
       auto *cb = +[](void *dtls_begin, void *dtls_end, uptr /*dso_idd*/,
                      void *arg) -> void {
         ScanRangeForPointers(reinterpret_cast<uptr>(dtls_begin),
@@ -366,7 +457,7 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
       // thread is suspended in the middle of updating its DTLS. IOWs, we
       // could scan already freed memory. (probably fine for now)
       __libc_iterate_dynamic_tls(os_id, cb, frontier);
-#else
+#    else
       if (dtls && !DTLSInDestruction(dtls)) {
         ForEachDVT(dtls, [&](const DTLS::DTV &dtv, int id) {
           uptr dtls_beg = dtv.beg;
@@ -383,7 +474,7 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
         // this and continue.
         LOG_THREADS("Thread %llu has DTLS under destruction.\n", os_id);
       }
-#endif
+#    endif
     }
   }
 
@@ -391,13 +482,14 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
   ProcessThreadRegistry(frontier);
 }
 
-#endif  // SANITIZER_FUCHSIA
+#  endif  // SANITIZER_FUCHSIA
 
 void ScanRootRegion(Frontier *frontier, const RootRegion &root_region,
                     uptr region_begin, uptr region_end, bool is_readable) {
   uptr intersection_begin = Max(root_region.begin, region_begin);
   uptr intersection_end = Min(region_end, root_region.begin + root_region.size);
-  if (intersection_begin >= intersection_end) return;
+  if (intersection_begin >= intersection_end)
+    return;
   LOG_POINTERS("Root region %p-%p intersects with mapped region %p-%p (%s)\n",
                (void *)root_region.begin,
                (void *)(root_region.begin + root_region.size),
@@ -420,7 +512,8 @@ static void ProcessRootRegion(Frontier *frontier,
 
 // Scans root regions for heap pointers.
 static void ProcessRootRegions(Frontier *frontier) {
-  if (!flags()->use_root_regions) return;
+  if (!flags()->use_root_regions)
+    return;
   for (uptr i = 0; i < root_regions.size(); i++)
     ProcessRootRegion(frontier, root_regions[i]);
 }
@@ -477,68 +570,6 @@ static void CollectIgnoredCb(uptr chunk, void *arg) {
   }
 }
 
-static uptr GetCallerPC(const StackTrace &stack) {
-  // The top frame is our malloc/calloc/etc. The next frame is the caller.
-  if (stack.size >= 2)
-    return stack.trace[1];
-  return 0;
-}
-
-struct InvalidPCParam {
-  Frontier *frontier;
-  bool skip_linker_allocations;
-};
-
-// ForEachChunk callback. If the caller pc is invalid or is within the linker,
-// mark as reachable. Called by ProcessPlatformSpecificAllocations.
-static void MarkInvalidPCCb(uptr chunk, void *arg) {
-  CHECK(arg);
-  InvalidPCParam *param = reinterpret_cast<InvalidPCParam *>(arg);
-  chunk = GetUserBegin(chunk);
-  LsanMetadata m(chunk);
-  if (m.allocated() && m.tag() != kReachable && m.tag() != kIgnored) {
-    u32 stack_id = m.stack_trace_id();
-    uptr caller_pc = 0;
-    if (stack_id > 0)
-      caller_pc = GetCallerPC(StackDepotGet(stack_id));
-    // If caller_pc is unknown, this chunk may be allocated in a coroutine. Mark
-    // it as reachable, as we can't properly report its allocation stack anyway.
-    if (caller_pc == 0 || (param->skip_linker_allocations &&
-                           GetLinker()->containsAddress(caller_pc))) {
-      m.set_tag(kReachable);
-      param->frontier->push_back(chunk);
-    }
-  }
-}
-
-// On Linux, treats all chunks allocated from ld-linux.so as reachable, which
-// covers dynamically allocated TLS blocks, internal dynamic loader's loaded
-// modules accounting etc.
-// Dynamic TLS blocks contain the TLS variables of dynamically loaded modules.
-// They are allocated with a __libc_memalign() call in allocate_and_init()
-// (elf/dl-tls.c). Glibc won't tell us the address ranges occupied by those
-// blocks, but we can make sure they come from our own allocator by intercepting
-// __libc_memalign(). On top of that, there is no easy way to reach them. Their
-// addresses are stored in a dynamically allocated array (the DTV) which is
-// referenced from the static TLS. Unfortunately, we can't just rely on the DTV
-// being reachable from the static TLS, and the dynamic TLS being reachable from
-// the DTV. This is because the initial DTV is allocated before our interception
-// mechanism kicks in, and thus we don't recognize it as allocated memory. We
-// can't special-case it either, since we don't know its size.
-// Our solution is to include in the root set all allocations made from
-// ld-linux.so (which is where allocate_and_init() is implemented). This is
-// guaranteed to include all dynamic TLS blocks (and possibly other allocations
-// which we don't care about).
-// On all other platforms, this simply checks to ensure that the caller pc is
-// valid before reporting chunks as leaked.
-static void ProcessPC(Frontier *frontier) {
-  InvalidPCParam arg;
-  arg.frontier = frontier;
-  arg.skip_linker_allocations =
-      flags()->use_tls && flags()->use_ld_allocations && GetLinker() != nullptr;
-  ForEachChunk(MarkInvalidPCCb, &arg);
-}
-
 // Sets the appropriate tag on each chunk.
 static void ClassifyAllChunks(SuspendedThreadsList const &suspended_threads,
                               Frontier *frontier) {
@@ -554,9 +585,6 @@ static void ClassifyAllChunks(SuspendedThreadsList const &suspended_threads,
   ProcessRootRegions(frontier);
   FloodFillTag(frontier, kReachable);
 
-  CHECK_EQ(0, frontier->size());
-  ProcessPC(frontier);
-
   // The check here is relatively expensive, so we do this in a separate flood
   // fill. That way we can skip the check for chunks that are reachable
   // otherwise.
@@ -583,14 +611,13 @@ static void ResetTagsCb(uptr chunk, void *arg) {
 // a LeakReport.
 static void CollectLeaksCb(uptr chunk, void *arg) {
   CHECK(arg);
-  LeakReport *leak_report = reinterpret_cast<LeakReport *>(arg);
+  LeakedChunks *leaks = reinterpret_cast<LeakedChunks *>(arg);
   chunk = GetUserBegin(chunk);
   LsanMetadata m(chunk);
-  if (!m.allocated()) return;
-  if (m.tag() == kDirectlyLeaked || m.tag() == kIndirectlyLeaked) {
-    leak_report->AddLeakedChunk(chunk, m.stack_trace_id(), m.requested_size(),
-                                m.tag());
-  }
+  if (!m.allocated())
+    return;
+  if (m.tag() == kDirectlyLeaked || m.tag() == kIndirectlyLeaked)
+    leaks->push_back({chunk, m.stack_trace_id(), m.requested_size(), m.tag()});
 }
 
 void LeakSuppressionContext::PrintMatchedSuppressions() {
@@ -622,13 +649,13 @@ static void ReportIfNotSuspended(ThreadContextBase *tctx, void *arg) {
   }
 }
 
-#if SANITIZER_FUCHSIA
+#  if SANITIZER_FUCHSIA
 
 // Fuchsia provides a libc interface that guarantees all threads are
 // covered, and SuspendedThreadList is never really used.
 static void ReportUnsuspendedThreads(const SuspendedThreadsList &) {}
 
-#else  // !SANITIZER_FUCHSIA
+#  else  // !SANITIZER_FUCHSIA
 
 static void ReportUnsuspendedThreads(
     const SuspendedThreadsList &suspended_threads) {
@@ -642,7 +669,7 @@ static void ReportUnsuspendedThreads(
       &ReportIfNotSuspended, &threads);
 }
 
-#endif  // !SANITIZER_FUCHSIA
+#  endif  // !SANITIZER_FUCHSIA
 
 static void CheckForLeaksCallback(const SuspendedThreadsList &suspended_threads,
                                   void *arg) {
@@ -651,7 +678,7 @@ static void CheckForLeaksCallback(const SuspendedThreadsList &suspended_threads,
   CHECK(!param->success);
   ReportUnsuspendedThreads(suspended_threads);
   ClassifyAllChunks(suspended_threads, &param->frontier);
-  ForEachChunk(CollectLeaksCb, &param->leak_report);
+  ForEachChunk(CollectLeaksCb, &param->leaks);
   // Clean up for subsequent leak checks. This assumes we did not overwrite any
   // kIgnored tags.
   ForEachChunk(ResetTagsCb, nullptr);
@@ -700,17 +727,20 @@ static bool CheckForLeaks() {
           "etc)\n");
       Die();
     }
+    LeakReport leak_report;
+    leak_report.AddLeakedChunks(param.leaks);
+
     // No new suppressions stacks, so rerun will not help and we can report.
-    if (!param.leak_report.ApplySuppressions())
-      return PrintResults(param.leak_report);
+    if (!leak_report.ApplySuppressions())
+      return PrintResults(leak_report);
 
     // No indirect leaks to report, so we are done here.
-    if (!param.leak_report.IndirectUnsuppressedLeakCount())
-      return PrintResults(param.leak_report);
+    if (!leak_report.IndirectUnsuppressedLeakCount())
+      return PrintResults(leak_report);
 
     if (i >= 8) {
       Report("WARNING: LeakSanitizer gave up on indirect leaks suppression.\n");
-      return PrintResults(param.leak_report);
+      return PrintResults(leak_report);
     }
 
     // We found a new previously unseen suppressed call stack. Rerun to make
@@ -726,10 +756,12 @@ bool HasReportedLeaks() { return has_reported_leaks; }
 void DoLeakCheck() {
   Lock l(&global_mutex);
   static bool already_done;
-  if (already_done) return;
+  if (already_done)
+    return;
   already_done = true;
   has_reported_leaks = CheckForLeaks();
-  if (has_reported_leaks) HandleLeaks();
+  if (has_reported_leaks)
+    HandleLeaks();
 }
 
 static int DoRecoverableLeakCheck() {
@@ -740,80 +772,50 @@ static int DoRecoverableLeakCheck() {
 
 void DoRecoverableLeakCheckVoid() { DoRecoverableLeakCheck(); }
 
-Suppression *LeakSuppressionContext::GetSuppressionForAddr(uptr addr) {
-  Suppression *s = nullptr;
-
-  // Suppress by module name.
-  if (const char *module_name =
-          Symbolizer::GetOrInit()->GetModuleNameForPc(addr))
-    if (context.Match(module_name, kSuppressionLeak, &s))
-      return s;
-
-  // Suppress by file or function name.
-  SymbolizedStack *frames = Symbolizer::GetOrInit()->SymbolizePC(addr);
-  for (SymbolizedStack *cur = frames; cur; cur = cur->next) {
-    if (context.Match(cur->info.function, kSuppressionLeak, &s) ||
-        context.Match(cur->info.file, kSuppressionLeak, &s)) {
-      break;
-    }
-  }
-  frames->ClearAll();
-  return s;
-}
-
-Suppression *LeakSuppressionContext::GetSuppressionForStack(
-    u32 stack_trace_id, const StackTrace &stack) {
-  LazyInit();
-  for (uptr i = 0; i < stack.size; i++) {
-    Suppression *s = GetSuppressionForAddr(
-        StackTrace::GetPreviousInstructionPc(stack.trace[i]));
-    if (s) {
-      suppressed_stacks_sorted = false;
-      suppressed_stacks.push_back(stack_trace_id);
-      return s;
-    }
-  }
-  return nullptr;
-}
-
 ///// LeakReport implementation. /////
 
 // A hard limit on the number of distinct leaks, to avoid quadratic complexity
 // in LeakReport::AddLeakedChunk(). We don't expect to ever see this many leaks
 // in real-world applications.
-// FIXME: Get rid of this limit by changing the implementation of LeakReport to
-// use a hash table.
+// FIXME: Get rid of this limit by moving logic into DedupLeaks.
 const uptr kMaxLeaksConsidered = 5000;
 
-void LeakReport::AddLeakedChunk(uptr chunk, u32 stack_trace_id,
-                                uptr leaked_size, ChunkTag tag) {
-  CHECK(tag == kDirectlyLeaked || tag == kIndirectlyLeaked);
-
-  if (u32 resolution = flags()->resolution) {
-    StackTrace stack = StackDepotGet(stack_trace_id);
-    stack.size = Min(stack.size, resolution);
-    stack_trace_id = StackDepotPut(stack);
-  }
+void LeakReport::AddLeakedChunks(const LeakedChunks &chunks) {
+  for (const LeakedChunk &leak : chunks) {
+    uptr chunk = leak.chunk;
+    u32 stack_trace_id = leak.stack_trace_id;
+    uptr leaked_size = leak.leaked_size;
+    ChunkTag tag = leak.tag;
+    CHECK(tag == kDirectlyLeaked || tag == kIndirectlyLeaked);
+
+    if (u32 resolution = flags()->resolution) {
+      StackTrace stack = StackDepotGet(stack_trace_id);
+      stack.size = Min(stack.size, resolution);
+      stack_trace_id = StackDepotPut(stack);
+    }
 
-  bool is_directly_leaked = (tag == kDirectlyLeaked);
-  uptr i;
-  for (i = 0; i < leaks_.size(); i++) {
-    if (leaks_[i].stack_trace_id == stack_trace_id &&
-        leaks_[i].is_directly_leaked == is_directly_leaked) {
-      leaks_[i].hit_count++;
-      leaks_[i].total_size += leaked_size;
-      break;
+    bool is_directly_leaked = (tag == kDirectlyLeaked);
+    uptr i;
+    for (i = 0; i < leaks_.size(); i++) {
+      if (leaks_[i].stack_trace_id == stack_trace_id &&
+          leaks_[i].is_directly_leaked == is_directly_leaked) {
+        leaks_[i].hit_count++;
+        leaks_[i].total_size += leaked_size;
+        break;
+      }
+    }
+    if (i == leaks_.size()) {
+      if (leaks_.size() == kMaxLeaksConsidered)
+        return;
+      Leak leak = {next_id_++,         /* hit_count */ 1,
+                   leaked_size,        stack_trace_id,
+                   is_directly_leaked, /* is_suppressed */ false};
+      leaks_.push_back(leak);
+    }
+    if (flags()->report_objects) {
+      LeakedObject obj = {leaks_[i].id, chunk, leaked_size};
+      leaked_objects_.push_back(obj);
     }
-  }
-  if (i == leaks_.size()) {
-    if (leaks_.size() == kMaxLeaksConsidered) return;
-    Leak leak = { next_id_++, /* hit_count */ 1, leaked_size, stack_trace_id,
-                  is_directly_leaked, /* is_suppressed */ false };
-    leaks_.push_back(leak);
-  }
-  if (flags()->report_objects) {
-    LeakedObject obj = {leaks_[i].id, chunk, leaked_size};
-    leaked_objects_.push_back(obj);
   }
 }
 
@@ -828,9 +830,10 @@ void LeakReport::ReportTopLeaks(uptr num_leaks_to_report) {
   CHECK(leaks_.size() <= kMaxLeaksConsidered);
   Printf("\n");
   if (leaks_.size() == kMaxLeaksConsidered)
-    Printf("Too many leaks! Only the first %zu leaks encountered will be "
-           "reported.\n",
-           kMaxLeaksConsidered);
+    Printf(
+        "Too many leaks! Only the first %zu leaks encountered will be "
+        "reported.\n",
+        kMaxLeaksConsidered);
 
   uptr unsuppressed_count = UnsuppressedLeakCount();
   if (num_leaks_to_report > 0 && num_leaks_to_report < unsuppressed_count)
@@ -838,10 +841,12 @@ void LeakReport::ReportTopLeaks(uptr num_leaks_to_report) {
   Sort(leaks_.data(), leaks_.size(), &LeakComparator);
   uptr leaks_reported = 0;
   for (uptr i = 0; i < leaks_.size(); i++) {
-    if (leaks_[i].is_suppressed) continue;
+    if (leaks_[i].is_suppressed)
+      continue;
     PrintReportForLeak(i);
     leaks_reported++;
-    if (leaks_reported == num_leaks_to_report) break;
+    if (leaks_reported == num_leaks_to_report)
+      break;
   }
   if (leaks_reported < unsuppressed_count) {
     uptr remaining = unsuppressed_count - leaks_reported;
@@ -880,9 +885,10 @@ void LeakReport::PrintSummary() {
   CHECK(leaks_.size() <= kMaxLeaksConsidered);
   uptr bytes = 0, allocations = 0;
   for (uptr i = 0; i < leaks_.size(); i++) {
-      if (leaks_[i].is_suppressed) continue;
-      bytes += leaks_[i].total_size;
-      allocations += leaks_[i].hit_count;
+    if (leaks_[i].is_suppressed)
+      continue;
+    bytes += leaks_[i].total_size;
+    allocations += leaks_[i].hit_count;
   }
   InternalScopedString summary;
   summary.append("%zu byte(s) leaked in %zu allocation(s).", bytes,
@@ -894,12 +900,8 @@ uptr LeakReport::ApplySuppressions() {
   LeakSuppressionContext *suppressions = GetSuppressionContext();
   uptr new_suppressions = false;
   for (uptr i = 0; i < leaks_.size(); i++) {
-    Suppression *s = suppressions->GetSuppressionForStack(
-        leaks_[i].stack_trace_id, StackDepotGet(leaks_[i].stack_trace_id));
-    if (s) {
-      s->weight += leaks_[i].total_size;
-      atomic_store_relaxed(&s->hit_count, atomic_load_relaxed(&s->hit_count) +
-          leaks_[i].hit_count);
+    if (suppressions->Suppress(leaks_[i].stack_trace_id, leaks_[i].hit_count,
+                               leaks_[i].total_size)) {
       leaks_[i].is_suppressed = true;
       ++new_suppressions;
     }
@@ -910,7 +912,8 @@ uptr LeakReport::ApplySuppressions() {
 uptr LeakReport::UnsuppressedLeakCount() {
   uptr result = 0;
   for (uptr i = 0; i < leaks_.size(); i++)
-    if (!leaks_[i].is_suppressed) result++;
+    if (!leaks_[i].is_suppressed)
+      result++;
   return result;
 }
 
@@ -922,16 +925,16 @@ uptr LeakReport::IndirectUnsuppressedLeakCount() {
   return result;
 }
 
-} // namespace __lsan
-#else // CAN_SANITIZE_LEAKS
+}  // namespace __lsan
+#else   // CAN_SANITIZE_LEAKS
 namespace __lsan {
-void InitCommonLsan() { }
-void DoLeakCheck() { }
-void DoRecoverableLeakCheckVoid() { }
-void DisableInThisThread() { }
-void EnableInThisThread() { }
-}
-#endif // CAN_SANITIZE_LEAKS
+void InitCommonLsan() {}
+void DoLeakCheck() {}
+void DoRecoverableLeakCheckVoid() {}
+void DisableInThisThread() {}
+void EnableInThisThread() {}
+}  // namespace __lsan
+#endif  // CAN_SANITIZE_LEAKS
 
 using namespace __lsan;
 
@@ -948,11 +951,13 @@ void __lsan_ignore_object(const void *p) {
   if (res == kIgnoreObjectInvalid)
     VReport(1, "__lsan_ignore_object(): no heap object found at %p", p);
   if (res == kIgnoreObjectAlreadyIgnored)
-    VReport(1, "__lsan_ignore_object(): "
-           "heap object at %p is already being ignored\n", p);
+    VReport(1,
+            "__lsan_ignore_object(): "
+            "heap object at %p is already being ignored\n",
+            p);
   if (res == kIgnoreObjectSuccess)
     VReport(1, "__lsan_ignore_object(): ignoring heap object at %p\n", p);
-#endif // CAN_SANITIZE_LEAKS
+#endif  // CAN_SANITIZE_LEAKS
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -962,7 +967,7 @@ void __lsan_register_root_region(const void *begin, uptr size) {
   RootRegion region = {reinterpret_cast<uptr>(begin), size};
   root_regions.push_back(region);
   VReport(1, "Registered root region at %p of size %zu\n", begin, size);
-#endif // CAN_SANITIZE_LEAKS
+#endif  // CAN_SANITIZE_LEAKS
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -988,7 +993,7 @@ void __lsan_unregister_root_region(const void *begin, uptr size) {
         begin, size);
     Die();
   }
-#endif // CAN_SANITIZE_LEAKS
+#endif  // CAN_SANITIZE_LEAKS
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -1010,7 +1015,7 @@ void __lsan_do_leak_check() {
 #if CAN_SANITIZE_LEAKS
   if (common_flags()->detect_leaks)
     __lsan::DoLeakCheck();
-#endif // CAN_SANITIZE_LEAKS
+#endif  // CAN_SANITIZE_LEAKS
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -1018,7 +1023,7 @@ int __lsan_do_recoverable_leak_check() {
 #if CAN_SANITIZE_LEAKS
   if (common_flags()->detect_leaks)
     return __lsan::DoRecoverableLeakCheck();
-#endif // CAN_SANITIZE_LEAKS
+#endif  // CAN_SANITIZE_LEAKS
   return 0;
 }
 
@@ -1027,14 +1032,14 @@ SANITIZER_INTERFACE_WEAK_DEF(const char *, __lsan_default_options, void) {
 }
 
 #if !SANITIZER_SUPPORTS_WEAK_HOOKS
-SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-int __lsan_is_turned_off() {
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE int
+__lsan_is_turned_off() {
   return 0;
 }
 
-SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-const char *__lsan_default_suppressions() {
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE const char *
+__lsan_default_suppressions() {
   return "";
 }
 #endif
-} // extern "C"
+}  // extern "C"
diff --git a/libsanitizer/lsan/lsan_common.h b/libsanitizer/lsan/lsan_common.h
index f9b55e4..6b06c45 100644
--- a/libsanitizer/lsan/lsan_common.h
+++ b/libsanitizer/lsan/lsan_common.h
@@ -33,21 +33,21 @@
 // Exclude leak-detection on arm32 for Android because `__aeabi_read_tp`
 // is missing. This caused a link error.
 #if SANITIZER_ANDROID && (__ANDROID_API__ < 28 || defined(__arm__))
-#define CAN_SANITIZE_LEAKS 0
+#  define CAN_SANITIZE_LEAKS 0
 #elif (SANITIZER_LINUX || SANITIZER_MAC) && (SANITIZER_WORDSIZE == 64) && \
     (defined(__x86_64__) || defined(__mips64) || defined(__aarch64__) ||  \
      defined(__powerpc64__) || defined(__s390x__))
-#define CAN_SANITIZE_LEAKS 1
+#  define CAN_SANITIZE_LEAKS 1
 #elif defined(__i386__) && (SANITIZER_LINUX || SANITIZER_MAC)
-#define CAN_SANITIZE_LEAKS 1
+#  define CAN_SANITIZE_LEAKS 1
 #elif defined(__arm__) && SANITIZER_LINUX
-#define CAN_SANITIZE_LEAKS 1
+#  define CAN_SANITIZE_LEAKS 1
 #elif SANITIZER_RISCV64 && SANITIZER_LINUX
-#define CAN_SANITIZE_LEAKS 1
+#  define CAN_SANITIZE_LEAKS 1
 #elif SANITIZER_NETBSD || SANITIZER_FUCHSIA
-#define CAN_SANITIZE_LEAKS 1
+#  define CAN_SANITIZE_LEAKS 1
 #else
-#define CAN_SANITIZE_LEAKS 0
+#  define CAN_SANITIZE_LEAKS 0
 #endif
 
 namespace __sanitizer {
@@ -82,6 +82,15 @@ extern Flags lsan_flags;
 inline Flags *flags() { return &lsan_flags; }
 void RegisterLsanFlags(FlagParser *parser, Flags *f);
 
+struct LeakedChunk {
+  uptr chunk;
+  u32 stack_trace_id;
+  uptr leaked_size;
+  ChunkTag tag;
+};
+
+using LeakedChunks = InternalMmapVector<LeakedChunk>;
+
 struct Leak {
   u32 id;
   uptr hit_count;
@@ -101,8 +110,7 @@ struct LeakedObject {
 class LeakReport {
  public:
   LeakReport() {}
-  void AddLeakedChunk(uptr chunk, u32 stack_trace_id, uptr leaked_size,
-                      ChunkTag tag);
+  void AddLeakedChunks(const LeakedChunks &chunks);
   void ReportTopLeaks(uptr max_leaks);
   void PrintSummary();
   uptr ApplySuppressions();
@@ -136,7 +144,7 @@ struct RootRegion {
 // threads and enumerating roots.
 struct CheckForLeaksParam {
   Frontier frontier;
-  LeakReport leak_report;
+  LeakedChunks leaks;
   bool success = false;
 };
 
@@ -222,8 +230,24 @@ void UnlockAllocator();
 // Returns true if [addr, addr + sizeof(void *)) is poisoned.
 bool WordIsPoisoned(uptr addr);
 // Wrappers for ThreadRegistry access.
-void LockThreadRegistry() NO_THREAD_SAFETY_ANALYSIS;
-void UnlockThreadRegistry() NO_THREAD_SAFETY_ANALYSIS;
+void LockThreadRegistry() SANITIZER_NO_THREAD_SAFETY_ANALYSIS;
+void UnlockThreadRegistry() SANITIZER_NO_THREAD_SAFETY_ANALYSIS;
+
+struct ScopedStopTheWorldLock {
+  ScopedStopTheWorldLock() {
+    LockThreadRegistry();
+    LockAllocator();
+  }
+
+  ~ScopedStopTheWorldLock() {
+    UnlockAllocator();
+    UnlockThreadRegistry();
+  }
+
+  ScopedStopTheWorldLock &operator=(const ScopedStopTheWorldLock &) = delete;
+  ScopedStopTheWorldLock(const ScopedStopTheWorldLock &) = delete;
+};
+
 ThreadRegistry *GetThreadRegistryLocked();
 bool GetThreadRangesLocked(tid_t os_id, uptr *stack_begin, uptr *stack_end,
                            uptr *tls_begin, uptr *tls_end, uptr *cache_begin,
diff --git a/libsanitizer/lsan/lsan_common_fuchsia.cpp b/libsanitizer/lsan/lsan_common_fuchsia.cpp
index 2d35fa5..edb4ca6 100644
--- a/libsanitizer/lsan/lsan_common_fuchsia.cpp
+++ b/libsanitizer/lsan/lsan_common_fuchsia.cpp
@@ -52,14 +52,22 @@ void ProcessPlatformSpecificAllocations(Frontier *frontier) {}
 // behavior and causes rare race conditions.
 void HandleLeaks() {}
 
+// This is defined differently in asan_fuchsia.cpp and lsan_fuchsia.cpp.
+bool UseExitcodeOnLeak();
+
 int ExitHook(int status) {
+  if (common_flags()->detect_leaks && common_flags()->leak_check_at_exit) {
+    if (UseExitcodeOnLeak())
+      DoLeakCheck();
+    else
+      DoRecoverableLeakCheckVoid();
+  }
   return status == 0 && HasReportedLeaks() ? common_flags()->exitcode : status;
 }
 
 void LockStuffAndStopTheWorld(StopTheWorldCallback callback,
                               CheckForLeaksParam *argument) {
-  LockThreadRegistry();
-  LockAllocator();
+  ScopedStopTheWorldLock lock;
 
   struct Params {
     InternalMmapVector<uptr> allocator_caches;
@@ -149,9 +157,6 @@ void LockStuffAndStopTheWorld(StopTheWorldCallback callback,
         params->callback(SuspendedThreadsListFuchsia(), params->argument);
       },
       &params);
-
-  UnlockAllocator();
-  UnlockThreadRegistry();
 }
 
 }  // namespace __lsan
diff --git a/libsanitizer/lsan/lsan_common_linux.cpp b/libsanitizer/lsan/lsan_common_linux.cpp
index 3af586e..692ad35 100644
--- a/libsanitizer/lsan/lsan_common_linux.cpp
+++ b/libsanitizer/lsan/lsan_common_linux.cpp
@@ -122,12 +122,9 @@ void HandleLeaks() {
 
 static int LockStuffAndStopTheWorldCallback(struct dl_phdr_info *info,
                                             size_t size, void *data) {
-  LockThreadRegistry();
-  LockAllocator();
+  ScopedStopTheWorldLock lock;
   DoStopTheWorldParam *param = reinterpret_cast<DoStopTheWorldParam *>(data);
   StopTheWorld(param->callback, param->argument);
-  UnlockAllocator();
-  UnlockThreadRegistry();
   return 1;
 }
 
diff --git a/libsanitizer/lsan/lsan_common_mac.cpp b/libsanitizer/lsan/lsan_common_mac.cpp
index 4301dcc..a420474 100644
--- a/libsanitizer/lsan/lsan_common_mac.cpp
+++ b/libsanitizer/lsan/lsan_common_mac.cpp
@@ -143,16 +143,16 @@ void ProcessGlobalRegions(Frontier *frontier) {
 }
 
 void ProcessPlatformSpecificAllocations(Frontier *frontier) {
-  unsigned depth = 1;
-  vm_size_t size = 0;
   vm_address_t address = 0;
   kern_return_t err = KERN_SUCCESS;
-  mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
 
   InternalMmapVectorNoCtor<RootRegion> const *root_regions = GetRootRegions();
 
   while (err == KERN_SUCCESS) {
+    vm_size_t size = 0;
+    unsigned depth = 1;
     struct vm_region_submap_info_64 info;
+    mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
     err = vm_region_recurse_64(mach_task_self(), &address, &size, &depth,
                                (vm_region_info_t)&info, &count);
 
@@ -195,11 +195,8 @@ void HandleLeaks() {}
 
 void LockStuffAndStopTheWorld(StopTheWorldCallback callback,
                               CheckForLeaksParam *argument) {
-  LockThreadRegistry();
-  LockAllocator();
+  ScopedStopTheWorldLock lock;
   StopTheWorld(callback, argument);
-  UnlockAllocator();
-  UnlockThreadRegistry();
 }
 
 } // namespace __lsan
diff --git a/libsanitizer/lsan/lsan_fuchsia.cpp b/libsanitizer/lsan/lsan_fuchsia.cpp
index 40e65c6..2d96206 100644
--- a/libsanitizer/lsan/lsan_fuchsia.cpp
+++ b/libsanitizer/lsan/lsan_fuchsia.cpp
@@ -62,7 +62,7 @@ void InitializeMainThread() {
   OnCreatedArgs args;
   __sanitizer::GetThreadStackTopAndBottom(true, &args.stack_end,
                                           &args.stack_begin);
-  u32 tid = ThreadCreate(0, GetThreadSelf(), true, &args);
+  u32 tid = ThreadCreate(kMainTid, true, &args);
   CHECK_EQ(tid, 0);
   ThreadStart(tid);
 }
@@ -76,6 +76,13 @@ void GetAllThreadAllocatorCachesLocked(InternalMmapVector<uptr> *caches) {
       caches);
 }
 
+// On Fuchsia, leak detection is done by a special hook after atexit hooks.
+// So this doesn't install any atexit hook like on other platforms.
+void InstallAtExitCheckLeaks() {}
+
+// ASan defines this to check its `halt_on_error` flag.
+bool UseExitcodeOnLeak() { return true; }
+
 }  // namespace __lsan
 
 // These are declared (in extern "C") by <zircon/sanitizer.h>.
@@ -86,14 +93,13 @@ void GetAllThreadAllocatorCachesLocked(InternalMmapVector<uptr> *caches) {
 void *__sanitizer_before_thread_create_hook(thrd_t thread, bool detached,
                                             const char *name, void *stack_base,
                                             size_t stack_size) {
-  uptr user_id = reinterpret_cast<uptr>(thread);
   ENSURE_LSAN_INITED;
   EnsureMainThreadIDIsCorrect();
   OnCreatedArgs args;
   args.stack_begin = reinterpret_cast<uptr>(stack_base);
   args.stack_end = args.stack_begin + stack_size;
   u32 parent_tid = GetCurrentThread();
-  u32 tid = ThreadCreate(parent_tid, user_id, detached, &args);
+  u32 tid = ThreadCreate(parent_tid, detached, &args);
   return reinterpret_cast<void *>(static_cast<uptr>(tid));
 }
 
diff --git a/libsanitizer/lsan/lsan_interceptors.cpp b/libsanitizer/lsan/lsan_interceptors.cpp
index 22999d5..205e856 100644
--- a/libsanitizer/lsan/lsan_interceptors.cpp
+++ b/libsanitizer/lsan/lsan_interceptors.cpp
@@ -468,8 +468,7 @@ INTERCEPTOR(int, pthread_create, void *th, void *attr,
     res = REAL(pthread_create)(th, attr, __lsan_thread_start_func, &p);
   }
   if (res == 0) {
-    int tid = ThreadCreate(GetCurrentThread(), *(uptr *)th,
-                           IsStateDetached(detached));
+    int tid = ThreadCreate(GetCurrentThread(), IsStateDetached(detached));
     CHECK_NE(tid, kMainTid);
     atomic_store(&p.tid, tid, memory_order_release);
     while (atomic_load(&p.tid, memory_order_acquire) != 0)
@@ -480,23 +479,11 @@ INTERCEPTOR(int, pthread_create, void *th, void *attr,
   return res;
 }
 
-INTERCEPTOR(int, pthread_join, void *th, void **ret) {
-  ENSURE_LSAN_INITED;
-  int tid = ThreadTid((uptr)th);
-  int res = REAL(pthread_join)(th, ret);
-  if (res == 0)
-    ThreadJoin(tid);
-  return res;
+INTERCEPTOR(int, pthread_join, void *t, void **arg) {
+  return REAL(pthread_join)(t, arg);
 }
 
-INTERCEPTOR(int, pthread_detach, void *th) {
-  ENSURE_LSAN_INITED;
-  int tid = ThreadTid((uptr)th);
-  int res = REAL(pthread_detach)(th);
-  if (res == 0)
-    ThreadDetach(tid);
-  return res;
-}
+DEFINE_REAL_PTHREAD_FUNCTIONS
 
 INTERCEPTOR(void, _exit, int status) {
   if (status == 0 && HasReportedLeaks()) status = common_flags()->exitcode;
@@ -530,7 +517,6 @@ void InitializeInterceptors() {
   LSAN_MAYBE_INTERCEPT_MALLINFO;
   LSAN_MAYBE_INTERCEPT_MALLOPT;
   INTERCEPT_FUNCTION(pthread_create);
-  INTERCEPT_FUNCTION(pthread_detach);
   INTERCEPT_FUNCTION(pthread_join);
   INTERCEPT_FUNCTION(_exit);
 
diff --git a/libsanitizer/lsan/lsan_mac.cpp b/libsanitizer/lsan/lsan_mac.cpp
index b96893e..10a73f8 100644
--- a/libsanitizer/lsan/lsan_mac.cpp
+++ b/libsanitizer/lsan/lsan_mac.cpp
@@ -68,7 +68,7 @@ typedef struct {
 ALWAYS_INLINE
 void lsan_register_worker_thread(int parent_tid) {
   if (GetCurrentThread() == kInvalidTid) {
-    u32 tid = ThreadCreate(parent_tid, 0, true);
+    u32 tid = ThreadCreate(parent_tid, true);
     ThreadStart(tid, GetTid());
     SetCurrentThread(tid);
   }
diff --git a/libsanitizer/lsan/lsan_posix.cpp b/libsanitizer/lsan/lsan_posix.cpp
index 5d1c3f6..8f277db 100644
--- a/libsanitizer/lsan/lsan_posix.cpp
+++ b/libsanitizer/lsan/lsan_posix.cpp
@@ -75,7 +75,7 @@ bool GetThreadRangesLocked(tid_t os_id, uptr *stack_begin, uptr *stack_end,
 }
 
 void InitializeMainThread() {
-  u32 tid = ThreadCreate(kMainTid, 0, true);
+  u32 tid = ThreadCreate(kMainTid, true);
   CHECK_EQ(tid, kMainTid);
   ThreadStart(tid, GetTid());
 }
@@ -91,6 +91,11 @@ void LsanOnDeadlySignal(int signo, void *siginfo, void *context) {
                      nullptr);
 }
 
+void InstallAtExitCheckLeaks() {
+  if (common_flags()->detect_leaks && common_flags()->leak_check_at_exit)
+    Atexit(DoLeakCheck);
+}
+
 }  // namespace __lsan
 
 #endif  // SANITIZER_POSIX
diff --git a/libsanitizer/lsan/lsan_thread.cpp b/libsanitizer/lsan/lsan_thread.cpp
index 1d224eb..ca3dfd0 100644
--- a/libsanitizer/lsan/lsan_thread.cpp
+++ b/libsanitizer/lsan/lsan_thread.cpp
@@ -44,8 +44,8 @@ void ThreadContextLsanBase::OnFinished() {
   DTLS_Destroy();
 }
 
-u32 ThreadCreate(u32 parent_tid, uptr user_id, bool detached, void *arg) {
-  return thread_registry->CreateThread(user_id, detached, parent_tid, arg);
+u32 ThreadCreate(u32 parent_tid, bool detached, void *arg) {
+  return thread_registry->CreateThread(0, detached, parent_tid, arg);
 }
 
 void ThreadContextLsanBase::ThreadStart(u32 tid, tid_t os_id,
@@ -68,28 +68,6 @@ ThreadContext *CurrentThreadContext() {
   return (ThreadContext *)thread_registry->GetThreadLocked(GetCurrentThread());
 }
 
-static bool FindThreadByUid(ThreadContextBase *tctx, void *arg) {
-  uptr uid = (uptr)arg;
-  if (tctx->user_id == uid && tctx->status != ThreadStatusInvalid) {
-    return true;
-  }
-  return false;
-}
-
-u32 ThreadTid(uptr uid) {
-  return thread_registry->FindThread(FindThreadByUid, (void *)uid);
-}
-
-void ThreadDetach(u32 tid) {
-  CHECK_NE(tid, kInvalidTid);
-  thread_registry->DetachThread(tid, /* arg */ nullptr);
-}
-
-void ThreadJoin(u32 tid) {
-  CHECK_NE(tid, kInvalidTid);
-  thread_registry->JoinThread(tid, /* arg */ nullptr);
-}
-
 void EnsureMainThreadIDIsCorrect() {
   if (GetCurrentThread() == kMainTid)
     CurrentThreadContext()->os_id = GetTid();
diff --git a/libsanitizer/lsan/lsan_thread.h b/libsanitizer/lsan/lsan_thread.h
index 3664375..6ab4172 100644
--- a/libsanitizer/lsan/lsan_thread.h
+++ b/libsanitizer/lsan/lsan_thread.h
@@ -45,11 +45,8 @@ class ThreadContext;
 void InitializeThreadRegistry();
 void InitializeMainThread();
 
-u32 ThreadCreate(u32 tid, uptr uid, bool detached, void *arg = nullptr);
+u32 ThreadCreate(u32 tid, bool detached, void *arg = nullptr);
 void ThreadFinish();
-void ThreadDetach(u32 tid);
-void ThreadJoin(u32 tid);
-u32 ThreadTid(uptr uid);
 
 u32 GetCurrentThread();
 void SetCurrentThread(u32 tid);
diff --git a/libsanitizer/sanitizer_common/sanitizer_addrhashmap.h b/libsanitizer/sanitizer_common/sanitizer_addrhashmap.h
index 7e2fa91..fe48b9c 100644
--- a/libsanitizer/sanitizer_common/sanitizer_addrhashmap.h
+++ b/libsanitizer/sanitizer_common/sanitizer_addrhashmap.h
@@ -201,7 +201,8 @@ AddrHashMap<T, kSize>::AddrHashMap() {
 }
 
 template <typename T, uptr kSize>
-void AddrHashMap<T, kSize>::acquire(Handle *h) NO_THREAD_SAFETY_ANALYSIS {
+void AddrHashMap<T, kSize>::acquire(Handle *h)
+    SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   uptr addr = h->addr_;
   uptr hash = calcHash(addr);
   Bucket *b = &table_[hash];
@@ -330,7 +331,8 @@ void AddrHashMap<T, kSize>::acquire(Handle *h) NO_THREAD_SAFETY_ANALYSIS {
  }
 
  template <typename T, uptr kSize>
- void AddrHashMap<T, kSize>::release(Handle *h) NO_THREAD_SAFETY_ANALYSIS {
+ void AddrHashMap<T, kSize>::release(Handle *h)
+     SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
    if (!h->cell_)
      return;
    Bucket *b = h->bucket_;
diff --git a/libsanitizer/sanitizer_common/sanitizer_allocator.cpp b/libsanitizer/sanitizer_common/sanitizer_allocator.cpp
index bcb7370..25a43a59 100644
--- a/libsanitizer/sanitizer_common/sanitizer_allocator.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_allocator.cpp
@@ -17,6 +17,7 @@
 #include "sanitizer_allocator_internal.h"
 #include "sanitizer_atomic.h"
 #include "sanitizer_common.h"
+#include "sanitizer_platform.h"
 
 namespace __sanitizer {
 
@@ -24,66 +25,6 @@ namespace __sanitizer {
 const char *PrimaryAllocatorName = "SizeClassAllocator";
 const char *SecondaryAllocatorName = "LargeMmapAllocator";
 
-// ThreadSanitizer for Go uses libc malloc/free.
-#if defined(SANITIZER_USE_MALLOC)
-# if SANITIZER_LINUX && !SANITIZER_ANDROID
-extern "C" void *__libc_malloc(uptr size);
-#  if !SANITIZER_GO
-extern "C" void *__libc_memalign(uptr alignment, uptr size);
-#  endif
-extern "C" void *__libc_realloc(void *ptr, uptr size);
-extern "C" void __libc_free(void *ptr);
-# else
-#  include <stdlib.h>
-#  define __libc_malloc malloc
-#  if !SANITIZER_GO
-static void *__libc_memalign(uptr alignment, uptr size) {
-  void *p;
-  uptr error = posix_memalign(&p, alignment, size);
-  if (error) return nullptr;
-  return p;
-}
-#  endif
-#  define __libc_realloc realloc
-#  define __libc_free free
-# endif
-
-static void *RawInternalAlloc(uptr size, InternalAllocatorCache *cache,
-                              uptr alignment) {
-  (void)cache;
-#if !SANITIZER_GO
-  if (alignment == 0)
-    return __libc_malloc(size);
-  else
-    return __libc_memalign(alignment, size);
-#else
-  // Windows does not provide __libc_memalign/posix_memalign. It provides
-  // __aligned_malloc, but the allocated blocks can't be passed to free,
-  // they need to be passed to __aligned_free. InternalAlloc interface does
-  // not account for such requirement. Alignemnt does not seem to be used
-  // anywhere in runtime, so just call __libc_malloc for now.
-  DCHECK_EQ(alignment, 0);
-  return __libc_malloc(size);
-#endif
-}
-
-static void *RawInternalRealloc(void *ptr, uptr size,
-                                InternalAllocatorCache *cache) {
-  (void)cache;
-  return __libc_realloc(ptr, size);
-}
-
-static void RawInternalFree(void *ptr, InternalAllocatorCache *cache) {
-  (void)cache;
-  __libc_free(ptr);
-}
-
-InternalAllocator *internal_allocator() {
-  return 0;
-}
-
-#else  // SANITIZER_GO || defined(SANITIZER_USE_MALLOC)
-
 static ALIGNED(64) char internal_alloc_placeholder[sizeof(InternalAllocator)];
 static atomic_uint8_t internal_allocator_initialized;
 static StaticSpinMutex internal_alloc_init_mu;
@@ -135,8 +76,6 @@ static void RawInternalFree(void *ptr, InternalAllocatorCache *cache) {
   internal_allocator()->Deallocate(cache, ptr);
 }
 
-#endif  // SANITIZER_GO || defined(SANITIZER_USE_MALLOC)
-
 static void NORETURN ReportInternalAllocatorOutOfMemory(uptr requested_size) {
   SetAllocatorOutOfMemory();
   Report("FATAL: %s: internal allocator is out of memory trying to allocate "
@@ -187,6 +126,16 @@ void InternalFree(void *addr, InternalAllocatorCache *cache) {
   RawInternalFree(addr, cache);
 }
 
+void InternalAllocatorLock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  internal_allocator_cache_mu.Lock();
+  internal_allocator()->ForceLock();
+}
+
+void InternalAllocatorUnlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  internal_allocator()->ForceUnlock();
+  internal_allocator_cache_mu.Unlock();
+}
+
 // LowLevelAllocator
 constexpr uptr kLowLevelAllocatorDefaultAlignment = 8;
 static uptr low_level_alloc_min_alignment = kLowLevelAllocatorDefaultAlignment;
@@ -247,4 +196,14 @@ void PrintHintAllocatorCannotReturnNull() {
          "allocator_may_return_null=1\n");
 }
 
+static atomic_uint8_t rss_limit_exceeded;
+
+bool IsRssLimitExceeded() {
+  return atomic_load(&rss_limit_exceeded, memory_order_relaxed);
+}
+
+void SetRssLimitExceeded(bool limit_exceeded) {
+  atomic_store(&rss_limit_exceeded, limit_exceeded, memory_order_relaxed);
+}
+
 } // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_allocator.h b/libsanitizer/sanitizer_common/sanitizer_allocator.h
index ec23465..76b936f 100644
--- a/libsanitizer/sanitizer_common/sanitizer_allocator.h
+++ b/libsanitizer/sanitizer_common/sanitizer_allocator.h
@@ -70,6 +70,9 @@ inline void RandomShuffle(T *a, u32 n, u32 *rand_state) {
 #include "sanitizer_allocator_secondary.h"
 #include "sanitizer_allocator_combined.h"
 
+bool IsRssLimitExceeded();
+void SetRssLimitExceeded(bool limit_exceeded);
+
 } // namespace __sanitizer
 
 #endif // SANITIZER_ALLOCATOR_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_allocator_combined.h b/libsanitizer/sanitizer_common/sanitizer_allocator_combined.h
index 9a3602f..b92cfa5 100644
--- a/libsanitizer/sanitizer_common/sanitizer_allocator_combined.h
+++ b/libsanitizer/sanitizer_common/sanitizer_allocator_combined.h
@@ -175,12 +175,12 @@ class CombinedAllocator {
 
   // ForceLock() and ForceUnlock() are needed to implement Darwin malloc zone
   // introspection API.
-  void ForceLock() NO_THREAD_SAFETY_ANALYSIS {
+  void ForceLock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
     primary_.ForceLock();
     secondary_.ForceLock();
   }
 
-  void ForceUnlock() NO_THREAD_SAFETY_ANALYSIS {
+  void ForceUnlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
     secondary_.ForceUnlock();
     primary_.ForceUnlock();
   }
diff --git a/libsanitizer/sanitizer_common/sanitizer_allocator_internal.h b/libsanitizer/sanitizer_common/sanitizer_allocator_internal.h
index 3284903..3899473 100644
--- a/libsanitizer/sanitizer_common/sanitizer_allocator_internal.h
+++ b/libsanitizer/sanitizer_common/sanitizer_allocator_internal.h
@@ -48,6 +48,8 @@ void *InternalReallocArray(void *p, uptr count, uptr size,
 void *InternalCalloc(uptr count, uptr size,
                      InternalAllocatorCache *cache = nullptr);
 void InternalFree(void *p, InternalAllocatorCache *cache = nullptr);
+void InternalAllocatorLock();
+void InternalAllocatorUnlock();
 InternalAllocator *internal_allocator();
 
 } // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_allocator_primary32.h b/libsanitizer/sanitizer_common/sanitizer_allocator_primary32.h
index ae1b7e0..f2471ef 100644
--- a/libsanitizer/sanitizer_common/sanitizer_allocator_primary32.h
+++ b/libsanitizer/sanitizer_common/sanitizer_allocator_primary32.h
@@ -238,13 +238,13 @@ class SizeClassAllocator32 {
 
   // ForceLock() and ForceUnlock() are needed to implement Darwin malloc zone
   // introspection API.
-  void ForceLock() NO_THREAD_SAFETY_ANALYSIS {
+  void ForceLock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
     for (uptr i = 0; i < kNumClasses; i++) {
       GetSizeClassInfo(i)->mutex.Lock();
     }
   }
 
-  void ForceUnlock() NO_THREAD_SAFETY_ANALYSIS {
+  void ForceUnlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
     for (int i = kNumClasses - 1; i >= 0; i--) {
       GetSizeClassInfo(i)->mutex.Unlock();
     }
diff --git a/libsanitizer/sanitizer_common/sanitizer_allocator_primary64.h b/libsanitizer/sanitizer_common/sanitizer_allocator_primary64.h
index f917310..66ba71d 100644
--- a/libsanitizer/sanitizer_common/sanitizer_allocator_primary64.h
+++ b/libsanitizer/sanitizer_common/sanitizer_allocator_primary64.h
@@ -354,13 +354,13 @@ class SizeClassAllocator64 {
 
   // ForceLock() and ForceUnlock() are needed to implement Darwin malloc zone
   // introspection API.
-  void ForceLock() NO_THREAD_SAFETY_ANALYSIS {
+  void ForceLock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
     for (uptr i = 0; i < kNumClasses; i++) {
       GetRegionInfo(i)->mutex.Lock();
     }
   }
 
-  void ForceUnlock() NO_THREAD_SAFETY_ANALYSIS {
+  void ForceUnlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
     for (int i = (int)kNumClasses - 1; i >= 0; i--) {
       GetRegionInfo(i)->mutex.Unlock();
     }
diff --git a/libsanitizer/sanitizer_common/sanitizer_allocator_secondary.h b/libsanitizer/sanitizer_common/sanitizer_allocator_secondary.h
index c24354c..48afb2a 100644
--- a/libsanitizer/sanitizer_common/sanitizer_allocator_secondary.h
+++ b/libsanitizer/sanitizer_common/sanitizer_allocator_secondary.h
@@ -267,9 +267,9 @@ class LargeMmapAllocator {
 
   // ForceLock() and ForceUnlock() are needed to implement Darwin malloc zone
   // introspection API.
-  void ForceLock() ACQUIRE(mutex_) { mutex_.Lock(); }
+  void ForceLock() SANITIZER_ACQUIRE(mutex_) { mutex_.Lock(); }
 
-  void ForceUnlock() RELEASE(mutex_) { mutex_.Unlock(); }
+  void ForceUnlock() SANITIZER_RELEASE(mutex_) { mutex_.Unlock(); }
 
   // Iterate over all existing chunks.
   // The allocator must be locked when calling this function.
diff --git a/libsanitizer/sanitizer_common/sanitizer_atomic_clang.h b/libsanitizer/sanitizer_common/sanitizer_atomic_clang.h
index ccf18f0..4318d64 100644
--- a/libsanitizer/sanitizer_common/sanitizer_atomic_clang.h
+++ b/libsanitizer/sanitizer_common/sanitizer_atomic_clang.h
@@ -74,13 +74,12 @@ template <typename T>
 inline bool atomic_compare_exchange_strong(volatile T *a, typename T::Type *cmp,
                                            typename T::Type xchg,
                                            memory_order mo) {
-  typedef typename T::Type Type;
-  Type cmpv = *cmp;
-  Type prev;
-  prev = __sync_val_compare_and_swap(&a->val_dont_use, cmpv, xchg);
-  if (prev == cmpv) return true;
-  *cmp = prev;
-  return false;
+  // Transitioned from __sync_val_compare_and_swap to support targets like
+  // SPARC V8 that cannot inline atomic cmpxchg.  __atomic_compare_exchange
+  // can then be resolved from libatomic.  __ATOMIC_SEQ_CST is used to best
+  // match the __sync builtin memory order.
+  return __atomic_compare_exchange(&a->val_dont_use, cmp, &xchg, false,
+                                   __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
 }
 
 template<typename T>
diff --git a/libsanitizer/sanitizer_common/sanitizer_chained_origin_depot.cpp b/libsanitizer/sanitizer_common/sanitizer_chained_origin_depot.cpp
index 626777d..472b83d 100644
--- a/libsanitizer/sanitizer_common/sanitizer_chained_origin_depot.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_chained_origin_depot.cpp
@@ -11,7 +11,6 @@
 
 #include "sanitizer_chained_origin_depot.h"
 
-#include "sanitizer_persistent_allocator.h"
 #include "sanitizer_stackdepotbase.h"
 
 namespace __sanitizer {
diff --git a/libsanitizer/sanitizer_common/sanitizer_common.cpp b/libsanitizer/sanitizer_common/sanitizer_common.cpp
index 5fae8e3..e30a93d 100644
--- a/libsanitizer/sanitizer_common/sanitizer_common.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_common.cpp
@@ -11,10 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "sanitizer_common.h"
+
 #include "sanitizer_allocator_interface.h"
 #include "sanitizer_allocator_internal.h"
 #include "sanitizer_atomic.h"
 #include "sanitizer_flags.h"
+#include "sanitizer_interface_internal.h"
 #include "sanitizer_libc.h"
 #include "sanitizer_placement_new.h"
 
@@ -138,13 +140,21 @@ void LoadedModule::set(const char *module_name, uptr base_address,
   set(module_name, base_address);
   arch_ = arch;
   internal_memcpy(uuid_, uuid, sizeof(uuid_));
+  uuid_size_ = kModuleUUIDSize;
   instrumented_ = instrumented;
 }
 
+void LoadedModule::setUuid(const char *uuid, uptr size) {
+  if (size > kModuleUUIDSize)
+    size = kModuleUUIDSize;
+  internal_memcpy(uuid_, uuid, size);
+  uuid_size_ = size;
+}
+
 void LoadedModule::clear() {
   InternalFree(full_name_);
   base_address_ = 0;
-  max_executable_address_ = 0;
+  max_address_ = 0;
   full_name_ = nullptr;
   arch_ = kModuleArchUnknown;
   internal_memset(uuid_, 0, kModuleUUIDSize);
@@ -162,8 +172,7 @@ void LoadedModule::addAddressRange(uptr beg, uptr end, bool executable,
   AddressRange *r =
       new(mem) AddressRange(beg, end, executable, writable, name);
   ranges_.push_back(r);
-  if (executable && end > max_executable_address_)
-    max_executable_address_ = end;
+  max_address_ = Max(max_address_, end);
 }
 
 bool LoadedModule::containsAddress(uptr address) const {
@@ -301,18 +310,22 @@ struct MallocFreeHook {
 
 static MallocFreeHook MFHooks[kMaxMallocFreeHooks];
 
-void RunMallocHooks(const void *ptr, uptr size) {
+void RunMallocHooks(void *ptr, uptr size) {
+  __sanitizer_malloc_hook(ptr, size);
   for (int i = 0; i < kMaxMallocFreeHooks; i++) {
     auto hook = MFHooks[i].malloc_hook;
-    if (!hook) return;
+    if (!hook)
+      break;
     hook(ptr, size);
   }
 }
 
-void RunFreeHooks(const void *ptr) {
+void RunFreeHooks(void *ptr) {
+  __sanitizer_free_hook(ptr);
   for (int i = 0; i < kMaxMallocFreeHooks; i++) {
     auto hook = MFHooks[i].free_hook;
-    if (!hook) return;
+    if (!hook)
+      break;
     hook(ptr);
   }
 }
@@ -360,4 +373,16 @@ int __sanitizer_install_malloc_and_free_hooks(void (*malloc_hook)(const void *,
                                               void (*free_hook)(const void *)) {
   return InstallMallocFreeHooks(malloc_hook, free_hook);
 }
+
+// Provide default (no-op) implementation of malloc hooks.
+SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_malloc_hook, void *ptr,
+                             uptr size) {
+  (void)ptr;
+  (void)size;
+}
+
+SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_free_hook, void *ptr) {
+  (void)ptr;
+}
+
 } // extern "C"
diff --git a/libsanitizer/sanitizer_common/sanitizer_common.h b/libsanitizer/sanitizer_common/sanitizer_common.h
index 0651544..17570d6 100644
--- a/libsanitizer/sanitizer_common/sanitizer_common.h
+++ b/libsanitizer/sanitizer_common/sanitizer_common.h
@@ -16,7 +16,6 @@
 #define SANITIZER_COMMON_H
 
 #include "sanitizer_flags.h"
-#include "sanitizer_interface_internal.h"
 #include "sanitizer_internal_defs.h"
 #include "sanitizer_libc.h"
 #include "sanitizer_list.h"
@@ -171,8 +170,8 @@ void SetShadowRegionHugePageMode(uptr addr, uptr length);
 bool DontDumpShadowMemory(uptr addr, uptr length);
 // Check if the built VMA size matches the runtime one.
 void CheckVMASize();
-void RunMallocHooks(const void *ptr, uptr size);
-void RunFreeHooks(const void *ptr);
+void RunMallocHooks(void *ptr, uptr size);
+void RunFreeHooks(void *ptr);
 
 class ReservedAddressRange {
  public:
@@ -238,12 +237,12 @@ void SetPrintfAndReportCallback(void (*callback)(const char *));
 // Lock sanitizer error reporting and protects against nested errors.
 class ScopedErrorReportLock {
  public:
-  ScopedErrorReportLock() ACQUIRE(mutex_) { Lock(); }
-  ~ScopedErrorReportLock() RELEASE(mutex_) { Unlock(); }
+  ScopedErrorReportLock() SANITIZER_ACQUIRE(mutex_) { Lock(); }
+  ~ScopedErrorReportLock() SANITIZER_RELEASE(mutex_) { Unlock(); }
 
-  static void Lock() ACQUIRE(mutex_);
-  static void Unlock() RELEASE(mutex_);
-  static void CheckLocked() CHECK_LOCKED(mutex_);
+  static void Lock() SANITIZER_ACQUIRE(mutex_);
+  static void Unlock() SANITIZER_RELEASE(mutex_);
+  static void CheckLocked() SANITIZER_CHECK_LOCKED(mutex_);
 
  private:
   static atomic_uintptr_t reporting_thread_;
@@ -286,7 +285,7 @@ void SetStackSizeLimitInBytes(uptr limit);
 bool AddressSpaceIsUnlimited();
 void SetAddressSpaceUnlimited();
 void AdjustStackSize(void *attr);
-void PlatformPrepareForSandboxing(__sanitizer_sandbox_arguments *args);
+void PlatformPrepareForSandboxing(void *args);
 void SetSandboxingCallback(void (*f)());
 
 void InitializeCoverage(bool enabled, const char *coverage_dir);
@@ -326,12 +325,6 @@ void SetUserDieCallback(DieCallbackType callback);
 
 void SetCheckUnwindCallback(void (*callback)());
 
-// Callback will be called if soft_rss_limit_mb is given and the limit is
-// exceeded (exceeded==true) or if rss went down below the limit
-// (exceeded==false).
-// The callback should be registered once at the tool init time.
-void SetSoftRssLimitExceededCallback(void (*Callback)(bool exceeded));
-
 // Functions related to signal handling.
 typedef void (*SignalHandlerType)(int, void *, void *);
 HandleSignalMode GetHandleSignalMode(int signum);
@@ -460,6 +453,10 @@ template <class T>
 constexpr T Max(T a, T b) {
   return a > b ? a : b;
 }
+template <class T>
+constexpr T Abs(T a) {
+  return a < 0 ? -a : a;
+}
 template<class T> void Swap(T& a, T& b) {
   T tmp = a;
   a = b;
@@ -669,11 +666,9 @@ void Sort(T *v, uptr size, Compare comp = {}) {
 
 // Works like std::lower_bound: finds the first element that is not less
 // than the val.
-template <class Container,
+template <class Container, class T,
           class Compare = CompareLess<typename Container::value_type>>
-uptr InternalLowerBound(const Container &v,
-                        const typename Container::value_type &val,
-                        Compare comp = {}) {
+uptr InternalLowerBound(const Container &v, const T &val, Compare comp = {}) {
   uptr first = 0;
   uptr last = v.size();
   while (last > first) {
@@ -743,6 +738,9 @@ bool ReadFileToBuffer(const char *file_name, char **buff, uptr *buff_size,
                       uptr *read_len, uptr max_len = kDefaultFileMaxSize,
                       error_t *errno_p = nullptr);
 
+int GetModuleAndOffsetForPc(uptr pc, char *module_name, uptr module_name_len,
+                            uptr *pc_offset);
+
 // When adding a new architecture, don't forget to also update
 // script/asan_symbolize.py and sanitizer_symbolizer_libcdep.cpp.
 inline const char *ModuleArchToString(ModuleArch arch) {
@@ -774,7 +772,7 @@ inline const char *ModuleArchToString(ModuleArch arch) {
   return "";
 }
 
-const uptr kModuleUUIDSize = 16;
+const uptr kModuleUUIDSize = 32;
 const uptr kMaxSegName = 16;
 
 // Represents a binary loaded into virtual memory (e.g. this can be an
@@ -784,8 +782,9 @@ class LoadedModule {
   LoadedModule()
       : full_name_(nullptr),
         base_address_(0),
-        max_executable_address_(0),
+        max_address_(0),
         arch_(kModuleArchUnknown),
+        uuid_size_(0),
         instrumented_(false) {
     internal_memset(uuid_, 0, kModuleUUIDSize);
     ranges_.clear();
@@ -793,6 +792,7 @@ class LoadedModule {
   void set(const char *module_name, uptr base_address);
   void set(const char *module_name, uptr base_address, ModuleArch arch,
            u8 uuid[kModuleUUIDSize], bool instrumented);
+  void setUuid(const char *uuid, uptr size);
   void clear();
   void addAddressRange(uptr beg, uptr end, bool executable, bool writable,
                        const char *name = nullptr);
@@ -800,9 +800,10 @@ class LoadedModule {
 
   const char *full_name() const { return full_name_; }
   uptr base_address() const { return base_address_; }
-  uptr max_executable_address() const { return max_executable_address_; }
+  uptr max_address() const { return max_address_; }
   ModuleArch arch() const { return arch_; }
   const u8 *uuid() const { return uuid_; }
+  uptr uuid_size() const { return uuid_size_; }
   bool instrumented() const { return instrumented_; }
 
   struct AddressRange {
@@ -829,8 +830,9 @@ class LoadedModule {
  private:
   char *full_name_;  // Owned.
   uptr base_address_;
-  uptr max_executable_address_;
+  uptr max_address_;
   ModuleArch arch_;
+  uptr uuid_size_;
   u8 uuid_[kModuleUUIDSize];
   bool instrumented_;
   IntrusiveList<AddressRange> ranges_;
@@ -956,7 +958,7 @@ struct SignalContext {
   uptr sp;
   uptr bp;
   bool is_memory_access;
-  enum WriteFlag { UNKNOWN, READ, WRITE } write_flag;
+  enum WriteFlag { Unknown, Read, Write } write_flag;
 
   // In some cases the kernel cannot provide the true faulting address; `addr`
   // will be zero then.  This field allows to distinguish between these cases
diff --git a/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc b/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc
index abb38cc..43296e6 100644
--- a/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc
@@ -21,7 +21,7 @@
 //   COMMON_INTERCEPTOR_FD_RELEASE
 //   COMMON_INTERCEPTOR_FD_ACCESS
 //   COMMON_INTERCEPTOR_SET_THREAD_NAME
-//   COMMON_INTERCEPTOR_ON_DLOPEN
+//   COMMON_INTERCEPTOR_DLOPEN
 //   COMMON_INTERCEPTOR_ON_EXIT
 //   COMMON_INTERCEPTOR_MUTEX_PRE_LOCK
 //   COMMON_INTERCEPTOR_MUTEX_POST_LOCK
@@ -132,6 +132,76 @@ extern const short *_toupper_tab_;
 extern const short *_tolower_tab_;
 #endif
 
+#if SANITIZER_MUSL && \
+  (defined(__i386__) || defined(__arm__) || SANITIZER_MIPS32 || SANITIZER_PPC32)
+// musl 1.2.0 on existing 32-bit architectures uses new symbol names for the
+// time-related functions that take 64-bit time_t values.  See
+// https://musl.libc.org/time64.html
+#define adjtime __adjtime64
+#define adjtimex __adjtimex_time64
+#define aio_suspend __aio_suspend_time64
+#define clock_adjtime __clock_adjtime64
+#define clock_getres __clock_getres_time64
+#define clock_gettime __clock_gettime64
+#define clock_nanosleep __clock_nanosleep_time64
+#define clock_settime __clock_settime64
+#define cnd_timedwait __cnd_timedwait_time64
+#define ctime __ctime64
+#define ctime_r __ctime64_r
+#define difftime __difftime64
+#define dlsym __dlsym_time64
+#define fstatat __fstatat_time64
+#define fstat __fstat_time64
+#define ftime __ftime64
+#define futimens __futimens_time64
+#define futimesat __futimesat_time64
+#define futimes __futimes_time64
+#define getitimer __getitimer_time64
+#define getrusage __getrusage_time64
+#define gettimeofday __gettimeofday_time64
+#define gmtime __gmtime64
+#define gmtime_r __gmtime64_r
+#define localtime __localtime64
+#define localtime_r __localtime64_r
+#define lstat __lstat_time64
+#define lutimes __lutimes_time64
+#define mktime __mktime64
+#define mq_timedreceive __mq_timedreceive_time64
+#define mq_timedsend __mq_timedsend_time64
+#define mtx_timedlock __mtx_timedlock_time64
+#define nanosleep __nanosleep_time64
+#define ppoll __ppoll_time64
+#define pselect __pselect_time64
+#define pthread_cond_timedwait __pthread_cond_timedwait_time64
+#define pthread_mutex_timedlock __pthread_mutex_timedlock_time64
+#define pthread_rwlock_timedrdlock __pthread_rwlock_timedrdlock_time64
+#define pthread_rwlock_timedwrlock __pthread_rwlock_timedwrlock_time64
+#define pthread_timedjoin_np __pthread_timedjoin_np_time64
+#define recvmmsg __recvmmsg_time64
+#define sched_rr_get_interval __sched_rr_get_interval_time64
+#define select __select_time64
+#define semtimedop __semtimedop_time64
+#define sem_timedwait __sem_timedwait_time64
+#define setitimer __setitimer_time64
+#define settimeofday __settimeofday_time64
+#define sigtimedwait __sigtimedwait_time64
+#define stat __stat_time64
+#define stime __stime64
+#define thrd_sleep __thrd_sleep_time64
+#define timegm __timegm_time64
+#define timerfd_gettime __timerfd_gettime64
+#define timerfd_settime __timerfd_settime64
+#define timer_gettime __timer_gettime64
+#define timer_settime __timer_settime64
+#define timespec_get __timespec_get_time64
+#define time __time64
+#define utimensat __utimensat_time64
+#define utimes __utimes_time64
+#define utime __utime64
+#define wait3 __wait3_time64
+#define wait4 __wait4_time64
+#endif
+
 // Platform-specific options.
 #if SANITIZER_MAC
 #define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
@@ -206,9 +276,9 @@ extern const short *_tolower_tab_;
     COMMON_INTERCEPTOR_READ_RANGE((ctx), (s),                       \
       common_flags()->strict_string_checks ? (internal_strlen(s)) + 1 : (n) )
 
-#ifndef COMMON_INTERCEPTOR_ON_DLOPEN
-#define COMMON_INTERCEPTOR_ON_DLOPEN(filename, flag) \
-  CheckNoDeepBind(filename, flag);
+#ifndef COMMON_INTERCEPTOR_DLOPEN
+#define COMMON_INTERCEPTOR_DLOPEN(filename, flag) \
+  ({ CheckNoDeepBind(filename, flag); REAL(dlopen)(filename, flag); })
 #endif
 
 #ifndef COMMON_INTERCEPTOR_GET_TLS_RANGE
@@ -1295,12 +1365,16 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, prctl, option, arg2, arg3, arg4, arg5);
   static const int PR_SET_NAME = 15;
+  static const int PR_SCHED_CORE = 62;
+  static const int PR_SCHED_CORE_GET = 0;
   int res = REAL(prctl(option, arg2, arg3, arg4, arg5));
   if (option == PR_SET_NAME) {
     char buff[16];
     internal_strncpy(buff, (char *)arg2, 15);
     buff[15] = 0;
     COMMON_INTERCEPTOR_SET_THREAD_NAME(ctx, buff);
+  } else if (res != -1 && option == PR_SCHED_CORE && arg2 == PR_SCHED_CORE_GET) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64*)(arg5), sizeof(u64));
   }
   return res;
 }
@@ -2422,6 +2496,34 @@ INTERCEPTOR(int, glob64, const char *pattern, int flags,
 #define INIT_GLOB64
 #endif  // SANITIZER_INTERCEPT_GLOB64
 
+#if SANITIZER_INTERCEPT___B64_TO
+INTERCEPTOR(int, __b64_ntop, unsigned char const *src, SIZE_T srclength,
+            char *target, SIZE_T targsize) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __b64_ntop, src, srclength, target, targsize);
+  COMMON_INTERCEPTOR_READ_RANGE(ctx, src, srclength);
+  int res = REAL(__b64_ntop)(src, srclength, target, targsize);
+  if (res >= 0)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, target, res + 1);
+  return res;
+}
+INTERCEPTOR(int, __b64_pton, char const *src, char *target, SIZE_T targsize) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __b64_pton, src, target, targsize);
+  COMMON_INTERCEPTOR_READ_RANGE(ctx, src, internal_strlen(src) + 1);
+  int res = REAL(__b64_pton)(src, target, targsize);
+  if (res >= 0)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, target, res);
+  return res;
+}
+#  define INIT___B64_TO                    \
+    COMMON_INTERCEPT_FUNCTION(__b64_ntop); \
+    COMMON_INTERCEPT_FUNCTION(__b64_pton);
+#else  // SANITIZER_INTERCEPT___B64_TO
+#define INIT___B64_TO
+#endif  // SANITIZER_INTERCEPT___B64_TO
+
+
 #if SANITIZER_INTERCEPT_POSIX_SPAWN
 
 template <class RealSpawnPtr>
@@ -6380,8 +6482,7 @@ INTERCEPTOR(void*, dlopen, const char *filename, int flag) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER_NOIGNORE(ctx, dlopen, filename, flag);
   if (filename) COMMON_INTERCEPTOR_READ_STRING(ctx, filename, 0);
-  COMMON_INTERCEPTOR_ON_DLOPEN(filename, flag);
-  void *res = REAL(dlopen)(filename, flag);
+  void *res = COMMON_INTERCEPTOR_DLOPEN(filename, flag);
   Symbolizer::GetOrInit()->InvalidateModuleList();
   COMMON_INTERCEPTOR_LIBRARY_LOADED(filename, res);
   return res;
@@ -6872,6 +6973,23 @@ INTERCEPTOR(int, stat, const char *path, void *buf) {
 #define INIT_STAT
 #endif
 
+#if SANITIZER_INTERCEPT_STAT64
+INTERCEPTOR(int, stat64, const char *path, void *buf) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, stat64, path, buf);
+  if (common_flags()->intercept_stat)
+    COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  int res = REAL(stat64)(path, buf);
+  if (!res)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, __sanitizer::struct_stat64_sz);
+  return res;
+}
+#define INIT_STAT64 COMMON_INTERCEPT_FUNCTION(stat64)
+#else
+#define INIT_STAT64
+#endif
+
+
 #if SANITIZER_INTERCEPT_LSTAT
 INTERCEPTOR(int, lstat, const char *path, void *buf) {
   void *ctx;
@@ -6888,6 +7006,22 @@ INTERCEPTOR(int, lstat, const char *path, void *buf) {
 #define INIT_LSTAT
 #endif
 
+#if SANITIZER_INTERCEPT_STAT64
+INTERCEPTOR(int, lstat64, const char *path, void *buf) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, lstat64, path, buf);
+  if (common_flags()->intercept_stat)
+    COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  int res = REAL(lstat64)(path, buf);
+  if (!res)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, __sanitizer::struct_stat64_sz);
+  return res;
+}
+#define INIT_LSTAT64 COMMON_INTERCEPT_FUNCTION(lstat64)
+#else
+#define INIT_LSTAT64
+#endif
+
 #if SANITIZER_INTERCEPT___XSTAT
 INTERCEPTOR(int, __xstat, int version, const char *path, void *buf) {
   void *ctx;
@@ -7858,12 +7992,12 @@ INTERCEPTOR(void, setbuf, __sanitizer_FILE *stream, char *buf) {
       unpoison_file(stream);
 }
 
-INTERCEPTOR(void, setbuffer, __sanitizer_FILE *stream, char *buf, int mode) {
+INTERCEPTOR(void, setbuffer, __sanitizer_FILE *stream, char *buf, SIZE_T size) {
   void *ctx;
-  COMMON_INTERCEPTOR_ENTER(ctx, setbuffer, stream, buf, mode);
-  REAL(setbuffer)(stream, buf, mode);
+  COMMON_INTERCEPTOR_ENTER(ctx, setbuffer, stream, buf, size);
+  REAL(setbuffer)(stream, buf, size);
   if (buf) {
-    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, __sanitizer_bufsiz);
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, size);
   }
   if (stream)
     unpoison_file(stream);
@@ -7905,7 +8039,7 @@ INTERCEPTOR(int, regcomp, void *preg, const char *pattern, int cflags) {
   if (pattern)
     COMMON_INTERCEPTOR_READ_RANGE(ctx, pattern, internal_strlen(pattern) + 1);
   int res = REAL(regcomp)(preg, pattern, cflags);
-  if (!res)
+  if (preg)
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, preg, struct_regex_sz);
   return res;
 }
@@ -10290,6 +10424,7 @@ static void InitializeCommonInterceptors() {
   INIT_TIME;
   INIT_GLOB;
   INIT_GLOB64;
+  INIT___B64_TO;
   INIT_POSIX_SPAWN;
   INIT_WAIT;
   INIT_WAIT4;
@@ -10447,8 +10582,10 @@ static void InitializeCommonInterceptors() {
   INIT_RECV_RECVFROM;
   INIT_SEND_SENDTO;
   INIT_STAT;
+  INIT_STAT64;
   INIT_EVENTFD_READ_WRITE;
   INIT_LSTAT;
+  INIT_LSTAT64;
   INIT___XSTAT;
   INIT___XSTAT64;
   INIT___LXSTAT;
diff --git a/libsanitizer/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/libsanitizer/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
index b7da659..49ec409 100644
--- a/libsanitizer/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
+++ b/libsanitizer/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
@@ -115,11 +115,19 @@ static void ioctl_table_fill() {
   // _(SOUND_MIXER_WRITE_MUTE, WRITE, sizeof(int)); // same as ...WRITE_ENHANCE
   _(BLKFLSBUF, NONE, 0);
   _(BLKGETSIZE, WRITE, sizeof(uptr));
-  _(BLKRAGET, WRITE, sizeof(int));
+  _(BLKRAGET, WRITE, sizeof(uptr));
   _(BLKRASET, NONE, 0);
   _(BLKROGET, WRITE, sizeof(int));
   _(BLKROSET, READ, sizeof(int));
   _(BLKRRPART, NONE, 0);
+  _(BLKFRASET, NONE, 0);
+  _(BLKFRAGET, WRITE, sizeof(uptr));
+  _(BLKSECTSET, READ, sizeof(short));
+  _(BLKSECTGET, WRITE, sizeof(short));
+  _(BLKSSZGET, WRITE, sizeof(int));
+  _(BLKBSZGET, WRITE, sizeof(int));
+  _(BLKBSZSET, READ, sizeof(uptr));
+  _(BLKGETSIZE64, WRITE, sizeof(u64));
   _(CDROMEJECT, NONE, 0);
   _(CDROMEJECT_SW, NONE, 0);
   _(CDROMMULTISESSION, WRITE, struct_cdrom_multisession_sz);
diff --git a/libsanitizer/sanitizer_common/sanitizer_common_interface_posix.inc b/libsanitizer/sanitizer_common/sanitizer_common_interface_posix.inc
index 38f9531..a5259be 100644
--- a/libsanitizer/sanitizer_common/sanitizer_common_interface_posix.inc
+++ b/libsanitizer/sanitizer_common/sanitizer_common_interface_posix.inc
@@ -11,3 +11,5 @@ INTERFACE_WEAK_FUNCTION(__sanitizer_symbolize_code)
 INTERFACE_WEAK_FUNCTION(__sanitizer_symbolize_data)
 INTERFACE_WEAK_FUNCTION(__sanitizer_symbolize_demangle)
 INTERFACE_WEAK_FUNCTION(__sanitizer_symbolize_flush)
+INTERFACE_WEAK_FUNCTION(__sanitizer_symbolize_set_demangle)
+INTERFACE_WEAK_FUNCTION(__sanitizer_symbolize_set_inline_frames)
diff --git a/libsanitizer/sanitizer_common/sanitizer_common_libcdep.cpp b/libsanitizer/sanitizer_common/sanitizer_common_libcdep.cpp
index bc4b477..8fd3985 100644
--- a/libsanitizer/sanitizer_common/sanitizer_common_libcdep.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_common_libcdep.cpp
@@ -10,25 +10,22 @@
 // run-time libraries.
 //===----------------------------------------------------------------------===//
 
+#include "sanitizer_allocator.h"
 #include "sanitizer_allocator_interface.h"
 #include "sanitizer_common.h"
 #include "sanitizer_flags.h"
+#include "sanitizer_interface_internal.h"
 #include "sanitizer_procmaps.h"
-
+#include "sanitizer_stackdepot.h"
 
 namespace __sanitizer {
 
-static void (*SoftRssLimitExceededCallback)(bool exceeded);
-void SetSoftRssLimitExceededCallback(void (*Callback)(bool exceeded)) {
-  CHECK_EQ(SoftRssLimitExceededCallback, nullptr);
-  SoftRssLimitExceededCallback = Callback;
-}
-
 #if (SANITIZER_LINUX || SANITIZER_NETBSD) && !SANITIZER_GO
 // Weak default implementation for when sanitizer_stackdepot is not linked in.
 SANITIZER_WEAK_ATTRIBUTE StackDepotStats StackDepotGetStats() { return {}; }
 
 void *BackgroundThread(void *arg) {
+  VPrintf(1, "%s: Started BackgroundThread\n", SanitizerToolName);
   const uptr hard_rss_limit_mb = common_flags()->hard_rss_limit_mb;
   const uptr soft_rss_limit_mb = common_flags()->soft_rss_limit_mb;
   const bool heap_profile = common_flags()->heap_profile;
@@ -66,13 +63,11 @@ void *BackgroundThread(void *arg) {
         reached_soft_rss_limit = true;
         Report("%s: soft rss limit exhausted (%zdMb vs %zdMb)\n",
                SanitizerToolName, soft_rss_limit_mb, current_rss_mb);
-        if (SoftRssLimitExceededCallback)
-          SoftRssLimitExceededCallback(true);
+        SetRssLimitExceeded(true);
       } else if (soft_rss_limit_mb >= current_rss_mb &&
                  reached_soft_rss_limit) {
         reached_soft_rss_limit = false;
-        if (SoftRssLimitExceededCallback)
-          SoftRssLimitExceededCallback(false);
+        SetRssLimitExceeded(false);
       }
     }
     if (heap_profile &&
@@ -83,6 +78,42 @@ void *BackgroundThread(void *arg) {
     }
   }
 }
+
+void MaybeStartBackgroudThread() {
+  // Need to implement/test on other platforms.
+  // Start the background thread if one of the rss limits is given.
+  if (!common_flags()->hard_rss_limit_mb &&
+      !common_flags()->soft_rss_limit_mb &&
+      !common_flags()->heap_profile) return;
+  if (!&real_pthread_create) {
+    VPrintf(1, "%s: real_pthread_create undefined\n", SanitizerToolName);
+    return;  // Can't spawn the thread anyway.
+  }
+
+  static bool started = false;
+  if (!started) {
+    started = true;
+    internal_start_thread(BackgroundThread, nullptr);
+  }
+}
+
+#  if !SANITIZER_START_BACKGROUND_THREAD_IN_ASAN_INTERNAL
+#    ifdef __clang__
+#    pragma clang diagnostic push
+// We avoid global-constructors to be sure that globals are ready when
+// sanitizers need them. This can happend before global constructors executed.
+// Here we don't mind if thread is started on later stages.
+#    pragma clang diagnostic ignored "-Wglobal-constructors"
+#    endif
+static struct BackgroudThreadStarted {
+  BackgroudThreadStarted() { MaybeStartBackgroudThread(); }
+} background_thread_strarter UNUSED;
+#    ifdef __clang__
+#    pragma clang diagnostic pop
+#    endif
+#  endif
+#else
+void MaybeStartBackgroudThread() {}
 #endif
 
 void WriteToSyslog(const char *msg) {
@@ -105,18 +136,6 @@ void WriteToSyslog(const char *msg) {
     WriteOneLineToSyslog(p);
 }
 
-void MaybeStartBackgroudThread() {
-#if (SANITIZER_LINUX || SANITIZER_NETBSD) && \
-    !SANITIZER_GO  // Need to implement/test on other platforms.
-  // Start the background thread if one of the rss limits is given.
-  if (!common_flags()->hard_rss_limit_mb &&
-      !common_flags()->soft_rss_limit_mb &&
-      !common_flags()->heap_profile) return;
-  if (!&real_pthread_create) return;  // Can't spawn the thread anyway.
-  internal_start_thread(BackgroundThread, nullptr);
-#endif
-}
-
 static void (*sandboxing_callback)();
 void SetSandboxingCallback(void (*f)()) {
   sandboxing_callback = f;
@@ -185,10 +204,22 @@ void ProtectGap(uptr addr, uptr size, uptr zero_base_shadow_start,
 
 #endif  // !SANITIZER_FUCHSIA
 
+#if !SANITIZER_WINDOWS && !SANITIZER_GO
+// Weak default implementation for when sanitizer_stackdepot is not linked in.
+SANITIZER_WEAK_ATTRIBUTE void StackDepotStopBackgroundThread() {}
+static void StopStackDepotBackgroundThread() {
+  StackDepotStopBackgroundThread();
+}
+#else
+// SANITIZER_WEAK_ATTRIBUTE is unsupported.
+static void StopStackDepotBackgroundThread() {}
+#endif
+
 }  // namespace __sanitizer
 
 SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_sandbox_on_notify,
                              __sanitizer_sandbox_arguments *args) {
+  __sanitizer::StopStackDepotBackgroundThread();
   __sanitizer::PlatformPrepareForSandboxing(args);
   if (__sanitizer::sandboxing_callback)
     __sanitizer::sandboxing_callback();
diff --git a/libsanitizer/sanitizer_common/sanitizer_coverage_fuchsia.cpp b/libsanitizer/sanitizer_common/sanitizer_coverage_fuchsia.cpp
index 1d0dbe5..35c3253 100644
--- a/libsanitizer/sanitizer_common/sanitizer_coverage_fuchsia.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_coverage_fuchsia.cpp
@@ -33,6 +33,7 @@
 
 #include "sanitizer_atomic.h"
 #include "sanitizer_common.h"
+#include "sanitizer_interface_internal.h"
 #include "sanitizer_internal_defs.h"
 #include "sanitizer_symbolizer_fuchsia.h"
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_coverage_libcdep_new.cpp b/libsanitizer/sanitizer_common/sanitizer_coverage_libcdep_new.cpp
index 56220df..3dcb39f 100644
--- a/libsanitizer/sanitizer_common/sanitizer_coverage_libcdep_new.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_coverage_libcdep_new.cpp
@@ -10,11 +10,13 @@
 #include "sanitizer_platform.h"
 
 #if !SANITIZER_FUCHSIA
-#include "sancov_flags.h"
-#include "sanitizer_allocator_internal.h"
-#include "sanitizer_atomic.h"
-#include "sanitizer_common.h"
-#include "sanitizer_file.h"
+#  include "sancov_flags.h"
+#  include "sanitizer_allocator_internal.h"
+#  include "sanitizer_atomic.h"
+#  include "sanitizer_common.h"
+#  include "sanitizer_common/sanitizer_stacktrace.h"
+#  include "sanitizer_file.h"
+#  include "sanitizer_interface_internal.h"
 
 using namespace __sanitizer;
 
@@ -72,7 +74,7 @@ static void SanitizerDumpCoverage(const uptr* unsorted_pcs, uptr len) {
     const uptr pc = pcs[i];
     if (!pc) continue;
 
-    if (!__sanitizer_get_module_and_offset_for_pc(pc, nullptr, 0, &pcs[i])) {
+    if (!GetModuleAndOffsetForPc(pc, nullptr, 0, &pcs[i])) {
       Printf("ERROR: unknown pc 0x%zx (may happen if dlclose is used)\n", pc);
       continue;
     }
@@ -87,8 +89,7 @@ static void SanitizerDumpCoverage(const uptr* unsorted_pcs, uptr len) {
       last_base = module_base;
       module_start_idx = i;
       module_found = true;
-      __sanitizer_get_module_and_offset_for_pc(pc, module_name, kMaxPathLength,
-                                               &pcs[i]);
+      GetModuleAndOffsetForPc(pc, module_name, kMaxPathLength, &pcs[i]);
     }
   }
 
@@ -222,7 +223,8 @@ SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_dump_coverage(const uptr* pcs,
 
 SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_cov_trace_pc_guard, u32* guard) {
   if (!*guard) return;
-  __sancov::pc_guard_controller.TracePcGuard(guard, GET_CALLER_PC() - 1);
+  __sancov::pc_guard_controller.TracePcGuard(
+      guard, StackTrace::GetPreviousInstructionPc(GET_CALLER_PC()));
 }
 
 SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_cov_trace_pc_guard_init,
diff --git a/libsanitizer/sanitizer_common/sanitizer_dense_map.h b/libsanitizer/sanitizer_common/sanitizer_dense_map.h
new file mode 100644
index 0000000..046d77d
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_dense_map.h
@@ -0,0 +1,705 @@
+//===- sanitizer_dense_map.h - Dense probed hash table ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is fork of llvm/ADT/DenseMap.h class with the following changes:
+//  * Use mmap to allocate.
+//  * No iterators.
+//  * Does not shrink.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_DENSE_MAP_H
+#define SANITIZER_DENSE_MAP_H
+
+#include "sanitizer_common.h"
+#include "sanitizer_dense_map_info.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_type_traits.h"
+
+namespace __sanitizer {
+
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+class DenseMapBase {
+ public:
+  using size_type = unsigned;
+  using key_type = KeyT;
+  using mapped_type = ValueT;
+  using value_type = BucketT;
+
+  WARN_UNUSED_RESULT bool empty() const { return getNumEntries() == 0; }
+  unsigned size() const { return getNumEntries(); }
+
+  /// Grow the densemap so that it can contain at least \p NumEntries items
+  /// before resizing again.
+  void reserve(size_type NumEntries) {
+    auto NumBuckets = getMinBucketToReserveForEntries(NumEntries);
+    if (NumBuckets > getNumBuckets())
+      grow(NumBuckets);
+  }
+
+  void clear() {
+    if (getNumEntries() == 0 && getNumTombstones() == 0)
+      return;
+
+    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    if (__sanitizer::is_trivially_destructible<ValueT>::value) {
+      // Use a simpler loop when values don't need destruction.
+      for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P)
+        P->getFirst() = EmptyKey;
+    } else {
+      unsigned NumEntries = getNumEntries();
+      for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+        if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey)) {
+          if (!KeyInfoT::isEqual(P->getFirst(), TombstoneKey)) {
+            P->getSecond().~ValueT();
+            --NumEntries;
+          }
+          P->getFirst() = EmptyKey;
+        }
+      }
+      CHECK_EQ(NumEntries, 0);
+    }
+    setNumEntries(0);
+    setNumTombstones(0);
+  }
+
+  /// Return 1 if the specified key is in the map, 0 otherwise.
+  size_type count(const KeyT &Key) const {
+    const BucketT *TheBucket;
+    return LookupBucketFor(Key, TheBucket) ? 1 : 0;
+  }
+
+  value_type *find(const KeyT &Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket;
+    return nullptr;
+  }
+  const value_type *find(const KeyT &Key) const {
+    const BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket;
+    return nullptr;
+  }
+
+  /// Alternate version of find() which allows a different, and possibly
+  /// less expensive, key type.
+  /// The DenseMapInfo is responsible for supplying methods
+  /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key
+  /// type used.
+  template <class LookupKeyT>
+  value_type *find_as(const LookupKeyT &Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket;
+    return nullptr;
+  }
+  template <class LookupKeyT>
+  const value_type *find_as(const LookupKeyT &Key) const {
+    const BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket;
+    return nullptr;
+  }
+
+  /// lookup - Return the entry for the specified key, or a default
+  /// constructed value if no such entry exists.
+  ValueT lookup(const KeyT &Key) const {
+    const BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return TheBucket->getSecond();
+    return ValueT();
+  }
+
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // If the key is already in the map, it returns false and doesn't update the
+  // value.
+  detail::DenseMapPair<value_type *, bool> insert(const value_type &KV) {
+    return try_emplace(KV.first, KV.second);
+  }
+
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // If the key is already in the map, it returns false and doesn't update the
+  // value.
+  detail::DenseMapPair<value_type *, bool> insert(value_type &&KV) {
+    return try_emplace(__sanitizer::move(KV.first),
+                       __sanitizer::move(KV.second));
+  }
+
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // The value is constructed in-place if the key is not in the map, otherwise
+  // it is not moved.
+  template <typename... Ts>
+  detail::DenseMapPair<value_type *, bool> try_emplace(KeyT &&Key,
+                                                       Ts &&...Args) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return {TheBucket, false};  // Already in map.
+
+    // Otherwise, insert the new element.
+    TheBucket = InsertIntoBucket(TheBucket, __sanitizer::move(Key),
+                                 __sanitizer::forward<Ts>(Args)...);
+    return {TheBucket, true};
+  }
+
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // The value is constructed in-place if the key is not in the map, otherwise
+  // it is not moved.
+  template <typename... Ts>
+  detail::DenseMapPair<value_type *, bool> try_emplace(const KeyT &Key,
+                                                       Ts &&...Args) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return {TheBucket, false};  // Already in map.
+
+    // Otherwise, insert the new element.
+    TheBucket =
+        InsertIntoBucket(TheBucket, Key, __sanitizer::forward<Ts>(Args)...);
+    return {TheBucket, true};
+  }
+
+  /// Alternate version of insert() which allows a different, and possibly
+  /// less expensive, key type.
+  /// The DenseMapInfo is responsible for supplying methods
+  /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key
+  /// type used.
+  template <typename LookupKeyT>
+  detail::DenseMapPair<value_type *, bool> insert_as(value_type &&KV,
+                                                     const LookupKeyT &Val) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Val, TheBucket))
+      return {TheBucket, false};  // Already in map.
+
+    // Otherwise, insert the new element.
+    TheBucket =
+        InsertIntoBucketWithLookup(TheBucket, __sanitizer::move(KV.first),
+                                   __sanitizer::move(KV.second), Val);
+    return {TheBucket, true};
+  }
+
+  bool erase(const KeyT &Val) {
+    BucketT *TheBucket;
+    if (!LookupBucketFor(Val, TheBucket))
+      return false;  // not in map.
+
+    TheBucket->getSecond().~ValueT();
+    TheBucket->getFirst() = getTombstoneKey();
+    decrementNumEntries();
+    incrementNumTombstones();
+    return true;
+  }
+
+  void erase(value_type *I) {
+    CHECK_NE(I, nullptr);
+    BucketT *TheBucket = &*I;
+    TheBucket->getSecond().~ValueT();
+    TheBucket->getFirst() = getTombstoneKey();
+    decrementNumEntries();
+    incrementNumTombstones();
+  }
+
+  value_type &FindAndConstruct(const KeyT &Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return *TheBucket;
+
+    return *InsertIntoBucket(TheBucket, Key);
+  }
+
+  ValueT &operator[](const KeyT &Key) { return FindAndConstruct(Key).second; }
+
+  value_type &FindAndConstruct(KeyT &&Key) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(Key, TheBucket))
+      return *TheBucket;
+
+    return *InsertIntoBucket(TheBucket, __sanitizer::move(Key));
+  }
+
+  ValueT &operator[](KeyT &&Key) {
+    return FindAndConstruct(__sanitizer::move(Key)).second;
+  }
+
+  /// Iterate over active entries of the container.
+  ///
+  /// Function can return fast to stop the process.
+  template <class Fn>
+  void forEach(Fn fn) {
+    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    for (auto *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+      const KeyT K = P->getFirst();
+      if (!KeyInfoT::isEqual(K, EmptyKey) &&
+          !KeyInfoT::isEqual(K, TombstoneKey)) {
+        if (!fn(*P))
+          return;
+      }
+    }
+  }
+
+  template <class Fn>
+  void forEach(Fn fn) const {
+    const_cast<DenseMapBase *>(this)->forEach(
+        [&](const value_type &KV) { return fn(KV); });
+  }
+
+ protected:
+  DenseMapBase() = default;
+
+  void destroyAll() {
+    if (getNumBuckets() == 0)  // Nothing to do.
+      return;
+
+    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
+      if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(P->getFirst(), TombstoneKey))
+        P->getSecond().~ValueT();
+      P->getFirst().~KeyT();
+    }
+  }
+
+  void initEmpty() {
+    setNumEntries(0);
+    setNumTombstones(0);
+
+    CHECK_EQ((getNumBuckets() & (getNumBuckets() - 1)), 0);
+    const KeyT EmptyKey = getEmptyKey();
+    for (BucketT *B = getBuckets(), *E = getBucketsEnd(); B != E; ++B)
+      ::new (&B->getFirst()) KeyT(EmptyKey);
+  }
+
+  /// Returns the number of buckets to allocate to ensure that the DenseMap can
+  /// accommodate \p NumEntries without need to grow().
+  unsigned getMinBucketToReserveForEntries(unsigned NumEntries) {
+    // Ensure that "NumEntries * 4 < NumBuckets * 3"
+    if (NumEntries == 0)
+      return 0;
+    // +1 is required because of the strict equality.
+    // For example if NumEntries is 48, we need to return 401.
+    return RoundUpToPowerOfTwo((NumEntries * 4 / 3 + 1) + /* NextPowerOf2 */ 1);
+  }
+
+  void moveFromOldBuckets(BucketT *OldBucketsBegin, BucketT *OldBucketsEnd) {
+    initEmpty();
+
+    // Insert all the old elements.
+    const KeyT EmptyKey = getEmptyKey();
+    const KeyT TombstoneKey = getTombstoneKey();
+    for (BucketT *B = OldBucketsBegin, *E = OldBucketsEnd; B != E; ++B) {
+      if (!KeyInfoT::isEqual(B->getFirst(), EmptyKey) &&
+          !KeyInfoT::isEqual(B->getFirst(), TombstoneKey)) {
+        // Insert the key/value into the new table.
+        BucketT *DestBucket;
+        bool FoundVal = LookupBucketFor(B->getFirst(), DestBucket);
+        (void)FoundVal;  // silence warning.
+        CHECK(!FoundVal);
+        DestBucket->getFirst() = __sanitizer::move(B->getFirst());
+        ::new (&DestBucket->getSecond())
+            ValueT(__sanitizer::move(B->getSecond()));
+        incrementNumEntries();
+
+        // Free the value.
+        B->getSecond().~ValueT();
+      }
+      B->getFirst().~KeyT();
+    }
+  }
+
+  template <typename OtherBaseT>
+  void copyFrom(
+      const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT, BucketT> &other) {
+    CHECK_NE(&other, this);
+    CHECK_EQ(getNumBuckets(), other.getNumBuckets());
+
+    setNumEntries(other.getNumEntries());
+    setNumTombstones(other.getNumTombstones());
+
+    if (__sanitizer::is_trivially_copyable<KeyT>::value &&
+        __sanitizer::is_trivially_copyable<ValueT>::value)
+      internal_memcpy(reinterpret_cast<void *>(getBuckets()),
+                      other.getBuckets(), getNumBuckets() * sizeof(BucketT));
+    else
+      for (uptr i = 0; i < getNumBuckets(); ++i) {
+        ::new (&getBuckets()[i].getFirst())
+            KeyT(other.getBuckets()[i].getFirst());
+        if (!KeyInfoT::isEqual(getBuckets()[i].getFirst(), getEmptyKey()) &&
+            !KeyInfoT::isEqual(getBuckets()[i].getFirst(), getTombstoneKey()))
+          ::new (&getBuckets()[i].getSecond())
+              ValueT(other.getBuckets()[i].getSecond());
+      }
+  }
+
+  static unsigned getHashValue(const KeyT &Val) {
+    return KeyInfoT::getHashValue(Val);
+  }
+
+  template <typename LookupKeyT>
+  static unsigned getHashValue(const LookupKeyT &Val) {
+    return KeyInfoT::getHashValue(Val);
+  }
+
+  static const KeyT getEmptyKey() { return KeyInfoT::getEmptyKey(); }
+
+  static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); }
+
+ private:
+  unsigned getNumEntries() const {
+    return static_cast<const DerivedT *>(this)->getNumEntries();
+  }
+
+  void setNumEntries(unsigned Num) {
+    static_cast<DerivedT *>(this)->setNumEntries(Num);
+  }
+
+  void incrementNumEntries() { setNumEntries(getNumEntries() + 1); }
+
+  void decrementNumEntries() { setNumEntries(getNumEntries() - 1); }
+
+  unsigned getNumTombstones() const {
+    return static_cast<const DerivedT *>(this)->getNumTombstones();
+  }
+
+  void setNumTombstones(unsigned Num) {
+    static_cast<DerivedT *>(this)->setNumTombstones(Num);
+  }
+
+  void incrementNumTombstones() { setNumTombstones(getNumTombstones() + 1); }
+
+  void decrementNumTombstones() { setNumTombstones(getNumTombstones() - 1); }
+
+  const BucketT *getBuckets() const {
+    return static_cast<const DerivedT *>(this)->getBuckets();
+  }
+
+  BucketT *getBuckets() { return static_cast<DerivedT *>(this)->getBuckets(); }
+
+  unsigned getNumBuckets() const {
+    return static_cast<const DerivedT *>(this)->getNumBuckets();
+  }
+
+  BucketT *getBucketsEnd() { return getBuckets() + getNumBuckets(); }
+
+  const BucketT *getBucketsEnd() const {
+    return getBuckets() + getNumBuckets();
+  }
+
+  void grow(unsigned AtLeast) { static_cast<DerivedT *>(this)->grow(AtLeast); }
+
+  template <typename KeyArg, typename... ValueArgs>
+  BucketT *InsertIntoBucket(BucketT *TheBucket, KeyArg &&Key,
+                            ValueArgs &&...Values) {
+    TheBucket = InsertIntoBucketImpl(Key, Key, TheBucket);
+
+    TheBucket->getFirst() = __sanitizer::forward<KeyArg>(Key);
+    ::new (&TheBucket->getSecond())
+        ValueT(__sanitizer::forward<ValueArgs>(Values)...);
+    return TheBucket;
+  }
+
+  template <typename LookupKeyT>
+  BucketT *InsertIntoBucketWithLookup(BucketT *TheBucket, KeyT &&Key,
+                                      ValueT &&Value, LookupKeyT &Lookup) {
+    TheBucket = InsertIntoBucketImpl(Key, Lookup, TheBucket);
+
+    TheBucket->getFirst() = __sanitizer::move(Key);
+    ::new (&TheBucket->getSecond()) ValueT(__sanitizer::move(Value));
+    return TheBucket;
+  }
+
+  template <typename LookupKeyT>
+  BucketT *InsertIntoBucketImpl(const KeyT &Key, const LookupKeyT &Lookup,
+                                BucketT *TheBucket) {
+    // If the load of the hash table is more than 3/4, or if fewer than 1/8 of
+    // the buckets are empty (meaning that many are filled with tombstones),
+    // grow the table.
+    //
+    // The later case is tricky.  For example, if we had one empty bucket with
+    // tons of tombstones, failing lookups (e.g. for insertion) would have to
+    // probe almost the entire table until it found the empty bucket.  If the
+    // table completely filled with tombstones, no lookup would ever succeed,
+    // causing infinite loops in lookup.
+    unsigned NewNumEntries = getNumEntries() + 1;
+    unsigned NumBuckets = getNumBuckets();
+    if (UNLIKELY(NewNumEntries * 4 >= NumBuckets * 3)) {
+      this->grow(NumBuckets * 2);
+      LookupBucketFor(Lookup, TheBucket);
+      NumBuckets = getNumBuckets();
+    } else if (UNLIKELY(NumBuckets - (NewNumEntries + getNumTombstones()) <=
+                        NumBuckets / 8)) {
+      this->grow(NumBuckets);
+      LookupBucketFor(Lookup, TheBucket);
+    }
+    CHECK(TheBucket);
+
+    // Only update the state after we've grown our bucket space appropriately
+    // so that when growing buckets we have self-consistent entry count.
+    incrementNumEntries();
+
+    // If we are writing over a tombstone, remember this.
+    const KeyT EmptyKey = getEmptyKey();
+    if (!KeyInfoT::isEqual(TheBucket->getFirst(), EmptyKey))
+      decrementNumTombstones();
+
+    return TheBucket;
+  }
+
+  /// LookupBucketFor - Lookup the appropriate bucket for Val, returning it in
+  /// FoundBucket.  If the bucket contains the key and a value, this returns
+  /// true, otherwise it returns a bucket with an empty marker or tombstone and
+  /// returns false.
+  template <typename LookupKeyT>
+  bool LookupBucketFor(const LookupKeyT &Val,
+                       const BucketT *&FoundBucket) const {
+    const BucketT *BucketsPtr = getBuckets();
+    const unsigned NumBuckets = getNumBuckets();
+
+    if (NumBuckets == 0) {
+      FoundBucket = nullptr;
+      return false;
+    }
+
+    // FoundTombstone - Keep track of whether we find a tombstone while probing.
+    const BucketT *FoundTombstone = nullptr;
+    const KeyT EmptyKey = getEmptyKey();
+    const KeyT TombstoneKey = getTombstoneKey();
+    CHECK(!KeyInfoT::isEqual(Val, EmptyKey));
+    CHECK(!KeyInfoT::isEqual(Val, TombstoneKey));
+
+    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+    unsigned ProbeAmt = 1;
+    while (true) {
+      const BucketT *ThisBucket = BucketsPtr + BucketNo;
+      // Found Val's bucket?  If so, return it.
+      if (LIKELY(KeyInfoT::isEqual(Val, ThisBucket->getFirst()))) {
+        FoundBucket = ThisBucket;
+        return true;
+      }
+
+      // If we found an empty bucket, the key doesn't exist in the set.
+      // Insert it and return the default value.
+      if (LIKELY(KeyInfoT::isEqual(ThisBucket->getFirst(), EmptyKey))) {
+        // If we've already seen a tombstone while probing, fill it in instead
+        // of the empty bucket we eventually probed to.
+        FoundBucket = FoundTombstone ? FoundTombstone : ThisBucket;
+        return false;
+      }
+
+      // If this is a tombstone, remember it.  If Val ends up not in the map, we
+      // prefer to return it than something that would require more probing.
+      if (KeyInfoT::isEqual(ThisBucket->getFirst(), TombstoneKey) &&
+          !FoundTombstone)
+        FoundTombstone = ThisBucket;  // Remember the first tombstone found.
+
+      // Otherwise, it's a hash collision or a tombstone, continue quadratic
+      // probing.
+      BucketNo += ProbeAmt++;
+      BucketNo &= (NumBuckets - 1);
+    }
+  }
+
+  template <typename LookupKeyT>
+  bool LookupBucketFor(const LookupKeyT &Val, BucketT *&FoundBucket) {
+    const BucketT *ConstFoundBucket;
+    bool Result = const_cast<const DenseMapBase *>(this)->LookupBucketFor(
+        Val, ConstFoundBucket);
+    FoundBucket = const_cast<BucketT *>(ConstFoundBucket);
+    return Result;
+  }
+
+ public:
+  /// Return the approximate size (in bytes) of the actual map.
+  /// This is just the raw memory used by DenseMap.
+  /// If entries are pointers to objects, the size of the referenced objects
+  /// are not included.
+  uptr getMemorySize() const {
+    return RoundUpTo(getNumBuckets() * sizeof(BucketT), GetPageSizeCached());
+  }
+};
+
+/// Equality comparison for DenseMap.
+///
+/// Iterates over elements of LHS confirming that each (key, value) pair in LHS
+/// is also in RHS, and that no additional pairs are in RHS.
+/// Equivalent to N calls to RHS.find and N value comparisons. Amortized
+/// complexity is linear, worst case is O(N^2) (if every hash collides).
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+bool operator==(
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+  if (LHS.size() != RHS.size())
+    return false;
+
+  bool R = true;
+  LHS.forEach(
+      [&](const typename DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT,
+                                      BucketT>::value_type &KV) -> bool {
+        const auto *I = RHS.find(KV.first);
+        if (!I || I->second != KV.second) {
+          R = false;
+          return false;
+        }
+        return true;
+      });
+
+  return R;
+}
+
+/// Inequality comparison for DenseMap.
+///
+/// Equivalent to !(LHS == RHS). See operator== for performance notes.
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+bool operator!=(
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+  return !(LHS == RHS);
+}
+
+template <typename KeyT, typename ValueT,
+          typename KeyInfoT = DenseMapInfo<KeyT>,
+          typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
+                                     KeyT, ValueT, KeyInfoT, BucketT> {
+  friend class DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
+
+  // Lift some types from the dependent base class into this class for
+  // simplicity of referring to them.
+  using BaseT = DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
+
+  BucketT *Buckets = nullptr;
+  unsigned NumEntries = 0;
+  unsigned NumTombstones = 0;
+  unsigned NumBuckets = 0;
+
+ public:
+  /// Create a DenseMap with an optional \p InitialReserve that guarantee that
+  /// this number of elements can be inserted in the map without grow()
+  explicit DenseMap(unsigned InitialReserve) { init(InitialReserve); }
+  constexpr DenseMap() = default;
+
+  DenseMap(const DenseMap &other) : BaseT() {
+    init(0);
+    copyFrom(other);
+  }
+
+  DenseMap(DenseMap &&other) : BaseT() {
+    init(0);
+    swap(other);
+  }
+
+  ~DenseMap() {
+    this->destroyAll();
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets);
+  }
+
+  void swap(DenseMap &RHS) {
+    Swap(Buckets, RHS.Buckets);
+    Swap(NumEntries, RHS.NumEntries);
+    Swap(NumTombstones, RHS.NumTombstones);
+    Swap(NumBuckets, RHS.NumBuckets);
+  }
+
+  DenseMap &operator=(const DenseMap &other) {
+    if (&other != this)
+      copyFrom(other);
+    return *this;
+  }
+
+  DenseMap &operator=(DenseMap &&other) {
+    this->destroyAll();
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
+    init(0);
+    swap(other);
+    return *this;
+  }
+
+  void copyFrom(const DenseMap &other) {
+    this->destroyAll();
+    deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets);
+    if (allocateBuckets(other.NumBuckets)) {
+      this->BaseT::copyFrom(other);
+    } else {
+      NumEntries = 0;
+      NumTombstones = 0;
+    }
+  }
+
+  void init(unsigned InitNumEntries) {
+    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
+    if (allocateBuckets(InitBuckets)) {
+      this->BaseT::initEmpty();
+    } else {
+      NumEntries = 0;
+      NumTombstones = 0;
+    }
+  }
+
+  void grow(unsigned AtLeast) {
+    unsigned OldNumBuckets = NumBuckets;
+    BucketT *OldBuckets = Buckets;
+
+    allocateBuckets(RoundUpToPowerOfTwo(Max<unsigned>(64, AtLeast)));
+    CHECK(Buckets);
+    if (!OldBuckets) {
+      this->BaseT::initEmpty();
+      return;
+    }
+
+    this->moveFromOldBuckets(OldBuckets, OldBuckets + OldNumBuckets);
+
+    // Free the old table.
+    deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets);
+  }
+
+ private:
+  unsigned getNumEntries() const { return NumEntries; }
+
+  void setNumEntries(unsigned Num) { NumEntries = Num; }
+
+  unsigned getNumTombstones() const { return NumTombstones; }
+
+  void setNumTombstones(unsigned Num) { NumTombstones = Num; }
+
+  BucketT *getBuckets() const { return Buckets; }
+
+  unsigned getNumBuckets() const { return NumBuckets; }
+
+  bool allocateBuckets(unsigned Num) {
+    NumBuckets = Num;
+    if (NumBuckets == 0) {
+      Buckets = nullptr;
+      return false;
+    }
+
+    uptr Size = sizeof(BucketT) * NumBuckets;
+    if (Size * 2 <= GetPageSizeCached()) {
+      // We always allocate at least a page, so use entire space.
+      unsigned Log2 = MostSignificantSetBitIndex(GetPageSizeCached() / Size);
+      Size <<= Log2;
+      NumBuckets <<= Log2;
+      CHECK_EQ(Size, sizeof(BucketT) * NumBuckets);
+      CHECK_GT(Size * 2, GetPageSizeCached());
+    }
+    Buckets = static_cast<BucketT *>(allocate_buffer(Size));
+    return true;
+  }
+
+  static void *allocate_buffer(uptr Size) {
+    return MmapOrDie(RoundUpTo(Size, GetPageSizeCached()), "DenseMap");
+  }
+
+  static void deallocate_buffer(void *Ptr, uptr Size) {
+    UnmapOrDie(Ptr, RoundUpTo(Size, GetPageSizeCached()));
+  }
+};
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_DENSE_MAP_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_dense_map_info.h b/libsanitizer/sanitizer_common/sanitizer_dense_map_info.h
new file mode 100644
index 0000000..f464036
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_dense_map_info.h
@@ -0,0 +1,282 @@
+//===- sanitizer_dense_map_info.h - Type traits for DenseMap ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_DENSE_MAP_INFO_H
+#define SANITIZER_DENSE_MAP_INFO_H
+
+#include "sanitizer_common.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_type_traits.h"
+
+namespace __sanitizer {
+
+namespace detail {
+
+/// Simplistic combination of 32-bit hash values into 32-bit hash values.
+static constexpr unsigned combineHashValue(unsigned a, unsigned b) {
+  u64 key = (u64)a << 32 | (u64)b;
+  key += ~(key << 32);
+  key ^= (key >> 22);
+  key += ~(key << 13);
+  key ^= (key >> 8);
+  key += (key << 3);
+  key ^= (key >> 15);
+  key += ~(key << 27);
+  key ^= (key >> 31);
+  return (unsigned)key;
+}
+
+// We extend a pair to allow users to override the bucket type with their own
+// implementation without requiring two members.
+template <typename KeyT, typename ValueT>
+struct DenseMapPair {
+  KeyT first = {};
+  ValueT second = {};
+  constexpr DenseMapPair() = default;
+  constexpr DenseMapPair(const KeyT &f, const ValueT &s)
+      : first(f), second(s) {}
+
+  template <typename KeyT2, typename ValueT2>
+  constexpr DenseMapPair(KeyT2 &&f, ValueT2 &&s)
+      : first(__sanitizer::forward<KeyT2>(f)),
+        second(__sanitizer::forward<ValueT2>(s)) {}
+
+  constexpr DenseMapPair(const DenseMapPair &other) = default;
+  constexpr DenseMapPair &operator=(const DenseMapPair &other) = default;
+  constexpr DenseMapPair(DenseMapPair &&other) = default;
+  constexpr DenseMapPair &operator=(DenseMapPair &&other) = default;
+
+  KeyT &getFirst() { return first; }
+  const KeyT &getFirst() const { return first; }
+  ValueT &getSecond() { return second; }
+  const ValueT &getSecond() const { return second; }
+};
+
+}  // end namespace detail
+
+template <typename T>
+struct DenseMapInfo {
+  // static T getEmptyKey();
+  // static T getTombstoneKey();
+  // static unsigned getHashValue(const T &Val);
+  // static bool isEqual(const T &LHS, const T &RHS);
+};
+
+// Provide DenseMapInfo for all pointers. Come up with sentinel pointer values
+// that are aligned to alignof(T) bytes, but try to avoid requiring T to be
+// complete. This allows clients to instantiate DenseMap<T*, ...> with forward
+// declared key types. Assume that no pointer key type requires more than 4096
+// bytes of alignment.
+template <typename T>
+struct DenseMapInfo<T *> {
+  // The following should hold, but it would require T to be complete:
+  // static_assert(alignof(T) <= (1 << Log2MaxAlign),
+  //               "DenseMap does not support pointer keys requiring more than "
+  //               "Log2MaxAlign bits of alignment");
+  static constexpr uptr Log2MaxAlign = 12;
+
+  static constexpr T *getEmptyKey() {
+    uptr Val = static_cast<uptr>(-1);
+    Val <<= Log2MaxAlign;
+    return reinterpret_cast<T *>(Val);
+  }
+
+  static constexpr T *getTombstoneKey() {
+    uptr Val = static_cast<uptr>(-2);
+    Val <<= Log2MaxAlign;
+    return reinterpret_cast<T *>(Val);
+  }
+
+  static constexpr unsigned getHashValue(const T *PtrVal) {
+    return (unsigned((uptr)PtrVal) >> 4) ^ (unsigned((uptr)PtrVal) >> 9);
+  }
+
+  static constexpr bool isEqual(const T *LHS, const T *RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for chars.
+template <>
+struct DenseMapInfo<char> {
+  static constexpr char getEmptyKey() { return ~0; }
+  static constexpr char getTombstoneKey() { return ~0 - 1; }
+  static constexpr unsigned getHashValue(const char &Val) { return Val * 37U; }
+
+  static constexpr bool isEqual(const char &LHS, const char &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned chars.
+template <>
+struct DenseMapInfo<unsigned char> {
+  static constexpr unsigned char getEmptyKey() { return ~0; }
+  static constexpr unsigned char getTombstoneKey() { return ~0 - 1; }
+  static constexpr unsigned getHashValue(const unsigned char &Val) {
+    return Val * 37U;
+  }
+
+  static constexpr bool isEqual(const unsigned char &LHS,
+                                const unsigned char &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned shorts.
+template <>
+struct DenseMapInfo<unsigned short> {
+  static constexpr unsigned short getEmptyKey() { return 0xFFFF; }
+  static constexpr unsigned short getTombstoneKey() { return 0xFFFF - 1; }
+  static constexpr unsigned getHashValue(const unsigned short &Val) {
+    return Val * 37U;
+  }
+
+  static constexpr bool isEqual(const unsigned short &LHS,
+                                const unsigned short &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned ints.
+template <>
+struct DenseMapInfo<unsigned> {
+  static constexpr unsigned getEmptyKey() { return ~0U; }
+  static constexpr unsigned getTombstoneKey() { return ~0U - 1; }
+  static constexpr unsigned getHashValue(const unsigned &Val) {
+    return Val * 37U;
+  }
+
+  static constexpr bool isEqual(const unsigned &LHS, const unsigned &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned longs.
+template <>
+struct DenseMapInfo<unsigned long> {
+  static constexpr unsigned long getEmptyKey() { return ~0UL; }
+  static constexpr unsigned long getTombstoneKey() { return ~0UL - 1L; }
+
+  static constexpr unsigned getHashValue(const unsigned long &Val) {
+    return (unsigned)(Val * 37UL);
+  }
+
+  static constexpr bool isEqual(const unsigned long &LHS,
+                                const unsigned long &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for unsigned long longs.
+template <>
+struct DenseMapInfo<unsigned long long> {
+  static constexpr unsigned long long getEmptyKey() { return ~0ULL; }
+  static constexpr unsigned long long getTombstoneKey() { return ~0ULL - 1ULL; }
+
+  static constexpr unsigned getHashValue(const unsigned long long &Val) {
+    return (unsigned)(Val * 37ULL);
+  }
+
+  static constexpr bool isEqual(const unsigned long long &LHS,
+                                const unsigned long long &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for shorts.
+template <>
+struct DenseMapInfo<short> {
+  static constexpr short getEmptyKey() { return 0x7FFF; }
+  static constexpr short getTombstoneKey() { return -0x7FFF - 1; }
+  static constexpr unsigned getHashValue(const short &Val) { return Val * 37U; }
+  static constexpr bool isEqual(const short &LHS, const short &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for ints.
+template <>
+struct DenseMapInfo<int> {
+  static constexpr int getEmptyKey() { return 0x7fffffff; }
+  static constexpr int getTombstoneKey() { return -0x7fffffff - 1; }
+  static constexpr unsigned getHashValue(const int &Val) {
+    return (unsigned)(Val * 37U);
+  }
+
+  static constexpr bool isEqual(const int &LHS, const int &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for longs.
+template <>
+struct DenseMapInfo<long> {
+  static constexpr long getEmptyKey() {
+    return (1UL << (sizeof(long) * 8 - 1)) - 1UL;
+  }
+
+  static constexpr long getTombstoneKey() { return getEmptyKey() - 1L; }
+
+  static constexpr unsigned getHashValue(const long &Val) {
+    return (unsigned)(Val * 37UL);
+  }
+
+  static constexpr bool isEqual(const long &LHS, const long &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for long longs.
+template <>
+struct DenseMapInfo<long long> {
+  static constexpr long long getEmptyKey() { return 0x7fffffffffffffffLL; }
+  static constexpr long long getTombstoneKey() {
+    return -0x7fffffffffffffffLL - 1;
+  }
+
+  static constexpr unsigned getHashValue(const long long &Val) {
+    return (unsigned)(Val * 37ULL);
+  }
+
+  static constexpr bool isEqual(const long long &LHS, const long long &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide DenseMapInfo for all pairs whose members have info.
+template <typename T, typename U>
+struct DenseMapInfo<detail::DenseMapPair<T, U>> {
+  using Pair = detail::DenseMapPair<T, U>;
+  using FirstInfo = DenseMapInfo<T>;
+  using SecondInfo = DenseMapInfo<U>;
+
+  static constexpr Pair getEmptyKey() {
+    return detail::DenseMapPair<T, U>(FirstInfo::getEmptyKey(),
+                                      SecondInfo::getEmptyKey());
+  }
+
+  static constexpr Pair getTombstoneKey() {
+    return detail::DenseMapPair<T, U>(FirstInfo::getTombstoneKey(),
+                                      SecondInfo::getTombstoneKey());
+  }
+
+  static constexpr unsigned getHashValue(const Pair &PairVal) {
+    return detail::combineHashValue(FirstInfo::getHashValue(PairVal.first),
+                                    SecondInfo::getHashValue(PairVal.second));
+  }
+
+  static constexpr bool isEqual(const Pair &LHS, const Pair &RHS) {
+    return FirstInfo::isEqual(LHS.first, RHS.first) &&
+           SecondInfo::isEqual(LHS.second, RHS.second);
+  }
+};
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_DENSE_MAP_INFO_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_file.cpp b/libsanitizer/sanitizer_common/sanitizer_file.cpp
index 5492560..7ef499c 100644
--- a/libsanitizer/sanitizer_common/sanitizer_file.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_file.cpp
@@ -19,6 +19,7 @@
 
 #include "sanitizer_common.h"
 #include "sanitizer_file.h"
+#  include "sanitizer_interface_internal.h"
 
 namespace __sanitizer {
 
@@ -83,8 +84,12 @@ static void RecursiveCreateParentDirs(char *path) {
     if (!IsPathSeparator(path[i]))
       continue;
     path[i] = '\0';
-    /* Some of these will fail, because the directory exists, ignore it. */
-    CreateDir(path);
+    if (!DirExists(path) && !CreateDir(path)) {
+      const char *ErrorMsgPrefix = "ERROR: Can't create directory: ";
+      WriteToFile(kStderrFd, ErrorMsgPrefix, internal_strlen(ErrorMsgPrefix));
+      WriteToFile(kStderrFd, path, internal_strlen(path));
+      Die();
+    }
     path[i] = save;
   }
 }
diff --git a/libsanitizer/sanitizer_common/sanitizer_file.h b/libsanitizer/sanitizer_common/sanitizer_file.h
index 3d79161..810c1e4 100644
--- a/libsanitizer/sanitizer_common/sanitizer_file.h
+++ b/libsanitizer/sanitizer_common/sanitizer_file.h
@@ -15,7 +15,6 @@
 #ifndef SANITIZER_FILE_H
 #define SANITIZER_FILE_H
 
-#include "sanitizer_interface_internal.h"
 #include "sanitizer_internal_defs.h"
 #include "sanitizer_libc.h"
 #include "sanitizer_mutex.h"
@@ -78,6 +77,7 @@ bool SupportsColoredOutput(fd_t fd);
 // OS
 const char *GetPwd();
 bool FileExists(const char *filename);
+bool DirExists(const char *path);
 char *FindPathToBinary(const char *name);
 bool IsPathSeparator(const char c);
 bool IsAbsolutePath(const char *path);
diff --git a/libsanitizer/sanitizer_common/sanitizer_flags.inc b/libsanitizer/sanitizer_common/sanitizer_flags.inc
index 95da82b..0ca91af 100644
--- a/libsanitizer/sanitizer_common/sanitizer_flags.inc
+++ b/libsanitizer/sanitizer_common/sanitizer_flags.inc
@@ -179,6 +179,7 @@ COMMON_FLAG(bool, use_madv_dontdump, true,
           "in core file.")
 COMMON_FLAG(bool, symbolize_inline_frames, true,
             "Print inlined frames in stacktraces. Defaults to true.")
+COMMON_FLAG(bool, demangle, true, "Print demangled symbols.")
 COMMON_FLAG(bool, symbolize_vs_style, false,
             "Print file locations in Visual Studio style (e.g: "
             " file(10,42): ...")
@@ -191,6 +192,8 @@ COMMON_FLAG(const char *, stack_trace_format, "DEFAULT",
             "Format string used to render stack frames. "
             "See sanitizer_stacktrace_printer.h for the format description. "
             "Use DEFAULT to get default format.")
+COMMON_FLAG(int, compress_stack_depot, 0,
+            "Compress stack depot to save memory.")
 COMMON_FLAG(bool, no_huge_pages_for_shadow, true,
             "If true, the shadow is not allowed to use huge pages. ")
 COMMON_FLAG(bool, strict_string_checks, false,
diff --git a/libsanitizer/sanitizer_common/sanitizer_fuchsia.cpp b/libsanitizer/sanitizer_common/sanitizer_fuchsia.cpp
index c7b30d9..848953a 100644
--- a/libsanitizer/sanitizer_common/sanitizer_fuchsia.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_fuchsia.cpp
@@ -14,17 +14,18 @@
 #include "sanitizer_fuchsia.h"
 #if SANITIZER_FUCHSIA
 
-#include <pthread.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <zircon/errors.h>
-#include <zircon/process.h>
-#include <zircon/syscalls.h>
-#include <zircon/utc.h>
-
-#include "sanitizer_common.h"
-#include "sanitizer_libc.h"
-#include "sanitizer_mutex.h"
+#  include <pthread.h>
+#  include <stdlib.h>
+#  include <unistd.h>
+#  include <zircon/errors.h>
+#  include <zircon/process.h>
+#  include <zircon/syscalls.h>
+#  include <zircon/utc.h>
+
+#  include "sanitizer_common.h"
+#  include "sanitizer_interface_internal.h"
+#  include "sanitizer_libc.h"
+#  include "sanitizer_mutex.h"
 
 namespace __sanitizer {
 
@@ -89,7 +90,7 @@ void InitializePlatformEarly() {}
 void MaybeReexec() {}
 void CheckASLR() {}
 void CheckMPROTECT() {}
-void PlatformPrepareForSandboxing(__sanitizer_sandbox_arguments *args) {}
+void PlatformPrepareForSandboxing(void *args) {}
 void DisableCoreDumperIfNecessary() {}
 void InstallDeadlySignalHandlers(SignalHandlerType handler) {}
 void SetAlternateSignalStack() {}
@@ -274,6 +275,15 @@ void *MmapFixedNoAccess(uptr fixed_addr, uptr size, const char *name) {
   UNIMPLEMENTED();
 }
 
+bool MprotectNoAccess(uptr addr, uptr size) {
+  return _zx_vmar_protect(_zx_vmar_root_self(), 0, addr, size) == ZX_OK;
+}
+
+bool MprotectReadOnly(uptr addr, uptr size) {
+  return _zx_vmar_protect(_zx_vmar_root_self(), ZX_VM_PERM_READ, addr, size) ==
+         ZX_OK;
+}
+
 void *MmapAlignedOrDieOnFatalError(uptr size, uptr alignment,
                                    const char *mem_type) {
   CHECK_GE(size, GetPageSize());
@@ -376,29 +386,8 @@ void GetMemoryProfile(fill_profile_f cb, uptr *stats) {}
 
 bool ReadFileToBuffer(const char *file_name, char **buff, uptr *buff_size,
                       uptr *read_len, uptr max_len, error_t *errno_p) {
-  zx_handle_t vmo;
-  zx_status_t status = __sanitizer_get_configuration(file_name, &vmo);
-  if (status == ZX_OK) {
-    uint64_t vmo_size;
-    status = _zx_vmo_get_size(vmo, &vmo_size);
-    if (status == ZX_OK) {
-      if (vmo_size < max_len)
-        max_len = vmo_size;
-      size_t map_size = RoundUpTo(max_len, GetPageSize());
-      uintptr_t addr;
-      status = _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ, 0, vmo, 0,
-                            map_size, &addr);
-      if (status == ZX_OK) {
-        *buff = reinterpret_cast<char *>(addr);
-        *buff_size = map_size;
-        *read_len = max_len;
-      }
-    }
-    _zx_handle_close(vmo);
-  }
-  if (status != ZX_OK && errno_p)
-    *errno_p = status;
-  return status == ZX_OK;
+  *errno_p = ZX_ERR_NOT_SUPPORTED;
+  return false;
 }
 
 void RawWrite(const char *buffer) {
@@ -475,6 +464,9 @@ u32 GetNumberOfCPUs() { return zx_system_get_num_cpus(); }
 
 uptr GetRSS() { UNIMPLEMENTED(); }
 
+void *internal_start_thread(void *(*func)(void *arg), void *arg) { return 0; }
+void internal_join_thread(void *th) {}
+
 void InitializePlatformCommonFlags(CommonFlags *cf) {}
 
 }  // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_interface_internal.h b/libsanitizer/sanitizer_common/sanitizer_interface_internal.h
index 1600d31..e9dc78c 100644
--- a/libsanitizer/sanitizer_common/sanitizer_interface_internal.h
+++ b/libsanitizer/sanitizer_common/sanitizer_interface_internal.h
@@ -20,104 +20,102 @@
 #include "sanitizer_internal_defs.h"
 
 extern "C" {
-  // Tell the tools to write their reports to "path.<pid>" instead of stderr.
-  // The special values are "stdout" and "stderr".
-  SANITIZER_INTERFACE_ATTRIBUTE
-  void __sanitizer_set_report_path(const char *path);
-  // Tell the tools to write their reports to the provided file descriptor
-  // (casted to void *).
-  SANITIZER_INTERFACE_ATTRIBUTE
-  void __sanitizer_set_report_fd(void *fd);
-  // Get the current full report file path, if a path was specified by
-  // an earlier call to __sanitizer_set_report_path. Returns null otherwise.
-  SANITIZER_INTERFACE_ATTRIBUTE
-  const char *__sanitizer_get_report_path();
+// Tell the tools to write their reports to "path.<pid>" instead of stderr.
+// The special values are "stdout" and "stderr".
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_set_report_path(const char *path);
+// Tell the tools to write their reports to the provided file descriptor
+// (casted to void *).
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_set_report_fd(void *fd);
+// Get the current full report file path, if a path was specified by
+// an earlier call to __sanitizer_set_report_path. Returns null otherwise.
+SANITIZER_INTERFACE_ATTRIBUTE
+const char *__sanitizer_get_report_path();
 
-  typedef struct {
-      int coverage_sandboxed;
-      __sanitizer::sptr coverage_fd;
-      unsigned int coverage_max_block_size;
-  } __sanitizer_sandbox_arguments;
+typedef struct {
+  int coverage_sandboxed;
+  __sanitizer::sptr coverage_fd;
+  unsigned int coverage_max_block_size;
+} __sanitizer_sandbox_arguments;
 
-  // Notify the tools that the sandbox is going to be turned on.
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
-      __sanitizer_sandbox_on_notify(__sanitizer_sandbox_arguments *args);
+// Notify the tools that the sandbox is going to be turned on.
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_sandbox_on_notify(__sanitizer_sandbox_arguments *args);
 
-  // This function is called by the tool when it has just finished reporting
-  // an error. 'error_summary' is a one-line string that summarizes
-  // the error message. This function can be overridden by the client.
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_report_error_summary(const char *error_summary);
+// This function is called by the tool when it has just finished reporting
+// an error. 'error_summary' is a one-line string that summarizes
+// the error message. This function can be overridden by the client.
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_report_error_summary(const char *error_summary);
 
-  SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_cov_dump();
-  SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_dump_coverage(
-      const __sanitizer::uptr *pcs, const __sanitizer::uptr len);
-  SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_dump_trace_pc_guard_coverage();
+SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_cov_dump();
+SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_dump_coverage(
+    const __sanitizer::uptr *pcs, const __sanitizer::uptr len);
+SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_dump_trace_pc_guard_coverage();
 
-  SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_cov(__sanitizer::u32 *guard);
+SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_cov(__sanitizer::u32 *guard);
 
-  // Returns 1 on the first call, then returns 0 thereafter.  Called by the tool
-  // to ensure only one report is printed when multiple errors occur
-  // simultaneously.
-  SANITIZER_INTERFACE_ATTRIBUTE int __sanitizer_acquire_crash_state();
+// Returns 1 on the first call, then returns 0 thereafter.  Called by the tool
+// to ensure only one report is printed when multiple errors occur
+// simultaneously.
+SANITIZER_INTERFACE_ATTRIBUTE int __sanitizer_acquire_crash_state();
 
-  SANITIZER_INTERFACE_ATTRIBUTE
-  void __sanitizer_annotate_contiguous_container(const void *beg,
-                                                 const void *end,
-                                                 const void *old_mid,
-                                                 const void *new_mid);
-  SANITIZER_INTERFACE_ATTRIBUTE
-  int __sanitizer_verify_contiguous_container(const void *beg, const void *mid,
-                                              const void *end);
-  SANITIZER_INTERFACE_ATTRIBUTE
-  const void *__sanitizer_contiguous_container_find_bad_address(
-      const void *beg, const void *mid, const void *end);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_annotate_contiguous_container(const void *beg, const void *end,
+                                               const void *old_mid,
+                                               const void *new_mid);
+SANITIZER_INTERFACE_ATTRIBUTE
+int __sanitizer_verify_contiguous_container(const void *beg, const void *mid,
+                                            const void *end);
+SANITIZER_INTERFACE_ATTRIBUTE
+const void *__sanitizer_contiguous_container_find_bad_address(const void *beg,
+                                                              const void *mid,
+                                                              const void *end);
 
-  SANITIZER_INTERFACE_ATTRIBUTE
-  int __sanitizer_get_module_and_offset_for_pc(
-      __sanitizer::uptr pc, char *module_path,
-      __sanitizer::uptr module_path_len, __sanitizer::uptr *pc_offset);
+SANITIZER_INTERFACE_ATTRIBUTE
+int __sanitizer_get_module_and_offset_for_pc(void *pc, char *module_path,
+                                             __sanitizer::uptr module_path_len,
+                                             void **pc_offset);
 
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_cmp();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_cmp1();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_cmp2();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_cmp4();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_cmp8();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_const_cmp1();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_const_cmp2();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_const_cmp4();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_const_cmp8();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_switch();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_div4();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_div8();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_gep();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_pc_indir();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_pc_guard(__sanitizer::u32*);
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-  void __sanitizer_cov_trace_pc_guard_init(__sanitizer::u32*,
-                                           __sanitizer::u32*);
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
-  __sanitizer_cov_8bit_counters_init(char *, char *);
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
-  __sanitizer_cov_bool_flag_init();
-  SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
-  __sanitizer_cov_pcs_init(const __sanitizer::uptr *,
-                           const __sanitizer::uptr *);
-} // extern "C"
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_cmp();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_cmp1();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_cmp2();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_cmp4();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_cmp8();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_const_cmp1();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_const_cmp2();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_const_cmp4();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_const_cmp8();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_switch();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_div4();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_div8();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_gep();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_pc_indir();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_pc_guard(__sanitizer::u32 *);
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_trace_pc_guard_init(__sanitizer::u32 *, __sanitizer::u32 *);
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_8bit_counters_init(char *, char *);
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_bool_flag_init();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_cov_pcs_init(const __sanitizer::uptr *, const __sanitizer::uptr *);
+}  // extern "C"
 
 #endif  // SANITIZER_INTERFACE_INTERNAL_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_internal_defs.h b/libsanitizer/sanitizer_common/sanitizer_internal_defs.h
index e97cc9a..95a80b4 100644
--- a/libsanitizer/sanitizer_common/sanitizer_internal_defs.h
+++ b/libsanitizer/sanitizer_common/sanitizer_internal_defs.h
@@ -179,6 +179,7 @@ typedef int pid_t;
 
 #if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_MAC ||             \
     (SANITIZER_SOLARIS && (defined(_LP64) || _FILE_OFFSET_BITS == 64)) || \
+    (SANITIZER_LINUX && !SANITIZER_GLIBC && !SANITIZER_ANDROID) ||        \
     (SANITIZER_LINUX && (defined(__x86_64__) || defined(__hexagon__)))
 typedef u64 OFF_T;
 #else
@@ -258,7 +259,9 @@ typedef u64 tid_t;
 # define NOEXCEPT throw()
 #endif
 
-#if __has_cpp_attribute(clang::fallthrough)
+#if __has_cpp_attribute(fallthrough)
+#  define FALLTHROUGH [[fallthrough]]
+#elif __has_cpp_attribute(clang::fallthrough)
 #  define FALLTHROUGH [[clang::fallthrough]]
 #else
 #  define FALLTHROUGH
@@ -300,7 +303,8 @@ void NORETURN CheckFailed(const char *file, int line, const char *cond,
     }                                          \
   } while (0)
 
-#define RAW_CHECK(expr, ...) RAW_CHECK_MSG(expr, #expr "\n", __VA_ARGS__)
+#define RAW_CHECK(expr) RAW_CHECK_MSG(expr, #expr "\n", )
+#define RAW_CHECK_VA(expr, ...) RAW_CHECK_MSG(expr, #expr "\n", __VA_ARGS__)
 
 #define CHECK_IMPL(c1, op, c2) \
   do { \
diff --git a/libsanitizer/sanitizer_common/sanitizer_leb128.h b/libsanitizer/sanitizer_common/sanitizer_leb128.h
new file mode 100644
index 0000000..553550d
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_leb128.h
@@ -0,0 +1,87 @@
+//===-- sanitizer_leb128.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_LEB128_H
+#define SANITIZER_LEB128_H
+
+#include "sanitizer_common.h"
+#include "sanitizer_internal_defs.h"
+
+namespace __sanitizer {
+
+template <typename T, typename It>
+It EncodeSLEB128(T value, It begin, It end) {
+  bool more;
+  do {
+    u8 byte = value & 0x7f;
+    // NOTE: this assumes that this signed shift is an arithmetic right shift.
+    value >>= 7;
+    more = !((((value == 0) && ((byte & 0x40) == 0)) ||
+              ((value == -1) && ((byte & 0x40) != 0))));
+    if (more)
+      byte |= 0x80;
+    if (UNLIKELY(begin == end))
+      break;
+    *(begin++) = byte;
+  } while (more);
+  return begin;
+}
+
+template <typename T, typename It>
+It DecodeSLEB128(It begin, It end, T* v) {
+  T value = 0;
+  unsigned shift = 0;
+  u8 byte;
+  do {
+    if (UNLIKELY(begin == end))
+      return begin;
+    byte = *(begin++);
+    T slice = byte & 0x7f;
+    value |= slice << shift;
+    shift += 7;
+  } while (byte >= 128);
+  if (shift < 64 && (byte & 0x40))
+    value |= (-1ULL) << shift;
+  *v = value;
+  return begin;
+}
+
+template <typename T, typename It>
+It EncodeULEB128(T value, It begin, It end) {
+  do {
+    u8 byte = value & 0x7f;
+    value >>= 7;
+    if (value)
+      byte |= 0x80;
+    if (UNLIKELY(begin == end))
+      break;
+    *(begin++) = byte;
+  } while (value);
+  return begin;
+}
+
+template <typename T, typename It>
+It DecodeULEB128(It begin, It end, T* v) {
+  T value = 0;
+  unsigned shift = 0;
+  u8 byte;
+  do {
+    if (UNLIKELY(begin == end))
+      return begin;
+    byte = *(begin++);
+    T slice = byte & 0x7f;
+    value += slice << shift;
+    shift += 7;
+  } while (byte >= 128);
+  *v = value;
+  return begin;
+}
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_LEB128_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_linux.cpp b/libsanitizer/sanitizer_common/sanitizer_linux.cpp
index aa59d97..8e144a4 100644
--- a/libsanitizer/sanitizer_common/sanitizer_linux.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_linux.cpp
@@ -80,6 +80,7 @@
 
 #if SANITIZER_FREEBSD
 #include <sys/exec.h>
+#include <sys/procctl.h>
 #include <sys/sysctl.h>
 #include <machine/atomic.h>
 extern "C" {
@@ -163,6 +164,12 @@ ScopedBlockSignals::ScopedBlockSignals(__sanitizer_sigset_t *copy) {
   // See test/sanitizer_common/TestCases/Linux/setuid.c.
   internal_sigdelset(&set, 33);
 #  endif
+#  if SANITIZER_LINUX
+  // Seccomp-BPF-sandboxed processes rely on SIGSYS to handle trapped syscalls.
+  // If this signal is blocked, such calls cannot be handled and the process may
+  // hang.
+  internal_sigdelset(&set, 31);
+#  endif
   SetSigProcMask(&set, &saved_);
   if (copy)
     internal_memcpy(copy, &saved_, sizeof(saved_));
@@ -226,7 +233,7 @@ uptr internal_close(fd_t fd) {
 }
 
 uptr internal_open(const char *filename, int flags) {
-#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    if SANITIZER_LINUX
   return internal_syscall(SYSCALL(openat), AT_FDCWD, (uptr)filename, flags);
 #else
   return internal_syscall(SYSCALL(open), (uptr)filename, flags);
@@ -234,7 +241,7 @@ uptr internal_open(const char *filename, int flags) {
 }
 
 uptr internal_open(const char *filename, int flags, u32 mode) {
-#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    if SANITIZER_LINUX
   return internal_syscall(SYSCALL(openat), AT_FDCWD, (uptr)filename, flags,
                           mode);
 #else
@@ -335,50 +342,46 @@ static void kernel_stat_to_stat(struct kernel_stat *in, struct stat *out) {
 uptr internal_stat(const char *path, void *buf) {
 #if SANITIZER_FREEBSD
   return internal_syscall(SYSCALL(fstatat), AT_FDCWD, (uptr)path, (uptr)buf, 0);
-#elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    elif SANITIZER_LINUX
+#      if SANITIZER_WORDSIZE == 64
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           0);
-#elif SANITIZER_LINUX_USES_64BIT_SYSCALLS
-# if defined(__mips64)
-  // For mips64, stat syscall fills buffer in the format of kernel_stat
-  struct kernel_stat kbuf;
-  int res = internal_syscall(SYSCALL(stat), path, &kbuf);
-  kernel_stat_to_stat(&kbuf, (struct stat *)buf);
+#      else
+  struct stat64 buf64;
+  int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
+                             (uptr)&buf64, 0);
+  stat64_to_stat(&buf64, (struct stat *)buf);
   return res;
-# else
-  return internal_syscall(SYSCALL(stat), (uptr)path, (uptr)buf);
-# endif
-#else
+#      endif
+#    else
   struct stat64 buf64;
   int res = internal_syscall(SYSCALL(stat64), path, &buf64);
   stat64_to_stat(&buf64, (struct stat *)buf);
   return res;
-#endif
+#    endif
 }
 
 uptr internal_lstat(const char *path, void *buf) {
 #if SANITIZER_FREEBSD
   return internal_syscall(SYSCALL(fstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           AT_SYMLINK_NOFOLLOW);
-#elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    elif SANITIZER_LINUX
+#      if defined(_LP64)
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           AT_SYMLINK_NOFOLLOW);
-#elif SANITIZER_LINUX_USES_64BIT_SYSCALLS
-# if SANITIZER_MIPS64
-  // For mips64, lstat syscall fills buffer in the format of kernel_stat
-  struct kernel_stat kbuf;
-  int res = internal_syscall(SYSCALL(lstat), path, &kbuf);
-  kernel_stat_to_stat(&kbuf, (struct stat *)buf);
+#      else
+  struct stat64 buf64;
+  int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
+                             (uptr)&buf64, AT_SYMLINK_NOFOLLOW);
+  stat64_to_stat(&buf64, (struct stat *)buf);
   return res;
-# else
-  return internal_syscall(SYSCALL(lstat), (uptr)path, (uptr)buf);
-# endif
-#else
+#      endif
+#    else
   struct stat64 buf64;
   int res = internal_syscall(SYSCALL(lstat64), path, &buf64);
   stat64_to_stat(&buf64, (struct stat *)buf);
   return res;
-#endif
+#    endif
 }
 
 uptr internal_fstat(fd_t fd, void *buf) {
@@ -412,7 +415,7 @@ uptr internal_dup(int oldfd) {
 }
 
 uptr internal_dup2(int oldfd, int newfd) {
-#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    if SANITIZER_LINUX
   return internal_syscall(SYSCALL(dup3), oldfd, newfd, 0);
 #else
   return internal_syscall(SYSCALL(dup2), oldfd, newfd);
@@ -420,7 +423,7 @@ uptr internal_dup2(int oldfd, int newfd) {
 }
 
 uptr internal_readlink(const char *path, char *buf, uptr bufsize) {
-#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    if SANITIZER_LINUX
   return internal_syscall(SYSCALL(readlinkat), AT_FDCWD, (uptr)path, (uptr)buf,
                           bufsize);
 #else
@@ -429,7 +432,7 @@ uptr internal_readlink(const char *path, char *buf, uptr bufsize) {
 }
 
 uptr internal_unlink(const char *path) {
-#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    if SANITIZER_LINUX
   return internal_syscall(SYSCALL(unlinkat), AT_FDCWD, (uptr)path, 0);
 #else
   return internal_syscall(SYSCALL(unlink), (uptr)path);
@@ -440,12 +443,12 @@ uptr internal_rename(const char *oldpath, const char *newpath) {
 #if defined(__riscv) && defined(__linux__)
   return internal_syscall(SYSCALL(renameat2), AT_FDCWD, (uptr)oldpath, AT_FDCWD,
                           (uptr)newpath, 0);
-#elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    elif SANITIZER_LINUX
   return internal_syscall(SYSCALL(renameat), AT_FDCWD, (uptr)oldpath, AT_FDCWD,
                           (uptr)newpath);
-#else
+#    else
   return internal_syscall(SYSCALL(rename), (uptr)oldpath, (uptr)newpath);
-#endif
+#    endif
 }
 
 uptr internal_sched_yield() {
@@ -482,17 +485,20 @@ bool FileExists(const char *filename) {
   if (ShouldMockFailureToOpen(filename))
     return false;
   struct stat st;
-#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
-  if (internal_syscall(SYSCALL(newfstatat), AT_FDCWD, filename, &st, 0))
-#else
   if (internal_stat(filename, &st))
-#endif
     return false;
   // Sanity check: filename is a regular file.
   return S_ISREG(st.st_mode);
 }
 
-#if !SANITIZER_NETBSD
+bool DirExists(const char *path) {
+  struct stat st;
+  if (internal_stat(path, &st))
+    return false;
+  return S_ISDIR(st.st_mode);
+}
+
+#  if !SANITIZER_NETBSD
 tid_t GetTid() {
 #if SANITIZER_FREEBSD
   long Tid;
@@ -691,17 +697,17 @@ void FutexWake(atomic_uint32_t *p, u32 count) {
 // Not used
 #else
 struct linux_dirent {
-#if SANITIZER_X32 || defined(__aarch64__) || SANITIZER_RISCV64
+#    if SANITIZER_X32 || SANITIZER_LINUX
   u64 d_ino;
   u64 d_off;
-#else
+#    else
   unsigned long      d_ino;
   unsigned long      d_off;
-#endif
+#    endif
   unsigned short     d_reclen;
-#if defined(__aarch64__) || SANITIZER_RISCV64
+#    if SANITIZER_LINUX
   unsigned char      d_type;
-#endif
+#    endif
   char               d_name[256];
 };
 #endif
@@ -737,11 +743,11 @@ int internal_dlinfo(void *handle, int request, void *p) {
 uptr internal_getdents(fd_t fd, struct linux_dirent *dirp, unsigned int count) {
 #if SANITIZER_FREEBSD
   return internal_syscall(SYSCALL(getdirentries), fd, (uptr)dirp, count, NULL);
-#elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    elif SANITIZER_LINUX
   return internal_syscall(SYSCALL(getdents64), fd, (uptr)dirp, count);
-#else
+#    else
   return internal_syscall(SYSCALL(getdents), fd, (uptr)dirp, count);
-#endif
+#    endif
 }
 
 uptr internal_lseek(fd_t fd, OFF_T offset, int whence) {
@@ -759,11 +765,15 @@ uptr internal_sigaltstack(const void *ss, void *oss) {
 }
 
 int internal_fork() {
-#if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
+#    if SANITIZER_LINUX
+#      if SANITIZER_S390
+  return internal_syscall(SYSCALL(clone), 0, SIGCHLD);
+#      else
   return internal_syscall(SYSCALL(clone), SIGCHLD, 0);
-#else
+#      endif
+#    else
   return internal_syscall(SYSCALL(fork));
-#endif
+#    endif
 }
 
 #if SANITIZER_FREEBSD
@@ -1380,7 +1390,7 @@ uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
 #elif defined(__aarch64__)
 uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
                     int *parent_tidptr, void *newtls, int *child_tidptr) {
-  long long res;
+  register long long res __asm__("x0");
   if (!fn || !child_stack)
     return -EINVAL;
   CHECK_EQ(0, (uptr)child_stack % 16);
@@ -1760,6 +1770,8 @@ HandleSignalMode GetHandleSignalMode(int signum) {
 
 #if !SANITIZER_GO
 void *internal_start_thread(void *(*func)(void *arg), void *arg) {
+  if (&real_pthread_create == 0)
+    return nullptr;
   // Start the thread with signals blocked, otherwise it can steal user signals.
   ScopedBlockSignals block(nullptr);
   void *th;
@@ -1768,7 +1780,8 @@ void *internal_start_thread(void *(*func)(void *arg), void *arg) {
 }
 
 void internal_join_thread(void *th) {
-  real_pthread_join(th, nullptr);
+  if (&real_pthread_join)
+    real_pthread_join(th, nullptr);
 }
 #else
 void *internal_start_thread(void *(*func)(void *), void *arg) { return 0; }
@@ -1815,7 +1828,7 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 #else
   uptr err = ucontext->uc_mcontext.gregs[REG_ERR];
 #endif // SANITIZER_FREEBSD
-  return err & PF_WRITE ? WRITE : READ;
+  return err & PF_WRITE ? Write : Read;
 #elif defined(__mips__)
   uint32_t *exception_source;
   uint32_t faulty_instruction;
@@ -1838,7 +1851,7 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
     case 0x2a:  // swl
     case 0x2e:  // swr
 #endif
-      return SignalContext::WRITE;
+      return SignalContext::Write;
 
     case 0x20:  // lb
     case 0x24:  // lbu
@@ -1853,27 +1866,27 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
     case 0x22:  // lwl
     case 0x26:  // lwr
 #endif
-      return SignalContext::READ;
+      return SignalContext::Read;
 #if __mips_isa_rev == 6
     case 0x3b:  // pcrel
       op_code = (faulty_instruction >> 19) & 0x3;
       switch (op_code) {
         case 0x1:  // lwpc
         case 0x2:  // lwupc
-          return SignalContext::READ;
+          return SignalContext::Read;
       }
 #endif
   }
-  return SignalContext::UNKNOWN;
+  return SignalContext::Unknown;
 #elif defined(__arm__)
   static const uptr FSR_WRITE = 1U << 11;
   uptr fsr = ucontext->uc_mcontext.error_code;
-  return fsr & FSR_WRITE ? WRITE : READ;
+  return fsr & FSR_WRITE ? Write : Read;
 #elif defined(__aarch64__)
   static const u64 ESR_ELx_WNR = 1U << 6;
   u64 esr;
-  if (!Aarch64GetESR(ucontext, &esr)) return UNKNOWN;
-  return esr & ESR_ELx_WNR ? WRITE : READ;
+  if (!Aarch64GetESR(ucontext, &esr)) return Unknown;
+  return esr & ESR_ELx_WNR ? Write : Read;
 #elif defined(__sparc__)
   // Decode the instruction to determine the access type.
   // From OpenSolaris $SRC/uts/sun4/os/trap.c (get_accesstype).
@@ -1889,7 +1902,7 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 #endif
 #endif
   u32 instr = *(u32 *)pc;
-  return (instr >> 21) & 1 ? WRITE: READ;
+  return (instr >> 21) & 1 ? Write: Read;
 #elif defined(__riscv)
 #if SANITIZER_FREEBSD
   unsigned long pc = ucontext->uc_mcontext.mc_gpregs.gp_sepc;
@@ -1909,7 +1922,7 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 #if __riscv_xlen == 64
       case 0b10'011:  // c.ldsp (rd != x0)
 #endif
-        return rd ? SignalContext::READ : SignalContext::UNKNOWN;
+        return rd ? SignalContext::Read : SignalContext::Unknown;
       case 0b00'010:  // c.lw
 #if __riscv_flen >= 32 && __riscv_xlen == 32
       case 0b10'011:  // c.flwsp
@@ -1921,7 +1934,7 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
       case 0b00'001:  // c.fld
       case 0b10'001:  // c.fldsp
 #endif
-        return SignalContext::READ;
+        return SignalContext::Read;
       case 0b00'110:  // c.sw
       case 0b10'110:  // c.swsp
 #if __riscv_flen >= 32 || __riscv_xlen == 64
@@ -1932,9 +1945,9 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
       case 0b00'101:  // c.fsd
       case 0b10'101:  // c.fsdsp
 #endif
-        return SignalContext::WRITE;
+        return SignalContext::Write;
       default:
-        return SignalContext::UNKNOWN;
+        return SignalContext::Unknown;
     }
   }
 #endif
@@ -1952,9 +1965,9 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 #endif
         case 0b100:  // lbu
         case 0b101:  // lhu
-          return SignalContext::READ;
+          return SignalContext::Read;
         default:
-          return SignalContext::UNKNOWN;
+          return SignalContext::Unknown;
       }
     case 0b0100011:  // stores
       switch (funct3) {
@@ -1964,9 +1977,9 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 #if __riscv_xlen == 64
         case 0b011:  // sd
 #endif
-          return SignalContext::WRITE;
+          return SignalContext::Write;
         default:
-          return SignalContext::UNKNOWN;
+          return SignalContext::Unknown;
       }
 #if __riscv_flen >= 32
     case 0b0000111:  // floating-point loads
@@ -1975,9 +1988,9 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 #if __riscv_flen == 64
         case 0b011:  // fld
 #endif
-          return SignalContext::READ;
+          return SignalContext::Read;
         default:
-          return SignalContext::UNKNOWN;
+          return SignalContext::Unknown;
       }
     case 0b0100111:  // floating-point stores
       switch (funct3) {
@@ -1985,17 +1998,17 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 #if __riscv_flen == 64
         case 0b011:  // fsd
 #endif
-          return SignalContext::WRITE;
+          return SignalContext::Write;
         default:
-          return SignalContext::UNKNOWN;
+          return SignalContext::Unknown;
       }
 #endif
     default:
-      return SignalContext::UNKNOWN;
+      return SignalContext::Unknown;
   }
 #else
   (void)ucontext;
-  return UNKNOWN;  // FIXME: Implement.
+  return Unknown;  // FIXME: Implement.
 #endif
 }
 
@@ -2070,12 +2083,19 @@ static void GetPcSpBp(void *context, uptr *pc, uptr *sp, uptr *bp) {
   *sp = ucontext->uc_mcontext.gregs[REG_UESP];
 # endif
 #elif defined(__powerpc__) || defined(__powerpc64__)
+#    if SANITIZER_FREEBSD
+  ucontext_t *ucontext = (ucontext_t *)context;
+  *pc = ucontext->uc_mcontext.mc_srr0;
+  *sp = ucontext->uc_mcontext.mc_frame[1];
+  *bp = ucontext->uc_mcontext.mc_frame[31];
+#    else
   ucontext_t *ucontext = (ucontext_t*)context;
   *pc = ucontext->uc_mcontext.regs->nip;
   *sp = ucontext->uc_mcontext.regs->gpr[PT_R1];
   // The powerpc{,64}-linux ABIs do not specify r31 as the frame
   // pointer, but GCC always uses r31 when we need a frame pointer.
   *bp = ucontext->uc_mcontext.regs->gpr[PT_R31];
+#    endif
 #elif defined(__sparc__)
 #if defined(__arch64__) || defined(__sparcv9)
 #define STACK_BIAS 2047
@@ -2164,49 +2184,34 @@ void CheckASLR() {
            GetArgv()[0]);
     Die();
   }
-#elif SANITIZER_PPC64V2
-  // Disable ASLR for Linux PPC64LE.
-  int old_personality = personality(0xffffffff);
-  if (old_personality != -1 && (old_personality & ADDR_NO_RANDOMIZE) == 0) {
-    VReport(1, "WARNING: Program is being run with address space layout "
-               "randomization (ASLR) enabled which prevents the thread and "
-               "memory sanitizers from working on powerpc64le.\n"
-               "ASLR will be disabled and the program re-executed.\n");
-    CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
-    ReExec();
-  }
 #elif SANITIZER_FREEBSD
-  int aslr_pie;
-  uptr len = sizeof(aslr_pie);
-#if SANITIZER_WORDSIZE == 64
-  if (UNLIKELY(internal_sysctlbyname("kern.elf64.aslr.pie_enable",
-      &aslr_pie, &len, NULL, 0) == -1)) {
+  int aslr_status;
+  if (UNLIKELY(procctl(P_PID, 0, PROC_ASLR_STATUS, &aslr_status) == -1)) {
     // We're making things less 'dramatic' here since
-    // the OID is not necessarily guaranteed to be here
+    // the cmd is not necessarily guaranteed to be here
     // just yet regarding FreeBSD release
     return;
   }
-
-  if (aslr_pie > 0) {
+  if ((aslr_status & PROC_ASLR_ACTIVE) != 0) {
     Printf("This sanitizer is not compatible with enabled ASLR "
            "and binaries compiled with PIE\n");
     Die();
   }
-#endif
-  // there might be 32 bits compat for 64 bits
-  if (UNLIKELY(internal_sysctlbyname("kern.elf32.aslr.pie_enable",
-      &aslr_pie, &len, NULL, 0) == -1)) {
-    return;
-  }
-
-  if (aslr_pie > 0) {
-    Printf("This sanitizer is not compatible with enabled ASLR "
-           "and binaries compiled with PIE\n");
-    Die();
+#  elif SANITIZER_PPC64V2
+  // Disable ASLR for Linux PPC64LE.
+  int old_personality = personality(0xffffffff);
+  if (old_personality != -1 && (old_personality & ADDR_NO_RANDOMIZE) == 0) {
+    VReport(1,
+            "WARNING: Program is being run with address space layout "
+            "randomization (ASLR) enabled which prevents the thread and "
+            "memory sanitizers from working on powerpc64le.\n"
+            "ASLR will be disabled and the program re-executed.\n");
+    CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
+    ReExec();
   }
-#else
+#  else
   // Do nothing
-#endif
+#  endif
 }
 
 void CheckMPROTECT() {
diff --git a/libsanitizer/sanitizer_common/sanitizer_linux.h b/libsanitizer/sanitizer_common/sanitizer_linux.h
index 6a235db..ebd60e0 100644
--- a/libsanitizer/sanitizer_common/sanitizer_linux.h
+++ b/libsanitizer/sanitizer_common/sanitizer_linux.h
@@ -55,6 +55,9 @@ struct ScopedBlockSignals {
   explicit ScopedBlockSignals(__sanitizer_sigset_t *copy);
   ~ScopedBlockSignals();
 
+  ScopedBlockSignals &operator=(const ScopedBlockSignals &) = delete;
+  ScopedBlockSignals(const ScopedBlockSignals &) = delete;
+
  private:
   __sanitizer_sigset_t saved_;
 };
diff --git a/libsanitizer/sanitizer_common/sanitizer_linux_libcdep.cpp b/libsanitizer/sanitizer_common/sanitizer_linux_libcdep.cpp
index 0c260b6..25ad825 100644
--- a/libsanitizer/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -216,7 +216,8 @@ void InitTlsSize() { }
 // On glibc x86_64, ThreadDescriptorSize() needs to be precise due to the usage
 // of g_tls_size. On other targets, ThreadDescriptorSize() is only used by lsan
 // to get the pointer to thread-specific data keys in the thread control block.
-#if (SANITIZER_FREEBSD || SANITIZER_LINUX) && !SANITIZER_ANDROID
+#if (SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_SOLARIS) && \
+    !SANITIZER_ANDROID && !SANITIZER_GO
 // sizeof(struct pthread) from glibc.
 static atomic_uintptr_t thread_descriptor_size;
 
@@ -319,7 +320,6 @@ static uptr TlsPreTcbSize() {
 }
 #endif
 
-#if !SANITIZER_GO
 namespace {
 struct TlsBlock {
   uptr begin, end, align;
@@ -407,9 +407,8 @@ __attribute__((unused)) static void GetStaticTlsBoundary(uptr *addr, uptr *size,
   *addr = ranges[l].begin;
   *size = ranges[r - 1].end - ranges[l].begin;
 }
-#endif  // !SANITIZER_GO
 #endif  // (x86_64 || i386 || mips || ...) && (SANITIZER_FREEBSD ||
-        // SANITIZER_LINUX) && !SANITIZER_ANDROID
+        // SANITIZER_LINUX) && !SANITIZER_ANDROID && !SANITIZER_GO
 
 #if SANITIZER_NETBSD
 static struct tls_tcb * ThreadSelfTlsTcb() {
@@ -478,7 +477,7 @@ static void GetTls(uptr *addr, uptr *size) {
   const uptr pre_tcb_size = TlsPreTcbSize();
   *addr = tp - pre_tcb_size;
   *size = g_tls_size + pre_tcb_size;
-#elif SANITIZER_FREEBSD || SANITIZER_LINUX
+#elif SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_SOLARIS
   uptr align;
   GetStaticTlsBoundary(addr, size, &align);
 #if defined(__x86_64__) || defined(__i386__) || defined(__s390__) || \
@@ -539,11 +538,6 @@ static void GetTls(uptr *addr, uptr *size) {
       *addr = (uptr)tcb->tcb_dtv[1];
     }
   }
-#elif SANITIZER_SOLARIS
-  // FIXME
-  *addr = 0;
-  *size = 0;
-#else
 #error "Unknown OS"
 #endif
 }
@@ -614,6 +608,34 @@ static int AddModuleSegments(const char *module_name, dl_phdr_info *info,
       bool writable = phdr->p_flags & PF_W;
       cur_module.addAddressRange(cur_beg, cur_end, executable,
                                  writable);
+    } else if (phdr->p_type == PT_NOTE) {
+#  ifdef NT_GNU_BUILD_ID
+      uptr off = 0;
+      while (off + sizeof(ElfW(Nhdr)) < phdr->p_memsz) {
+        auto *nhdr = reinterpret_cast<const ElfW(Nhdr) *>(info->dlpi_addr +
+                                                          phdr->p_vaddr + off);
+        constexpr auto kGnuNamesz = 4;  // "GNU" with NUL-byte.
+        static_assert(kGnuNamesz % 4 == 0, "kGnuNameSize is aligned to 4.");
+        if (nhdr->n_type == NT_GNU_BUILD_ID && nhdr->n_namesz == kGnuNamesz) {
+          if (off + sizeof(ElfW(Nhdr)) + nhdr->n_namesz + nhdr->n_descsz >
+              phdr->p_memsz) {
+            // Something is very wrong, bail out instead of reading potentially
+            // arbitrary memory.
+            break;
+          }
+          const char *name =
+              reinterpret_cast<const char *>(nhdr) + sizeof(*nhdr);
+          if (internal_memcmp(name, "GNU", 3) == 0) {
+            const char *value = reinterpret_cast<const char *>(nhdr) +
+                                sizeof(*nhdr) + kGnuNamesz;
+            cur_module.setUuid(value, nhdr->n_descsz);
+            break;
+          }
+        }
+        off += sizeof(*nhdr) + RoundUpTo(nhdr->n_namesz, 4) +
+               RoundUpTo(nhdr->n_descsz, 4);
+      }
+#  endif
     }
   }
   modules->push_back(cur_module);
@@ -770,13 +792,9 @@ u32 GetNumberOfCPUs() {
 #elif SANITIZER_SOLARIS
   return sysconf(_SC_NPROCESSORS_ONLN);
 #else
-#if defined(CPU_COUNT)
   cpu_set_t CPUs;
   CHECK_EQ(sched_getaffinity(0, sizeof(cpu_set_t), &CPUs), 0);
   return CPU_COUNT(&CPUs);
-#else
-  return 1;
-#endif
 #endif
 }
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_linux_s390.cpp b/libsanitizer/sanitizer_common/sanitizer_linux_s390.cpp
index bb2f5b5..74db831 100644
--- a/libsanitizer/sanitizer_common/sanitizer_linux_s390.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_linux_s390.cpp
@@ -57,8 +57,10 @@ uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
 
 uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
                     int *parent_tidptr, void *newtls, int *child_tidptr) {
-  if (!fn || !child_stack)
-    return -EINVAL;
+  if (!fn || !child_stack) {
+    errno = EINVAL;
+    return -1;
+  }
   CHECK_EQ(0, (uptr)child_stack % 16);
   // Minimum frame size.
 #ifdef __s390x__
@@ -71,9 +73,9 @@ uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
   // And pass parameters.
   ((unsigned long *)child_stack)[1] = (uptr)fn;
   ((unsigned long *)child_stack)[2] = (uptr)arg;
-  register long res __asm__("r2");
+  register uptr res __asm__("r2");
   register void *__cstack      __asm__("r2") = child_stack;
-  register int __flags         __asm__("r3") = flags;
+  register long __flags        __asm__("r3") = flags;
   register int * __ptidptr     __asm__("r4") = parent_tidptr;
   register int * __ctidptr     __asm__("r5") = child_tidptr;
   register void * __newtls     __asm__("r6") = newtls;
@@ -113,6 +115,10 @@ uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
                          "r"(__ctidptr),
                          "r"(__newtls)
                        : "memory", "cc");
+  if (res >= (uptr)-4095) {
+    errno = -res;
+    return -1;
+  }
   return res;
 }
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_lzw.h b/libsanitizer/sanitizer_common/sanitizer_lzw.h
new file mode 100644
index 0000000..42acfbd
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_lzw.h
@@ -0,0 +1,159 @@
+//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// LempelâZivâWelch encoding/decoding
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_LZW_H
+#define SANITIZER_LZW_H
+
+#include "sanitizer_dense_map.h"
+
+namespace __sanitizer {
+
+using LzwCodeType = u32;
+
+template <class T, class ItIn, class ItOut>
+ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
+  using Substring =
+      detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
+
+  // Sentinel value for substrings of len 1.
+  static constexpr LzwCodeType kNoPrefix =
+      Min(DenseMapInfo<Substring>::getEmptyKey().first,
+          DenseMapInfo<Substring>::getTombstoneKey().first) -
+      1;
+  DenseMap<Substring, LzwCodeType> prefix_to_code;
+  {
+    // Add all substring of len 1 as initial dictionary.
+    InternalMmapVector<T> dict_len1;
+    for (auto it = begin; it != end; ++it)
+      if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
+        dict_len1.push_back(*it);
+
+    // Slightly helps with later delta encoding.
+    Sort(dict_len1.data(), dict_len1.size());
+
+    // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
+    // just generate them.
+    *out = dict_len1.size();
+    ++out;
+
+    for (uptr i = 0; i != dict_len1.size(); ++i) {
+      // Remap after the Sort.
+      prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
+      *out = dict_len1[i];
+      ++out;
+    }
+    CHECK_EQ(prefix_to_code.size(), dict_len1.size());
+  }
+
+  if (begin == end)
+    return out;
+
+  // Main LZW encoding loop.
+  LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
+  ++begin;
+  for (auto it = begin; it != end; ++it) {
+    // Extend match with the new item.
+    auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
+    if (ins.second) {
+      // This is a new substring, but emit the code for the current match
+      // (before extend). This allows LZW decoder to recover the dictionary.
+      *out = match;
+      ++out;
+      // Reset the match to a single item, which must be already in the map.
+      match = prefix_to_code.find({kNoPrefix, *it})->second;
+    } else {
+      // Already known, use as the current match.
+      match = ins.first->second;
+    }
+  }
+
+  *out = match;
+  ++out;
+
+  return out;
+}
+
+template <class T, class ItIn, class ItOut>
+ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
+  if (begin == end)
+    return out;
+
+  // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
+  InternalMmapVector<T> dict_len1(*begin);
+  ++begin;
+
+  if (begin == end)
+    return out;
+
+  for (auto& v : dict_len1) {
+    v = *begin;
+    ++begin;
+  }
+
+  // Substrings of len 2 and up. Indexes are shifted because [0,
+  // dict_len1.size()) stored in dict_len1. Substings get here after being
+  // emitted to the output, so we can use output position.
+  InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
+      code_to_substr;
+
+  // Copies already emitted substrings into the output again.
+  auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
+    if (code < dict_len1.size()) {
+      *out = dict_len1[code];
+      ++out;
+      return out;
+    }
+    const auto& s = code_to_substr[code - dict_len1.size()];
+
+    for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
+    return out;
+  };
+
+  // Returns lens of the substring with the given code.
+  auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
+    if (code < dict_len1.size())
+      return 1;
+    const auto& s = code_to_substr[code - dict_len1.size()];
+    return s.second - s.first;
+  };
+
+  // Main LZW decoding loop.
+  LzwCodeType prev_code = *begin;
+  ++begin;
+  out = copy(prev_code, out);
+  for (auto it = begin; it != end; ++it) {
+    LzwCodeType code = *it;
+    auto start = out;
+    if (code == dict_len1.size() + code_to_substr.size()) {
+      // Special LZW case. The code is not in the dictionary yet. This is
+      // possible only when the new substring is the same as previous one plus
+      // the first item of the previous substring. We can emit that in two
+      // steps.
+      out = copy(prev_code, out);
+      *out = *start;
+      ++out;
+    } else {
+      out = copy(code, out);
+    }
+
+    // Every time encoded emits the code, it also creates substing of len + 1
+    // including the first item of the just emmited substring. Do the same here.
+    uptr len = code_to_len(prev_code);
+    code_to_substr.push_back({start - len, start + 1});
+
+    prev_code = code;
+  }
+  return out;
+}
+
+}  // namespace __sanitizer
+#endif
diff --git a/libsanitizer/sanitizer_common/sanitizer_mac.cpp b/libsanitizer/sanitizer_common/sanitizer_mac.cpp
index a61cde8..05512a5 100644
--- a/libsanitizer/sanitizer_common/sanitizer_mac.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_mac.cpp
@@ -25,6 +25,7 @@
 #include "sanitizer_common.h"
 #include "sanitizer_file.h"
 #include "sanitizer_flags.h"
+#include "sanitizer_interface_internal.h"
 #include "sanitizer_internal_defs.h"
 #include "sanitizer_libc.h"
 #include "sanitizer_platform_limits_posix.h"
@@ -37,7 +38,7 @@
 extern char **environ;
 #endif
 
-#if defined(__has_include) && __has_include(<os/trace.h>) && defined(__BLOCKS__)
+#if defined(__has_include) && __has_include(<os/trace.h>)
 #define SANITIZER_OS_TRACE 1
 #include <os/trace.h>
 #else
@@ -70,15 +71,7 @@ extern "C" {
 #include <mach/mach_time.h>
 #include <mach/vm_statistics.h>
 #include <malloc/malloc.h>
-#if defined(__has_builtin) && __has_builtin(__builtin_os_log_format)
-# include <os/log.h>
-#else
-   /* Without support for __builtin_os_log_format, fall back to the older
-      method.  */
-# define OS_LOG_DEFAULT 0
-# define os_log_error(A,B,C) \
-  asl_log(nullptr, nullptr, ASL_LEVEL_ERR, "%s", (C));
-#endif
+#include <os/log.h>
 #include <pthread.h>
 #include <sched.h>
 #include <signal.h>
@@ -273,30 +266,32 @@ int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
 
 static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
                                 pid_t *pid) {
-  fd_t master_fd = kInvalidFd;
-  fd_t slave_fd = kInvalidFd;
+  fd_t primary_fd = kInvalidFd;
+  fd_t secondary_fd = kInvalidFd;
 
   auto fd_closer = at_scope_exit([&] {
-    internal_close(master_fd);
-    internal_close(slave_fd);
+    internal_close(primary_fd);
+    internal_close(secondary_fd);
   });
 
   // We need a new pseudoterminal to avoid buffering problems. The 'atos' tool
   // in particular detects when it's talking to a pipe and forgets to flush the
   // output stream after sending a response.
-  master_fd = posix_openpt(O_RDWR);
-  if (master_fd == kInvalidFd) return kInvalidFd;
+  primary_fd = posix_openpt(O_RDWR);
+  if (primary_fd == kInvalidFd)
+    return kInvalidFd;
 
-  int res = grantpt(master_fd) || unlockpt(master_fd);
+  int res = grantpt(primary_fd) || unlockpt(primary_fd);
   if (res != 0) return kInvalidFd;
 
   // Use TIOCPTYGNAME instead of ptsname() to avoid threading problems.
-  char slave_pty_name[128];
-  res = ioctl(master_fd, TIOCPTYGNAME, slave_pty_name);
+  char secondary_pty_name[128];
+  res = ioctl(primary_fd, TIOCPTYGNAME, secondary_pty_name);
   if (res == -1) return kInvalidFd;
 
-  slave_fd = internal_open(slave_pty_name, O_RDWR);
-  if (slave_fd == kInvalidFd) return kInvalidFd;
+  secondary_fd = internal_open(secondary_pty_name, O_RDWR);
+  if (secondary_fd == kInvalidFd)
+    return kInvalidFd;
 
   // File descriptor actions
   posix_spawn_file_actions_t acts;
@@ -307,9 +302,9 @@ static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
     posix_spawn_file_actions_destroy(&acts);
   });
 
-  res = posix_spawn_file_actions_adddup2(&acts, slave_fd, STDIN_FILENO) ||
-        posix_spawn_file_actions_adddup2(&acts, slave_fd, STDOUT_FILENO) ||
-        posix_spawn_file_actions_addclose(&acts, slave_fd);
+  res = posix_spawn_file_actions_adddup2(&acts, secondary_fd, STDIN_FILENO) ||
+        posix_spawn_file_actions_adddup2(&acts, secondary_fd, STDOUT_FILENO) ||
+        posix_spawn_file_actions_addclose(&acts, secondary_fd);
   if (res != 0) return kInvalidFd;
 
   // Spawn attributes
@@ -334,14 +329,14 @@ static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
 
   // Disable echo in the new terminal, disable CR.
   struct termios termflags;
-  tcgetattr(master_fd, &termflags);
+  tcgetattr(primary_fd, &termflags);
   termflags.c_oflag &= ~ONLCR;
   termflags.c_lflag &= ~ECHO;
-  tcsetattr(master_fd, TCSANOW, &termflags);
+  tcsetattr(primary_fd, TCSANOW, &termflags);
 
-  // On success, do not close master_fd on scope exit.
-  fd_t fd = master_fd;
-  master_fd = kInvalidFd;
+  // On success, do not close primary_fd on scope exit.
+  fd_t fd = primary_fd;
+  primary_fd = kInvalidFd;
 
   return fd;
 }
@@ -398,6 +393,13 @@ bool FileExists(const char *filename) {
   return S_ISREG(st.st_mode);
 }
 
+bool DirExists(const char *path) {
+  struct stat st;
+  if (stat(path, &st))
+    return false;
+  return S_ISDIR(st.st_mode);
+}
+
 tid_t GetTid() {
   tid_t tid;
   pthread_threadid_np(nullptr, &tid);
@@ -877,9 +879,9 @@ void LogFullErrorReport(const char *buffer) {
 SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 #if defined(__x86_64__) || defined(__i386__)
   ucontext_t *ucontext = static_cast<ucontext_t*>(context);
-  return ucontext->uc_mcontext->__es.__err & 2 /*T_PF_WRITE*/ ? WRITE : READ;
+  return ucontext->uc_mcontext->__es.__err & 2 /*T_PF_WRITE*/ ? Write : Read;
 #else
-  return UNKNOWN;
+  return Unknown;
 #endif
 }
 
@@ -894,18 +896,14 @@ bool SignalContext::IsTrueFaultingAddress() const {
     (uptr)ptrauth_strip(     \
         (void *)arm_thread_state64_get_##r(ucontext->uc_mcontext->__ss), 0)
 #else
-  #define AARCH64_GET_REG(r) ucontext->uc_mcontext->__ss.__##r
+  #define AARCH64_GET_REG(r) (uptr)ucontext->uc_mcontext->__ss.__##r
 #endif
 
 static void GetPcSpBp(void *context, uptr *pc, uptr *sp, uptr *bp) {
   ucontext_t *ucontext = (ucontext_t*)context;
 # if defined(__aarch64__)
   *pc = AARCH64_GET_REG(pc);
-#   if defined(__IPHONE_8_0) && __IPHONE_OS_VERSION_MAX_ALLOWED >= __IPHONE_8_0
   *bp = AARCH64_GET_REG(fp);
-#   else
-  *bp = AARCH64_GET_REG(lr);
-#   endif
   *sp = AARCH64_GET_REG(sp);
 # elif defined(__x86_64__)
   *pc = ucontext->uc_mcontext->__ss.__rip;
@@ -1057,12 +1055,12 @@ void MaybeReexec() {
   }
 
   // Verify that interceptors really work.  We'll use dlsym to locate
-  // "pthread_create", if interceptors are working, it should really point to
-  // "wrap_pthread_create" within our own dylib.
-  Dl_info info_pthread_create;
-  void *dlopen_addr = dlsym(RTLD_DEFAULT, "pthread_create");
-  RAW_CHECK(dladdr(dlopen_addr, &info_pthread_create));
-  if (internal_strcmp(info.dli_fname, info_pthread_create.dli_fname) != 0) {
+  // "puts", if interceptors are working, it should really point to
+  // "wrap_puts" within our own dylib.
+  Dl_info info_puts;
+  void *dlopen_addr = dlsym(RTLD_DEFAULT, "puts");
+  RAW_CHECK(dladdr(dlopen_addr, &info_puts));
+  if (internal_strcmp(info.dli_fname, info_puts.dli_fname) != 0) {
     Report(
         "ERROR: Interceptors are not working. This may be because %s is "
         "loaded too late (e.g. via dlopen). Please launch the executable "
@@ -1229,7 +1227,7 @@ uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
 
   uptr largest_gap_found = 0;
   uptr max_occupied_addr = 0;
-  VReport(2, "FindDynamicShadowStart, space_size = %p\n", space_size);
+  VReport(2, "FindDynamicShadowStart, space_size = %p\n", (void *)space_size);
   uptr shadow_start =
       FindAvailableMemoryRange(space_size, alignment, granularity,
                                &largest_gap_found, &max_occupied_addr);
@@ -1238,20 +1236,21 @@ uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
     VReport(
         2,
         "Shadow doesn't fit, largest_gap_found = %p, max_occupied_addr = %p\n",
-        largest_gap_found, max_occupied_addr);
+        (void *)largest_gap_found, (void *)max_occupied_addr);
     uptr new_max_vm = RoundDownTo(largest_gap_found << shadow_scale, alignment);
     if (new_max_vm < max_occupied_addr) {
       Report("Unable to find a memory range for dynamic shadow.\n");
       Report(
           "space_size = %p, largest_gap_found = %p, max_occupied_addr = %p, "
           "new_max_vm = %p\n",
-          space_size, largest_gap_found, max_occupied_addr, new_max_vm);
+          (void *)space_size, (void *)largest_gap_found,
+          (void *)max_occupied_addr, (void *)new_max_vm);
       CHECK(0 && "cannot place shadow");
     }
     RestrictMemoryToMaxAddress(new_max_vm);
     high_mem_end = new_max_vm - 1;
     space_size = (high_mem_end >> shadow_scale) + left_padding;
-    VReport(2, "FindDynamicShadowStart, space_size = %p\n", space_size);
+    VReport(2, "FindDynamicShadowStart, space_size = %p\n", (void *)space_size);
     shadow_start = FindAvailableMemoryRange(space_size, alignment, granularity,
                                             nullptr, nullptr);
     if (shadow_start == 0) {
@@ -1331,7 +1330,7 @@ void SignalContext::DumpAllRegisters(void *context) {
 # define DUMPREG64(r) \
     Printf("%s = 0x%016llx  ", #r, ucontext->uc_mcontext->__ss.__ ## r);
 # define DUMPREGA64(r) \
-    Printf("   %s = 0x%016llx  ", #r, AARCH64_GET_REG(r));
+    Printf("   %s = 0x%016lx  ", #r, AARCH64_GET_REG(r));
 # define DUMPREG32(r) \
     Printf("%s = 0x%08x  ", #r, ucontext->uc_mcontext->__ss.__ ## r);
 # define DUMPREG_(r)   Printf(" "); DUMPREG(r);
@@ -1401,7 +1400,7 @@ void DumpProcessMap() {
     char uuid_str[128];
     FormatUUID(uuid_str, sizeof(uuid_str), modules[i].uuid());
     Printf("0x%zx-0x%zx %s (%s) %s\n", modules[i].base_address(),
-           modules[i].max_executable_address(), modules[i].full_name(),
+           modules[i].max_address(), modules[i].full_name(),
            ModuleArchToString(modules[i].arch()), uuid_str);
   }
   Printf("End of module map.\n");
diff --git a/libsanitizer/sanitizer_common/sanitizer_mac.h b/libsanitizer/sanitizer_common/sanitizer_mac.h
index 96a5986..0b6af5a 100644
--- a/libsanitizer/sanitizer_common/sanitizer_mac.h
+++ b/libsanitizer/sanitizer_common/sanitizer_mac.h
@@ -14,26 +14,6 @@
 
 #include "sanitizer_common.h"
 #include "sanitizer_platform.h"
-
-/* TARGET_OS_OSX is not present in SDKs before Darwin16 (macOS 10.12) use
-   TARGET_OS_MAC (we have no support for iOS in any form for these versions,
-   so there's no ambiguity).  */
-#if !defined(TARGET_OS_OSX) && TARGET_OS_MAC
-# define TARGET_OS_OSX 1
-#endif
-
-/* Other TARGET_OS_xxx are not present on earlier versions, define them to
-   0 (we have no support for them; they are not valid targets anyway).  */
-#ifndef TARGET_OS_IOS
-#define TARGET_OS_IOS 0
-#endif
-#ifndef TARGET_OS_TV
-#define TARGET_OS_TV 0
-#endif
-#ifndef TARGET_OS_WATCH
-#define TARGET_OS_WATCH 0
-#endif
-
 #if SANITIZER_MAC
 #include "sanitizer_posix.h"
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_mutex.h b/libsanitizer/sanitizer_common/sanitizer_mutex.h
index 5ec6efa..d2188a9 100644
--- a/libsanitizer/sanitizer_common/sanitizer_mutex.h
+++ b/libsanitizer/sanitizer_common/sanitizer_mutex.h
@@ -20,25 +20,27 @@
 
 namespace __sanitizer {
 
-class MUTEX StaticSpinMutex {
+class SANITIZER_MUTEX StaticSpinMutex {
  public:
   void Init() {
     atomic_store(&state_, 0, memory_order_relaxed);
   }
 
-  void Lock() ACQUIRE() {
+  void Lock() SANITIZER_ACQUIRE() {
     if (LIKELY(TryLock()))
       return;
     LockSlow();
   }
 
-  bool TryLock() TRY_ACQUIRE(true) {
+  bool TryLock() SANITIZER_TRY_ACQUIRE(true) {
     return atomic_exchange(&state_, 1, memory_order_acquire) == 0;
   }
 
-  void Unlock() RELEASE() { atomic_store(&state_, 0, memory_order_release); }
+  void Unlock() SANITIZER_RELEASE() {
+    atomic_store(&state_, 0, memory_order_release);
+  }
 
-  void CheckLocked() const CHECK_LOCKED() {
+  void CheckLocked() const SANITIZER_CHECK_LOCKED() {
     CHECK_EQ(atomic_load(&state_, memory_order_relaxed), 1);
   }
 
@@ -48,7 +50,7 @@ class MUTEX StaticSpinMutex {
   void LockSlow();
 };
 
-class MUTEX SpinMutex : public StaticSpinMutex {
+class SANITIZER_MUTEX SpinMutex : public StaticSpinMutex {
  public:
   SpinMutex() {
     Init();
@@ -156,12 +158,12 @@ class CheckedMutex {
 // Derive from CheckedMutex for the purposes of EBO.
 // We could make it a field marked with [[no_unique_address]],
 // but this attribute is not supported by some older compilers.
-class MUTEX Mutex : CheckedMutex {
+class SANITIZER_MUTEX Mutex : CheckedMutex {
  public:
   explicit constexpr Mutex(MutexType type = MutexUnchecked)
       : CheckedMutex(type) {}
 
-  void Lock() ACQUIRE() {
+  void Lock() SANITIZER_ACQUIRE() {
     CheckedMutex::Lock();
     u64 reset_mask = ~0ull;
     u64 state = atomic_load_relaxed(&state_);
@@ -206,7 +208,21 @@ class MUTEX Mutex : CheckedMutex {
     }
   }
 
-  void Unlock() RELEASE() {
+  bool TryLock() SANITIZER_TRY_ACQUIRE(true) {
+    u64 state = atomic_load_relaxed(&state_);
+    for (;;) {
+      if (UNLIKELY(state & (kWriterLock | kReaderLockMask)))
+        return false;
+      // The mutex is not read-/write-locked, try to lock.
+      if (LIKELY(atomic_compare_exchange_weak(
+              &state_, &state, state | kWriterLock, memory_order_acquire))) {
+        CheckedMutex::Lock();
+        return true;
+      }
+    }
+  }
+
+  void Unlock() SANITIZER_RELEASE() {
     CheckedMutex::Unlock();
     bool wake_writer;
     u64 wake_readers;
@@ -234,7 +250,7 @@ class MUTEX Mutex : CheckedMutex {
       readers_.Post(wake_readers);
   }
 
-  void ReadLock() ACQUIRE_SHARED() {
+  void ReadLock() SANITIZER_ACQUIRE_SHARED() {
     CheckedMutex::Lock();
     u64 reset_mask = ~0ull;
     u64 state = atomic_load_relaxed(&state_);
@@ -271,7 +287,7 @@ class MUTEX Mutex : CheckedMutex {
     }
   }
 
-  void ReadUnlock() RELEASE_SHARED() {
+  void ReadUnlock() SANITIZER_RELEASE_SHARED() {
     CheckedMutex::Unlock();
     bool wake;
     u64 new_state;
@@ -297,13 +313,13 @@ class MUTEX Mutex : CheckedMutex {
   // owns the mutex but a child checks that it is locked. Rather than
   // maintaining complex state to work around those situations, the check only
   // checks that the mutex is owned.
-  void CheckWriteLocked() const CHECK_LOCKED() {
+  void CheckWriteLocked() const SANITIZER_CHECK_LOCKED() {
     CHECK(atomic_load(&state_, memory_order_relaxed) & kWriterLock);
   }
 
-  void CheckLocked() const CHECK_LOCKED() { CheckWriteLocked(); }
+  void CheckLocked() const SANITIZER_CHECK_LOCKED() { CheckWriteLocked(); }
 
-  void CheckReadLocked() const CHECK_LOCKED() {
+  void CheckReadLocked() const SANITIZER_CHECK_LOCKED() {
     CHECK(atomic_load(&state_, memory_order_relaxed) & kReaderLockMask);
   }
 
@@ -361,13 +377,13 @@ void FutexWait(atomic_uint32_t *p, u32 cmp);
 void FutexWake(atomic_uint32_t *p, u32 count);
 
 template <typename MutexType>
-class SCOPED_LOCK GenericScopedLock {
+class SANITIZER_SCOPED_LOCK GenericScopedLock {
  public:
-  explicit GenericScopedLock(MutexType *mu) ACQUIRE(mu) : mu_(mu) {
+  explicit GenericScopedLock(MutexType *mu) SANITIZER_ACQUIRE(mu) : mu_(mu) {
     mu_->Lock();
   }
 
-  ~GenericScopedLock() RELEASE() { mu_->Unlock(); }
+  ~GenericScopedLock() SANITIZER_RELEASE() { mu_->Unlock(); }
 
  private:
   MutexType *mu_;
@@ -377,13 +393,14 @@ class SCOPED_LOCK GenericScopedLock {
 };
 
 template <typename MutexType>
-class SCOPED_LOCK GenericScopedReadLock {
+class SANITIZER_SCOPED_LOCK GenericScopedReadLock {
  public:
-  explicit GenericScopedReadLock(MutexType *mu) ACQUIRE(mu) : mu_(mu) {
+  explicit GenericScopedReadLock(MutexType *mu) SANITIZER_ACQUIRE(mu)
+      : mu_(mu) {
     mu_->ReadLock();
   }
 
-  ~GenericScopedReadLock() RELEASE() { mu_->ReadUnlock(); }
+  ~GenericScopedReadLock() SANITIZER_RELEASE() { mu_->ReadUnlock(); }
 
  private:
   MutexType *mu_;
@@ -393,10 +410,10 @@ class SCOPED_LOCK GenericScopedReadLock {
 };
 
 template <typename MutexType>
-class SCOPED_LOCK GenericScopedRWLock {
+class SANITIZER_SCOPED_LOCK GenericScopedRWLock {
  public:
   ALWAYS_INLINE explicit GenericScopedRWLock(MutexType *mu, bool write)
-      ACQUIRE(mu)
+      SANITIZER_ACQUIRE(mu)
       : mu_(mu), write_(write) {
     if (write_)
       mu_->Lock();
@@ -404,7 +421,7 @@ class SCOPED_LOCK GenericScopedRWLock {
       mu_->ReadLock();
   }
 
-  ALWAYS_INLINE ~GenericScopedRWLock() RELEASE() {
+  ALWAYS_INLINE ~GenericScopedRWLock() SANITIZER_RELEASE() {
     if (write_)
       mu_->Unlock();
     else
diff --git a/libsanitizer/sanitizer_common/sanitizer_persistent_allocator.h b/libsanitizer/sanitizer_common/sanitizer_persistent_allocator.h
deleted file mode 100644
index e18b003..0000000
--- a/libsanitizer/sanitizer_common/sanitizer_persistent_allocator.h
+++ /dev/null
@@ -1,110 +0,0 @@
-//===-- sanitizer_persistent_allocator.h ------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A fast memory allocator that does not support free() nor realloc().
-// All allocations are forever.
-//===----------------------------------------------------------------------===//
-
-#ifndef SANITIZER_PERSISTENT_ALLOCATOR_H
-#define SANITIZER_PERSISTENT_ALLOCATOR_H
-
-#include "sanitizer_internal_defs.h"
-#include "sanitizer_mutex.h"
-#include "sanitizer_atomic.h"
-#include "sanitizer_common.h"
-
-namespace __sanitizer {
-
-template <typename T>
-class PersistentAllocator {
- public:
-  T *alloc(uptr count = 1);
-  uptr allocated() const { return atomic_load_relaxed(&mapped_size); }
-
-  void TestOnlyUnmap();
-
- private:
-  T *tryAlloc(uptr count);
-  T *refillAndAlloc(uptr count);
-  mutable StaticSpinMutex mtx;  // Protects alloc of new blocks.
-  atomic_uintptr_t region_pos;  // Region allocator for Node's.
-  atomic_uintptr_t region_end;
-  atomic_uintptr_t mapped_size;
-
-  struct BlockInfo {
-    const BlockInfo *next;
-    uptr ptr;
-    uptr size;
-  };
-  const BlockInfo *curr;
-};
-
-template <typename T>
-inline T *PersistentAllocator<T>::tryAlloc(uptr count) {
-  // Optimisic lock-free allocation, essentially try to bump the region ptr.
-  for (;;) {
-    uptr cmp = atomic_load(&region_pos, memory_order_acquire);
-    uptr end = atomic_load(&region_end, memory_order_acquire);
-    uptr size = count * sizeof(T);
-    if (cmp == 0 || cmp + size > end)
-      return nullptr;
-    if (atomic_compare_exchange_weak(&region_pos, &cmp, cmp + size,
-                                     memory_order_acquire))
-      return reinterpret_cast<T *>(cmp);
-  }
-}
-
-template <typename T>
-inline T *PersistentAllocator<T>::alloc(uptr count) {
-  // First, try to allocate optimisitically.
-  T *s = tryAlloc(count);
-  if (LIKELY(s))
-    return s;
-  return refillAndAlloc(count);
-}
-
-template <typename T>
-inline T *PersistentAllocator<T>::refillAndAlloc(uptr count) {
-  // If failed, lock, retry and alloc new superblock.
-  SpinMutexLock l(&mtx);
-  for (;;) {
-    T *s = tryAlloc(count);
-    if (s)
-      return s;
-    atomic_store(&region_pos, 0, memory_order_relaxed);
-    uptr size = count * sizeof(T) + sizeof(BlockInfo);
-    uptr allocsz = RoundUpTo(Max<uptr>(size, 64u * 1024u), GetPageSizeCached());
-    uptr mem = (uptr)MmapOrDie(allocsz, "stack depot");
-    BlockInfo *new_block = (BlockInfo *)(mem + allocsz) - 1;
-    new_block->next = curr;
-    new_block->ptr = mem;
-    new_block->size = allocsz;
-    curr = new_block;
-
-    atomic_fetch_add(&mapped_size, allocsz, memory_order_relaxed);
-
-    allocsz -= sizeof(BlockInfo);
-    atomic_store(&region_end, mem + allocsz, memory_order_release);
-    atomic_store(&region_pos, mem, memory_order_release);
-  }
-}
-
-template <typename T>
-void PersistentAllocator<T>::TestOnlyUnmap() {
-  while (curr) {
-    uptr mem = curr->ptr;
-    uptr allocsz = curr->size;
-    curr = curr->next;
-    UnmapOrDie((void *)mem, allocsz);
-  }
-  internal_memset(this, 0, sizeof(*this));
-}
-
-} // namespace __sanitizer
-
-#endif // SANITIZER_PERSISTENT_ALLOCATOR_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform.h b/libsanitizer/sanitizer_common/sanitizer_platform.h
index 3153de3..8fe0d83 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform.h
+++ b/libsanitizer/sanitizer_common/sanitizer_platform.h
@@ -22,103 +22,110 @@
 // function declarations into a .S file which doesn't compile.
 // https://crbug.com/1162741
 #if __has_include(<features.h>) && !defined(__ANDROID__)
-#include <features.h>
+#  include <features.h>
 #endif
 
 #if defined(__linux__)
-# define SANITIZER_LINUX   1
+#  define SANITIZER_LINUX 1
 #else
-# define SANITIZER_LINUX   0
+#  define SANITIZER_LINUX 0
 #endif
 
 #if defined(__GLIBC__)
-# define SANITIZER_GLIBC   1
+#  define SANITIZER_GLIBC 1
 #else
-# define SANITIZER_GLIBC   0
+#  define SANITIZER_GLIBC 0
 #endif
 
 #if defined(__FreeBSD__)
-# define SANITIZER_FREEBSD 1
+#  define SANITIZER_FREEBSD 1
 #else
-# define SANITIZER_FREEBSD 0
+#  define SANITIZER_FREEBSD 0
 #endif
 
 #if defined(__NetBSD__)
-# define SANITIZER_NETBSD 1
+#  define SANITIZER_NETBSD 1
 #else
-# define SANITIZER_NETBSD 0
+#  define SANITIZER_NETBSD 0
 #endif
 
 #if defined(__sun__) && defined(__svr4__)
-# define SANITIZER_SOLARIS 1
+#  define SANITIZER_SOLARIS 1
 #else
-# define SANITIZER_SOLARIS 0
+#  define SANITIZER_SOLARIS 0
 #endif
 
 #if defined(__APPLE__)
-# define SANITIZER_MAC     1
-# include <TargetConditionals.h>
-# if TARGET_OS_OSX
-#  define SANITIZER_OSX    1
-# else
-#  define SANITIZER_OSX    0
-# endif
-# if TARGET_OS_IPHONE
-#  define SANITIZER_IOS    1
-# else
-#  define SANITIZER_IOS    0
-# endif
-# if TARGET_OS_SIMULATOR
-#  define SANITIZER_IOSSIM 1
-# else
-#  define SANITIZER_IOSSIM 0
-# endif
+#  define SANITIZER_MAC 1
+#  include <TargetConditionals.h>
+#  if TARGET_OS_OSX
+#    define SANITIZER_OSX 1
+#  else
+#    define SANITIZER_OSX 0
+#  endif
+#  if TARGET_OS_IPHONE
+#    define SANITIZER_IOS 1
+#  else
+#    define SANITIZER_IOS 0
+#  endif
+#  if TARGET_OS_SIMULATOR
+#    define SANITIZER_IOSSIM 1
+#  else
+#    define SANITIZER_IOSSIM 0
+#  endif
 #else
-# define SANITIZER_MAC     0
-# define SANITIZER_IOS     0
-# define SANITIZER_IOSSIM  0
-# define SANITIZER_OSX     0
+#  define SANITIZER_MAC 0
+#  define SANITIZER_IOS 0
+#  define SANITIZER_IOSSIM 0
+#  define SANITIZER_OSX 0
 #endif
 
 #if defined(__APPLE__) && TARGET_OS_IPHONE && TARGET_OS_WATCH
-# define SANITIZER_WATCHOS 1
+#  define SANITIZER_WATCHOS 1
 #else
-# define SANITIZER_WATCHOS 0
+#  define SANITIZER_WATCHOS 0
 #endif
 
 #if defined(__APPLE__) && TARGET_OS_IPHONE && TARGET_OS_TV
-# define SANITIZER_TVOS 1
+#  define SANITIZER_TVOS 1
 #else
-# define SANITIZER_TVOS 0
+#  define SANITIZER_TVOS 0
 #endif
 
 #if defined(_WIN32)
-# define SANITIZER_WINDOWS 1
+#  define SANITIZER_WINDOWS 1
 #else
-# define SANITIZER_WINDOWS 0
+#  define SANITIZER_WINDOWS 0
 #endif
 
 #if defined(_WIN64)
-# define SANITIZER_WINDOWS64 1
+#  define SANITIZER_WINDOWS64 1
 #else
-# define SANITIZER_WINDOWS64 0
+#  define SANITIZER_WINDOWS64 0
 #endif
 
 #if defined(__ANDROID__)
-# define SANITIZER_ANDROID 1
+#  define SANITIZER_ANDROID 1
 #else
-# define SANITIZER_ANDROID 0
+#  define SANITIZER_ANDROID 0
 #endif
 
 #if defined(__Fuchsia__)
-# define SANITIZER_FUCHSIA 1
+#  define SANITIZER_FUCHSIA 1
+#else
+#  define SANITIZER_FUCHSIA 0
+#endif
+
+// Assume linux that is not glibc or android is musl libc.
+#if SANITIZER_LINUX && !SANITIZER_GLIBC && !SANITIZER_ANDROID
+#  define SANITIZER_MUSL 1
 #else
-# define SANITIZER_FUCHSIA 0
+#  define SANITIZER_MUSL 0
 #endif
 
-#define SANITIZER_POSIX \
+#define SANITIZER_POSIX                                     \
   (SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_MAC || \
-    SANITIZER_NETBSD || SANITIZER_SOLARIS)
+   SANITIZER_NETBSD || SANITIZER_SOLARIS)
 
 #if __LP64__ || defined(_WIN64)
 #  define SANITIZER_WORDSIZE 64
@@ -127,58 +134,64 @@
 #endif
 
 #if SANITIZER_WORDSIZE == 64
-# define FIRST_32_SECOND_64(a, b) (b)
+#  define FIRST_32_SECOND_64(a, b) (b)
 #else
-# define FIRST_32_SECOND_64(a, b) (a)
+#  define FIRST_32_SECOND_64(a, b) (a)
 #endif
 
 #if defined(__x86_64__) && !defined(_LP64)
-# define SANITIZER_X32 1
+#  define SANITIZER_X32 1
 #else
-# define SANITIZER_X32 0
+#  define SANITIZER_X32 0
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#  define SANITIZER_X64 1
+#else
+#  define SANITIZER_X64 0
 #endif
 
 #if defined(__i386__) || defined(_M_IX86)
-# define SANITIZER_I386 1
+#  define SANITIZER_I386 1
 #else
-# define SANITIZER_I386 0
+#  define SANITIZER_I386 0
 #endif
 
 #if defined(__mips__)
-# define SANITIZER_MIPS 1
-# if defined(__mips64)
+#  define SANITIZER_MIPS 1
+#  if defined(__mips64)
+#    define SANITIZER_MIPS32 0
+#    define SANITIZER_MIPS64 1
+#  else
+#    define SANITIZER_MIPS32 1
+#    define SANITIZER_MIPS64 0
+#  endif
+#else
+#  define SANITIZER_MIPS 0
 #  define SANITIZER_MIPS32 0
-#  define SANITIZER_MIPS64 1
-# else
-#  define SANITIZER_MIPS32 1
 #  define SANITIZER_MIPS64 0
-# endif
-#else
-# define SANITIZER_MIPS 0
-# define SANITIZER_MIPS32 0
-# define SANITIZER_MIPS64 0
 #endif
 
 #if defined(__s390__)
-# define SANITIZER_S390 1
-# if defined(__s390x__)
+#  define SANITIZER_S390 1
+#  if defined(__s390x__)
+#    define SANITIZER_S390_31 0
+#    define SANITIZER_S390_64 1
+#  else
+#    define SANITIZER_S390_31 1
+#    define SANITIZER_S390_64 0
+#  endif
+#else
+#  define SANITIZER_S390 0
 #  define SANITIZER_S390_31 0
-#  define SANITIZER_S390_64 1
-# else
-#  define SANITIZER_S390_31 1
 #  define SANITIZER_S390_64 0
-# endif
-#else
-# define SANITIZER_S390 0
-# define SANITIZER_S390_31 0
-# define SANITIZER_S390_64 0
 #endif
 
 #if defined(__powerpc__)
-# define SANITIZER_PPC 1
-# if defined(__powerpc64__)
-#  define SANITIZER_PPC32 0
-#  define SANITIZER_PPC64 1
+#  define SANITIZER_PPC 1
+#  if defined(__powerpc64__)
+#    define SANITIZER_PPC32 0
+#    define SANITIZER_PPC64 1
 // 64-bit PPC has two ABIs (v1 and v2).  The old powerpc64 target is
 // big-endian, and uses v1 ABI (known for its function descriptors),
 // while the new powerpc64le target is little-endian and uses v2.
@@ -186,43 +199,49 @@
 // (eg. big-endian v2), but you won't find such combinations in the wild
 // (it'd require bootstrapping a whole system, which would be quite painful
 // - there's no target triple for that).  LLVM doesn't support them either.
-#  if _CALL_ELF == 2
-#   define SANITIZER_PPC64V1 0
-#   define SANITIZER_PPC64V2 1
+#    if _CALL_ELF == 2
+#      define SANITIZER_PPC64V1 0
+#      define SANITIZER_PPC64V2 1
+#    else
+#      define SANITIZER_PPC64V1 1
+#      define SANITIZER_PPC64V2 0
+#    endif
 #  else
-#   define SANITIZER_PPC64V1 1
-#   define SANITIZER_PPC64V2 0
+#    define SANITIZER_PPC32 1
+#    define SANITIZER_PPC64 0
+#    define SANITIZER_PPC64V1 0
+#    define SANITIZER_PPC64V2 0
 #  endif
-# else
-#  define SANITIZER_PPC32 1
+#else
+#  define SANITIZER_PPC 0
+#  define SANITIZER_PPC32 0
 #  define SANITIZER_PPC64 0
 #  define SANITIZER_PPC64V1 0
 #  define SANITIZER_PPC64V2 0
-# endif
+#endif
+
+#if defined(__arm__) || defined(_M_ARM)
+#  define SANITIZER_ARM 1
 #else
-# define SANITIZER_PPC 0
-# define SANITIZER_PPC32 0
-# define SANITIZER_PPC64 0
-# define SANITIZER_PPC64V1 0
-# define SANITIZER_PPC64V2 0
+#  define SANITIZER_ARM 0
 #endif
 
-#if defined(__arm__)
-# define SANITIZER_ARM 1
+#if defined(__aarch64__) || defined(_M_ARM64)
+#  define SANITIZER_ARM64 1
 #else
-# define SANITIZER_ARM 0
+#  define SANITIZER_ARM64 0
 #endif
 
 #if SANITIZER_SOLARIS && SANITIZER_WORDSIZE == 32
-# define SANITIZER_SOLARIS32 1
+#  define SANITIZER_SOLARIS32 1
 #else
-# define SANITIZER_SOLARIS32 0
+#  define SANITIZER_SOLARIS32 0
 #endif
 
 #if defined(__riscv) && (__riscv_xlen == 64)
-#define SANITIZER_RISCV64 1
+#  define SANITIZER_RISCV64 1
 #else
-#define SANITIZER_RISCV64 0
+#  define SANITIZER_RISCV64 0
 #endif
 
 // By default we allow to use SizeClassAllocator64 on 64-bit platform.
@@ -231,62 +250,52 @@
 // For such platforms build this code with -DSANITIZER_CAN_USE_ALLOCATOR64=0 or
 // change the definition of SANITIZER_CAN_USE_ALLOCATOR64 here.
 #ifndef SANITIZER_CAN_USE_ALLOCATOR64
-# if (SANITIZER_ANDROID && defined(__aarch64__)) || SANITIZER_FUCHSIA
-#  define SANITIZER_CAN_USE_ALLOCATOR64 1
-# elif defined(__mips64) || defined(__aarch64__)
-#  define SANITIZER_CAN_USE_ALLOCATOR64 0
-# else
-#  define SANITIZER_CAN_USE_ALLOCATOR64 (SANITIZER_WORDSIZE == 64)
-# endif
+#  if (SANITIZER_ANDROID && defined(__aarch64__)) || SANITIZER_FUCHSIA
+#    define SANITIZER_CAN_USE_ALLOCATOR64 1
+#  elif defined(__mips64) || defined(__aarch64__)
+#    define SANITIZER_CAN_USE_ALLOCATOR64 0
+#  else
+#    define SANITIZER_CAN_USE_ALLOCATOR64 (SANITIZER_WORDSIZE == 64)
+#  endif
 #endif
 
 // The range of addresses which can be returned my mmap.
 // FIXME: this value should be different on different platforms.  Larger values
 // will still work but will consume more memory for TwoLevelByteMap.
 #if defined(__mips__)
-#if SANITIZER_GO && defined(__mips64)
-#define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47)
-#else
-# define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 40)
-#endif
+#  if SANITIZER_GO && defined(__mips64)
+#    define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47)
+#  else
+#    define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 40)
+#  endif
 #elif SANITIZER_RISCV64
-#define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 38)
+#  define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 38)
 #elif defined(__aarch64__)
-# if SANITIZER_MAC
-#  if SANITIZER_OSX || SANITIZER_IOSSIM
-#   define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47)
+#  if SANITIZER_MAC
+#    if SANITIZER_OSX || SANITIZER_IOSSIM
+#      define SANITIZER_MMAP_RANGE_SIZE \
+        FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47)
+#    else
+// Darwin iOS/ARM64 has a 36-bit VMA, 64GiB VM
+#      define SANITIZER_MMAP_RANGE_SIZE \
+        FIRST_32_SECOND_64(1ULL << 32, 1ULL << 36)
+#    endif
 #  else
-    // Darwin iOS/ARM64 has a 36-bit VMA, 64GiB VM
-#   define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 36)
+#    define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 48)
 #  endif
-# else
-#  define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 48)
-# endif
 #elif defined(__sparc__)
-#define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 52)
+#  define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 52)
 #else
-# define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47)
+#  define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47)
 #endif
 
 // Whether the addresses are sign-extended from the VMA range to the word.
 // The SPARC64 Linux port implements this to split the VMA space into two
 // non-contiguous halves with a huge hole in the middle.
 #if defined(__sparc__) && SANITIZER_WORDSIZE == 64
-#define SANITIZER_SIGN_EXTENDED_ADDRESSES 1
+#  define SANITIZER_SIGN_EXTENDED_ADDRESSES 1
 #else
-#define SANITIZER_SIGN_EXTENDED_ADDRESSES 0
-#endif
-
-// The AArch64 and RISC-V linux ports use the canonical syscall set as
-// mandated by the upstream linux community for all new ports. Other ports
-// may still use legacy syscalls.
-#ifndef SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
-#  if (defined(__aarch64__) || defined(__riscv) || defined(__hexagon__)) && \
-      SANITIZER_LINUX
-#    define SANITIZER_USES_CANONICAL_LINUX_SYSCALLS 1
-#  else
-#    define SANITIZER_USES_CANONICAL_LINUX_SYSCALLS 0
-#  endif
+#  define SANITIZER_SIGN_EXTENDED_ADDRESSES 0
 #endif
 
 // udi16 syscalls can only be used when the following conditions are
@@ -297,15 +306,15 @@
 // Since we don't want to include libc headers here, we check the
 // target only.
 #if defined(__arm__) || SANITIZER_X32 || defined(__sparc__)
-#define SANITIZER_USES_UID16_SYSCALLS 1
+#  define SANITIZER_USES_UID16_SYSCALLS 1
 #else
-#define SANITIZER_USES_UID16_SYSCALLS 0
+#  define SANITIZER_USES_UID16_SYSCALLS 0
 #endif
 
 #if defined(__mips__)
-# define SANITIZER_POINTER_FORMAT_LENGTH FIRST_32_SECOND_64(8, 10)
+#  define SANITIZER_POINTER_FORMAT_LENGTH FIRST_32_SECOND_64(8, 10)
 #else
-# define SANITIZER_POINTER_FORMAT_LENGTH FIRST_32_SECOND_64(8, 12)
+#  define SANITIZER_POINTER_FORMAT_LENGTH FIRST_32_SECOND_64(8, 12)
 #endif
 
 /// \macro MSC_PREREQ
@@ -314,15 +323,15 @@
 ///  * 1800: Microsoft Visual Studio 2013 / 12.0
 ///  * 1900: Microsoft Visual Studio 2015 / 14.0
 #ifdef _MSC_VER
-# define MSC_PREREQ(version) (_MSC_VER >= (version))
+#  define MSC_PREREQ(version) (_MSC_VER >= (version))
 #else
-# define MSC_PREREQ(version) 0
+#  define MSC_PREREQ(version) 0
 #endif
 
-#if SANITIZER_MAC && !(defined(__arm64__) && SANITIZER_IOS)
-# define SANITIZER_NON_UNIQUE_TYPEINFO 0
+#if SANITIZER_MAC && defined(__x86_64__)
+#  define SANITIZER_NON_UNIQUE_TYPEINFO 0
 #else
-# define SANITIZER_NON_UNIQUE_TYPEINFO 1
+#  define SANITIZER_NON_UNIQUE_TYPEINFO 1
 #endif
 
 // On linux, some architectures had an ABI transition from 64-bit long double
@@ -330,11 +339,11 @@
 // involving long doubles come in two versions, and we need to pass the
 // correct one to dlvsym when intercepting them.
 #if SANITIZER_LINUX && (SANITIZER_S390 || SANITIZER_PPC32 || SANITIZER_PPC64V1)
-#define SANITIZER_NLDBL_VERSION "GLIBC_2.4"
+#  define SANITIZER_NLDBL_VERSION "GLIBC_2.4"
 #endif
 
 #if SANITIZER_GO == 0
-# define SANITIZER_GO 0
+#  define SANITIZER_GO 0
 #endif
 
 // On PowerPC and ARM Thumb, calling pthread_exit() causes LSan to detect leaks.
@@ -342,40 +351,39 @@
 // dlopen mallocs "libgcc_s.so" string which confuses LSan, it fails to realize
 // that this allocation happens in dynamic linker and should be ignored.
 #if SANITIZER_PPC || defined(__thumb__)
-# define SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT 1
+#  define SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT 1
 #else
-# define SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT 0
+#  define SANITIZER_SUPPRESS_LEAK_ON_PTHREAD_EXIT 0
 #endif
 
-#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_NETBSD || \
-  SANITIZER_SOLARIS
-# define SANITIZER_MADVISE_DONTNEED MADV_FREE
+#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_NETBSD || SANITIZER_SOLARIS
+#  define SANITIZER_MADVISE_DONTNEED MADV_FREE
 #else
-# define SANITIZER_MADVISE_DONTNEED MADV_DONTNEED
+#  define SANITIZER_MADVISE_DONTNEED MADV_DONTNEED
 #endif
 
 // Older gcc have issues aligning to a constexpr, and require an integer.
 // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56859 among others.
 #if defined(__powerpc__) || defined(__powerpc64__)
-# define SANITIZER_CACHE_LINE_SIZE 128
+#  define SANITIZER_CACHE_LINE_SIZE 128
 #else
-# define SANITIZER_CACHE_LINE_SIZE 64
+#  define SANITIZER_CACHE_LINE_SIZE 64
 #endif
 
 // Enable offline markup symbolizer for Fuchsia.
 #if SANITIZER_FUCHSIA
 #  define SANITIZER_SYMBOLIZER_MARKUP 1
 #else
-#define SANITIZER_SYMBOLIZER_MARKUP 0
+#  define SANITIZER_SYMBOLIZER_MARKUP 0
 #endif
 
 // Enable ability to support sanitizer initialization that is
 // compatible with the sanitizer library being loaded via
 // `dlopen()`.
 #if SANITIZER_MAC
-#define SANITIZER_SUPPORTS_INIT_FOR_DLOPEN 1
+#  define SANITIZER_SUPPORTS_INIT_FOR_DLOPEN 1
 #else
-#define SANITIZER_SUPPORTS_INIT_FOR_DLOPEN 0
+#  define SANITIZER_SUPPORTS_INIT_FOR_DLOPEN 0
 #endif
 
 // SANITIZER_SUPPORTS_THREADLOCAL
@@ -392,4 +400,15 @@
 #  endif
 #endif
 
-#endif // SANITIZER_PLATFORM_H
+#if defined(__thumb__) && defined(__linux__)
+// Workaround for
+// https://lab.llvm.org/buildbot/#/builders/clang-thumbv7-full-2stage
+// or
+// https://lab.llvm.org/staging/#/builders/clang-thumbv7-full-2stage
+// It fails *rss_limit_mb_test* without meaningful errors.
+#  define SANITIZER_START_BACKGROUND_THREAD_IN_ASAN_INTERNAL 1
+#else
+#  define SANITIZER_START_BACKGROUND_THREAD_IN_ASAN_INTERNAL 0
+#endif
+
+#endif  // SANITIZER_PLATFORM_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_interceptors.h b/libsanitizer/sanitizer_common/sanitizer_platform_interceptors.h
index 14610f2..3cbbead 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_interceptors.h
@@ -235,6 +235,7 @@
 #define SANITIZER_INTERCEPT_TIME SI_POSIX
 #define SANITIZER_INTERCEPT_GLOB (SI_GLIBC || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_GLOB64 SI_GLIBC
+#define SANITIZER_INTERCEPT___B64_TO SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_POSIX_SPAWN SI_POSIX
 #define SANITIZER_INTERCEPT_WAIT SI_POSIX
 #define SANITIZER_INTERCEPT_INET SI_POSIX
@@ -465,6 +466,7 @@
 #define SANITIZER_INTERCEPT_STAT                                        \
   (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS ||     \
    SI_STAT_LINUX)
+#define SANITIZER_INTERCEPT_STAT64 SI_STAT_LINUX
 #define SANITIZER_INTERCEPT_LSTAT (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX)
 #define SANITIZER_INTERCEPT___XSTAT \
   ((!SANITIZER_INTERCEPT_STAT && SI_POSIX) || SI_STAT_LINUX)
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_freebsd.cpp b/libsanitizer/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
index 6453580..0d25fa8 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
@@ -130,7 +130,7 @@ unsigned struct_sigevent_sz = sizeof(struct sigevent);
 unsigned struct_sched_param_sz = sizeof(struct sched_param);
 unsigned struct_statfs_sz = sizeof(struct statfs);
 unsigned struct_sockaddr_sz = sizeof(struct sockaddr);
-unsigned ucontext_t_sz = sizeof(ucontext_t);
+unsigned ucontext_t_sz(void *ctx) { return sizeof(ucontext_t); }
 unsigned struct_rlimit_sz = sizeof(struct rlimit);
 unsigned struct_timespec_sz = sizeof(struct timespec);
 unsigned struct_utimbuf_sz = sizeof(struct utimbuf);
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_freebsd.h b/libsanitizer/sanitizer_common/sanitizer_platform_limits_freebsd.h
index 649e64f..9859c52 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_freebsd.h
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_freebsd.h
@@ -57,7 +57,7 @@ extern unsigned struct_sched_param_sz;
 extern unsigned struct_statfs64_sz;
 extern unsigned struct_statfs_sz;
 extern unsigned struct_sockaddr_sz;
-extern unsigned ucontext_t_sz;
+unsigned ucontext_t_sz(void *ctx);
 extern unsigned struct_rlimit_sz;
 extern unsigned struct_utimbuf_sz;
 extern unsigned struct_timespec_sz;
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_linux.cpp b/libsanitizer/sanitizer_common/sanitizer_platform_limits_linux.cpp
index 2b1a2f7..9d57757 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_linux.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_linux.cpp
@@ -26,10 +26,7 @@
 
 // With old kernels (and even new kernels on powerpc) asm/stat.h uses types that
 // are not defined anywhere in userspace headers. Fake them. This seems to work
-// fine with newer headers, too.  Beware that with <sys/stat.h>, struct stat
-// takes the form of struct stat64 on 32-bit platforms if _FILE_OFFSET_BITS=64.
-// Also, for some platforms (e.g. mips) there are additional members in the
-// <sys/stat.h> struct stat:s.
+// fine with newer headers, too.
 #include <linux/posix_types.h>
 #  if defined(__x86_64__) || defined(__mips__) || defined(__hexagon__)
 #    include <sys/stat.h>
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_netbsd.cpp b/libsanitizer/sanitizer_common/sanitizer_platform_limits_netbsd.cpp
index 531e07f..648e502 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_netbsd.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_netbsd.cpp
@@ -554,7 +554,7 @@ unsigned struct_tms_sz = sizeof(struct tms);
 unsigned struct_sigevent_sz = sizeof(struct sigevent);
 unsigned struct_sched_param_sz = sizeof(struct sched_param);
 unsigned struct_sockaddr_sz = sizeof(struct sockaddr);
-unsigned ucontext_t_sz = sizeof(ucontext_t);
+unsigned ucontext_t_sz(void *ctx) { return sizeof(ucontext_t); }
 unsigned struct_rlimit_sz = sizeof(struct rlimit);
 unsigned struct_timespec_sz = sizeof(struct timespec);
 unsigned struct_sembuf_sz = sizeof(struct sembuf);
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_netbsd.h b/libsanitizer/sanitizer_common/sanitizer_platform_limits_netbsd.h
index 9407803..dc6eb59 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_netbsd.h
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_netbsd.h
@@ -45,7 +45,7 @@ extern unsigned struct_stack_t_sz;
 extern unsigned struct_sched_param_sz;
 extern unsigned struct_statfs_sz;
 extern unsigned struct_sockaddr_sz;
-extern unsigned ucontext_t_sz;
+unsigned ucontext_t_sz(void *ctx);
 
 extern unsigned struct_rlimit_sz;
 extern unsigned struct_utimbuf_sz;
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cpp b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cpp
index a1c4528..e5cecaa 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -170,8 +170,9 @@ typedef struct user_fpregs elf_fpregset_t;
 #endif
 
 // Include these after system headers to avoid name clashes and ambiguities.
-#include "sanitizer_internal_defs.h"
-#include "sanitizer_platform_limits_posix.h"
+#  include "sanitizer_common.h"
+#  include "sanitizer_internal_defs.h"
+#  include "sanitizer_platform_limits_posix.h"
 
 namespace __sanitizer {
   unsigned struct_utsname_sz = sizeof(struct utsname);
@@ -214,10 +215,24 @@ namespace __sanitizer {
 #if !SANITIZER_ANDROID
   unsigned struct_statfs_sz = sizeof(struct statfs);
   unsigned struct_sockaddr_sz = sizeof(struct sockaddr);
-  unsigned ucontext_t_sz = sizeof(ucontext_t);
-#endif // !SANITIZER_ANDROID
 
-#if SANITIZER_LINUX
+  unsigned ucontext_t_sz(void *ctx) {
+#    if SANITIZER_GLIBC && SANITIZER_X64
+    // See kernel arch/x86/kernel/fpu/signal.c for details.
+    const auto *fpregs = static_cast<ucontext_t *>(ctx)->uc_mcontext.fpregs;
+    // The member names differ across header versions, but the actual layout
+    // is always the same.  So avoid using members, just use arithmetic.
+    const uint32_t *after_xmm =
+        reinterpret_cast<const uint32_t *>(fpregs + 1) - 24;
+    if (after_xmm[12] == FP_XSTATE_MAGIC1)
+      return reinterpret_cast<const char *>(fpregs) + after_xmm[13] -
+             static_cast<const char *>(ctx);
+#    endif
+    return sizeof(ucontext_t);
+  }
+#  endif  // !SANITIZER_ANDROID
+
+#  if SANITIZER_LINUX
   unsigned struct_epoll_event_sz = sizeof(struct epoll_event);
   unsigned struct_sysinfo_sz = sizeof(struct sysinfo);
   unsigned __user_cap_header_struct_sz =
@@ -575,6 +590,14 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   unsigned IOCTL_BLKROGET = BLKROGET;
   unsigned IOCTL_BLKROSET = BLKROSET;
   unsigned IOCTL_BLKRRPART = BLKRRPART;
+  unsigned IOCTL_BLKFRASET = BLKFRASET;
+  unsigned IOCTL_BLKFRAGET = BLKFRAGET;
+  unsigned IOCTL_BLKSECTSET = BLKSECTSET;
+  unsigned IOCTL_BLKSECTGET = BLKSECTGET;
+  unsigned IOCTL_BLKSSZGET = BLKSSZGET;
+  unsigned IOCTL_BLKBSZGET = BLKBSZGET;
+  unsigned IOCTL_BLKBSZSET = BLKBSZSET;
+  unsigned IOCTL_BLKGETSIZE64 = BLKGETSIZE64;
   unsigned IOCTL_CDROMAUDIOBUFSIZ = CDROMAUDIOBUFSIZ;
   unsigned IOCTL_CDROMEJECT = CDROMEJECT;
   unsigned IOCTL_CDROMEJECT_SW = CDROMEJECT_SW;
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
index da53b5a..62a9903 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -57,12 +57,12 @@ extern unsigned struct_regmatch_sz;
 extern unsigned struct_fstab_sz;
 extern unsigned struct_statfs_sz;
 extern unsigned struct_sockaddr_sz;
-extern unsigned ucontext_t_sz;
-#endif // !SANITIZER_ANDROID
+unsigned ucontext_t_sz(void *uctx);
+#  endif  // !SANITIZER_ANDROID
 
-#if SANITIZER_LINUX
+#  if SANITIZER_LINUX
 
-#if defined(__x86_64__)
+#    if defined(__x86_64__)
 const unsigned struct_kernel_stat_sz = 144;
 const unsigned struct_kernel_stat64_sz = 0;
 #elif defined(__i386__)
@@ -83,7 +83,7 @@ const unsigned struct_kernel_stat64_sz = 104;
 #elif defined(__mips__)
 const unsigned struct_kernel_stat_sz = SANITIZER_ANDROID
                                            ? FIRST_32_SECOND_64(104, 128)
-                                           : FIRST_32_SECOND_64(144, 216);
+                                           : FIRST_32_SECOND_64(160, 216);
 const unsigned struct_kernel_stat64_sz = 104;
 #elif defined(__s390__) && !defined(__s390x__)
 const unsigned struct_kernel_stat_sz = 64;
@@ -370,7 +370,8 @@ struct __sanitizer_group {
   char **gr_mem;
 };
 
-#  if (defined(__x86_64__) && !defined(_LP64)) || defined(__hexagon__)
+#  if (SANITIZER_LINUX && !SANITIZER_GLIBC && !SANITIZER_ANDROID) || \
+      (defined(__x86_64__) && !defined(_LP64)) || defined(__hexagon__)
 typedef long long __sanitizer_time_t;
 #else
 typedef long __sanitizer_time_t;
@@ -478,7 +479,8 @@ struct __sanitizer_dirent {
   unsigned short d_reclen;
   // more fields that we don't care about
 };
-#  elif SANITIZER_ANDROID || defined(__x86_64__) || defined(__hexagon__)
+#  elif (SANITIZER_LINUX && !SANITIZER_GLIBC) || defined(__x86_64__) || \
+      defined(__hexagon__)
 struct __sanitizer_dirent {
   unsigned long long d_ino;
   unsigned long long d_off;
@@ -1108,6 +1110,14 @@ extern unsigned IOCTL_BLKRASET;
 extern unsigned IOCTL_BLKROGET;
 extern unsigned IOCTL_BLKROSET;
 extern unsigned IOCTL_BLKRRPART;
+extern unsigned IOCTL_BLKFRASET;
+extern unsigned IOCTL_BLKFRAGET;
+extern unsigned IOCTL_BLKSECTSET;
+extern unsigned IOCTL_BLKSECTGET;
+extern unsigned IOCTL_BLKSSZGET;
+extern unsigned IOCTL_BLKBSZGET;
+extern unsigned IOCTL_BLKBSZSET;
+extern unsigned IOCTL_BLKGETSIZE64;
 extern unsigned IOCTL_CDROMAUDIOBUFSIZ;
 extern unsigned IOCTL_CDROMEJECT;
 extern unsigned IOCTL_CDROMEJECT_SW;
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_solaris.cpp b/libsanitizer/sanitizer_common/sanitizer_platform_limits_solaris.cpp
index a113cb0..dad7bde 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_solaris.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_solaris.cpp
@@ -89,7 +89,7 @@ namespace __sanitizer {
   unsigned struct_sched_param_sz = sizeof(struct sched_param);
   unsigned struct_statfs_sz = sizeof(struct statfs);
   unsigned struct_sockaddr_sz = sizeof(struct sockaddr);
-  unsigned ucontext_t_sz = sizeof(ucontext_t);
+  unsigned ucontext_t_sz(void *ctx) { return sizeof(ucontext_t); }
   unsigned struct_timespec_sz = sizeof(struct timespec);
 #if SANITIZER_SOLARIS32
   unsigned struct_statvfs64_sz = sizeof(struct statvfs64);
diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_solaris.h b/libsanitizer/sanitizer_common/sanitizer_platform_limits_solaris.h
index cbab577..84a8126 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_solaris.h
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_solaris.h
@@ -43,7 +43,7 @@ extern unsigned struct_sched_param_sz;
 extern unsigned struct_statfs64_sz;
 extern unsigned struct_statfs_sz;
 extern unsigned struct_sockaddr_sz;
-extern unsigned ucontext_t_sz;
+unsigned ucontext_t_sz(void *ctx);
 
 extern unsigned struct_timespec_sz;
 extern unsigned struct_rlimit_sz;
diff --git a/libsanitizer/sanitizer_common/sanitizer_posix.cpp b/libsanitizer/sanitizer_common/sanitizer_posix.cpp
index f8457a6..3b330a3 100644
--- a/libsanitizer/sanitizer_common/sanitizer_posix.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_posix.cpp
@@ -95,6 +95,7 @@ void *MmapAlignedOrDieOnFatalError(uptr size, uptr alignment,
     UnmapOrDie((void*)map_res, res - map_res);
   }
   uptr end = res + size;
+  end = RoundUpTo(end, GetPageSizeCached());
   if (end != map_end)
     UnmapOrDie((void*)end, map_end - end);
   return (void*)res;
diff --git a/libsanitizer/sanitizer_common/sanitizer_posix_libcdep.cpp b/libsanitizer/sanitizer_common/sanitizer_posix_libcdep.cpp
index eed02ce..b6d8c72 100644
--- a/libsanitizer/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -290,7 +290,7 @@ bool IsAccessibleMemoryRange(uptr beg, uptr size) {
   return result;
 }
 
-void PlatformPrepareForSandboxing(__sanitizer_sandbox_arguments *args) {
+void PlatformPrepareForSandboxing(void *args) {
   // Some kinds of sandboxes may forbid filesystem access, so we won't be able
   // to read the file mappings from /proc/self/maps. Luckily, neither the
   // process will be able to load additional libraries, so it's fine to use the
diff --git a/libsanitizer/sanitizer_common/sanitizer_printf.cpp b/libsanitizer/sanitizer_common/sanitizer_printf.cpp
index 79aee8b..3a9e366 100644
--- a/libsanitizer/sanitizer_common/sanitizer_printf.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_printf.cpp
@@ -191,12 +191,12 @@ int VSNPrintf(char *buff, int buff_length,
         break;
       }
       case 'p': {
-        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(!have_flags, kPrintfFormatsHelp, format);
         result += AppendPointer(&buff, buff_end, va_arg(args, uptr));
         break;
       }
       case 's': {
-        RAW_CHECK(!have_length, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(!have_length, kPrintfFormatsHelp, format);
         // Only left-justified width is supported.
         CHECK(!have_width || left_justified);
         result += AppendString(&buff, buff_end, left_justified ? -width : width,
@@ -204,17 +204,17 @@ int VSNPrintf(char *buff, int buff_length,
         break;
       }
       case 'c': {
-        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(!have_flags, kPrintfFormatsHelp, format);
         result += AppendChar(&buff, buff_end, va_arg(args, int));
         break;
       }
       case '%' : {
-        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(!have_flags, kPrintfFormatsHelp, format);
         result += AppendChar(&buff, buff_end, '%');
         break;
       }
       default: {
-        RAW_CHECK(false, kPrintfFormatsHelp, format);
+        RAW_CHECK_VA(false, kPrintfFormatsHelp, format);
       }
     }
   }
diff --git a/libsanitizer/sanitizer_common/sanitizer_procmaps_mac.cpp b/libsanitizer/sanitizer_common/sanitizer_procmaps_mac.cpp
index 1f53e3e..62b2e5e 100644
--- a/libsanitizer/sanitizer_common/sanitizer_procmaps_mac.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_procmaps_mac.cpp
@@ -143,16 +143,16 @@ void MemoryMappingLayout::LoadFromCache() {
 // early in the process, when dyld is one of the only images loaded,
 // so it will be hit after only a few iterations.
 static mach_header *get_dyld_image_header() {
-  unsigned depth = 1;
-  vm_size_t size = 0;
   vm_address_t address = 0;
-  kern_return_t err = KERN_SUCCESS;
-  mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
 
   while (true) {
+    vm_size_t size = 0;
+    unsigned depth = 1;
     struct vm_region_submap_info_64 info;
-    err = vm_region_recurse_64(mach_task_self(), &address, &size, &depth,
-                               (vm_region_info_t)&info, &count);
+    mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
+    kern_return_t err =
+        vm_region_recurse_64(mach_task_self(), &address, &size, &depth,
+                             (vm_region_info_t)&info, &count);
     if (err != KERN_SUCCESS) return nullptr;
 
     if (size >= sizeof(mach_header) && info.protection & kProtectionRead) {
diff --git a/libsanitizer/sanitizer_common/sanitizer_quarantine.h b/libsanitizer/sanitizer_common/sanitizer_quarantine.h
index 1a074d2..4aa6054 100644
--- a/libsanitizer/sanitizer_common/sanitizer_quarantine.h
+++ b/libsanitizer/sanitizer_common/sanitizer_quarantine.h
@@ -149,8 +149,8 @@ class Quarantine {
   Cache cache_;
   char pad2_[kCacheLineSize];
 
-  void NOINLINE Recycle(uptr min_size, Callback cb) REQUIRES(recycle_mutex_)
-      RELEASE(recycle_mutex_) {
+  void NOINLINE Recycle(uptr min_size, Callback cb)
+      SANITIZER_REQUIRES(recycle_mutex_) SANITIZER_RELEASE(recycle_mutex_) {
     Cache tmp;
     {
       SpinMutexLock l(&cache_mutex_);
diff --git a/libsanitizer/sanitizer_common/sanitizer_ring_buffer.h b/libsanitizer/sanitizer_common/sanitizer_ring_buffer.h
index 2a46e93..f22e40c 100644
--- a/libsanitizer/sanitizer_common/sanitizer_ring_buffer.h
+++ b/libsanitizer/sanitizer_common/sanitizer_ring_buffer.h
@@ -86,10 +86,13 @@ class CompactRingBuffer {
   // Lower bytes store the address of the next buffer element.
   static constexpr int kPageSizeBits = 12;
   static constexpr int kSizeShift = 56;
+  static constexpr int kSizeBits = 64 - kSizeShift;
   static constexpr uptr kNextMask = (1ULL << kSizeShift) - 1;
 
   uptr GetStorageSize() const { return (long_ >> kSizeShift) << kPageSizeBits; }
 
+  static uptr SignExtend(uptr x) { return ((sptr)x) << kSizeBits >> kSizeBits; }
+
   void Init(void *storage, uptr size) {
     CHECK_EQ(sizeof(CompactRingBuffer<T>), sizeof(void *));
     CHECK(IsPowerOfTwo(size));
@@ -97,12 +100,14 @@ class CompactRingBuffer {
     CHECK_LE(size, 128 << kPageSizeBits);
     CHECK_EQ(size % 4096, 0);
     CHECK_EQ(size % sizeof(T), 0);
-    CHECK_EQ((uptr)storage % (size * 2), 0);
-    long_ = (uptr)storage | ((size >> kPageSizeBits) << kSizeShift);
+    uptr st = (uptr)storage;
+    CHECK_EQ(st % (size * 2), 0);
+    CHECK_EQ(st, SignExtend(st & kNextMask));
+    long_ = (st & kNextMask) | ((size >> kPageSizeBits) << kSizeShift);
   }
 
   void SetNext(const T *next) {
-    long_ = (long_ & ~kNextMask) | (uptr)next;
+    long_ = (long_ & ~kNextMask) | ((uptr)next & kNextMask);
   }
 
  public:
@@ -119,7 +124,7 @@ class CompactRingBuffer {
     SetNext((const T *)storage + Idx);
   }
 
-  T *Next() const { return (T *)(long_ & kNextMask); }
+  T *Next() const { return (T *)(SignExtend(long_ & kNextMask)); }
 
   void *StartOfStorage() const {
     return (void *)((uptr)Next() & ~(GetStorageSize() - 1));
diff --git a/libsanitizer/sanitizer_common/sanitizer_stack_store.cpp b/libsanitizer/sanitizer_common/sanitizer_stack_store.cpp
new file mode 100644
index 0000000..1484709
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_stack_store.cpp
@@ -0,0 +1,379 @@
+//===-- sanitizer_stack_store.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_stack_store.h"
+
+#include "sanitizer_atomic.h"
+#include "sanitizer_common.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_leb128.h"
+#include "sanitizer_lzw.h"
+#include "sanitizer_placement_new.h"
+#include "sanitizer_stacktrace.h"
+
+namespace __sanitizer {
+
+namespace {
+struct StackTraceHeader {
+  static constexpr u32 kStackSizeBits = 8;
+
+  u8 size;
+  u8 tag;
+  explicit StackTraceHeader(const StackTrace &trace)
+      : size(Min<uptr>(trace.size, (1u << 8) - 1)), tag(trace.tag) {
+    CHECK_EQ(trace.tag, static_cast<uptr>(tag));
+  }
+  explicit StackTraceHeader(uptr h)
+      : size(h & ((1 << kStackSizeBits) - 1)), tag(h >> kStackSizeBits) {}
+
+  uptr ToUptr() const {
+    return static_cast<uptr>(size) | (static_cast<uptr>(tag) << kStackSizeBits);
+  }
+};
+}  // namespace
+
+StackStore::Id StackStore::Store(const StackTrace &trace, uptr *pack) {
+  if (!trace.size && !trace.tag)
+    return 0;
+  StackTraceHeader h(trace);
+  uptr idx = 0;
+  *pack = 0;
+  uptr *stack_trace = Alloc(h.size + 1, &idx, pack);
+  *stack_trace = h.ToUptr();
+  internal_memcpy(stack_trace + 1, trace.trace, h.size * sizeof(uptr));
+  *pack += blocks_[GetBlockIdx(idx)].Stored(h.size + 1);
+  return OffsetToId(idx);
+}
+
+StackTrace StackStore::Load(Id id) {
+  if (!id)
+    return {};
+  uptr idx = IdToOffset(id);
+  uptr block_idx = GetBlockIdx(idx);
+  CHECK_LT(block_idx, ARRAY_SIZE(blocks_));
+  const uptr *stack_trace = blocks_[block_idx].GetOrUnpack(this);
+  if (!stack_trace)
+    return {};
+  stack_trace += GetInBlockIdx(idx);
+  StackTraceHeader h(*stack_trace);
+  return StackTrace(stack_trace + 1, h.size, h.tag);
+}
+
+uptr StackStore::Allocated() const {
+  return atomic_load_relaxed(&allocated_) + sizeof(*this);
+}
+
+uptr *StackStore::Alloc(uptr count, uptr *idx, uptr *pack) {
+  for (;;) {
+    // Optimisic lock-free allocation, essentially try to bump the
+    // total_frames_.
+    uptr start = atomic_fetch_add(&total_frames_, count, memory_order_relaxed);
+    uptr block_idx = GetBlockIdx(start);
+    uptr last_idx = GetBlockIdx(start + count - 1);
+    if (LIKELY(block_idx == last_idx)) {
+      // Fits into the a single block.
+      CHECK_LT(block_idx, ARRAY_SIZE(blocks_));
+      *idx = start;
+      return blocks_[block_idx].GetOrCreate(this) + GetInBlockIdx(start);
+    }
+
+    // Retry. We can't use range allocated in two different blocks.
+    CHECK_LE(count, kBlockSizeFrames);
+    uptr in_first = kBlockSizeFrames - GetInBlockIdx(start);
+    // Mark tail/head of these blocks as "stored".to avoid waiting before we can
+    // Pack().
+    *pack += blocks_[block_idx].Stored(in_first);
+    *pack += blocks_[last_idx].Stored(count - in_first);
+  }
+}
+
+void *StackStore::Map(uptr size, const char *mem_type) {
+  atomic_fetch_add(&allocated_, size, memory_order_relaxed);
+  return MmapNoReserveOrDie(size, mem_type);
+}
+
+void StackStore::Unmap(void *addr, uptr size) {
+  atomic_fetch_sub(&allocated_, size, memory_order_relaxed);
+  UnmapOrDie(addr, size);
+}
+
+uptr StackStore::Pack(Compression type) {
+  uptr res = 0;
+  for (BlockInfo &b : blocks_) res += b.Pack(type, this);
+  return res;
+}
+
+void StackStore::LockAll() {
+  for (BlockInfo &b : blocks_) b.Lock();
+}
+
+void StackStore::UnlockAll() {
+  for (BlockInfo &b : blocks_) b.Unlock();
+}
+
+void StackStore::TestOnlyUnmap() {
+  for (BlockInfo &b : blocks_) b.TestOnlyUnmap(this);
+  internal_memset(this, 0, sizeof(*this));
+}
+
+uptr *StackStore::BlockInfo::Get() const {
+  // Idiomatic double-checked locking uses memory_order_acquire here. But
+  // relaxed is fine for us, justification is similar to
+  // TwoLevelMap::GetOrCreate.
+  return reinterpret_cast<uptr *>(atomic_load_relaxed(&data_));
+}
+
+uptr *StackStore::BlockInfo::Create(StackStore *store) {
+  SpinMutexLock l(&mtx_);
+  uptr *ptr = Get();
+  if (!ptr) {
+    ptr = reinterpret_cast<uptr *>(store->Map(kBlockSizeBytes, "StackStore"));
+    atomic_store(&data_, reinterpret_cast<uptr>(ptr), memory_order_release);
+  }
+  return ptr;
+}
+
+uptr *StackStore::BlockInfo::GetOrCreate(StackStore *store) {
+  uptr *ptr = Get();
+  if (LIKELY(ptr))
+    return ptr;
+  return Create(store);
+}
+
+class SLeb128Encoder {
+ public:
+  SLeb128Encoder(u8 *begin, u8 *end) : begin(begin), end(end) {}
+
+  bool operator==(const SLeb128Encoder &other) const {
+    return begin == other.begin;
+  }
+
+  bool operator!=(const SLeb128Encoder &other) const {
+    return begin != other.begin;
+  }
+
+  SLeb128Encoder &operator=(uptr v) {
+    sptr diff = v - previous;
+    begin = EncodeSLEB128(diff, begin, end);
+    previous = v;
+    return *this;
+  }
+  SLeb128Encoder &operator*() { return *this; }
+  SLeb128Encoder &operator++() { return *this; }
+
+  u8 *base() const { return begin; }
+
+ private:
+  u8 *begin;
+  u8 *end;
+  uptr previous = 0;
+};
+
+class SLeb128Decoder {
+ public:
+  SLeb128Decoder(const u8 *begin, const u8 *end) : begin(begin), end(end) {}
+
+  bool operator==(const SLeb128Decoder &other) const {
+    return begin == other.begin;
+  }
+
+  bool operator!=(const SLeb128Decoder &other) const {
+    return begin != other.begin;
+  }
+
+  uptr operator*() {
+    sptr diff;
+    begin = DecodeSLEB128(begin, end, &diff);
+    previous += diff;
+    return previous;
+  }
+  SLeb128Decoder &operator++() { return *this; }
+
+  SLeb128Decoder operator++(int) { return *this; }
+
+ private:
+  const u8 *begin;
+  const u8 *end;
+  uptr previous = 0;
+};
+
+static u8 *CompressDelta(const uptr *from, const uptr *from_end, u8 *to,
+                         u8 *to_end) {
+  SLeb128Encoder encoder(to, to_end);
+  for (; from != from_end; ++from, ++encoder) *encoder = *from;
+  return encoder.base();
+}
+
+static uptr *UncompressDelta(const u8 *from, const u8 *from_end, uptr *to,
+                             uptr *to_end) {
+  SLeb128Decoder decoder(from, from_end);
+  SLeb128Decoder end(from_end, from_end);
+  for (; decoder != end; ++to, ++decoder) *to = *decoder;
+  CHECK_EQ(to, to_end);
+  return to;
+}
+
+static u8 *CompressLzw(const uptr *from, const uptr *from_end, u8 *to,
+                       u8 *to_end) {
+  SLeb128Encoder encoder(to, to_end);
+  encoder = LzwEncode<uptr>(from, from_end, encoder);
+  return encoder.base();
+}
+
+static uptr *UncompressLzw(const u8 *from, const u8 *from_end, uptr *to,
+                           uptr *to_end) {
+  SLeb128Decoder decoder(from, from_end);
+  SLeb128Decoder end(from_end, from_end);
+  to = LzwDecode<uptr>(decoder, end, to);
+  CHECK_EQ(to, to_end);
+  return to;
+}
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#  pragma warning(push)
+// Disable 'nonstandard extension used: zero-sized array in struct/union'.
+#  pragma warning(disable : 4200)
+#endif
+namespace {
+struct PackedHeader {
+  uptr size;
+  StackStore::Compression type;
+  u8 data[];
+};
+}  // namespace
+#if defined(_MSC_VER) && !defined(__clang__)
+#  pragma warning(pop)
+#endif
+
+uptr *StackStore::BlockInfo::GetOrUnpack(StackStore *store) {
+  SpinMutexLock l(&mtx_);
+  switch (state) {
+    case State::Storing:
+      state = State::Unpacked;
+      FALLTHROUGH;
+    case State::Unpacked:
+      return Get();
+    case State::Packed:
+      break;
+  }
+
+  u8 *ptr = reinterpret_cast<u8 *>(Get());
+  CHECK_NE(nullptr, ptr);
+  const PackedHeader *header = reinterpret_cast<const PackedHeader *>(ptr);
+  CHECK_LE(header->size, kBlockSizeBytes);
+  CHECK_GE(header->size, sizeof(PackedHeader));
+
+  uptr packed_size_aligned = RoundUpTo(header->size, GetPageSizeCached());
+
+  uptr *unpacked =
+      reinterpret_cast<uptr *>(store->Map(kBlockSizeBytes, "StackStoreUnpack"));
+
+  uptr *unpacked_end;
+  switch (header->type) {
+    case Compression::Delta:
+      unpacked_end = UncompressDelta(header->data, ptr + header->size, unpacked,
+                                     unpacked + kBlockSizeFrames);
+      break;
+    case Compression::LZW:
+      unpacked_end = UncompressLzw(header->data, ptr + header->size, unpacked,
+                                   unpacked + kBlockSizeFrames);
+      break;
+    default:
+      UNREACHABLE("Unexpected type");
+      break;
+  }
+
+  CHECK_EQ(kBlockSizeFrames, unpacked_end - unpacked);
+
+  MprotectReadOnly(reinterpret_cast<uptr>(unpacked), kBlockSizeBytes);
+  atomic_store(&data_, reinterpret_cast<uptr>(unpacked), memory_order_release);
+  store->Unmap(ptr, packed_size_aligned);
+
+  state = State::Unpacked;
+  return Get();
+}
+
+uptr StackStore::BlockInfo::Pack(Compression type, StackStore *store) {
+  if (type == Compression::None)
+    return 0;
+
+  SpinMutexLock l(&mtx_);
+  switch (state) {
+    case State::Unpacked:
+    case State::Packed:
+      return 0;
+    case State::Storing:
+      break;
+  }
+
+  uptr *ptr = Get();
+  if (!ptr || !Stored(0))
+    return 0;
+
+  u8 *packed =
+      reinterpret_cast<u8 *>(store->Map(kBlockSizeBytes, "StackStorePack"));
+  PackedHeader *header = reinterpret_cast<PackedHeader *>(packed);
+  u8 *alloc_end = packed + kBlockSizeBytes;
+
+  u8 *packed_end = nullptr;
+  switch (type) {
+    case Compression::Delta:
+      packed_end =
+          CompressDelta(ptr, ptr + kBlockSizeFrames, header->data, alloc_end);
+      break;
+    case Compression::LZW:
+      packed_end =
+          CompressLzw(ptr, ptr + kBlockSizeFrames, header->data, alloc_end);
+      break;
+    default:
+      UNREACHABLE("Unexpected type");
+      break;
+  }
+
+  header->type = type;
+  header->size = packed_end - packed;
+
+  VPrintf(1, "Packed block of %zu KiB to %zu KiB\n", kBlockSizeBytes >> 10,
+          header->size >> 10);
+
+  if (kBlockSizeBytes - header->size < kBlockSizeBytes / 8) {
+    VPrintf(1, "Undo and keep block unpacked\n");
+    MprotectReadOnly(reinterpret_cast<uptr>(ptr), kBlockSizeBytes);
+    store->Unmap(packed, kBlockSizeBytes);
+    state = State::Unpacked;
+    return 0;
+  }
+
+  uptr packed_size_aligned = RoundUpTo(header->size, GetPageSizeCached());
+  store->Unmap(packed + packed_size_aligned,
+               kBlockSizeBytes - packed_size_aligned);
+  MprotectReadOnly(reinterpret_cast<uptr>(packed), packed_size_aligned);
+
+  atomic_store(&data_, reinterpret_cast<uptr>(packed), memory_order_release);
+  store->Unmap(ptr, kBlockSizeBytes);
+
+  state = State::Packed;
+  return kBlockSizeBytes - packed_size_aligned;
+}
+
+void StackStore::BlockInfo::TestOnlyUnmap(StackStore *store) {
+  if (uptr *ptr = Get())
+    store->Unmap(ptr, kBlockSizeBytes);
+}
+
+bool StackStore::BlockInfo::Stored(uptr n) {
+  return n + atomic_fetch_add(&stored_, n, memory_order_release) ==
+         kBlockSizeFrames;
+}
+
+bool StackStore::BlockInfo::IsPacked() const {
+  SpinMutexLock l(&mtx_);
+  return state == State::Packed;
+}
+
+}  // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_stack_store.h b/libsanitizer/sanitizer_common/sanitizer_stack_store.h
new file mode 100644
index 0000000..4f1a8ca
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_stack_store.h
@@ -0,0 +1,121 @@
+//===-- sanitizer_stack_store.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_STACK_STORE_H
+#define SANITIZER_STACK_STORE_H
+
+#include "sanitizer_atomic.h"
+#include "sanitizer_common.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_mutex.h"
+#include "sanitizer_stacktrace.h"
+
+namespace __sanitizer {
+
+class StackStore {
+  static constexpr uptr kBlockSizeFrames = 0x100000;
+  static constexpr uptr kBlockCount = 0x1000;
+  static constexpr uptr kBlockSizeBytes = kBlockSizeFrames * sizeof(uptr);
+
+ public:
+  enum class Compression : u8 {
+    None = 0,
+    Delta,
+    LZW,
+  };
+
+  constexpr StackStore() = default;
+
+  using Id = u32;  // Enough for 2^32 * sizeof(uptr) bytes of traces.
+  static_assert(u64(kBlockCount) * kBlockSizeFrames == 1ull << (sizeof(Id) * 8),
+                "");
+
+  Id Store(const StackTrace &trace,
+           uptr *pack /* number of blocks completed by this call */);
+  StackTrace Load(Id id);
+  uptr Allocated() const;
+
+  // Packs all blocks which don't expect any more writes. A block is going to be
+  // packed once. As soon trace from that block was requested, it will unpack
+  // and stay unpacked after that.
+  // Returns the number of released bytes.
+  uptr Pack(Compression type);
+
+  void LockAll();
+  void UnlockAll();
+
+  void TestOnlyUnmap();
+
+ private:
+  friend class StackStoreTest;
+  static constexpr uptr GetBlockIdx(uptr frame_idx) {
+    return frame_idx / kBlockSizeFrames;
+  }
+
+  static constexpr uptr GetInBlockIdx(uptr frame_idx) {
+    return frame_idx % kBlockSizeFrames;
+  }
+
+  static constexpr uptr IdToOffset(Id id) {
+    CHECK_NE(id, 0);
+    return id - 1;  // Avoid zero as id.
+  }
+
+  static constexpr uptr OffsetToId(Id id) {
+    // This makes UINT32_MAX to 0 and it will be retrived as and empty stack.
+    // But this is not a problem as we will not be able to store anything after
+    // that anyway.
+    return id + 1;  // Avoid zero as id.
+  }
+
+  uptr *Alloc(uptr count, uptr *idx, uptr *pack);
+
+  void *Map(uptr size, const char *mem_type);
+  void Unmap(void *addr, uptr size);
+
+  // Total number of allocated frames.
+  atomic_uintptr_t total_frames_ = {};
+
+  // Tracks total allocated memory in bytes.
+  atomic_uintptr_t allocated_ = {};
+
+  // Each block will hold pointer to exactly kBlockSizeFrames.
+  class BlockInfo {
+    atomic_uintptr_t data_;
+    // Counter to track store progress to know when we can Pack() the block.
+    atomic_uint32_t stored_;
+    // Protects alloc of new blocks.
+    mutable StaticSpinMutex mtx_;
+
+    enum class State : u8 {
+      Storing = 0,
+      Packed,
+      Unpacked,
+    };
+    State state SANITIZER_GUARDED_BY(mtx_);
+
+    uptr *Create(StackStore *store);
+
+   public:
+    uptr *Get() const;
+    uptr *GetOrCreate(StackStore *store);
+    uptr *GetOrUnpack(StackStore *store);
+    uptr Pack(Compression type, StackStore *store);
+    void TestOnlyUnmap(StackStore *store);
+    bool Stored(uptr n);
+    bool IsPacked() const;
+    void Lock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS { mtx_.Lock(); }
+    void Unlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS { mtx_.Unlock(); }
+  };
+
+  BlockInfo blocks_[kBlockCount] = {};
+};
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_STACK_STORE_H
diff --git a/libsanitizer/sanitizer_common/sanitizer_stackdepot.cpp b/libsanitizer/sanitizer_common/sanitizer_stackdepot.cpp
index 0285545..a746d46 100644
--- a/libsanitizer/sanitizer_common/sanitizer_stackdepot.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_stackdepot.cpp
@@ -12,22 +12,22 @@
 
 #include "sanitizer_stackdepot.h"
 
+#include "sanitizer_atomic.h"
 #include "sanitizer_common.h"
 #include "sanitizer_hash.h"
-#include "sanitizer_persistent_allocator.h"
+#include "sanitizer_mutex.h"
+#include "sanitizer_stack_store.h"
 #include "sanitizer_stackdepotbase.h"
 
 namespace __sanitizer {
 
-static PersistentAllocator<uptr> traceAllocator;
-
 struct StackDepotNode {
   using hash_type = u64;
   hash_type stack_hash;
   u32 link;
+  StackStore::Id store_id;
 
   static const u32 kTabSizeLog = SANITIZER_ANDROID ? 16 : 20;
-  static const u32 kStackSizeBits = 16;
 
   typedef StackTrace args_type;
   bool eq(hash_type hash, const args_type &args) const {
@@ -50,14 +50,12 @@ struct StackDepotNode {
   typedef StackDepotHandle handle_type;
 };
 
+static StackStore stackStore;
+
 // FIXME(dvyukov): this single reserved bit is used in TSan.
 typedef StackDepotBase<StackDepotNode, 1, StackDepotNode::kTabSizeLog>
     StackDepot;
 static StackDepot theDepot;
-// Keep rarely accessed stack traces out of frequently access nodes to improve
-// caching efficiency.
-static TwoLevelMap<uptr *, StackDepot::kNodesSize1, StackDepot::kNodesSize2>
-    tracePtrs;
 // Keep mutable data out of frequently access nodes to improve caching
 // efficiency.
 static TwoLevelMap<atomic_uint32_t, StackDepot::kNodesSize1,
@@ -73,26 +71,136 @@ void StackDepotHandle::inc_use_count_unsafe() {
 }
 
 uptr StackDepotNode::allocated() {
-  return traceAllocator.allocated() + tracePtrs.MemoryUsage() +
-         useCounts.MemoryUsage();
+  return stackStore.Allocated() + useCounts.MemoryUsage();
+}
+
+static void CompressStackStore() {
+  u64 start = Verbosity() >= 1 ? MonotonicNanoTime() : 0;
+  uptr diff = stackStore.Pack(static_cast<StackStore::Compression>(
+      Abs(common_flags()->compress_stack_depot)));
+  if (!diff)
+    return;
+  if (Verbosity() >= 1) {
+    u64 finish = MonotonicNanoTime();
+    uptr total_before = theDepot.GetStats().allocated + diff;
+    VPrintf(1, "%s: StackDepot released %zu KiB out of %zu KiB in %llu ms\n",
+            SanitizerToolName, diff >> 10, total_before >> 10,
+            (finish - start) / 1000000);
+  }
+}
+
+namespace {
+
+class CompressThread {
+ public:
+  constexpr CompressThread() = default;
+  void NewWorkNotify();
+  void Stop();
+  void LockAndStop() SANITIZER_NO_THREAD_SAFETY_ANALYSIS;
+  void Unlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS;
+
+ private:
+  enum class State {
+    NotStarted = 0,
+    Started,
+    Failed,
+    Stopped,
+  };
+
+  void Run();
+
+  bool WaitForWork() {
+    semaphore_.Wait();
+    return atomic_load(&run_, memory_order_acquire);
+  }
+
+  Semaphore semaphore_ = {};
+  StaticSpinMutex mutex_ = {};
+  State state_ SANITIZER_GUARDED_BY(mutex_) = State::NotStarted;
+  void *thread_ SANITIZER_GUARDED_BY(mutex_) = nullptr;
+  atomic_uint8_t run_ = {};
+};
+
+static CompressThread compress_thread;
+
+void CompressThread::NewWorkNotify() {
+  int compress = common_flags()->compress_stack_depot;
+  if (!compress)
+    return;
+  if (compress > 0 /* for testing or debugging */) {
+    SpinMutexLock l(&mutex_);
+    if (state_ == State::NotStarted) {
+      atomic_store(&run_, 1, memory_order_release);
+      CHECK_EQ(nullptr, thread_);
+      thread_ = internal_start_thread(
+          [](void *arg) -> void * {
+            reinterpret_cast<CompressThread *>(arg)->Run();
+            return nullptr;
+          },
+          this);
+      state_ = thread_ ? State::Started : State::Failed;
+    }
+    if (state_ == State::Started) {
+      semaphore_.Post();
+      return;
+    }
+  }
+  CompressStackStore();
+}
+
+void CompressThread::Run() {
+  VPrintf(1, "%s: StackDepot compression thread started\n", SanitizerToolName);
+  while (WaitForWork()) CompressStackStore();
+  VPrintf(1, "%s: StackDepot compression thread stopped\n", SanitizerToolName);
+}
+
+void CompressThread::Stop() {
+  void *t = nullptr;
+  {
+    SpinMutexLock l(&mutex_);
+    if (state_ != State::Started)
+      return;
+    state_ = State::Stopped;
+    CHECK_NE(nullptr, thread_);
+    t = thread_;
+    thread_ = nullptr;
+  }
+  atomic_store(&run_, 0, memory_order_release);
+  semaphore_.Post();
+  internal_join_thread(t);
 }
 
+void CompressThread::LockAndStop() {
+  mutex_.Lock();
+  if (state_ != State::Started)
+    return;
+  CHECK_NE(nullptr, thread_);
+
+  atomic_store(&run_, 0, memory_order_release);
+  semaphore_.Post();
+  internal_join_thread(thread_);
+  // Allow to restart after Unlock() if needed.
+  state_ = State::NotStarted;
+  thread_ = nullptr;
+}
+
+void CompressThread::Unlock() { mutex_.Unlock(); }
+
+}  // namespace
+
 void StackDepotNode::store(u32 id, const args_type &args, hash_type hash) {
   stack_hash = hash;
-  uptr *stack_trace = traceAllocator.alloc(args.size + 1);
-  CHECK_LT(args.size, 1 << kStackSizeBits);
-  *stack_trace = args.size + (args.tag << kStackSizeBits);
-  internal_memcpy(stack_trace + 1, args.trace, args.size * sizeof(uptr));
-  tracePtrs[id] = stack_trace;
+  uptr pack = 0;
+  store_id = stackStore.Store(args, &pack);
+  if (LIKELY(!pack))
+    return;
+  compress_thread.NewWorkNotify();
 }
 
 StackDepotNode::args_type StackDepotNode::load(u32 id) const {
-  const uptr *stack_trace = tracePtrs[id];
-  if (!stack_trace)
+  if (!store_id)
     return {};
-  uptr size = *stack_trace & ((1 << kStackSizeBits) - 1);
-  uptr tag = *stack_trace >> kStackSizeBits;
-  return args_type(stack_trace + 1, size, tag);
+  return stackStore.Load(store_id);
 }
 
 StackDepotStats StackDepotGetStats() { return theDepot.GetStats(); }
@@ -109,9 +217,13 @@ StackTrace StackDepotGet(u32 id) {
 
 void StackDepotLockAll() {
   theDepot.LockAll();
+  compress_thread.LockAndStop();
+  stackStore.LockAll();
 }
 
 void StackDepotUnlockAll() {
+  stackStore.UnlockAll();
+  compress_thread.Unlock();
   theDepot.UnlockAll();
 }
 
@@ -121,14 +233,15 @@ void StackDepotPrintAll() {
 #endif
 }
 
+void StackDepotStopBackgroundThread() { compress_thread.Stop(); }
+
 StackDepotHandle StackDepotNode::get_handle(u32 id) {
   return StackDepotHandle(&theDepot.nodes[id], id);
 }
 
 void StackDepotTestOnlyUnmap() {
   theDepot.TestOnlyUnmap();
-  tracePtrs.TestOnlyUnmap();
-  traceAllocator.TestOnlyUnmap();
+  stackStore.TestOnlyUnmap();
 }
 
 } // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_stackdepot.h b/libsanitizer/sanitizer_common/sanitizer_stackdepot.h
index 56d655d..cca6fd5 100644
--- a/libsanitizer/sanitizer_common/sanitizer_stackdepot.h
+++ b/libsanitizer/sanitizer_common/sanitizer_stackdepot.h
@@ -42,6 +42,7 @@ StackTrace StackDepotGet(u32 id);
 void StackDepotLockAll();
 void StackDepotUnlockAll();
 void StackDepotPrintAll();
+void StackDepotStopBackgroundThread();
 
 void StackDepotTestOnlyUnmap();
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_stacktrace.cpp b/libsanitizer/sanitizer_common/sanitizer_stacktrace.cpp
index 5a12422..3013a0c 100644
--- a/libsanitizer/sanitizer_common/sanitizer_stacktrace.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_stacktrace.cpp
@@ -20,11 +20,10 @@
 namespace __sanitizer {
 
 uptr StackTrace::GetNextInstructionPc(uptr pc) {
-#if defined(__sparc__) || defined(__mips__)
-  return pc + 8;
-#elif defined(__powerpc__) || defined(__arm__) || defined(__aarch64__) || \
-    defined(__hexagon__)
+#if defined(__aarch64__)
   return STRIP_PAC_PC((void *)pc) + 4;
+#elif defined(__sparc__) || defined(__mips__)
+  return pc + 8;
 #elif SANITIZER_RISCV64
   // Current check order is 4 -> 2 -> 6 -> 8
   u8 InsnByte = *(u8 *)(pc);
@@ -47,8 +46,10 @@ uptr StackTrace::GetNextInstructionPc(uptr pc) {
   }
   // bail-out if could not figure out the instruction size
   return 0;
-#else
+#elif SANITIZER_S390 || SANITIZER_I386 || SANITIZER_X32 || SANITIZER_X64
   return pc + 1;
+#else
+  return pc + 4;
 #endif
 }
 
@@ -86,8 +87,8 @@ static inline uhwptr *GetCanonicFrame(uptr bp,
   // Nope, this does not look right either. This means the frame after next does
   // not have a valid frame pointer, but we can still extract the caller PC.
   // Unfortunately, there is no way to decide between GCC and LLVM frame
-  // layouts. Assume GCC.
-  return bp_prev - 1;
+  // layouts. Assume LLVM.
+  return bp_prev;
 #else
   return (uhwptr*)bp;
 #endif
@@ -110,21 +111,14 @@ void BufferedStackTrace::UnwindFast(uptr pc, uptr bp, uptr stack_top,
          IsAligned((uptr)frame, sizeof(*frame)) &&
          size < max_depth) {
 #ifdef __powerpc__
-    // PowerPC ABIs specify that the return address is saved on the
-    // *caller's* stack frame.  Thus we must dereference the back chain
-    // to find the caller frame before extracting it.
+    // PowerPC ABIs specify that the return address is saved at offset
+    // 16 of the *caller's* stack frame.  Thus we must dereference the
+    // back chain to find the caller frame before extracting it.
     uhwptr *caller_frame = (uhwptr*)frame[0];
     if (!IsValidFrame((uptr)caller_frame, stack_top, bottom) ||
         !IsAligned((uptr)caller_frame, sizeof(uhwptr)))
       break;
-    // For most ABIs the offset where the return address is saved is two
-    // register sizes.  The exception is the SVR4 ABI, which uses an
-    // offset of only one register size.
-#ifdef _CALL_SYSV
-    uhwptr pc1 = caller_frame[1];
-#else
     uhwptr pc1 = caller_frame[2];
-#endif
 #elif defined(__s390__)
     uhwptr pc1 = frame[14];
 #elif defined(__riscv)
diff --git a/libsanitizer/sanitizer_common/sanitizer_stacktrace.h b/libsanitizer/sanitizer_common/sanitizer_stacktrace.h
index 11c6154..9a5f8fb 100644
--- a/libsanitizer/sanitizer_common/sanitizer_stacktrace.h
+++ b/libsanitizer/sanitizer_common/sanitizer_stacktrace.h
@@ -20,7 +20,7 @@ namespace __sanitizer {
 
 struct BufferedStackTrace;
 
-static const u32 kStackTraceMax = 256;
+static const u32 kStackTraceMax = 255;
 
 #if SANITIZER_LINUX && defined(__mips__)
 # define SANITIZER_CAN_FAST_UNWIND 0
@@ -88,9 +88,6 @@ uptr StackTrace::GetPreviousInstructionPc(uptr pc) {
   // so we return (pc-2) in that case in order to be safe.
   // For A32 mode we return (pc-4) because all instructions are 32 bit long.
   return (pc - 3) & (~1);
-#elif defined(__powerpc__) || defined(__powerpc64__) || defined(__aarch64__)
-  // PCs are always 4 byte aligned.
-  return pc - 4;
 #elif defined(__sparc__) || defined(__mips__)
   return pc - 8;
 #elif SANITIZER_RISCV64
@@ -101,8 +98,10 @@ uptr StackTrace::GetPreviousInstructionPc(uptr pc) {
   // It seems difficult to figure out the exact instruction length -
   // pc - 2 seems like a safe option for the purposes of stack tracing
   return pc - 2;
-#else
+#elif SANITIZER_S390 || SANITIZER_I386 || SANITIZER_X32 || SANITIZER_X64
   return pc - 1;
+#else
+  return pc - 4;
 #endif
 }
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_stacktrace_libcdep.cpp b/libsanitizer/sanitizer_common/sanitizer_stacktrace_libcdep.cpp
index 2d1c03f..47983ee 100644
--- a/libsanitizer/sanitizer_common/sanitizer_stacktrace_libcdep.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_stacktrace_libcdep.cpp
@@ -166,8 +166,8 @@ void BufferedStackTrace::Unwind(u32 max_depth, uptr pc, uptr bp, void *context,
   UnwindFast(pc, bp, stack_top, stack_bottom, max_depth);
 }
 
-static int GetModuleAndOffsetForPc(uptr pc, char *module_name,
-                                   uptr module_name_len, uptr *pc_offset) {
+int GetModuleAndOffsetForPc(uptr pc, char *module_name, uptr module_name_len,
+                            uptr *pc_offset) {
   const char *found_module_name = nullptr;
   bool ok = Symbolizer::GetOrInit()->GetModuleNameAndOffsetForPC(
       pc, &found_module_name, pc_offset);
@@ -216,10 +216,11 @@ void __sanitizer_symbolize_global(uptr data_addr, const char *fmt,
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
-int __sanitizer_get_module_and_offset_for_pc(uptr pc, char *module_name,
+int __sanitizer_get_module_and_offset_for_pc(void *pc, char *module_name,
                                              uptr module_name_len,
-                                             uptr *pc_offset) {
-  return __sanitizer::GetModuleAndOffsetForPc(pc, module_name, module_name_len,
-                                              pc_offset);
+                                             void **pc_offset) {
+  return __sanitizer::GetModuleAndOffsetForPc(
+      reinterpret_cast<uptr>(pc), module_name, module_name_len,
+      reinterpret_cast<uptr *>(pc_offset));
 }
 }  // extern "C"
diff --git a/libsanitizer/sanitizer_common/sanitizer_stacktrace_printer.cpp b/libsanitizer/sanitizer_common/sanitizer_stacktrace_printer.cpp
index c6356da..2d0eccc 100644
--- a/libsanitizer/sanitizer_common/sanitizer_stacktrace_printer.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_stacktrace_printer.cpp
@@ -104,6 +104,19 @@ static const char *DemangleFunctionName(const char *function) {
   return function;
 }
 
+static void MaybeBuildIdToBuffer(const AddressInfo &info, bool PrefixSpace,
+                                 InternalScopedString *buffer) {
+  if (info.uuid_size) {
+    if (PrefixSpace)
+      buffer->append(" ");
+    buffer->append("(BuildId: ");
+    for (uptr i = 0; i < info.uuid_size; ++i) {
+      buffer->append("%02x", info.uuid[i]);
+    }
+    buffer->append(")");
+  }
+}
+
 static const char kDefaultFormat[] = "    #%n %p %F %L";
 
 void RenderFrame(InternalScopedString *buffer, const char *format, int frame_no,
@@ -140,6 +153,9 @@ void RenderFrame(InternalScopedString *buffer, const char *format, int frame_no,
     case 'o':
       buffer->append("0x%zx", info->module_offset);
       break;
+    case 'b':
+      MaybeBuildIdToBuffer(*info, /*PrefixSpace=*/false, buffer);
+      break;
     case 'f':
       buffer->append("%s", DemangleFunctionName(StripFunctionName(
                                info->function, strip_func_prefix)));
@@ -181,6 +197,8 @@ void RenderFrame(InternalScopedString *buffer, const char *format, int frame_no,
       } else if (info->module) {
         RenderModuleLocation(buffer, info->module, info->module_offset,
                              info->module_arch, strip_path_prefix);
+
+        MaybeBuildIdToBuffer(*info, /*PrefixSpace=*/true, buffer);
       } else {
         buffer->append("(<unknown module>)");
       }
@@ -193,6 +211,7 @@ void RenderFrame(InternalScopedString *buffer, const char *format, int frame_no,
         // Always strip the module name for %M.
         RenderModuleLocation(buffer, StripModuleName(info->module),
                              info->module_offset, info->module_arch, "");
+        MaybeBuildIdToBuffer(*info, /*PrefixSpace=*/true, buffer);
       } else {
         buffer->append("(%p)", (void *)address);
       }
diff --git a/libsanitizer/sanitizer_common/sanitizer_stoptheworld_win.cpp b/libsanitizer/sanitizer_common/sanitizer_stoptheworld_win.cpp
new file mode 100644
index 0000000..f114ace
--- /dev/null
+++ b/libsanitizer/sanitizer_common/sanitizer_stoptheworld_win.cpp
@@ -0,0 +1,175 @@
+//===-- sanitizer_stoptheworld_win.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// See sanitizer_stoptheworld.h for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_platform.h"
+
+#if SANITIZER_WINDOWS
+
+#  define WIN32_LEAN_AND_MEAN
+#  include <windows.h>
+// windows.h needs to be included before tlhelp32.h
+#  include <tlhelp32.h>
+
+#  include "sanitizer_stoptheworld.h"
+
+namespace __sanitizer {
+
+namespace {
+
+struct SuspendedThreadsListWindows final : public SuspendedThreadsList {
+  InternalMmapVector<HANDLE> threadHandles;
+  InternalMmapVector<DWORD> threadIds;
+
+  SuspendedThreadsListWindows() {
+    threadIds.reserve(1024);
+    threadHandles.reserve(1024);
+  }
+
+  PtraceRegistersStatus GetRegistersAndSP(uptr index,
+                                          InternalMmapVector<uptr> *buffer,
+                                          uptr *sp) const override;
+
+  tid_t GetThreadID(uptr index) const override;
+  uptr ThreadCount() const override;
+};
+
+// Stack Pointer register names on different architectures
+#  if SANITIZER_X64
+#    define SP_REG Rsp
+#  elif SANITIZER_I386
+#    define SP_REG Esp
+#  elif SANITIZER_ARM | SANITIZER_ARM64
+#    define SP_REG Sp
+#  else
+#    error Architecture not supported!
+#  endif
+
+PtraceRegistersStatus SuspendedThreadsListWindows::GetRegistersAndSP(
+    uptr index, InternalMmapVector<uptr> *buffer, uptr *sp) const {
+  CHECK_LT(index, threadHandles.size());
+
+  buffer->resize(RoundUpTo(sizeof(CONTEXT), sizeof(uptr)) / sizeof(uptr));
+  CONTEXT *thread_context = reinterpret_cast<CONTEXT *>(buffer->data());
+  thread_context->ContextFlags = CONTEXT_ALL;
+  CHECK(GetThreadContext(threadHandles[index], thread_context));
+  *sp = thread_context->SP_REG;
+
+  return REGISTERS_AVAILABLE;
+}
+
+tid_t SuspendedThreadsListWindows::GetThreadID(uptr index) const {
+  CHECK_LT(index, threadIds.size());
+  return threadIds[index];
+}
+
+uptr SuspendedThreadsListWindows::ThreadCount() const {
+  return threadIds.size();
+}
+
+struct RunThreadArgs {
+  StopTheWorldCallback callback;
+  void *argument;
+};
+
+DWORD WINAPI RunThread(void *argument) {
+  RunThreadArgs *run_args = (RunThreadArgs *)argument;
+
+  const DWORD this_thread = GetCurrentThreadId();
+  const DWORD this_process = GetCurrentProcessId();
+
+  SuspendedThreadsListWindows suspended_threads_list;
+  bool new_thread_found;
+
+  do {
+    // Take a snapshot of all Threads
+    const HANDLE threads = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
+    CHECK(threads != INVALID_HANDLE_VALUE);
+
+    THREADENTRY32 thread_entry;
+    thread_entry.dwSize = sizeof(thread_entry);
+    new_thread_found = false;
+
+    if (!Thread32First(threads, &thread_entry))
+      break;
+
+    do {
+      if (thread_entry.th32ThreadID == this_thread ||
+          thread_entry.th32OwnerProcessID != this_process)
+        continue;
+
+      bool suspended_thread = false;
+      for (const auto thread_id : suspended_threads_list.threadIds) {
+        if (thread_id == thread_entry.th32ThreadID) {
+          suspended_thread = true;
+          break;
+        }
+      }
+
+      // Skip the Thread if it was already suspended
+      if (suspended_thread)
+        continue;
+
+      const HANDLE thread =
+          OpenThread(THREAD_ALL_ACCESS, FALSE, thread_entry.th32ThreadID);
+      CHECK(thread);
+
+      if (SuspendThread(thread) == (DWORD)-1) {
+        DWORD last_error = GetLastError();
+
+        VPrintf(1, "Could not suspend thread %lu (error %lu)",
+                thread_entry.th32ThreadID, last_error);
+        continue;
+      }
+
+      suspended_threads_list.threadIds.push_back(thread_entry.th32ThreadID);
+      suspended_threads_list.threadHandles.push_back(thread);
+      new_thread_found = true;
+    } while (Thread32Next(threads, &thread_entry));
+
+    CloseHandle(threads);
+
+    // Between the call to `CreateToolhelp32Snapshot` and suspending the
+    // relevant Threads, new Threads could have potentially been created. So
+    // continue to find and suspend new Threads until we don't find any.
+  } while (new_thread_found);
+
+  // Now all Threads of this Process except of this Thread should be suspended.
+  // Execute the callback function.
+  run_args->callback(suspended_threads_list, run_args->argument);
+
+  // Resume all Threads
+  for (const auto suspended_thread_handle :
+       suspended_threads_list.threadHandles) {
+    CHECK_NE(ResumeThread(suspended_thread_handle), -1);
+    CloseHandle(suspended_thread_handle);
+  }
+
+  return 0;
+}
+
+}  // namespace
+
+void StopTheWorld(StopTheWorldCallback callback, void *argument) {
+  struct RunThreadArgs arg = {callback, argument};
+  DWORD trace_thread_id;
+
+  auto trace_thread =
+      CreateThread(nullptr, 0, RunThread, &arg, 0, &trace_thread_id);
+  CHECK(trace_thread);
+
+  WaitForSingleObject(trace_thread, INFINITE);
+  CloseHandle(trace_thread);
+}
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_WINDOWS
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer.cpp b/libsanitizer/sanitizer_common/sanitizer_symbolizer.cpp
index 0c4b84c..d3cffaa 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer.cpp
@@ -11,10 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "sanitizer_allocator_internal.h"
-#include "sanitizer_platform.h"
+#include "sanitizer_common.h"
 #include "sanitizer_internal_defs.h"
 #include "sanitizer_libc.h"
 #include "sanitizer_placement_new.h"
+#include "sanitizer_platform.h"
 #include "sanitizer_symbolizer_internal.h"
 
 namespace __sanitizer {
@@ -30,6 +31,7 @@ void AddressInfo::Clear() {
   InternalFree(file);
   internal_memset(this, 0, sizeof(AddressInfo));
   function_offset = kUnknown;
+  uuid_size = 0;
 }
 
 void AddressInfo::FillModuleInfo(const char *mod_name, uptr mod_offset,
@@ -37,6 +39,16 @@ void AddressInfo::FillModuleInfo(const char *mod_name, uptr mod_offset,
   module = internal_strdup(mod_name);
   module_offset = mod_offset;
   module_arch = mod_arch;
+  uuid_size = 0;
+}
+
+void AddressInfo::FillModuleInfo(const LoadedModule &mod) {
+  module = internal_strdup(mod.full_name());
+  module_offset = address - mod.base_address();
+  module_arch = mod.arch();
+  if (mod.uuid_size())
+    internal_memcpy(uuid, mod.uuid(), mod.uuid_size());
+  uuid_size = mod.uuid_size();
 }
 
 SymbolizedStack::SymbolizedStack() : next(nullptr), info() {}
@@ -126,10 +138,4 @@ Symbolizer::SymbolizerScope::~SymbolizerScope() {
     sym_->end_hook_();
 }
 
-void Symbolizer::LateInitializeTools() {
-  for (auto &tool : tools_) {
-    tool.LateInitialize();
-  }
-}
-
 }  // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer.h b/libsanitizer/sanitizer_common/sanitizer_symbolizer.h
index 42bd157..bad4761 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer.h
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer.h
@@ -32,6 +32,8 @@ struct AddressInfo {
   char *module;
   uptr module_offset;
   ModuleArch module_arch;
+  u8 uuid[kModuleUUIDSize];
+  uptr uuid_size;
 
   static const uptr kUnknown = ~(uptr)0;
   char *function;
@@ -45,6 +47,8 @@ struct AddressInfo {
   // Deletes all strings and resets all fields.
   void Clear();
   void FillModuleInfo(const char *mod_name, uptr mod_offset, ModuleArch arch);
+  void FillModuleInfo(const LoadedModule &mod);
+  uptr module_base() const { return address - module_offset; }
 };
 
 // Linked list of symbolized frames (each frame is described by AddressInfo).
@@ -209,9 +213,6 @@ class Symbolizer final {
    private:
     const Symbolizer *sym_;
   };
-
-  // Calls `LateInitialize()` on all items in `tools_`.
-  void LateInitializeTools();
 };
 
 #ifdef SANITIZER_WINDOWS
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer_internal.h b/libsanitizer/sanitizer_common/sanitizer_symbolizer_internal.h
index b867094..df122ed 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer_internal.h
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer_internal.h
@@ -70,11 +70,6 @@ class SymbolizerTool {
     return nullptr;
   }
 
-  // Called during the LateInitialize phase of Sanitizer initialization.
-  // Usually this is a safe place to call code that might need to use user
-  // memory allocators.
-  virtual void LateInitialize() {}
-
  protected:
   ~SymbolizerTool() {}
 };
@@ -91,7 +86,7 @@ class SymbolizerProcess {
   ~SymbolizerProcess() {}
 
   /// The maximum number of arguments required to invoke a tool process.
-  static const unsigned kArgVMax = 6;
+  static const unsigned kArgVMax = 16;
 
   // Customizable by subclasses.
   virtual bool StartSymbolizerSubprocess();
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer_libcdep.cpp b/libsanitizer/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
index 3fc994f..8bbd4af 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
@@ -84,15 +84,12 @@ const char *ExtractTokenUpToDelimiter(const char *str, const char *delimiter,
 
 SymbolizedStack *Symbolizer::SymbolizePC(uptr addr) {
   Lock l(&mu_);
-  const char *module_name = nullptr;
-  uptr module_offset;
-  ModuleArch arch;
   SymbolizedStack *res = SymbolizedStack::New(addr);
-  if (!FindModuleNameAndOffsetForAddress(addr, &module_name, &module_offset,
-                                         &arch))
+  auto *mod = FindModuleForAddress(addr);
+  if (!mod)
     return res;
   // Always fill data about module name and offset.
-  res->info.FillModuleInfo(module_name, module_offset, arch);
+  res->info.FillModuleInfo(*mod);
   for (auto &tool : tools_) {
     SymbolizerScope sym_scope(this);
     if (tool.SymbolizePC(addr, res)) {
@@ -277,14 +274,17 @@ class LLVMSymbolizerProcess final : public SymbolizerProcess {
     const char* const kSymbolizerArch = "--default-arch=unknown";
 #endif
 
-    const char *const inline_flag = common_flags()->symbolize_inline_frames
-                                        ? "--inlines"
-                                        : "--no-inlines";
+    const char *const demangle_flag =
+        common_flags()->demangle ? "--demangle" : "--no-demangle";
+    const char *const inline_flag =
+        common_flags()->symbolize_inline_frames ? "--inlines" : "--no-inlines";
     int i = 0;
     argv[i++] = path_to_binary;
+    argv[i++] = demangle_flag;
     argv[i++] = inline_flag;
     argv[i++] = kSymbolizerArch;
     argv[i++] = nullptr;
+    CHECK_LE(i, kArgVMax);
   }
 };
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer_mac.cpp b/libsanitizer/sanitizer_common/sanitizer_symbolizer_mac.cpp
index 5c25b28..ac811c8 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer_mac.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer_mac.cpp
@@ -20,7 +20,6 @@
 
 #include <dlfcn.h>
 #include <errno.h>
-#include <mach/mach.h>
 #include <stdlib.h>
 #include <sys/wait.h>
 #include <unistd.h>
@@ -58,13 +57,6 @@ bool DlAddrSymbolizer::SymbolizeData(uptr addr, DataInfo *datainfo) {
   return true;
 }
 
-#define K_ATOS_ENV_VAR "__check_mach_ports_lookup"
-
-// This cannot live in `AtosSymbolizerProcess` because instances of that object
-// are allocated by the internal allocator which under ASan is poisoned with
-// kAsanInternalHeapMagic.
-static char kAtosMachPortEnvEntry[] = K_ATOS_ENV_VAR "=000000000000000";
-
 class AtosSymbolizerProcess final : public SymbolizerProcess {
  public:
   explicit AtosSymbolizerProcess(const char *path)
@@ -72,51 +64,13 @@ class AtosSymbolizerProcess final : public SymbolizerProcess {
     pid_str_[0] = '\0';
   }
 
-  void LateInitialize() {
-    if (SANITIZER_IOSSIM) {
-      // `putenv()` may call malloc/realloc so it is only safe to do this
-      // during LateInitialize() or later (i.e. we can't do this in the
-      // constructor).  We also can't do this in `StartSymbolizerSubprocess()`
-      // because in TSan we switch allocators when we're symbolizing.
-      // We use `putenv()` rather than `setenv()` so that we can later directly
-      // write into the storage without LibC getting involved to change what the
-      // variable is set to
-      int result = putenv(kAtosMachPortEnvEntry);
-      CHECK_EQ(result, 0);
-    }
-  }
-
  private:
   bool StartSymbolizerSubprocess() override {
-    // Configure sandbox before starting atos process.
-
     // Put the string command line argument in the object so that it outlives
     // the call to GetArgV.
-    internal_snprintf(pid_str_, sizeof(pid_str_), "%d", internal_getpid());
-
-    if (SANITIZER_IOSSIM) {
-      // `atos` in the simulator is restricted in its ability to retrieve the
-      // task port for the target process (us) so we need to do extra work
-      // to pass our task port to it.
-      mach_port_t ports[]{mach_task_self()};
-      kern_return_t ret =
-          mach_ports_register(mach_task_self(), ports, /*count=*/1);
-      CHECK_EQ(ret, KERN_SUCCESS);
-
-      // Set environment variable that signals to `atos` that it should look
-      // for our task port. We can't call `setenv()` here because it might call
-      // malloc/realloc. To avoid that we instead update the
-      // `mach_port_env_var_entry_` variable with our current PID.
-      uptr count = internal_snprintf(kAtosMachPortEnvEntry,
-                                     sizeof(kAtosMachPortEnvEntry),
-                                     K_ATOS_ENV_VAR "=%s", pid_str_);
-      CHECK_GE(count, sizeof(K_ATOS_ENV_VAR) + internal_strlen(pid_str_));
-      // Document our assumption but without calling `getenv()` in normal
-      // builds.
-      DCHECK(getenv(K_ATOS_ENV_VAR));
-      DCHECK_EQ(internal_strcmp(getenv(K_ATOS_ENV_VAR), pid_str_), 0);
-    }
+    internal_snprintf(pid_str_, sizeof(pid_str_), "%d", (int)internal_getpid());
 
+    // Configure sandbox before starting atos process.
     return SymbolizerProcess::StartSymbolizerSubprocess();
   }
 
@@ -137,13 +91,10 @@ class AtosSymbolizerProcess final : public SymbolizerProcess {
       argv[i++] = "-d";
     }
     argv[i++] = nullptr;
+    CHECK_LE(i, kArgVMax);
   }
 
   char pid_str_[16];
-  // Space for `\0` in `K_ATOS_ENV_VAR` is reused for `=`.
-  static_assert(sizeof(kAtosMachPortEnvEntry) ==
-                    (sizeof(K_ATOS_ENV_VAR) + sizeof(pid_str_)),
-                "sizes should match");
 };
 
 #undef K_ATOS_ENV_VAR
@@ -249,8 +200,6 @@ bool AtosSymbolizer::SymbolizeData(uptr addr, DataInfo *info) {
   return true;
 }
 
-void AtosSymbolizer::LateInitialize() { process_->LateInitialize(); }
-
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_MAC
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer_mac.h b/libsanitizer/sanitizer_common/sanitizer_symbolizer_mac.h
index 401d30f..d5abe9d 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer_mac.h
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer_mac.h
@@ -35,7 +35,6 @@ class AtosSymbolizer final : public SymbolizerTool {
 
   bool SymbolizePC(uptr addr, SymbolizedStack *stack) override;
   bool SymbolizeData(uptr addr, DataInfo *info) override;
-  void LateInitialize() override;
 
  private:
   AtosSymbolizerProcess *process_;
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer_markup.cpp b/libsanitizer/sanitizer_common/sanitizer_symbolizer_markup.cpp
index 9a5b4a8..1ec0c5c 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer_markup.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer_markup.cpp
@@ -100,9 +100,7 @@ Symbolizer *Symbolizer::PlatformInit() {
   return new (symbolizer_allocator_) Symbolizer({});
 }
 
-void Symbolizer::LateInitialize() {
-  Symbolizer::GetOrInit()->LateInitializeTools();
-}
+void Symbolizer::LateInitialize() { Symbolizer::GetOrInit(); }
 
 void StartReportDeadlySignal() {}
 void ReportDeadlySignal(const SignalContext &sig, u32 tid,
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/libsanitizer/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index 4cd4b46..5f6e4cc 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -213,9 +213,14 @@ class Addr2LineProcess final : public SymbolizerProcess {
                const char *(&argv)[kArgVMax]) const override {
     int i = 0;
     argv[i++] = path_to_binary;
-    argv[i++] = "-iCfe";
+    if (common_flags()->demangle)
+      argv[i++] = "-C";
+    if (common_flags()->symbolize_inline_frames)
+      argv[i++] = "-i";
+    argv[i++] = "-fe";
     argv[i++] = module_name_;
     argv[i++] = nullptr;
+    CHECK_LE(i, kArgVMax);
   }
 
   bool ReachedEndOfOutput(const char *buffer, uptr length) const override;
@@ -312,37 +317,42 @@ class Addr2LinePool final : public SymbolizerTool {
       FIRST_32_SECOND_64(UINT32_MAX, UINT64_MAX);
 };
 
-#if SANITIZER_SUPPORTS_WEAK_HOOKS
+#  if SANITIZER_SUPPORTS_WEAK_HOOKS
 extern "C" {
 SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE bool
 __sanitizer_symbolize_code(const char *ModuleName, u64 ModuleOffset,
-                           char *Buffer, int MaxLength,
-                           bool SymbolizeInlineFrames);
-SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-bool __sanitizer_symbolize_data(const char *ModuleName, u64 ModuleOffset,
-                                char *Buffer, int MaxLength);
-SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void __sanitizer_symbolize_flush();
-SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-int __sanitizer_symbolize_demangle(const char *Name, char *Buffer,
-                                   int MaxLength);
+                           char *Buffer, int MaxLength);
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE bool
+__sanitizer_symbolize_data(const char *ModuleName, u64 ModuleOffset,
+                           char *Buffer, int MaxLength);
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void
+__sanitizer_symbolize_flush();
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE int
+__sanitizer_symbolize_demangle(const char *Name, char *Buffer, int MaxLength);
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE bool
+__sanitizer_symbolize_set_demangle(bool Demangle);
+SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE bool
+__sanitizer_symbolize_set_inline_frames(bool InlineFrames);
 }  // extern "C"
 
 class InternalSymbolizer final : public SymbolizerTool {
  public:
   static InternalSymbolizer *get(LowLevelAllocator *alloc) {
-    if (__sanitizer_symbolize_code != 0 &&
-        __sanitizer_symbolize_data != 0) {
-      return new(*alloc) InternalSymbolizer();
-    }
+    if (__sanitizer_symbolize_set_demangle)
+      CHECK(__sanitizer_symbolize_set_demangle(common_flags()->demangle));
+    if (__sanitizer_symbolize_set_inline_frames)
+      CHECK(__sanitizer_symbolize_set_inline_frames(
+          common_flags()->symbolize_inline_frames));
+    if (__sanitizer_symbolize_code && __sanitizer_symbolize_data)
+      return new (*alloc) InternalSymbolizer();
     return 0;
   }
 
   bool SymbolizePC(uptr addr, SymbolizedStack *stack) override {
     bool result = __sanitizer_symbolize_code(
-        stack->info.module, stack->info.module_offset, buffer_, kBufferSize,
-        common_flags()->symbolize_inline_frames);
-    if (result) ParseSymbolizePCOutput(buffer_, stack);
+        stack->info.module, stack->info.module_offset, buffer_, kBufferSize);
+    if (result)
+      ParseSymbolizePCOutput(buffer_, stack);
     return result;
   }
 
@@ -365,7 +375,7 @@ class InternalSymbolizer final : public SymbolizerTool {
     if (__sanitizer_symbolize_demangle) {
       for (uptr res_length = 1024;
            res_length <= InternalSizeClassMap::kMaxSize;) {
-        char *res_buff = static_cast<char*>(InternalAlloc(res_length));
+        char *res_buff = static_cast<char *>(InternalAlloc(res_length));
         uptr req_length =
             __sanitizer_symbolize_demangle(name, res_buff, res_length);
         if (req_length > res_length) {
@@ -380,19 +390,19 @@ class InternalSymbolizer final : public SymbolizerTool {
   }
 
  private:
-  InternalSymbolizer() { }
+  InternalSymbolizer() {}
 
   static const int kBufferSize = 16 * 1024;
   char buffer_[kBufferSize];
 };
-#else  // SANITIZER_SUPPORTS_WEAK_HOOKS
+#  else  // SANITIZER_SUPPORTS_WEAK_HOOKS
 
 class InternalSymbolizer final : public SymbolizerTool {
  public:
   static InternalSymbolizer *get(LowLevelAllocator *alloc) { return 0; }
 };
 
-#endif  // SANITIZER_SUPPORTS_WEAK_HOOKS
+#  endif  // SANITIZER_SUPPORTS_WEAK_HOOKS
 
 const char *Symbolizer::PlatformDemangle(const char *name) {
   return DemangleSwiftAndCXX(name);
@@ -492,7 +502,7 @@ Symbolizer *Symbolizer::PlatformInit() {
 }
 
 void Symbolizer::LateInitialize() {
-  Symbolizer::GetOrInit()->LateInitializeTools();
+  Symbolizer::GetOrInit();
   InitializeSwiftDemangler();
 }
 
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer_report.cpp b/libsanitizer/sanitizer_common/sanitizer_symbolizer_report.cpp
index 869c893..ac855c8 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer_report.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer_report.cpp
@@ -211,9 +211,9 @@ static void ReportDeadlySignalImpl(const SignalContext &sig, u32 tid,
     Report("Hint: pc points to the zero page.\n");
   if (sig.is_memory_access) {
     const char *access_type =
-        sig.write_flag == SignalContext::WRITE
+        sig.write_flag == SignalContext::Write
             ? "WRITE"
-            : (sig.write_flag == SignalContext::READ ? "READ" : "UNKNOWN");
+            : (sig.write_flag == SignalContext::Read ? "READ" : "UNKNOWN");
     Report("The signal is caused by a %s memory access.\n", access_type);
     if (!sig.is_true_faulting_addr)
       Report("Hint: this fault was caused by a dereference of a high value "
diff --git a/libsanitizer/sanitizer_common/sanitizer_symbolizer_win.cpp b/libsanitizer/sanitizer_common/sanitizer_symbolizer_win.cpp
index 702d901..c647ab1 100644
--- a/libsanitizer/sanitizer_common/sanitizer_symbolizer_win.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_symbolizer_win.cpp
@@ -318,7 +318,7 @@ Symbolizer *Symbolizer::PlatformInit() {
 }
 
 void Symbolizer::LateInitialize() {
-  Symbolizer::GetOrInit()->LateInitializeTools();
+  Symbolizer::GetOrInit();
 }
 
 }  // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_syscalls_netbsd.inc b/libsanitizer/sanitizer_common/sanitizer_syscalls_netbsd.inc
index c4a9d99..4ce5de0 100644
--- a/libsanitizer/sanitizer_common/sanitizer_syscalls_netbsd.inc
+++ b/libsanitizer/sanitizer_common/sanitizer_syscalls_netbsd.inc
@@ -2255,13 +2255,13 @@ PRE_SYSCALL(getcontext)(void *ucp_) { /* Nothing to do */ }
 POST_SYSCALL(getcontext)(long long res, void *ucp_) { /* Nothing to do */ }
 PRE_SYSCALL(setcontext)(void *ucp_) {
   if (ucp_) {
-    PRE_READ(ucp_, ucontext_t_sz);
+    PRE_READ(ucp_, ucontext_t_sz(ucp_));
   }
 }
 POST_SYSCALL(setcontext)(long long res, void *ucp_) {}
 PRE_SYSCALL(_lwp_create)(void *ucp_, long long flags_, void *new_lwp_) {
   if (ucp_) {
-    PRE_READ(ucp_, ucontext_t_sz);
+    PRE_READ(ucp_, ucontext_t_sz(ucp_));
   }
 }
 POST_SYSCALL(_lwp_create)
diff --git a/libsanitizer/sanitizer_common/sanitizer_thread_registry.cpp b/libsanitizer/sanitizer_common/sanitizer_thread_registry.cpp
index a34b8c1..278f6de 100644
--- a/libsanitizer/sanitizer_common/sanitizer_thread_registry.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_thread_registry.cpp
@@ -13,6 +13,8 @@
 
 #include "sanitizer_thread_registry.h"
 
+#include "sanitizer_placement_new.h"
+
 namespace __sanitizer {
 
 ThreadContextBase::ThreadContextBase(u32 tid)
@@ -108,7 +110,7 @@ ThreadRegistry::ThreadRegistry(ThreadContextFactory factory, u32 max_threads,
       max_threads_(max_threads),
       thread_quarantine_size_(thread_quarantine_size),
       max_reuse_(max_reuse),
-      mtx_(),
+      mtx_(MutexThreadRegistry),
       total_threads_(0),
       alive_threads_(0),
       max_alive_threads_(0),
@@ -162,6 +164,12 @@ u32 ThreadRegistry::CreateThread(uptr user_id, bool detached, u32 parent_tid,
     max_alive_threads_++;
     CHECK_EQ(alive_threads_, max_alive_threads_);
   }
+  if (user_id) {
+    // Ensure that user_id is unique. If it's not the case we are screwed.
+    // Ignoring this situation may lead to very hard to debug false
+    // positives later (e.g. if we join a wrong thread).
+    CHECK(live_.try_emplace(user_id, tid).second);
+  }
   tctx->SetCreated(user_id, total_threads_++, detached,
                    parent_tid, arg);
   return tid;
@@ -221,14 +229,8 @@ void ThreadRegistry::SetThreadName(u32 tid, const char *name) {
 
 void ThreadRegistry::SetThreadNameByUserId(uptr user_id, const char *name) {
   ThreadRegistryLock l(this);
-  for (u32 tid = 0; tid < threads_.size(); tid++) {
-    ThreadContextBase *tctx = threads_[tid];
-    if (tctx != 0 && tctx->user_id == user_id &&
-        tctx->status != ThreadStatusInvalid) {
-      tctx->SetName(name);
-      return;
-    }
-  }
+  if (const auto *tid = live_.find(user_id))
+    threads_[tid->second]->SetName(name);
 }
 
 void ThreadRegistry::DetachThread(u32 tid, void *arg) {
@@ -241,6 +243,8 @@ void ThreadRegistry::DetachThread(u32 tid, void *arg) {
   }
   tctx->OnDetached(arg);
   if (tctx->status == ThreadStatusFinished) {
+    if (tctx->user_id)
+      live_.erase(tctx->user_id);
     tctx->SetDead();
     QuarantinePush(tctx);
   } else {
@@ -260,6 +264,8 @@ void ThreadRegistry::JoinThread(u32 tid, void *arg) {
         return;
       }
       if ((destroyed = tctx->GetDestroyed())) {
+        if (tctx->user_id)
+          live_.erase(tctx->user_id);
         tctx->SetJoined(arg);
         QuarantinePush(tctx);
       }
@@ -292,6 +298,8 @@ ThreadStatus ThreadRegistry::FinishThread(u32 tid) {
   }
   tctx->SetFinished();
   if (dead) {
+    if (tctx->user_id)
+      live_.erase(tctx->user_id);
     tctx->SetDead();
     QuarantinePush(tctx);
   }
@@ -333,6 +341,19 @@ ThreadContextBase *ThreadRegistry::QuarantinePop() {
   return tctx;
 }
 
+u32 ThreadRegistry::ConsumeThreadUserId(uptr user_id) {
+  ThreadRegistryLock l(this);
+  u32 tid;
+  auto *t = live_.find(user_id);
+  CHECK(t);
+  tid = t->second;
+  live_.erase(t);
+  auto *tctx = threads_[tid];
+  CHECK_EQ(tctx->user_id, user_id);
+  tctx->user_id = 0;
+  return tid;
+}
+
 void ThreadRegistry::SetThreadUserId(u32 tid, uptr user_id) {
   ThreadRegistryLock l(this);
   ThreadContextBase *tctx = threads_[tid];
@@ -341,6 +362,23 @@ void ThreadRegistry::SetThreadUserId(u32 tid, uptr user_id) {
   CHECK_NE(tctx->status, ThreadStatusDead);
   CHECK_EQ(tctx->user_id, 0);
   tctx->user_id = user_id;
+  CHECK(live_.try_emplace(user_id, tctx->tid).second);
+}
+
+u32 ThreadRegistry::OnFork(u32 tid) {
+  ThreadRegistryLock l(this);
+  // We only purge user_id (pthread_t) of live threads because
+  // they cause CHECK failures if new threads with matching pthread_t
+  // created after fork.
+  // Potentially we could purge more info (ThreadContextBase themselves),
+  // but it's hard to test and easy to introduce new issues by doing this.
+  for (auto *tctx : threads_) {
+    if (tctx->tid == tid || !tctx->user_id)
+      continue;
+    CHECK(live_.erase(tctx->user_id));
+    tctx->user_id = 0;
+  }
+  return alive_threads_;
 }
 
 }  // namespace __sanitizer
diff --git a/libsanitizer/sanitizer_common/sanitizer_thread_registry.h b/libsanitizer/sanitizer_common/sanitizer_thread_registry.h
index a8a4d4d..2c7e5c2 100644
--- a/libsanitizer/sanitizer_common/sanitizer_thread_registry.h
+++ b/libsanitizer/sanitizer_common/sanitizer_thread_registry.h
@@ -15,6 +15,7 @@
 #define SANITIZER_THREAD_REGISTRY_H
 
 #include "sanitizer_common.h"
+#include "sanitizer_dense_map.h"
 #include "sanitizer_list.h"
 #include "sanitizer_mutex.h"
 
@@ -85,7 +86,7 @@ class ThreadContextBase {
 
 typedef ThreadContextBase* (*ThreadContextFactory)(u32 tid);
 
-class MUTEX ThreadRegistry {
+class SANITIZER_MUTEX ThreadRegistry {
  public:
   ThreadRegistry(ThreadContextFactory factory);
   ThreadRegistry(ThreadContextFactory factory, u32 max_threads,
@@ -94,15 +95,17 @@ class MUTEX ThreadRegistry {
                           uptr *alive = nullptr);
   uptr GetMaxAliveThreads();
 
-  void Lock() ACQUIRE() { mtx_.Lock(); }
-  void CheckLocked() const CHECK_LOCKED() { mtx_.CheckLocked(); }
-  void Unlock() RELEASE() { mtx_.Unlock(); }
+  void Lock() SANITIZER_ACQUIRE() { mtx_.Lock(); }
+  void CheckLocked() const SANITIZER_CHECK_LOCKED() { mtx_.CheckLocked(); }
+  void Unlock() SANITIZER_RELEASE() { mtx_.Unlock(); }
 
   // Should be guarded by ThreadRegistryLock.
   ThreadContextBase *GetThreadLocked(u32 tid) {
     return threads_.empty() ? nullptr : threads_[tid];
   }
 
+  u32 NumThreadsLocked() const { return threads_.size(); }
+
   u32 CreateThread(uptr user_id, bool detached, u32 parent_tid, void *arg);
 
   typedef void (*ThreadCallback)(ThreadContextBase *tctx, void *arg);
@@ -127,8 +130,14 @@ class MUTEX ThreadRegistry {
   // Finishes thread and returns previous status.
   ThreadStatus FinishThread(u32 tid);
   void StartThread(u32 tid, tid_t os_id, ThreadType thread_type, void *arg);
+  u32 ConsumeThreadUserId(uptr user_id);
   void SetThreadUserId(u32 tid, uptr user_id);
 
+  // OnFork must be called in the child process after fork to purge old
+  // threads that don't exist anymore (except for the current thread tid).
+  // Returns number of alive threads before fork.
+  u32 OnFork(u32 tid);
+
  private:
   const ThreadContextFactory context_factory_;
   const u32 max_threads_;
@@ -146,6 +155,7 @@ class MUTEX ThreadRegistry {
   InternalMmapVector<ThreadContextBase *> threads_;
   IntrusiveList<ThreadContextBase> dead_threads_;
   IntrusiveList<ThreadContextBase> invalid_threads_;
+  DenseMap<uptr, Tid> live_;
 
   void QuarantinePush(ThreadContextBase *tctx);
   ThreadContextBase *QuarantinePop();
diff --git a/libsanitizer/sanitizer_common/sanitizer_thread_safety.h b/libsanitizer/sanitizer_common/sanitizer_thread_safety.h
index 52b25ed..c34ea80 100644
--- a/libsanitizer/sanitizer_common/sanitizer_thread_safety.h
+++ b/libsanitizer/sanitizer_common/sanitizer_thread_safety.h
@@ -16,27 +16,34 @@
 #define SANITIZER_THREAD_SAFETY_H
 
 #if defined(__clang__)
-#  define THREAD_ANNOTATION(x) __attribute__((x))
+#  define SANITIZER_THREAD_ANNOTATION(x) __attribute__((x))
 #else
-#  define THREAD_ANNOTATION(x)
+#  define SANITIZER_THREAD_ANNOTATION(x)
 #endif
 
-#define MUTEX THREAD_ANNOTATION(capability("mutex"))
-#define SCOPED_LOCK THREAD_ANNOTATION(scoped_lockable)
-#define GUARDED_BY(x) THREAD_ANNOTATION(guarded_by(x))
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION(pt_guarded_by(x))
-#define REQUIRES(...) THREAD_ANNOTATION(requires_capability(__VA_ARGS__))
-#define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION(requires_shared_capability(__VA_ARGS__))
-#define ACQUIRE(...) THREAD_ANNOTATION(acquire_capability(__VA_ARGS__))
-#define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION(acquire_shared_capability(__VA_ARGS__))
-#define TRY_ACQUIRE(...) THREAD_ANNOTATION(try_acquire_capability(__VA_ARGS__))
-#define RELEASE(...) THREAD_ANNOTATION(release_capability(__VA_ARGS__))
-#define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION(release_shared_capability(__VA_ARGS__))
-#define EXCLUDES(...) THREAD_ANNOTATION(locks_excluded(__VA_ARGS__))
-#define CHECK_LOCKED(...) THREAD_ANNOTATION(assert_capability(__VA_ARGS__))
-#define NO_THREAD_SAFETY_ANALYSIS THREAD_ANNOTATION(no_thread_safety_analysis)
+#define SANITIZER_MUTEX SANITIZER_THREAD_ANNOTATION(capability("mutex"))
+#define SANITIZER_SCOPED_LOCK SANITIZER_THREAD_ANNOTATION(scoped_lockable)
+#define SANITIZER_GUARDED_BY(x) SANITIZER_THREAD_ANNOTATION(guarded_by(x))
+#define SANITIZER_PT_GUARDED_BY(x) SANITIZER_THREAD_ANNOTATION(pt_guarded_by(x))
+#define SANITIZER_REQUIRES(...) \
+  SANITIZER_THREAD_ANNOTATION(requires_capability(__VA_ARGS__))
+#define SANITIZER_REQUIRES_SHARED(...) \
+  SANITIZER_THREAD_ANNOTATION(requires_shared_capability(__VA_ARGS__))
+#define SANITIZER_ACQUIRE(...) \
+  SANITIZER_THREAD_ANNOTATION(acquire_capability(__VA_ARGS__))
+#define SANITIZER_ACQUIRE_SHARED(...) \
+  SANITIZER_THREAD_ANNOTATION(acquire_shared_capability(__VA_ARGS__))
+#define SANITIZER_TRY_ACQUIRE(...) \
+  SANITIZER_THREAD_ANNOTATION(try_acquire_capability(__VA_ARGS__))
+#define SANITIZER_RELEASE(...) \
+  SANITIZER_THREAD_ANNOTATION(release_capability(__VA_ARGS__))
+#define SANITIZER_RELEASE_SHARED(...) \
+  SANITIZER_THREAD_ANNOTATION(release_shared_capability(__VA_ARGS__))
+#define SANITIZER_EXCLUDES(...) \
+  SANITIZER_THREAD_ANNOTATION(locks_excluded(__VA_ARGS__))
+#define SANITIZER_CHECK_LOCKED(...) \
+  SANITIZER_THREAD_ANNOTATION(assert_capability(__VA_ARGS__))
+#define SANITIZER_NO_THREAD_SAFETY_ANALYSIS \
+  SANITIZER_THREAD_ANNOTATION(no_thread_safety_analysis)
 
 #endif
diff --git a/libsanitizer/sanitizer_common/sanitizer_type_traits.h b/libsanitizer/sanitizer_common/sanitizer_type_traits.h
index 2a58d98..06a44d1 100644
--- a/libsanitizer/sanitizer_common/sanitizer_type_traits.h
+++ b/libsanitizer/sanitizer_common/sanitizer_type_traits.h
@@ -13,6 +13,8 @@
 #ifndef SANITIZER_TYPE_TRAITS_H
 #define SANITIZER_TYPE_TRAITS_H
 
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
 namespace __sanitizer {
 
 struct true_type {
@@ -57,6 +59,83 @@ struct conditional<false, T, F> {
   using type = F;
 };
 
+template <class T>
+struct remove_reference {
+  using type = T;
+};
+template <class T>
+struct remove_reference<T&> {
+  using type = T;
+};
+template <class T>
+struct remove_reference<T&&> {
+  using type = T;
+};
+
+template <class T>
+WARN_UNUSED_RESULT inline typename remove_reference<T>::type&& move(T&& t) {
+  return static_cast<typename remove_reference<T>::type&&>(t);
+}
+
+template <class T>
+WARN_UNUSED_RESULT inline constexpr T&& forward(
+    typename remove_reference<T>::type& t) {
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+WARN_UNUSED_RESULT inline constexpr T&& forward(
+    typename remove_reference<T>::type&& t) {
+  return static_cast<T&&>(t);
+}
+
+template <class T, T v>
+struct integral_constant {
+  static constexpr const T value = v;
+  typedef T value_type;
+  typedef integral_constant type;
+  constexpr operator value_type() const { return value; }
+  constexpr value_type operator()() const { return value; }
+};
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if __has_builtin(__is_trivially_destructible)
+
+template <class T>
+struct is_trivially_destructible
+    : public integral_constant<bool, __is_trivially_destructible(T)> {};
+
+#elif __has_builtin(__has_trivial_destructor)
+
+template <class T>
+struct is_trivially_destructible
+    : public integral_constant<bool, __has_trivial_destructor(T)> {};
+
+#else
+
+template <class T>
+struct is_trivially_destructible
+    : public integral_constant<bool, /* less efficient fallback */ false> {};
+
+#endif
+
+#if __has_builtin(__is_trivially_copyable)
+
+template <class T>
+struct is_trivially_copyable
+    : public integral_constant<bool, __is_trivially_copyable(T)> {};
+
+#else
+
+template <class T>
+struct is_trivially_copyable
+    : public integral_constant<bool, /* less efficient fallback */ false> {};
+
+#endif
+
 }  // namespace __sanitizer
 
 #endif
diff --git a/libsanitizer/sanitizer_common/sanitizer_unwind_win.cpp b/libsanitizer/sanitizer_common/sanitizer_unwind_win.cpp
index 7e01c81..afcd01d 100644
--- a/libsanitizer/sanitizer_common/sanitizer_unwind_win.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_unwind_win.cpp
@@ -57,30 +57,37 @@ void BufferedStackTrace::UnwindSlow(uptr pc, void *context, u32 max_depth) {
   InitializeDbgHelpIfNeeded();
 
   size = 0;
-#if defined(_WIN64)
+#    if SANITIZER_WINDOWS64
+#      if SANITIZER_ARM64
+  int machine_type = IMAGE_FILE_MACHINE_ARM64;
+  stack_frame.AddrPC.Offset = ctx.Pc;
+  stack_frame.AddrFrame.Offset = ctx.Fp;
+  stack_frame.AddrStack.Offset = ctx.Sp;
+#      else
   int machine_type = IMAGE_FILE_MACHINE_AMD64;
   stack_frame.AddrPC.Offset = ctx.Rip;
   stack_frame.AddrFrame.Offset = ctx.Rbp;
   stack_frame.AddrStack.Offset = ctx.Rsp;
-#else
+#      endif
+#    else
   int machine_type = IMAGE_FILE_MACHINE_I386;
   stack_frame.AddrPC.Offset = ctx.Eip;
   stack_frame.AddrFrame.Offset = ctx.Ebp;
   stack_frame.AddrStack.Offset = ctx.Esp;
-#endif
+#    endif
   stack_frame.AddrPC.Mode = AddrModeFlat;
   stack_frame.AddrFrame.Mode = AddrModeFlat;
   stack_frame.AddrStack.Mode = AddrModeFlat;
   while (StackWalk64(machine_type, GetCurrentProcess(), GetCurrentThread(),
-    &stack_frame, &ctx, NULL, SymFunctionTableAccess64,
-    SymGetModuleBase64, NULL) &&
-    size < Min(max_depth, kStackTraceMax)) {
+                     &stack_frame, &ctx, NULL, SymFunctionTableAccess64,
+                     SymGetModuleBase64, NULL) &&
+         size < Min(max_depth, kStackTraceMax)) {
     trace_buffer[size++] = (uptr)stack_frame.AddrPC.Offset;
   }
 }
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#endif  // #if !SANITIZER_GO
+#    ifdef __clang__
+#      pragma clang diagnostic pop
+#    endif
+#  endif  // #if !SANITIZER_GO
 
 #endif  // SANITIZER_WINDOWS
diff --git a/libsanitizer/sanitizer_common/sanitizer_win.cpp b/libsanitizer/sanitizer_common/sanitizer_win.cpp
index c3607db..5377033 100644
--- a/libsanitizer/sanitizer_common/sanitizer_win.cpp
+++ b/libsanitizer/sanitizer_common/sanitizer_win.cpp
@@ -16,7 +16,6 @@
 
 #define WIN32_LEAN_AND_MEAN
 #define NOGDI
-#include <direct.h>
 #include <windows.h>
 #include <io.h>
 #include <psapi.h>
@@ -94,6 +93,11 @@ bool FileExists(const char *filename) {
   return ::GetFileAttributesA(filename) != INVALID_FILE_ATTRIBUTES;
 }
 
+bool DirExists(const char *path) {
+  auto attr = ::GetFileAttributesA(path);
+  return (attr != INVALID_FILE_ATTRIBUTES) && (attr & FILE_ATTRIBUTE_DIRECTORY);
+}
+
 uptr internal_getpid() {
   return GetProcessId(GetCurrentProcess());
 }
@@ -337,6 +341,11 @@ bool MprotectNoAccess(uptr addr, uptr size) {
   return VirtualProtect((LPVOID)addr, size, PAGE_NOACCESS, &old_protection);
 }
 
+bool MprotectReadOnly(uptr addr, uptr size) {
+  DWORD old_protection;
+  return VirtualProtect((LPVOID)addr, size, PAGE_READONLY, &old_protection);
+}
+
 void ReleaseMemoryPagesToOS(uptr beg, uptr end) {
   uptr beg_aligned = RoundDownTo(beg, GetPageSizeCached()),
        end_aligned = RoundDownTo(end, GetPageSizeCached());
@@ -513,7 +522,7 @@ void ReExec() {
   UNIMPLEMENTED();
 }
 
-void PlatformPrepareForSandboxing(__sanitizer_sandbox_arguments *args) {}
+void PlatformPrepareForSandboxing(void *args) {}
 
 bool StackSizeIsUnlimited() {
   UNIMPLEMENTED();
@@ -566,7 +575,9 @@ void Abort() {
   internal__exit(3);
 }
 
-bool CreateDir(const char *pathname) { return _mkdir(pathname) == 0; }
+bool CreateDir(const char *pathname) {
+  return CreateDirectoryA(pathname, nullptr) != 0;
+}
 
 #if !SANITIZER_GO
 // Read the file to extract the ImageBase field from the PE header. If ASLR is
@@ -944,13 +955,18 @@ void SignalContext::InitPcSpBp() {
   CONTEXT *context_record = (CONTEXT *)context;
 
   pc = (uptr)exception_record->ExceptionAddress;
-#ifdef _WIN64
+#  if SANITIZER_WINDOWS64
+#    if SANITIZER_ARM64
+  bp = (uptr)context_record->Fp;
+  sp = (uptr)context_record->Sp;
+#    else
   bp = (uptr)context_record->Rbp;
   sp = (uptr)context_record->Rsp;
-#else
+#    endif
+#  else
   bp = (uptr)context_record->Ebp;
   sp = (uptr)context_record->Esp;
-#endif
+#  endif
 }
 
 uptr SignalContext::GetAddress() const {
@@ -972,7 +988,7 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
 
   // The write flag is only available for access violation exceptions.
   if (exception_record->ExceptionCode != EXCEPTION_ACCESS_VIOLATION)
-    return SignalContext::UNKNOWN;
+    return SignalContext::Unknown;
 
   // The contents of this array are documented at
   // https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-exception_record
@@ -980,13 +996,13 @@ SignalContext::WriteFlag SignalContext::GetWriteFlag() const {
   // second element is the faulting address.
   switch (exception_record->ExceptionInformation[0]) {
     case 0:
-      return SignalContext::READ;
+      return SignalContext::Read;
     case 1:
-      return SignalContext::WRITE;
+      return SignalContext::Write;
     case 8:
-      return SignalContext::UNKNOWN;
+      return SignalContext::Unknown;
   }
-  return SignalContext::UNKNOWN;
+  return SignalContext::Unknown;
 }
 
 void SignalContext::DumpAllRegisters(void *context) {
diff --git a/libsanitizer/tsan/tsan_clock.cpp b/libsanitizer/tsan/tsan_clock.cpp
deleted file mode 100644
index d122b67..0000000
--- a/libsanitizer/tsan/tsan_clock.cpp
+++ /dev/null
@@ -1,625 +0,0 @@
-//===-- tsan_clock.cpp ----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer (TSan), a race detector.
-//
-//===----------------------------------------------------------------------===//
-#include "tsan_clock.h"
-#include "tsan_rtl.h"
-#include "sanitizer_common/sanitizer_placement_new.h"
-
-// SyncClock and ThreadClock implement vector clocks for sync variables
-// (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
-// ThreadClock contains fixed-size vector clock for maximum number of threads.
-// SyncClock contains growable vector clock for currently necessary number of
-// threads.
-// Together they implement very simple model of operations, namely:
-//
-//   void ThreadClock::acquire(const SyncClock *src) {
-//     for (int i = 0; i < kMaxThreads; i++)
-//       clock[i] = max(clock[i], src->clock[i]);
-//   }
-//
-//   void ThreadClock::release(SyncClock *dst) const {
-//     for (int i = 0; i < kMaxThreads; i++)
-//       dst->clock[i] = max(dst->clock[i], clock[i]);
-//   }
-//
-//   void ThreadClock::releaseStoreAcquire(SyncClock *sc) const {
-//     for (int i = 0; i < kMaxThreads; i++) {
-//       tmp = clock[i];
-//       clock[i] = max(clock[i], sc->clock[i]);
-//       sc->clock[i] = tmp;
-//     }
-//   }
-//
-//   void ThreadClock::ReleaseStore(SyncClock *dst) const {
-//     for (int i = 0; i < kMaxThreads; i++)
-//       dst->clock[i] = clock[i];
-//   }
-//
-//   void ThreadClock::acq_rel(SyncClock *dst) {
-//     acquire(dst);
-//     release(dst);
-//   }
-//
-// Conformance to this model is extensively verified in tsan_clock_test.cpp.
-// However, the implementation is significantly more complex. The complexity
-// allows to implement important classes of use cases in O(1) instead of O(N).
-//
-// The use cases are:
-// 1. Singleton/once atomic that has a single release-store operation followed
-//    by zillions of acquire-loads (the acquire-load is O(1)).
-// 2. Thread-local mutex (both lock and unlock can be O(1)).
-// 3. Leaf mutex (unlock is O(1)).
-// 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
-// 5. An atomic with a single writer (writes can be O(1)).
-// The implementation dynamically adopts to workload. So if an atomic is in
-// read-only phase, these reads will be O(1); if it later switches to read/write
-// phase, the implementation will correctly handle that by switching to O(N).
-//
-// Thread-safety note: all const operations on SyncClock's are conducted under
-// a shared lock; all non-const operations on SyncClock's are conducted under
-// an exclusive lock; ThreadClock's are private to respective threads and so
-// do not need any protection.
-//
-// Description of SyncClock state:
-// clk_ - variable size vector clock, low kClkBits hold timestamp,
-//   the remaining bits hold "acquired" flag (the actual value is thread's
-//   reused counter);
-//   if acquired == thr->reused_, then the respective thread has already
-//   acquired this clock (except possibly for dirty elements).
-// dirty_ - holds up to two indices in the vector clock that other threads
-//   need to acquire regardless of "acquired" flag value;
-// release_store_tid_ - denotes that the clock state is a result of
-//   release-store operation by the thread with release_store_tid_ index.
-// release_store_reused_ - reuse count of release_store_tid_.
-
-namespace __tsan {
-
-static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
-  return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
-}
-
-// Drop reference to the first level block idx.
-static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
-  ClockBlock *cb = ctx->clock_alloc.Map(idx);
-  atomic_uint32_t *ref = ref_ptr(cb);
-  u32 v = atomic_load(ref, memory_order_acquire);
-  for (;;) {
-    CHECK_GT(v, 0);
-    if (v == 1)
-      break;
-    if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
-      return;
-  }
-  // First level block owns second level blocks, so them as well.
-  for (uptr i = 0; i < blocks; i++)
-    ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
-  ctx->clock_alloc.Free(c, idx);
-}
-
-ThreadClock::ThreadClock(unsigned tid, unsigned reused)
-    : tid_(tid)
-    , reused_(reused + 1)  // 0 has special meaning
-    , last_acquire_()
-    , global_acquire_()
-    , cached_idx_()
-    , cached_size_()
-    , cached_blocks_() {
-  CHECK_LT(tid, kMaxTidInClock);
-  CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
-  nclk_ = tid_ + 1;
-  internal_memset(clk_, 0, sizeof(clk_));
-}
-
-void ThreadClock::ResetCached(ClockCache *c) {
-  if (cached_idx_) {
-    UnrefClockBlock(c, cached_idx_, cached_blocks_);
-    cached_idx_ = 0;
-    cached_size_ = 0;
-    cached_blocks_ = 0;
-  }
-}
-
-void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
-  DCHECK_LE(nclk_, kMaxTid);
-  DCHECK_LE(src->size_, kMaxTid);
-
-  // Check if it's empty -> no need to do anything.
-  const uptr nclk = src->size_;
-  if (nclk == 0)
-    return;
-
-  bool acquired = false;
-  for (unsigned i = 0; i < kDirtyTids; i++) {
-    SyncClock::Dirty dirty = src->dirty_[i];
-    unsigned tid = dirty.tid();
-    if (tid != kInvalidTid) {
-      if (clk_[tid] < dirty.epoch) {
-        clk_[tid] = dirty.epoch;
-        acquired = true;
-      }
-    }
-  }
-
-  // Check if we've already acquired src after the last release operation on src
-  if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
-    // O(N) acquire.
-    nclk_ = max(nclk_, nclk);
-    u64 *dst_pos = &clk_[0];
-    for (ClockElem &src_elem : *src) {
-      u64 epoch = src_elem.epoch;
-      if (*dst_pos < epoch) {
-        *dst_pos = epoch;
-        acquired = true;
-      }
-      dst_pos++;
-    }
-
-    // Remember that this thread has acquired this clock.
-    if (nclk > tid_)
-      src->elem(tid_).reused = reused_;
-  }
-
-  if (acquired) {
-    last_acquire_ = clk_[tid_];
-    ResetCached(c);
-  }
-}
-
-void ThreadClock::releaseStoreAcquire(ClockCache *c, SyncClock *sc) {
-  DCHECK_LE(nclk_, kMaxTid);
-  DCHECK_LE(sc->size_, kMaxTid);
-
-  if (sc->size_ == 0) {
-    // ReleaseStore will correctly set release_store_tid_,
-    // which can be important for future operations.
-    ReleaseStore(c, sc);
-    return;
-  }
-
-  nclk_ = max(nclk_, (uptr) sc->size_);
-
-  // Check if we need to resize sc.
-  if (sc->size_ < nclk_)
-    sc->Resize(c, nclk_);
-
-  bool acquired = false;
-
-  sc->Unshare(c);
-  // Update sc->clk_.
-  sc->FlushDirty();
-  uptr i = 0;
-  for (ClockElem &ce : *sc) {
-    u64 tmp = clk_[i];
-    if (clk_[i] < ce.epoch) {
-      clk_[i] = ce.epoch;
-      acquired = true;
-    }
-    ce.epoch = tmp;
-    ce.reused = 0;
-    i++;
-  }
-  sc->release_store_tid_ = kInvalidTid;
-  sc->release_store_reused_ = 0;
-
-  if (acquired) {
-    last_acquire_ = clk_[tid_];
-    ResetCached(c);
-  }
-}
-
-void ThreadClock::release(ClockCache *c, SyncClock *dst) {
-  DCHECK_LE(nclk_, kMaxTid);
-  DCHECK_LE(dst->size_, kMaxTid);
-
-  if (dst->size_ == 0) {
-    // ReleaseStore will correctly set release_store_tid_,
-    // which can be important for future operations.
-    ReleaseStore(c, dst);
-    return;
-  }
-
-  // Check if we need to resize dst.
-  if (dst->size_ < nclk_)
-    dst->Resize(c, nclk_);
-
-  // Check if we had not acquired anything from other threads
-  // since the last release on dst. If so, we need to update
-  // only dst->elem(tid_).
-  if (!HasAcquiredAfterRelease(dst)) {
-    UpdateCurrentThread(c, dst);
-    if (dst->release_store_tid_ != tid_ ||
-        dst->release_store_reused_ != reused_)
-      dst->release_store_tid_ = kInvalidTid;
-    return;
-  }
-
-  // O(N) release.
-  dst->Unshare(c);
-  // First, remember whether we've acquired dst.
-  bool acquired = IsAlreadyAcquired(dst);
-  // Update dst->clk_.
-  dst->FlushDirty();
-  uptr i = 0;
-  for (ClockElem &ce : *dst) {
-    ce.epoch = max(ce.epoch, clk_[i]);
-    ce.reused = 0;
-    i++;
-  }
-  // Clear 'acquired' flag in the remaining elements.
-  dst->release_store_tid_ = kInvalidTid;
-  dst->release_store_reused_ = 0;
-  // If we've acquired dst, remember this fact,
-  // so that we don't need to acquire it on next acquire.
-  if (acquired)
-    dst->elem(tid_).reused = reused_;
-}
-
-void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
-  DCHECK_LE(nclk_, kMaxTid);
-  DCHECK_LE(dst->size_, kMaxTid);
-
-  if (dst->size_ == 0 && cached_idx_ != 0) {
-    // Reuse the cached clock.
-    // Note: we could reuse/cache the cached clock in more cases:
-    // we could update the existing clock and cache it, or replace it with the
-    // currently cached clock and release the old one. And for a shared
-    // existing clock, we could replace it with the currently cached;
-    // or unshare, update and cache. But, for simplicity, we currently reuse
-    // cached clock only when the target clock is empty.
-    dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
-    dst->tab_idx_ = cached_idx_;
-    dst->size_ = cached_size_;
-    dst->blocks_ = cached_blocks_;
-    CHECK_EQ(dst->dirty_[0].tid(), kInvalidTid);
-    // The cached clock is shared (immutable),
-    // so this is where we store the current clock.
-    dst->dirty_[0].set_tid(tid_);
-    dst->dirty_[0].epoch = clk_[tid_];
-    dst->release_store_tid_ = tid_;
-    dst->release_store_reused_ = reused_;
-    // Remember that we don't need to acquire it in future.
-    dst->elem(tid_).reused = reused_;
-    // Grab a reference.
-    atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
-    return;
-  }
-
-  // Check if we need to resize dst.
-  if (dst->size_ < nclk_)
-    dst->Resize(c, nclk_);
-
-  if (dst->release_store_tid_ == tid_ &&
-      dst->release_store_reused_ == reused_ &&
-      !HasAcquiredAfterRelease(dst)) {
-    UpdateCurrentThread(c, dst);
-    return;
-  }
-
-  // O(N) release-store.
-  dst->Unshare(c);
-  // Note: dst can be larger than this ThreadClock.
-  // This is fine since clk_ beyond size is all zeros.
-  uptr i = 0;
-  for (ClockElem &ce : *dst) {
-    ce.epoch = clk_[i];
-    ce.reused = 0;
-    i++;
-  }
-  for (uptr i = 0; i < kDirtyTids; i++) dst->dirty_[i].set_tid(kInvalidTid);
-  dst->release_store_tid_ = tid_;
-  dst->release_store_reused_ = reused_;
-  // Remember that we don't need to acquire it in future.
-  dst->elem(tid_).reused = reused_;
-
-  // If the resulting clock is cachable, cache it for future release operations.
-  // The clock is always cachable if we released to an empty sync object.
-  if (cached_idx_ == 0 && dst->Cachable()) {
-    // Grab a reference to the ClockBlock.
-    atomic_uint32_t *ref = ref_ptr(dst->tab_);
-    if (atomic_load(ref, memory_order_acquire) == 1)
-      atomic_store_relaxed(ref, 2);
-    else
-      atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
-    cached_idx_ = dst->tab_idx_;
-    cached_size_ = dst->size_;
-    cached_blocks_ = dst->blocks_;
-  }
-}
-
-void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
-  acquire(c, dst);
-  ReleaseStore(c, dst);
-}
-
-// Updates only single element related to the current thread in dst->clk_.
-void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
-  // Update the threads time, but preserve 'acquired' flag.
-  for (unsigned i = 0; i < kDirtyTids; i++) {
-    SyncClock::Dirty *dirty = &dst->dirty_[i];
-    const unsigned tid = dirty->tid();
-    if (tid == tid_ || tid == kInvalidTid) {
-      dirty->set_tid(tid_);
-      dirty->epoch = clk_[tid_];
-      return;
-    }
-  }
-  // Reset all 'acquired' flags, O(N).
-  // We are going to touch dst elements, so we need to unshare it.
-  dst->Unshare(c);
-  dst->elem(tid_).epoch = clk_[tid_];
-  for (uptr i = 0; i < dst->size_; i++)
-    dst->elem(i).reused = 0;
-  dst->FlushDirty();
-}
-
-// Checks whether the current thread has already acquired src.
-bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
-  if (src->elem(tid_).reused != reused_)
-    return false;
-  for (unsigned i = 0; i < kDirtyTids; i++) {
-    SyncClock::Dirty dirty = src->dirty_[i];
-    if (dirty.tid() != kInvalidTid) {
-      if (clk_[dirty.tid()] < dirty.epoch)
-        return false;
-    }
-  }
-  return true;
-}
-
-// Checks whether the current thread has acquired anything
-// from other clocks after releasing to dst (directly or indirectly).
-bool ThreadClock::HasAcquiredAfterRelease(const SyncClock *dst) const {
-  const u64 my_epoch = dst->elem(tid_).epoch;
-  return my_epoch <= last_acquire_ ||
-      my_epoch <= atomic_load_relaxed(&global_acquire_);
-}
-
-// Sets a single element in the vector clock.
-// This function is called only from weird places like AcquireGlobal.
-void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
-  DCHECK_LT(tid, kMaxTid);
-  DCHECK_GE(v, clk_[tid]);
-  clk_[tid] = v;
-  if (nclk_ <= tid)
-    nclk_ = tid + 1;
-  last_acquire_ = clk_[tid_];
-  ResetCached(c);
-}
-
-void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
-  printf("clock=[");
-  for (uptr i = 0; i < nclk_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
-  printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
-}
-
-SyncClock::SyncClock() {
-  ResetImpl();
-}
-
-SyncClock::~SyncClock() {
-  // Reset must be called before dtor.
-  CHECK_EQ(size_, 0);
-  CHECK_EQ(blocks_, 0);
-  CHECK_EQ(tab_, 0);
-  CHECK_EQ(tab_idx_, 0);
-}
-
-void SyncClock::Reset(ClockCache *c) {
-  if (size_)
-    UnrefClockBlock(c, tab_idx_, blocks_);
-  ResetImpl();
-}
-
-void SyncClock::ResetImpl() {
-  tab_ = 0;
-  tab_idx_ = 0;
-  size_ = 0;
-  blocks_ = 0;
-  release_store_tid_ = kInvalidTid;
-  release_store_reused_ = 0;
-  for (uptr i = 0; i < kDirtyTids; i++) dirty_[i].set_tid(kInvalidTid);
-}
-
-void SyncClock::Resize(ClockCache *c, uptr nclk) {
-  Unshare(c);
-  if (nclk <= capacity()) {
-    // Memory is already allocated, just increase the size.
-    size_ = nclk;
-    return;
-  }
-  if (size_ == 0) {
-    // Grow from 0 to one-level table.
-    CHECK_EQ(size_, 0);
-    CHECK_EQ(blocks_, 0);
-    CHECK_EQ(tab_, 0);
-    CHECK_EQ(tab_idx_, 0);
-    tab_idx_ = ctx->clock_alloc.Alloc(c);
-    tab_ = ctx->clock_alloc.Map(tab_idx_);
-    internal_memset(tab_, 0, sizeof(*tab_));
-    atomic_store_relaxed(ref_ptr(tab_), 1);
-    size_ = 1;
-  } else if (size_ > blocks_ * ClockBlock::kClockCount) {
-    u32 idx = ctx->clock_alloc.Alloc(c);
-    ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
-    uptr top = size_ - blocks_ * ClockBlock::kClockCount;
-    CHECK_LT(top, ClockBlock::kClockCount);
-    const uptr move = top * sizeof(tab_->clock[0]);
-    internal_memcpy(&new_cb->clock[0], tab_->clock, move);
-    internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
-    internal_memset(tab_->clock, 0, move);
-    append_block(idx);
-  }
-  // At this point we have first level table allocated and all clock elements
-  // are evacuated from it to a second level block.
-  // Add second level tables as necessary.
-  while (nclk > capacity()) {
-    u32 idx = ctx->clock_alloc.Alloc(c);
-    ClockBlock *cb = ctx->clock_alloc.Map(idx);
-    internal_memset(cb, 0, sizeof(*cb));
-    append_block(idx);
-  }
-  size_ = nclk;
-}
-
-// Flushes all dirty elements into the main clock array.
-void SyncClock::FlushDirty() {
-  for (unsigned i = 0; i < kDirtyTids; i++) {
-    Dirty *dirty = &dirty_[i];
-    if (dirty->tid() != kInvalidTid) {
-      CHECK_LT(dirty->tid(), size_);
-      elem(dirty->tid()).epoch = dirty->epoch;
-      dirty->set_tid(kInvalidTid);
-    }
-  }
-}
-
-bool SyncClock::IsShared() const {
-  if (size_ == 0)
-    return false;
-  atomic_uint32_t *ref = ref_ptr(tab_);
-  u32 v = atomic_load(ref, memory_order_acquire);
-  CHECK_GT(v, 0);
-  return v > 1;
-}
-
-// Unshares the current clock if it's shared.
-// Shared clocks are immutable, so they need to be unshared before any updates.
-// Note: this does not apply to dirty entries as they are not shared.
-void SyncClock::Unshare(ClockCache *c) {
-  if (!IsShared())
-    return;
-  // First, copy current state into old.
-  SyncClock old;
-  old.tab_ = tab_;
-  old.tab_idx_ = tab_idx_;
-  old.size_ = size_;
-  old.blocks_ = blocks_;
-  old.release_store_tid_ = release_store_tid_;
-  old.release_store_reused_ = release_store_reused_;
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    old.dirty_[i] = dirty_[i];
-  // Then, clear current object.
-  ResetImpl();
-  // Allocate brand new clock in the current object.
-  Resize(c, old.size_);
-  // Now copy state back into this object.
-  Iter old_iter(&old);
-  for (ClockElem &ce : *this) {
-    ce = *old_iter;
-    ++old_iter;
-  }
-  release_store_tid_ = old.release_store_tid_;
-  release_store_reused_ = old.release_store_reused_;
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dirty_[i] = old.dirty_[i];
-  // Drop reference to old and delete if necessary.
-  old.Reset(c);
-}
-
-// Can we cache this clock for future release operations?
-ALWAYS_INLINE bool SyncClock::Cachable() const {
-  if (size_ == 0)
-    return false;
-  for (unsigned i = 0; i < kDirtyTids; i++) {
-    if (dirty_[i].tid() != kInvalidTid)
-      return false;
-  }
-  return atomic_load_relaxed(ref_ptr(tab_)) == 1;
-}
-
-// elem linearizes the two-level structure into linear array.
-// Note: this is used only for one time accesses, vector operations use
-// the iterator as it is much faster.
-ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
-  DCHECK_LT(tid, size_);
-  const uptr block = tid / ClockBlock::kClockCount;
-  DCHECK_LE(block, blocks_);
-  tid %= ClockBlock::kClockCount;
-  if (block == blocks_)
-    return tab_->clock[tid];
-  u32 idx = get_block(block);
-  ClockBlock *cb = ctx->clock_alloc.Map(idx);
-  return cb->clock[tid];
-}
-
-ALWAYS_INLINE uptr SyncClock::capacity() const {
-  if (size_ == 0)
-    return 0;
-  uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
-  // How many clock elements we can fit into the first level block.
-  // +1 for ref counter.
-  uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
-  return blocks_ * ClockBlock::kClockCount + top;
-}
-
-ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
-  DCHECK(size_);
-  DCHECK_LT(bi, blocks_);
-  return tab_->table[ClockBlock::kBlockIdx - bi];
-}
-
-ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
-  uptr bi = blocks_++;
-  CHECK_EQ(get_block(bi), 0);
-  tab_->table[ClockBlock::kBlockIdx - bi] = idx;
-}
-
-// Used only by tests.
-u64 SyncClock::get(unsigned tid) const {
-  for (unsigned i = 0; i < kDirtyTids; i++) {
-    Dirty dirty = dirty_[i];
-    if (dirty.tid() == tid)
-      return dirty.epoch;
-  }
-  return elem(tid).epoch;
-}
-
-// Used only by Iter test.
-u64 SyncClock::get_clean(unsigned tid) const {
-  return elem(tid).epoch;
-}
-
-void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
-  printf("clock=[");
-  for (uptr i = 0; i < size_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
-  printf("] reused=[");
-  for (uptr i = 0; i < size_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
-  printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
-         release_store_tid_, release_store_reused_, dirty_[0].tid(),
-         dirty_[0].epoch, dirty_[1].tid(), dirty_[1].epoch);
-}
-
-void SyncClock::Iter::Next() {
-  // Finished with the current block, move on to the next one.
-  block_++;
-  if (block_ < parent_->blocks_) {
-    // Iterate over the next second level block.
-    u32 idx = parent_->get_block(block_);
-    ClockBlock *cb = ctx->clock_alloc.Map(idx);
-    pos_ = &cb->clock[0];
-    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
-        ClockBlock::kClockCount);
-    return;
-  }
-  if (block_ == parent_->blocks_ &&
-      parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
-    // Iterate over elements in the first level block.
-    pos_ = &parent_->tab_->clock[0];
-    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
-        ClockBlock::kClockCount);
-    return;
-  }
-  parent_ = nullptr;  // denotes end
-}
-}  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_clock.h b/libsanitizer/tsan/tsan_clock.h
deleted file mode 100644
index 11cbc0c..0000000
--- a/libsanitizer/tsan/tsan_clock.h
+++ /dev/null
@@ -1,293 +0,0 @@
-//===-- tsan_clock.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer (TSan), a race detector.
-//
-//===----------------------------------------------------------------------===//
-#ifndef TSAN_CLOCK_H
-#define TSAN_CLOCK_H
-
-#include "tsan_defs.h"
-#include "tsan_dense_alloc.h"
-
-namespace __tsan {
-
-typedef DenseSlabAlloc<ClockBlock, 1 << 22, 1 << 10> ClockAlloc;
-typedef DenseSlabAllocCache ClockCache;
-
-// The clock that lives in sync variables (mutexes, atomics, etc).
-class SyncClock {
- public:
-  SyncClock();
-  ~SyncClock();
-
-  uptr size() const;
-
-  // These are used only in tests.
-  u64 get(unsigned tid) const;
-  u64 get_clean(unsigned tid) const;
-
-  void Resize(ClockCache *c, uptr nclk);
-  void Reset(ClockCache *c);
-
-  void DebugDump(int(*printf)(const char *s, ...));
-
-  // Clock element iterator.
-  // Note: it iterates only over the table without regard to dirty entries.
-  class Iter {
-   public:
-    explicit Iter(SyncClock* parent);
-    Iter& operator++();
-    bool operator!=(const Iter& other);
-    ClockElem &operator*();
-
-   private:
-    SyncClock *parent_;
-    // [pos_, end_) is the current continuous range of clock elements.
-    ClockElem *pos_;
-    ClockElem *end_;
-    int block_;  // Current number of second level block.
-
-    NOINLINE void Next();
-  };
-
-  Iter begin();
-  Iter end();
-
- private:
-  friend class ThreadClock;
-  friend class Iter;
-  static const uptr kDirtyTids = 2;
-
-  struct Dirty {
-    u32 tid() const { return tid_ == kShortInvalidTid ? kInvalidTid : tid_; }
-    void set_tid(u32 tid) {
-      tid_ = tid == kInvalidTid ? kShortInvalidTid : tid;
-    }
-    u64 epoch : kClkBits;
-
-   private:
-    // Full kInvalidTid won't fit into Dirty::tid.
-    static const u64 kShortInvalidTid = (1ull << (64 - kClkBits)) - 1;
-    u64 tid_ : 64 - kClkBits;  // kInvalidId if not active
-  };
-
-  static_assert(sizeof(Dirty) == 8, "Dirty is not 64bit");
-
-  unsigned release_store_tid_;
-  unsigned release_store_reused_;
-  Dirty dirty_[kDirtyTids];
-  // If size_ is 0, tab_ is nullptr.
-  // If size <= 64 (kClockCount), tab_ contains pointer to an array with
-  // 64 ClockElem's (ClockBlock::clock).
-  // Otherwise, tab_ points to an array with up to 127 u32 elements,
-  // each pointing to the second-level 512b block with 64 ClockElem's.
-  // Unused space in the first level ClockBlock is used to store additional
-  // clock elements.
-  // The last u32 element in the first level ClockBlock is always used as
-  // reference counter.
-  //
-  // See the following scheme for details.
-  // All memory blocks are 512 bytes (allocated from ClockAlloc).
-  // Clock (clk) elements are 64 bits.
-  // Idx and ref are 32 bits.
-  //
-  // tab_
-  //    |
-  //    \/
-  //    +----------------------------------------------------+
-  //    | clk128 | clk129 | ...unused... | idx1 | idx0 | ref |
-  //    +----------------------------------------------------+
-  //                                        |      |
-  //                                        |      \/
-  //                                        |      +----------------+
-  //                                        |      | clk0 ... clk63 |
-  //                                        |      +----------------+
-  //                                        \/
-  //                                        +------------------+
-  //                                        | clk64 ... clk127 |
-  //                                        +------------------+
-  //
-  // Note: dirty entries, if active, always override what's stored in the clock.
-  ClockBlock *tab_;
-  u32 tab_idx_;
-  u16 size_;
-  u16 blocks_;  // Number of second level blocks.
-
-  void Unshare(ClockCache *c);
-  bool IsShared() const;
-  bool Cachable() const;
-  void ResetImpl();
-  void FlushDirty();
-  uptr capacity() const;
-  u32 get_block(uptr bi) const;
-  void append_block(u32 idx);
-  ClockElem &elem(unsigned tid) const;
-};
-
-// The clock that lives in threads.
-class ThreadClock {
- public:
-  typedef DenseSlabAllocCache Cache;
-
-  explicit ThreadClock(unsigned tid, unsigned reused = 0);
-
-  u64 get(unsigned tid) const;
-  void set(ClockCache *c, unsigned tid, u64 v);
-  void set(u64 v);
-  void tick();
-  uptr size() const;
-
-  void acquire(ClockCache *c, SyncClock *src);
-  void releaseStoreAcquire(ClockCache *c, SyncClock *src);
-  void release(ClockCache *c, SyncClock *dst);
-  void acq_rel(ClockCache *c, SyncClock *dst);
-  void ReleaseStore(ClockCache *c, SyncClock *dst);
-  void ResetCached(ClockCache *c);
-  void NoteGlobalAcquire(u64 v);
-
-  void DebugReset();
-  void DebugDump(int(*printf)(const char *s, ...));
-
- private:
-  static const uptr kDirtyTids = SyncClock::kDirtyTids;
-  // Index of the thread associated with he clock ("current thread").
-  const unsigned tid_;
-  const unsigned reused_;  // tid_ reuse count.
-  // Current thread time when it acquired something from other threads.
-  u64 last_acquire_;
-
-  // Last time another thread has done a global acquire of this thread's clock.
-  // It helps to avoid problem described in:
-  // https://github.com/golang/go/issues/39186
-  // See test/tsan/java_finalizer2.cpp for a regression test.
-  // Note the failuire is _extremely_ hard to hit, so if you are trying
-  // to reproduce it, you may want to run something like:
-  // $ go get golang.org/x/tools/cmd/stress
-  // $ stress -p=64 ./a.out
-  //
-  // The crux of the problem is roughly as follows.
-  // A number of O(1) optimizations in the clocks algorithm assume proper
-  // transitive cumulative propagation of clock values. The AcquireGlobal
-  // operation may produce an inconsistent non-linearazable view of
-  // thread clocks. Namely, it may acquire a later value from a thread
-  // with a higher ID, but fail to acquire an earlier value from a thread
-  // with a lower ID. If a thread that executed AcquireGlobal then releases
-  // to a sync clock, it will spoil the sync clock with the inconsistent
-  // values. If another thread later releases to the sync clock, the optimized
-  // algorithm may break.
-  //
-  // The exact sequence of events that leads to the failure.
-  // - thread 1 executes AcquireGlobal
-  // - thread 1 acquires value 1 for thread 2
-  // - thread 2 increments clock to 2
-  // - thread 2 releases to sync object 1
-  // - thread 3 at time 1
-  // - thread 3 acquires from sync object 1
-  // - thread 3 increments clock to 2
-  // - thread 1 acquires value 2 for thread 3
-  // - thread 1 releases to sync object 2
-  // - sync object 2 clock has 1 for thread 2 and 2 for thread 3
-  // - thread 3 releases to sync object 2
-  // - thread 3 sees value 2 in the clock for itself
-  //   and decides that it has already released to the clock
-  //   and did not acquire anything from other threads after that
-  //   (the last_acquire_ check in release operation)
-  // - thread 3 does not update the value for thread 2 in the clock from 1 to 2
-  // - thread 4 acquires from sync object 2
-  // - thread 4 detects a false race with thread 2
-  //   as it should have been synchronized with thread 2 up to time 2,
-  //   but because of the broken clock it is now synchronized only up to time 1
-  //
-  // The global_acquire_ value helps to prevent this scenario.
-  // Namely, thread 3 will not trust any own clock values up to global_acquire_
-  // for the purposes of the last_acquire_ optimization.
-  atomic_uint64_t global_acquire_;
-
-  // Cached SyncClock (without dirty entries and release_store_tid_).
-  // We reuse it for subsequent store-release operations without intervening
-  // acquire operations. Since it is shared (and thus constant), clock value
-  // for the current thread is then stored in dirty entries in the SyncClock.
-  // We host a reference to the table while it is cached here.
-  u32 cached_idx_;
-  u16 cached_size_;
-  u16 cached_blocks_;
-
-  // Number of active elements in the clk_ table (the rest is zeros).
-  uptr nclk_;
-  u64 clk_[kMaxTidInClock];  // Fixed size vector clock.
-
-  bool IsAlreadyAcquired(const SyncClock *src) const;
-  bool HasAcquiredAfterRelease(const SyncClock *dst) const;
-  void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
-};
-
-ALWAYS_INLINE u64 ThreadClock::get(unsigned tid) const {
-  DCHECK_LT(tid, kMaxTidInClock);
-  return clk_[tid];
-}
-
-ALWAYS_INLINE void ThreadClock::set(u64 v) {
-  DCHECK_GE(v, clk_[tid_]);
-  clk_[tid_] = v;
-}
-
-ALWAYS_INLINE void ThreadClock::tick() {
-  clk_[tid_]++;
-}
-
-ALWAYS_INLINE uptr ThreadClock::size() const {
-  return nclk_;
-}
-
-ALWAYS_INLINE void ThreadClock::NoteGlobalAcquire(u64 v) {
-  // Here we rely on the fact that AcquireGlobal is protected by
-  // ThreadRegistryLock, thus only one thread at a time executes it
-  // and values passed to this function should not go backwards.
-  CHECK_LE(atomic_load_relaxed(&global_acquire_), v);
-  atomic_store_relaxed(&global_acquire_, v);
-}
-
-ALWAYS_INLINE SyncClock::Iter SyncClock::begin() {
-  return Iter(this);
-}
-
-ALWAYS_INLINE SyncClock::Iter SyncClock::end() {
-  return Iter(nullptr);
-}
-
-ALWAYS_INLINE uptr SyncClock::size() const {
-  return size_;
-}
-
-ALWAYS_INLINE SyncClock::Iter::Iter(SyncClock* parent)
-    : parent_(parent)
-    , pos_(nullptr)
-    , end_(nullptr)
-    , block_(-1) {
-  if (parent)
-    Next();
-}
-
-ALWAYS_INLINE SyncClock::Iter& SyncClock::Iter::operator++() {
-  pos_++;
-  if (UNLIKELY(pos_ >= end_))
-    Next();
-  return *this;
-}
-
-ALWAYS_INLINE bool SyncClock::Iter::operator!=(const SyncClock::Iter& other) {
-  return parent_ != other.parent_;
-}
-
-ALWAYS_INLINE ClockElem &SyncClock::Iter::operator*() {
-  return *pos_;
-}
-}  // namespace __tsan
-
-#endif  // TSAN_CLOCK_H
diff --git a/libsanitizer/tsan/tsan_debugging.cpp b/libsanitizer/tsan/tsan_debugging.cpp
index 1d3c384..1e61c31 100644
--- a/libsanitizer/tsan/tsan_debugging.cpp
+++ b/libsanitizer/tsan/tsan_debugging.cpp
@@ -157,7 +157,7 @@ int __tsan_get_report_mutex(void *report, uptr idx, uptr *mutex_id, void **addr,
   ReportMutex *mutex = rep->mutexes[idx];
   *mutex_id = mutex->id;
   *addr = (void *)mutex->addr;
-  *destroyed = mutex->destroyed;
+  *destroyed = false;
   if (mutex->stack) CopyTrace(mutex->stack->frames, trace, trace_size);
   return 1;
 }
diff --git a/libsanitizer/tsan/tsan_defs.h b/libsanitizer/tsan/tsan_defs.h
index fe0c1da..1ffa3d6 100644
--- a/libsanitizer/tsan/tsan_defs.h
+++ b/libsanitizer/tsan/tsan_defs.h
@@ -63,41 +63,14 @@ enum class Epoch : u16 {};
 constexpr uptr kEpochBits = 14;
 constexpr Epoch kEpochZero = static_cast<Epoch>(0);
 constexpr Epoch kEpochOver = static_cast<Epoch>(1 << kEpochBits);
+constexpr Epoch kEpochLast = static_cast<Epoch>((1 << kEpochBits) - 1);
 
-const int kClkBits = 42;
-const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
-
-struct ClockElem {
-  u64 epoch  : kClkBits;
-  u64 reused : 64 - kClkBits;  // tid reuse count
-};
-
-struct ClockBlock {
-  static const uptr kSize = 512;
-  static const uptr kTableSize = kSize / sizeof(u32);
-  static const uptr kClockCount = kSize / sizeof(ClockElem);
-  static const uptr kRefIdx = kTableSize - 1;
-  static const uptr kBlockIdx = kTableSize - 2;
-
-  union {
-    u32       table[kTableSize];
-    ClockElem clock[kClockCount];
-  };
+inline Epoch EpochInc(Epoch epoch) {
+  return static_cast<Epoch>(static_cast<u16>(epoch) + 1);
+}
 
-  ClockBlock() {
-  }
-};
+inline bool EpochOverflow(Epoch epoch) { return epoch == kEpochOver; }
 
-const int kTidBits = 13;
-// Reduce kMaxTid by kClockCount because one slot in ClockBlock table is
-// occupied by reference counter, so total number of elements we can store
-// in SyncClock is kClockCount * (kTableSize - 1).
-const unsigned kMaxTid = (1 << kTidBits) - ClockBlock::kClockCount;
-#if !SANITIZER_GO
-const unsigned kMaxTidInClock = kMaxTid * 2;  // This includes msb 'freed' bit.
-#else
-const unsigned kMaxTidInClock = kMaxTid;  // Go does not track freed memory.
-#endif
 const uptr kShadowStackSize = 64 * 1024;
 
 // Count of shadow values in a shadow cell.
@@ -107,7 +80,7 @@ const uptr kShadowCnt = 4;
 const uptr kShadowCell = 8;
 
 // Single shadow value.
-typedef u64 RawShadow;
+enum class RawShadow : u32 {};
 const uptr kShadowSize = sizeof(RawShadow);
 
 // Shadow memory is kShadowMultiplier times larger than user memory.
@@ -184,10 +157,13 @@ MD5Hash md5_hash(const void *data, uptr size);
 struct Processor;
 struct ThreadState;
 class ThreadContext;
+struct TidSlot;
 struct Context;
 struct ReportStack;
 class ReportDesc;
 class RegionAlloc;
+struct Trace;
+struct TracePart;
 
 typedef uptr AccessType;
 
@@ -198,6 +174,9 @@ enum : AccessType {
   kAccessVptr = 1 << 2,  // read or write of an object virtual table pointer
   kAccessFree = 1 << 3,  // synthetic memory access during memory freeing
   kAccessExternalPC = 1 << 4,  // access PC can have kExternalPCBit set
+  kAccessCheckOnly = 1 << 5,   // check for races, but don't store
+  kAccessNoRodata = 1 << 6,    // don't check for .rodata marker
+  kAccessSlotLocked = 1 << 7,  // memory access with TidSlot locked
 };
 
 // Descriptor of user's memory block.
@@ -219,15 +198,18 @@ enum ExternalTag : uptr {
   // as 16-bit values, see tsan_defs.h.
 };
 
-enum MutexType {
-  MutexTypeTrace = MutexLastCommon,
-  MutexTypeReport,
+enum {
+  MutexTypeReport = MutexLastCommon,
   MutexTypeSyncVar,
   MutexTypeAnnotations,
   MutexTypeAtExit,
   MutexTypeFired,
   MutexTypeRacy,
   MutexTypeGlobalProc,
+  MutexTypeInternalAlloc,
+  MutexTypeTrace,
+  MutexTypeSlot,
+  MutexTypeSlots,
 };
 
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_dense_alloc.h b/libsanitizer/tsan/tsan_dense_alloc.h
index 9e15f74..7a39a39 100644
--- a/libsanitizer/tsan/tsan_dense_alloc.h
+++ b/libsanitizer/tsan/tsan_dense_alloc.h
@@ -104,6 +104,15 @@ class DenseSlabAlloc {
     return atomic_load_relaxed(&fillpos_) * kL2Size * sizeof(T);
   }
 
+  template <typename Func>
+  void ForEach(Func func) {
+    SpinMutexLock lock(&mtx_);
+    uptr fillpos = atomic_load_relaxed(&fillpos_);
+    for (uptr l1 = 0; l1 < fillpos; l1++) {
+      for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
+    }
+  }
+
  private:
   T *map_[kL1Size];
   SpinMutex mtx_;
diff --git a/libsanitizer/tsan/tsan_fd.cpp b/libsanitizer/tsan/tsan_fd.cpp
index 255ffa8..cf8f491 100644
--- a/libsanitizer/tsan/tsan_fd.cpp
+++ b/libsanitizer/tsan/tsan_fd.cpp
@@ -11,9 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "tsan_fd.h"
-#include "tsan_rtl.h"
+
 #include <sanitizer_common/sanitizer_atomic.h>
 
+#include "tsan_interceptors.h"
+#include "tsan_rtl.h"
+
 namespace __tsan {
 
 const int kTableSizeL1 = 1024;
@@ -26,6 +29,9 @@ struct FdSync {
 
 struct FdDesc {
   FdSync *sync;
+  // This is used to establish write -> epoll_wait synchronization
+  // where epoll_wait receives notification about the write.
+  atomic_uintptr_t aux_sync;  // FdSync*
   Tid creation_tid;
   StackID creation_stack;
 };
@@ -100,6 +106,10 @@ static void init(ThreadState *thr, uptr pc, int fd, FdSync *s,
     unref(thr, pc, d->sync);
     d->sync = 0;
   }
+  unref(thr, pc,
+        reinterpret_cast<FdSync *>(
+            atomic_load(&d->aux_sync, memory_order_relaxed)));
+  atomic_store(&d->aux_sync, 0, memory_order_relaxed);
   if (flags()->io_sync == 0) {
     unref(thr, pc, s);
   } else if (flags()->io_sync == 1) {
@@ -110,12 +120,17 @@ static void init(ThreadState *thr, uptr pc, int fd, FdSync *s,
   }
   d->creation_tid = thr->tid;
   d->creation_stack = CurrentStackId(thr, pc);
+  // This prevents false positives on fd_close_norace3.cpp test.
+  // The mechanics of the false positive are not completely clear,
+  // but it happens only if global reset is enabled (flush_memory_ms=1)
+  // and may be related to lost writes during asynchronous MADV_DONTNEED.
+  SlotLocker locker(thr);
   if (write) {
     // To catch races between fd usage and open.
     MemoryRangeImitateWrite(thr, pc, (uptr)d, 8);
   } else {
     // See the dup-related comment in FdClose.
-    MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead);
+    MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead | kAccessSlotLocked);
   }
 }
 
@@ -177,6 +192,8 @@ void FdRelease(ThreadState *thr, uptr pc, int fd) {
   MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead);
   if (s)
     Release(thr, pc, (uptr)s);
+  if (uptr aux_sync = atomic_load(&d->aux_sync, memory_order_acquire))
+    Release(thr, pc, aux_sync);
 }
 
 void FdAccess(ThreadState *thr, uptr pc, int fd) {
@@ -192,25 +209,39 @@ void FdClose(ThreadState *thr, uptr pc, int fd, bool write) {
   if (bogusfd(fd))
     return;
   FdDesc *d = fddesc(thr, pc, fd);
-  if (write) {
-    // To catch races between fd usage and close.
-    MemoryAccess(thr, pc, (uptr)d, 8, kAccessWrite);
-  } else {
-    // This path is used only by dup2/dup3 calls.
-    // We do read instead of write because there is a number of legitimate
-    // cases where write would lead to false positives:
-    // 1. Some software dups a closed pipe in place of a socket before closing
-    //    the socket (to prevent races actually).
-    // 2. Some daemons dup /dev/null in place of stdin/stdout.
-    // On the other hand we have not seen cases when write here catches real
-    // bugs.
-    MemoryAccess(thr, pc, (uptr)d, 8, kAccessRead);
+  {
+    // Need to lock the slot to make MemoryAccess and MemoryResetRange atomic
+    // with respect to global reset. See the comment in MemoryRangeFreed.
+    SlotLocker locker(thr);
+    if (!MustIgnoreInterceptor(thr)) {
+      if (write) {
+        // To catch races between fd usage and close.
+        MemoryAccess(thr, pc, (uptr)d, 8,
+                     kAccessWrite | kAccessCheckOnly | kAccessSlotLocked);
+      } else {
+        // This path is used only by dup2/dup3 calls.
+        // We do read instead of write because there is a number of legitimate
+        // cases where write would lead to false positives:
+        // 1. Some software dups a closed pipe in place of a socket before
+        // closing
+        //    the socket (to prevent races actually).
+        // 2. Some daemons dup /dev/null in place of stdin/stdout.
+        // On the other hand we have not seen cases when write here catches real
+        // bugs.
+        MemoryAccess(thr, pc, (uptr)d, 8,
+                     kAccessRead | kAccessCheckOnly | kAccessSlotLocked);
+      }
+    }
+    // We need to clear it, because if we do not intercept any call out there
+    // that creates fd, we will hit false postives.
+    MemoryResetRange(thr, pc, (uptr)d, 8);
   }
-  // We need to clear it, because if we do not intercept any call out there
-  // that creates fd, we will hit false postives.
-  MemoryResetRange(thr, pc, (uptr)d, 8);
   unref(thr, pc, d->sync);
   d->sync = 0;
+  unref(thr, pc,
+        reinterpret_cast<FdSync *>(
+            atomic_load(&d->aux_sync, memory_order_relaxed)));
+  atomic_store(&d->aux_sync, 0, memory_order_relaxed);
   d->creation_tid = kInvalidTid;
   d->creation_stack = kInvalidStackID;
 }
@@ -269,6 +300,30 @@ void FdPollCreate(ThreadState *thr, uptr pc, int fd) {
   init(thr, pc, fd, allocsync(thr, pc));
 }
 
+void FdPollAdd(ThreadState *thr, uptr pc, int epfd, int fd) {
+  DPrintf("#%d: FdPollAdd(%d, %d)\n", thr->tid, epfd, fd);
+  if (bogusfd(epfd) || bogusfd(fd))
+    return;
+  FdDesc *d = fddesc(thr, pc, fd);
+  // Associate fd with epoll fd only once.
+  // While an fd can be associated with multiple epolls at the same time,
+  // or with different epolls during different phases of lifetime,
+  // synchronization semantics (and examples) of this are unclear.
+  // So we don't support this for now.
+  // If we change the association, it will also create lifetime management
+  // problem for FdRelease which accesses the aux_sync.
+  if (atomic_load(&d->aux_sync, memory_order_relaxed))
+    return;
+  FdDesc *epd = fddesc(thr, pc, epfd);
+  FdSync *s = epd->sync;
+  if (!s)
+    return;
+  uptr cmp = 0;
+  if (atomic_compare_exchange_strong(
+          &d->aux_sync, &cmp, reinterpret_cast<uptr>(s), memory_order_release))
+    ref(s);
+}
+
 void FdSocketCreate(ThreadState *thr, uptr pc, int fd) {
   DPrintf("#%d: FdSocketCreate(%d)\n", thr->tid, fd);
   if (bogusfd(fd))
diff --git a/libsanitizer/tsan/tsan_fd.h b/libsanitizer/tsan/tsan_fd.h
index d964817..92625dc 100644
--- a/libsanitizer/tsan/tsan_fd.h
+++ b/libsanitizer/tsan/tsan_fd.h
@@ -49,6 +49,7 @@ void FdEventCreate(ThreadState *thr, uptr pc, int fd);
 void FdSignalCreate(ThreadState *thr, uptr pc, int fd);
 void FdInotifyCreate(ThreadState *thr, uptr pc, int fd);
 void FdPollCreate(ThreadState *thr, uptr pc, int fd);
+void FdPollAdd(ThreadState *thr, uptr pc, int epfd, int fd);
 void FdSocketCreate(ThreadState *thr, uptr pc, int fd);
 void FdSocketAccept(ThreadState *thr, uptr pc, int fd, int newfd);
 void FdSocketConnecting(ThreadState *thr, uptr pc, int fd);
diff --git a/libsanitizer/tsan/tsan_flags.cpp b/libsanitizer/tsan/tsan_flags.cpp
index ee89862..ee78f25 100644
--- a/libsanitizer/tsan/tsan_flags.cpp
+++ b/libsanitizer/tsan/tsan_flags.cpp
@@ -97,7 +97,7 @@ void InitializeFlags(Flags *f, const char *env, const char *env_option_name) {
   ubsan_parser.ParseStringFromEnv("UBSAN_OPTIONS");
 #endif
 
-  // Sanity check.
+  // Check flags.
   if (!f->report_bugs) {
     f->report_thread_leaks = false;
     f->report_destroy_locked = false;
@@ -110,12 +110,6 @@ void InitializeFlags(Flags *f, const char *env, const char *env_option_name) {
 
   if (common_flags()->help) parser.PrintFlagDescriptions();
 
-  if (f->history_size < 0 || f->history_size > 7) {
-    Printf("ThreadSanitizer: incorrect value for history_size"
-           " (must be [0..7])\n");
-    Die();
-  }
-
   if (f->io_sync < 0 || f->io_sync > 2) {
     Printf("ThreadSanitizer: incorrect value for io_sync"
            " (must be [0..2])\n");
diff --git a/libsanitizer/tsan/tsan_flags.inc b/libsanitizer/tsan/tsan_flags.inc
index 7954a430..32cf3bb 100644
--- a/libsanitizer/tsan/tsan_flags.inc
+++ b/libsanitizer/tsan/tsan_flags.inc
@@ -43,6 +43,9 @@ TSAN_FLAG(
     bool, force_seq_cst_atomics, false,
     "If set, all atomics are effectively sequentially consistent (seq_cst), "
     "regardless of what user actually specified.")
+TSAN_FLAG(bool, force_background_thread, false,
+          "If set, eagerly launch a background thread for memory reclamation "
+          "instead of waiting for a user call to pthread_create.")
 TSAN_FLAG(bool, halt_on_error, false, "Exit after first reported error.")
 TSAN_FLAG(int, atexit_sleep_ms, 1000,
           "Sleep in main thread before exiting for that many ms "
@@ -59,14 +62,10 @@ TSAN_FLAG(bool, stop_on_start, false,
           "Stops on start until __tsan_resume() is called (for debugging).")
 TSAN_FLAG(bool, running_on_valgrind, false,
           "Controls whether RunningOnValgrind() returns true or false.")
-// There are a lot of goroutines in Go, so we use smaller history.
 TSAN_FLAG(
-    int, history_size, SANITIZER_GO ? 1 : 3,
-    "Per-thread history size, controls how many previous memory accesses "
-    "are remembered per thread.  Possible values are [0..7]. "
-    "history_size=0 amounts to 32K memory accesses.  Each next value doubles "
-    "the amount of memory accesses, up to history_size=7 that amounts to "
-    "4M memory accesses.  The default value is 2 (128K memory accesses).")
+    uptr, history_size, 0,
+    "Per-thread history size,"
+    " controls how many extra previous memory accesses are remembered per thread.")
 TSAN_FLAG(int, io_sync, 1,
           "Controls level of synchronization implied by IO operations. "
           "0 - no synchronization "
@@ -82,3 +81,6 @@ TSAN_FLAG(bool, ignore_noninstrumented_modules, SANITIZER_MAC ? true : false,
           "modules.")
 TSAN_FLAG(bool, shared_ptr_interceptor, true,
           "Track atomic reference counting in libc++ shared_ptr and weak_ptr.")
+TSAN_FLAG(bool, print_full_thread_history, false,
+          "If set, prints thread creation stacks for the threads involved in "
+          "the report and their ancestors up to the main thread.")
diff --git a/libsanitizer/tsan/tsan_interceptors.h b/libsanitizer/tsan/tsan_interceptors.h
index 61dbb81..3091ad8 100644
--- a/libsanitizer/tsan/tsan_interceptors.h
+++ b/libsanitizer/tsan/tsan_interceptors.h
@@ -36,6 +36,10 @@ inline bool in_symbolizer() {
 }
 #endif
 
+inline bool MustIgnoreInterceptor(ThreadState *thr) {
+  return !thr->is_inited || thr->ignore_interceptors || thr->in_ignored_lib;
+}
+
 }  // namespace __tsan
 
 #define SCOPED_INTERCEPTOR_RAW(func, ...)            \
@@ -60,10 +64,10 @@ inline bool in_symbolizer() {
 #  define CHECK_REAL_FUNC(func) DCHECK(REAL(func))
 #endif
 
-#define SCOPED_TSAN_INTERCEPTOR(func, ...)                                \
-  SCOPED_INTERCEPTOR_RAW(func, __VA_ARGS__);                              \
-  CHECK_REAL_FUNC(func);                                                  \
-  if (!thr->is_inited || thr->ignore_interceptors || thr->in_ignored_lib) \
+#define SCOPED_TSAN_INTERCEPTOR(func, ...)   \
+  SCOPED_INTERCEPTOR_RAW(func, __VA_ARGS__); \
+  CHECK_REAL_FUNC(func);                     \
+  if (MustIgnoreInterceptor(thr))            \
     return REAL(func)(__VA_ARGS__);
 
 #define SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START() \
@@ -74,6 +78,14 @@ inline bool in_symbolizer() {
 
 #define TSAN_INTERCEPTOR(ret, func, ...) INTERCEPTOR(ret, func, __VA_ARGS__)
 
+#if SANITIZER_FREEBSD
+#  define TSAN_INTERCEPTOR_FREEBSD_ALIAS(ret, func, ...) \
+    TSAN_INTERCEPTOR(ret, _pthread_##func, __VA_ARGS__)  \
+    ALIAS(WRAPPER_NAME(pthread_##func));
+#else
+#  define TSAN_INTERCEPTOR_FREEBSD_ALIAS(ret, func, ...)
+#endif
+
 #if SANITIZER_NETBSD
 # define TSAN_INTERCEPTOR_NETBSD_ALIAS(ret, func, ...) \
   TSAN_INTERCEPTOR(ret, __libc_##func, __VA_ARGS__) \
diff --git a/libsanitizer/tsan/tsan_interceptors_posix.cpp b/libsanitizer/tsan/tsan_interceptors_posix.cpp
index 9a85ee0..60ca963 100644
--- a/libsanitizer/tsan/tsan_interceptors_posix.cpp
+++ b/libsanitizer/tsan/tsan_interceptors_posix.cpp
@@ -90,6 +90,7 @@ DECLARE_REAL(int, pthread_mutexattr_gettype, void *, void *)
 DECLARE_REAL(int, fflush, __sanitizer_FILE *fp)
 DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, uptr size)
 DECLARE_REAL_AND_INTERCEPTOR(void, free, void *ptr)
+extern "C" int pthread_equal(void *t1, void *t2);
 extern "C" void *pthread_self();
 extern "C" void _exit(int status);
 #if !SANITIZER_NETBSD
@@ -176,6 +177,7 @@ struct ThreadSignalContext {
 struct AtExitCtx {
   void (*f)();
   void *arg;
+  uptr pc;
 };
 
 // InterceptorContext holds all global data required for interceptors.
@@ -287,20 +289,25 @@ void ScopedInterceptor::DisableIgnoresImpl() {
 }
 
 #define TSAN_INTERCEPT(func) INTERCEPT_FUNCTION(func)
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD
+#  define TSAN_INTERCEPT_VER(func, ver) INTERCEPT_FUNCTION(func)
+#else
+#  define TSAN_INTERCEPT_VER(func, ver) INTERCEPT_FUNCTION_VER(func, ver)
+#endif
 #if SANITIZER_FREEBSD
-# define TSAN_INTERCEPT_VER(func, ver) INTERCEPT_FUNCTION(func)
-# define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS(func)
-# define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS_THR(func)
-#elif SANITIZER_NETBSD
-# define TSAN_INTERCEPT_VER(func, ver) INTERCEPT_FUNCTION(func)
-# define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS(func) \
-         INTERCEPT_FUNCTION(__libc_##func)
-# define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS_THR(func) \
-         INTERCEPT_FUNCTION(__libc_thr_##func)
+#  define TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(func) \
+    INTERCEPT_FUNCTION(_pthread_##func)
 #else
-# define TSAN_INTERCEPT_VER(func, ver) INTERCEPT_FUNCTION_VER(func, ver)
-# define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS(func)
-# define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS_THR(func)
+#  define TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(func)
+#endif
+#if SANITIZER_NETBSD
+#  define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS(func) \
+    INTERCEPT_FUNCTION(__libc_##func)
+#  define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS_THR(func) \
+    INTERCEPT_FUNCTION(__libc_thr_##func)
+#else
+#  define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS(func)
+#  define TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS_THR(func)
 #endif
 
 #define READ_STRING_OF_LEN(thr, pc, s, len, n)                 \
@@ -366,7 +373,10 @@ TSAN_INTERCEPTOR(int, pause, int fake) {
   return BLOCK_REAL(pause)(fake);
 }
 
-static void at_exit_wrapper() {
+// Note: we specifically call the function in such strange way
+// with "installed_at" because in reports it will appear between
+// callback frames and the frame that installed the callback.
+static void at_exit_callback_installed_at() {
   AtExitCtx *ctx;
   {
     // Ensure thread-safety.
@@ -378,15 +388,21 @@ static void at_exit_wrapper() {
     interceptor_ctx()->AtExitStack.PopBack();
   }
 
-  Acquire(cur_thread(), (uptr)0, (uptr)ctx);
+  ThreadState *thr = cur_thread();
+  Acquire(thr, ctx->pc, (uptr)ctx);
+  FuncEntry(thr, ctx->pc);
   ((void(*)())ctx->f)();
+  FuncExit(thr);
   Free(ctx);
 }
 
-static void cxa_at_exit_wrapper(void *arg) {
-  Acquire(cur_thread(), 0, (uptr)arg);
+static void cxa_at_exit_callback_installed_at(void *arg) {
+  ThreadState *thr = cur_thread();
   AtExitCtx *ctx = (AtExitCtx*)arg;
+  Acquire(thr, ctx->pc, (uptr)arg);
+  FuncEntry(thr, ctx->pc);
   ((void(*)(void *arg))ctx->f)(ctx->arg);
+  FuncExit(thr);
   Free(ctx);
 }
 
@@ -400,7 +416,7 @@ TSAN_INTERCEPTOR(int, atexit, void (*f)()) {
   // We want to setup the atexit callback even if we are in ignored lib
   // or after fork.
   SCOPED_INTERCEPTOR_RAW(atexit, f);
-  return setup_at_exit_wrapper(thr, pc, (void(*)())f, 0, 0);
+  return setup_at_exit_wrapper(thr, GET_CALLER_PC(), (void (*)())f, 0, 0);
 }
 #endif
 
@@ -408,7 +424,7 @@ TSAN_INTERCEPTOR(int, __cxa_atexit, void (*f)(void *a), void *arg, void *dso) {
   if (in_symbolizer())
     return 0;
   SCOPED_TSAN_INTERCEPTOR(__cxa_atexit, f, arg, dso);
-  return setup_at_exit_wrapper(thr, pc, (void(*)())f, arg, dso);
+  return setup_at_exit_wrapper(thr, GET_CALLER_PC(), (void (*)())f, arg, dso);
 }
 
 static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
@@ -416,6 +432,7 @@ static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
   auto *ctx = New<AtExitCtx>();
   ctx->f = f;
   ctx->arg = arg;
+  ctx->pc = pc;
   Release(thr, pc, (uptr)ctx);
   // Memory allocation in __cxa_atexit will race with free during exit,
   // because we do not see synchronization around atexit callback list.
@@ -431,25 +448,27 @@ static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
     // due to atexit_mu held on exit from the calloc interceptor.
     ScopedIgnoreInterceptors ignore;
 
-    res = REAL(__cxa_atexit)((void (*)(void *a))at_exit_wrapper, 0, 0);
+    res = REAL(__cxa_atexit)((void (*)(void *a))at_exit_callback_installed_at,
+                             0, 0);
     // Push AtExitCtx on the top of the stack of callback functions
     if (!res) {
       interceptor_ctx()->AtExitStack.PushBack(ctx);
     }
   } else {
-    res = REAL(__cxa_atexit)(cxa_at_exit_wrapper, ctx, dso);
+    res = REAL(__cxa_atexit)(cxa_at_exit_callback_installed_at, ctx, dso);
   }
   ThreadIgnoreEnd(thr);
   return res;
 }
 
 #if !SANITIZER_MAC && !SANITIZER_NETBSD
-static void on_exit_wrapper(int status, void *arg) {
+static void on_exit_callback_installed_at(int status, void *arg) {
   ThreadState *thr = cur_thread();
-  uptr pc = 0;
-  Acquire(thr, pc, (uptr)arg);
   AtExitCtx *ctx = (AtExitCtx*)arg;
+  Acquire(thr, ctx->pc, (uptr)arg);
+  FuncEntry(thr, ctx->pc);
   ((void(*)(int status, void *arg))ctx->f)(status, ctx->arg);
+  FuncExit(thr);
   Free(ctx);
 }
 
@@ -460,11 +479,12 @@ TSAN_INTERCEPTOR(int, on_exit, void(*f)(int, void*), void *arg) {
   auto *ctx = New<AtExitCtx>();
   ctx->f = (void(*)())f;
   ctx->arg = arg;
+  ctx->pc = GET_CALLER_PC();
   Release(thr, pc, (uptr)ctx);
   // Memory allocation in __cxa_atexit will race with free during exit,
   // because we do not see synchronization around atexit callback list.
   ThreadIgnoreBegin(thr, pc);
-  int res = REAL(on_exit)(on_exit_wrapper, ctx);
+  int res = REAL(on_exit)(on_exit_callback_installed_at, ctx);
   ThreadIgnoreEnd(thr);
   return res;
 }
@@ -880,10 +900,11 @@ static int guard_acquire(ThreadState *thr, uptr pc, atomic_uint32_t *g,
   }
 }
 
-static void guard_release(ThreadState *thr, uptr pc, atomic_uint32_t *g) {
+static void guard_release(ThreadState *thr, uptr pc, atomic_uint32_t *g,
+                          u32 v) {
   if (!thr->in_ignored_lib)
     Release(thr, pc, (uptr)g);
-  u32 old = atomic_exchange(g, kGuardDone, memory_order_release);
+  u32 old = atomic_exchange(g, v, memory_order_release);
   if (old & kGuardWaiter)
     FutexWake(g, 1 << 30);
 }
@@ -913,12 +934,12 @@ STDCXX_INTERCEPTOR(int, __cxa_guard_acquire, atomic_uint32_t *g) {
 
 STDCXX_INTERCEPTOR(void, __cxa_guard_release, atomic_uint32_t *g) {
   SCOPED_INTERCEPTOR_RAW(__cxa_guard_release, g);
-  guard_release(thr, pc, g);
+  guard_release(thr, pc, g, kGuardDone);
 }
 
 STDCXX_INTERCEPTOR(void, __cxa_guard_abort, atomic_uint32_t *g) {
   SCOPED_INTERCEPTOR_RAW(__cxa_guard_abort, g);
-  atomic_store(g, kGuardInit, memory_order_relaxed);
+  guard_release(thr, pc, g, kGuardInit);
 }
 
 namespace __tsan {
@@ -1515,12 +1536,12 @@ TSAN_INTERCEPTOR(int, pthread_once, void *o, void (*f)()) {
   // result in crashes due to too little stack space.
   if (guard_acquire(thr, pc, a, !SANITIZER_MAC)) {
     (*f)();
-    guard_release(thr, pc, a);
+    guard_release(thr, pc, a, kGuardDone);
   }
   return 0;
 }
 
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
+#if SANITIZER_GLIBC
 TSAN_INTERCEPTOR(int, __fxstat, int version, int fd, void *buf) {
   SCOPED_TSAN_INTERCEPTOR(__fxstat, version, fd, buf);
   if (fd > 0)
@@ -1533,20 +1554,20 @@ TSAN_INTERCEPTOR(int, __fxstat, int version, int fd, void *buf) {
 #endif
 
 TSAN_INTERCEPTOR(int, fstat, int fd, void *buf) {
-#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_ANDROID || SANITIZER_NETBSD
-  SCOPED_TSAN_INTERCEPTOR(fstat, fd, buf);
+#if SANITIZER_GLIBC
+  SCOPED_TSAN_INTERCEPTOR(__fxstat, 0, fd, buf);
   if (fd > 0)
     FdAccess(thr, pc, fd);
-  return REAL(fstat)(fd, buf);
+  return REAL(__fxstat)(0, fd, buf);
 #else
-  SCOPED_TSAN_INTERCEPTOR(__fxstat, 0, fd, buf);
+  SCOPED_TSAN_INTERCEPTOR(fstat, fd, buf);
   if (fd > 0)
     FdAccess(thr, pc, fd);
-  return REAL(__fxstat)(0, fd, buf);
+  return REAL(fstat)(fd, buf);
 #endif
 }
 
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
+#if SANITIZER_GLIBC
 TSAN_INTERCEPTOR(int, __fxstat64, int version, int fd, void *buf) {
   SCOPED_TSAN_INTERCEPTOR(__fxstat64, version, fd, buf);
   if (fd > 0)
@@ -1558,7 +1579,7 @@ TSAN_INTERCEPTOR(int, __fxstat64, int version, int fd, void *buf) {
 #define TSAN_MAYBE_INTERCEPT___FXSTAT64
 #endif
 
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
+#if SANITIZER_GLIBC
 TSAN_INTERCEPTOR(int, fstat64, int fd, void *buf) {
   SCOPED_TSAN_INTERCEPTOR(__fxstat64, 0, fd, buf);
   if (fd > 0)
@@ -1665,11 +1686,10 @@ TSAN_INTERCEPTOR(int, eventfd, unsigned initval, int flags) {
 
 #if SANITIZER_LINUX
 TSAN_INTERCEPTOR(int, signalfd, int fd, void *mask, int flags) {
-  SCOPED_TSAN_INTERCEPTOR(signalfd, fd, mask, flags);
-  if (fd >= 0)
-    FdClose(thr, pc, fd);
+  SCOPED_INTERCEPTOR_RAW(signalfd, fd, mask, flags);
+  FdClose(thr, pc, fd);
   fd = REAL(signalfd)(fd, mask, flags);
-  if (fd >= 0)
+  if (!MustIgnoreInterceptor(thr))
     FdSignalCreate(thr, pc, fd);
   return fd;
 }
@@ -1746,17 +1766,16 @@ TSAN_INTERCEPTOR(int, listen, int fd, int backlog) {
 }
 
 TSAN_INTERCEPTOR(int, close, int fd) {
-  SCOPED_TSAN_INTERCEPTOR(close, fd);
-  if (fd >= 0)
+  SCOPED_INTERCEPTOR_RAW(close, fd);
+  if (!in_symbolizer())
     FdClose(thr, pc, fd);
   return REAL(close)(fd);
 }
 
 #if SANITIZER_LINUX
 TSAN_INTERCEPTOR(int, __close, int fd) {
-  SCOPED_TSAN_INTERCEPTOR(__close, fd);
-  if (fd >= 0)
-    FdClose(thr, pc, fd);
+  SCOPED_INTERCEPTOR_RAW(__close, fd);
+  FdClose(thr, pc, fd);
   return REAL(__close)(fd);
 }
 #define TSAN_MAYBE_INTERCEPT___CLOSE TSAN_INTERCEPT(__close)
@@ -1767,13 +1786,10 @@ TSAN_INTERCEPTOR(int, __close, int fd) {
 // glibc guts
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
 TSAN_INTERCEPTOR(void, __res_iclose, void *state, bool free_addr) {
-  SCOPED_TSAN_INTERCEPTOR(__res_iclose, state, free_addr);
+  SCOPED_INTERCEPTOR_RAW(__res_iclose, state, free_addr);
   int fds[64];
   int cnt = ExtractResolvFDs(state, fds, ARRAY_SIZE(fds));
-  for (int i = 0; i < cnt; i++) {
-    if (fds[i] > 0)
-      FdClose(thr, pc, fds[i]);
-  }
+  for (int i = 0; i < cnt; i++) FdClose(thr, pc, fds[i]);
   REAL(__res_iclose)(state, free_addr);
 }
 #define TSAN_MAYBE_INTERCEPT___RES_ICLOSE TSAN_INTERCEPT(__res_iclose)
@@ -1854,7 +1870,7 @@ TSAN_INTERCEPTOR(int, rmdir, char *path) {
 }
 
 TSAN_INTERCEPTOR(int, closedir, void *dirp) {
-  SCOPED_TSAN_INTERCEPTOR(closedir, dirp);
+  SCOPED_INTERCEPTOR_RAW(closedir, dirp);
   if (dirp) {
     int fd = dirfd(dirp);
     FdClose(thr, pc, fd);
@@ -1885,8 +1901,10 @@ TSAN_INTERCEPTOR(int, epoll_ctl, int epfd, int op, int fd, void *ev) {
     FdAccess(thr, pc, epfd);
   if (epfd >= 0 && fd >= 0)
     FdAccess(thr, pc, fd);
-  if (op == EPOLL_CTL_ADD && epfd >= 0)
+  if (op == EPOLL_CTL_ADD && epfd >= 0) {
+    FdPollAdd(thr, pc, epfd, fd);
     FdRelease(thr, pc, epfd);
+  }
   int res = REAL(epoll_ctl)(epfd, op, fd, ev);
   return res;
 }
@@ -1949,13 +1967,14 @@ TSAN_INTERCEPTOR(int, pthread_sigmask, int how, const __sanitizer_sigset_t *set,
 
 namespace __tsan {
 
-static void ReportErrnoSpoiling(ThreadState *thr, uptr pc) {
+static void ReportErrnoSpoiling(ThreadState *thr, uptr pc, int sig) {
   VarSizeStackTrace stack;
   // StackTrace::GetNestInstructionPc(pc) is used because return address is
   // expected, OutputReport() will undo this.
   ObtainCurrentStack(thr, StackTrace::GetNextInstructionPc(pc), &stack);
   ThreadRegistryLock l(&ctx->thread_registry);
   ScopedReport rep(ReportTypeErrnoInSignal);
+  rep.SetSigNum(sig);
   if (!IsFiredSuppression(ctx, ReportTypeErrnoInSignal, stack)) {
     rep.AddStack(stack, true);
     OutputReport(thr, rep);
@@ -1965,6 +1984,7 @@ static void ReportErrnoSpoiling(ThreadState *thr, uptr pc) {
 static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
                                   int sig, __sanitizer_siginfo *info,
                                   void *uctx) {
+  CHECK(thr->slot);
   __sanitizer_sigaction *sigactions = interceptor_ctx()->sigactions;
   if (acquire)
     Acquire(thr, 0, (uptr)&sigactions[sig]);
@@ -2021,7 +2041,7 @@ static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
   // signal; and it looks too fragile to intercept all ways to reraise a signal.
   if (ShouldReport(thr, ReportTypeErrnoInSignal) && !sync && sig != SIGTERM &&
       errno != 99)
-    ReportErrnoSpoiling(thr, pc);
+    ReportErrnoSpoiling(thr, pc, sig);
   errno = saved_errno;
 }
 
@@ -2132,11 +2152,11 @@ TSAN_INTERCEPTOR(int, pthread_kill, void *tid, int sig) {
   ThreadSignalContext *sctx = SigCtx(thr);
   CHECK_NE(sctx, 0);
   int prev = sctx->int_signal_send;
-  if (tid == pthread_self()) {
+  bool self = pthread_equal(tid, pthread_self());
+  if (self)
     sctx->int_signal_send = sig;
-  }
   int res = REAL(pthread_kill)(tid, sig);
-  if (tid == pthread_self()) {
+  if (self) {
     CHECK_EQ(sctx->int_signal_send, sig);
     sctx->int_signal_send = prev;
   }
@@ -2193,6 +2213,7 @@ void atfork_child() {
   FdOnFork(thr, pc);
 }
 
+#if !SANITIZER_IOS
 TSAN_INTERCEPTOR(int, vfork, int fake) {
   // Some programs (e.g. openjdk) call close for all file descriptors
   // in the child process. Under tsan it leads to false positives, because
@@ -2209,6 +2230,7 @@ TSAN_INTERCEPTOR(int, vfork, int fake) {
   // Instead we simply turn vfork into fork.
   return WRAP(fork)(fake);
 }
+#endif
 
 #if SANITIZER_LINUX
 TSAN_INTERCEPTOR(int, clone, int (*fn)(void *), void *stack, int flags,
@@ -2252,7 +2274,7 @@ struct dl_iterate_phdr_data {
 };
 
 static bool IsAppNotRodata(uptr addr) {
-  return IsAppMem(addr) && *MemToShadow(addr) != kShadowRodata;
+  return IsAppMem(addr) && *MemToShadow(addr) != Shadow::kRodata;
 }
 
 static int dl_iterate_phdr_cb(__sanitizer_dl_phdr_info *info, SIZE_T size,
@@ -2358,9 +2380,18 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
 #define COMMON_INTERCEPTOR_FILE_CLOSE(ctx, file) \
   if (file) {                                    \
     int fd = fileno_unlocked(file);              \
-    if (fd >= 0) FdClose(thr, pc, fd);           \
+    FdClose(thr, pc, fd);                        \
   }
 
+#define COMMON_INTERCEPTOR_DLOPEN(filename, flag) \
+  ({                                              \
+    CheckNoDeepBind(filename, flag);              \
+    ThreadIgnoreBegin(thr, 0);                    \
+    void *res = REAL(dlopen)(filename, flag);     \
+    ThreadIgnoreEnd(thr);                         \
+    res;                                          \
+  })
+
 #define COMMON_INTERCEPTOR_LIBRARY_LOADED(filename, handle) \
   libignore()->OnLibraryLoaded(filename)
 
@@ -2391,8 +2422,11 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc,
 #define COMMON_INTERCEPTOR_SET_THREAD_NAME(ctx, name) \
   ThreadSetName(((TsanInterceptorContext *) ctx)->thr, name)
 
-#define COMMON_INTERCEPTOR_SET_PTHREAD_NAME(ctx, thread, name) \
-  __tsan::ctx->thread_registry.SetThreadNameByUserId(thread, name)
+#define COMMON_INTERCEPTOR_SET_PTHREAD_NAME(ctx, thread, name)         \
+  if (pthread_equal(pthread_self(), reinterpret_cast<void *>(thread))) \
+    COMMON_INTERCEPTOR_SET_THREAD_NAME(ctx, name);                     \
+  else                                                                 \
+    __tsan::ctx->thread_registry.SetThreadNameByUserId(thread, name)
 
 #define COMMON_INTERCEPTOR_BLOCK_REAL(name) BLOCK_REAL(name)
 
@@ -2553,7 +2587,7 @@ static USED void syscall_release(uptr pc, uptr addr) {
 }
 
 static void syscall_fd_close(uptr pc, int fd) {
-  TSAN_SYSCALL();
+  auto *thr = cur_thread();
   FdClose(thr, pc, fd);
 }
 
@@ -2688,6 +2722,26 @@ TSAN_INTERCEPTOR(void, thr_exit, tid_t *state) {
 #define TSAN_MAYBE_INTERCEPT_THR_EXIT
 #endif
 
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, cond_init, void *c, void *a)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, cond_destroy, void *c)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, cond_signal, void *c)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, cond_broadcast, void *c)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, cond_wait, void *c, void *m)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, mutex_init, void *m, void *a)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, mutex_destroy, void *m)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, mutex_lock, void *m)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, mutex_trylock, void *m)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, mutex_unlock, void *m)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, rwlock_init, void *l, void *a)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, rwlock_destroy, void *l)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, rwlock_rdlock, void *l)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, rwlock_tryrdlock, void *l)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, rwlock_wrlock, void *l)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, rwlock_trywrlock, void *l)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, rwlock_unlock, void *l)
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, once, void *o, void (*i)())
+TSAN_INTERCEPTOR_FREEBSD_ALIAS(int, sigmask, int f, void *n, void *o)
+
 TSAN_INTERCEPTOR_NETBSD_ALIAS(int, cond_init, void *c, void *a)
 TSAN_INTERCEPTOR_NETBSD_ALIAS(int, cond_signal, void *c)
 TSAN_INTERCEPTOR_NETBSD_ALIAS(int, cond_broadcast, void *c)
@@ -2916,6 +2970,26 @@ void InitializeInterceptors() {
   }
 #endif
 
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(cond_init);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(cond_destroy);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(cond_signal);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(cond_broadcast);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(cond_wait);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(mutex_init);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(mutex_destroy);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(mutex_lock);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(mutex_trylock);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(mutex_unlock);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(rwlock_init);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(rwlock_destroy);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(rwlock_rdlock);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(rwlock_tryrdlock);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(rwlock_wrlock);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(rwlock_trywrlock);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(rwlock_unlock);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(once);
+  TSAN_MAYBE_INTERCEPT_FREEBSD_ALIAS(sigmask);
+
   TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS(cond_init);
   TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS(cond_signal);
   TSAN_MAYBE_INTERCEPT_NETBSD_ALIAS(cond_broadcast);
diff --git a/libsanitizer/tsan/tsan_interface.cpp b/libsanitizer/tsan/tsan_interface.cpp
index 0487151..e6c4bf2 100644
--- a/libsanitizer/tsan/tsan_interface.cpp
+++ b/libsanitizer/tsan/tsan_interface.cpp
@@ -26,20 +26,6 @@ void __tsan_flush_memory() {
   FlushShadowMemory();
 }
 
-void __tsan_read16(void *addr) {
-  uptr pc = CALLERPC;
-  ThreadState *thr = cur_thread();
-  MemoryAccess(thr, pc, (uptr)addr, 8, kAccessRead);
-  MemoryAccess(thr, pc, (uptr)addr + 8, 8, kAccessRead);
-}
-
-void __tsan_write16(void *addr) {
-  uptr pc = CALLERPC;
-  ThreadState *thr = cur_thread();
-  MemoryAccess(thr, pc, (uptr)addr, 8, kAccessWrite);
-  MemoryAccess(thr, pc, (uptr)addr + 8, 8, kAccessWrite);
-}
-
 void __tsan_read16_pc(void *addr, void *pc) {
   uptr pc_no_pac = STRIP_PAC_PC(pc);
   ThreadState *thr = cur_thread();
diff --git a/libsanitizer/tsan/tsan_interface.inc b/libsanitizer/tsan/tsan_interface.inc
index 0031800..b0a424f 100644
--- a/libsanitizer/tsan/tsan_interface.inc
+++ b/libsanitizer/tsan/tsan_interface.inc
@@ -34,6 +34,10 @@ void __tsan_read8(void *addr) {
   MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 8, kAccessRead);
 }
 
+void __tsan_read16(void *addr) {
+  MemoryAccess16(cur_thread(), CALLERPC, (uptr)addr, kAccessRead);
+}
+
 void __tsan_write1(void *addr) {
   MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 1, kAccessWrite);
 }
@@ -50,6 +54,10 @@ void __tsan_write8(void *addr) {
   MemoryAccess(cur_thread(), CALLERPC, (uptr)addr, 8, kAccessWrite);
 }
 
+void __tsan_write16(void *addr) {
+  MemoryAccess16(cur_thread(), CALLERPC, (uptr)addr, kAccessWrite);
+}
+
 void __tsan_read1_pc(void *addr, void *pc) {
   MemoryAccess(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, 1, kAccessRead | kAccessExternalPC);
 }
diff --git a/libsanitizer/tsan/tsan_interface_atomic.cpp b/libsanitizer/tsan/tsan_interface_atomic.cpp
index 24ba3bb..f794a2f 100644
--- a/libsanitizer/tsan/tsan_interface_atomic.cpp
+++ b/libsanitizer/tsan/tsan_interface_atomic.cpp
@@ -235,8 +235,9 @@ static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a, morder mo) {
   T v = NoTsanAtomicLoad(a, mo);
   SyncVar *s = ctx->metamap.GetSyncIfExists((uptr)a);
   if (s) {
-    ReadLock l(&s->mtx);
-    AcquireImpl(thr, pc, &s->clock);
+    SlotLocker locker(thr);
+    ReadLock lock(&s->mtx);
+    thr->clock.Acquire(s->clock);
     // Re-read under sync mutex because we need a consistent snapshot
     // of the value and the clock we acquire.
     v = NoTsanAtomicLoad(a, mo);
@@ -270,14 +271,14 @@ static void AtomicStore(ThreadState *thr, uptr pc, volatile T *a, T v,
     NoTsanAtomicStore(a, v, mo);
     return;
   }
-  __sync_synchronize();
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseStoreImpl(thr, pc, &s->clock);
-  NoTsanAtomicStore(a, v, mo);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+    Lock lock(&s->mtx);
+    thr->clock.ReleaseStore(&s->clock);
+    NoTsanAtomicStore(a, v, mo);
+  }
+  IncrementEpoch(thr);
 }
 
 template <typename T, T (*F)(volatile T *v, T op)>
@@ -285,18 +286,21 @@ static T AtomicRMW(ThreadState *thr, uptr pc, volatile T *a, T v, morder mo) {
   MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(), kAccessWrite | kAccessAtomic);
   if (LIKELY(mo == mo_relaxed))
     return F(a, v);
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  if (IsAcqRelOrder(mo))
-    AcquireReleaseImpl(thr, pc, &s->clock);
-  else if (IsReleaseOrder(mo))
-    ReleaseImpl(thr, pc, &s->clock);
-  else if (IsAcquireOrder(mo))
-    AcquireImpl(thr, pc, &s->clock);
-  return F(a, v);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+    RWLock lock(&s->mtx, IsReleaseOrder(mo));
+    if (IsAcqRelOrder(mo))
+      thr->clock.ReleaseAcquire(&s->clock);
+    else if (IsReleaseOrder(mo))
+      thr->clock.Release(&s->clock);
+    else if (IsAcquireOrder(mo))
+      thr->clock.Acquire(s->clock);
+    v = F(a, v);
+  }
+  if (IsReleaseOrder(mo))
+    IncrementEpoch(thr);
+  return v;
 }
 
 template<typename T>
@@ -416,27 +420,28 @@ static bool AtomicCAS(ThreadState *thr, uptr pc, volatile T *a, T *c, T v,
     *c = pr;
     return false;
   }
-
+  SlotLocker locker(thr);
   bool release = IsReleaseOrder(mo);
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
-  RWLock l(&s->mtx, release);
-  T cc = *c;
-  T pr = func_cas(a, cc, v);
-  bool success = pr == cc;
-  if (!success) {
-    *c = pr;
-    mo = fmo;
+  bool success;
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
+    RWLock lock(&s->mtx, release);
+    T cc = *c;
+    T pr = func_cas(a, cc, v);
+    success = pr == cc;
+    if (!success) {
+      *c = pr;
+      mo = fmo;
+    }
+    if (success && IsAcqRelOrder(mo))
+      thr->clock.ReleaseAcquire(&s->clock);
+    else if (success && IsReleaseOrder(mo))
+      thr->clock.Release(&s->clock);
+    else if (IsAcquireOrder(mo))
+      thr->clock.Acquire(s->clock);
   }
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-
-  if (success && IsAcqRelOrder(mo))
-    AcquireReleaseImpl(thr, pc, &s->clock);
-  else if (success && IsReleaseOrder(mo))
-    ReleaseImpl(thr, pc, &s->clock);
-  else if (IsAcquireOrder(mo))
-    AcquireImpl(thr, pc, &s->clock);
+  if (success && release)
+    IncrementEpoch(thr);
   return success;
 }
 
diff --git a/libsanitizer/tsan/tsan_interface_java.cpp b/libsanitizer/tsan/tsan_interface_java.cpp
index c090c1f..7c15a16 100644
--- a/libsanitizer/tsan/tsan_interface_java.cpp
+++ b/libsanitizer/tsan/tsan_interface_java.cpp
@@ -106,7 +106,7 @@ void __tsan_java_free(jptr ptr, jptr size) {
   DCHECK_GE(ptr, jctx->heap_begin);
   DCHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
 
-  ctx->metamap.FreeRange(thr->proc(), ptr, size);
+  ctx->metamap.FreeRange(thr->proc(), ptr, size, false);
 }
 
 void __tsan_java_move(jptr src, jptr dst, jptr size) {
@@ -133,7 +133,7 @@ void __tsan_java_move(jptr src, jptr dst, jptr size) {
   // support that anymore as it contains addresses of accesses.
   RawShadow *d = MemToShadow(dst);
   RawShadow *dend = MemToShadow(dst + size);
-  internal_memset(d, 0, (dend - d) * sizeof(*d));
+  ShadowSet(d, dend, Shadow::kEmpty);
 }
 
 jptr __tsan_java_find(jptr *from_ptr, jptr to) {
diff --git a/libsanitizer/tsan/tsan_mman.cpp b/libsanitizer/tsan/tsan_mman.cpp
index f1b6768..0937e52 100644
--- a/libsanitizer/tsan/tsan_mman.cpp
+++ b/libsanitizer/tsan/tsan_mman.cpp
@@ -20,18 +20,6 @@
 #include "tsan_report.h"
 #include "tsan_flags.h"
 
-// May be overriden by front-end.
-SANITIZER_WEAK_DEFAULT_IMPL
-void __sanitizer_malloc_hook(void *ptr, uptr size) {
-  (void)ptr;
-  (void)size;
-}
-
-SANITIZER_WEAK_DEFAULT_IMPL
-void __sanitizer_free_hook(void *ptr) {
-  (void)ptr;
-}
-
 namespace __tsan {
 
 struct MapUnmapCallback {
@@ -69,8 +57,17 @@ Allocator *allocator() {
 struct GlobalProc {
   Mutex mtx;
   Processor *proc;
-
-  GlobalProc() : mtx(MutexTypeGlobalProc), proc(ProcCreate()) {}
+  // This mutex represents the internal allocator combined for
+  // the purposes of deadlock detection. The internal allocator
+  // uses multiple mutexes, moreover they are locked only occasionally
+  // and they are spin mutexes which don't support deadlock detection.
+  // So we use this fake mutex to serve as a substitute for these mutexes.
+  CheckedMutex internal_alloc_mtx;
+
+  GlobalProc()
+      : mtx(MutexTypeGlobalProc),
+        proc(ProcCreate()),
+        internal_alloc_mtx(MutexTypeInternalAlloc) {}
 };
 
 static char global_proc_placeholder[sizeof(GlobalProc)] ALIGNED(64);
@@ -78,6 +75,11 @@ GlobalProc *global_proc() {
   return reinterpret_cast<GlobalProc*>(&global_proc_placeholder);
 }
 
+static void InternalAllocAccess() {
+  global_proc()->internal_alloc_mtx.Lock();
+  global_proc()->internal_alloc_mtx.Unlock();
+}
+
 ScopedGlobalProcessor::ScopedGlobalProcessor() {
   GlobalProc *gp = global_proc();
   ThreadState *thr = cur_thread();
@@ -110,6 +112,24 @@ ScopedGlobalProcessor::~ScopedGlobalProcessor() {
   gp->mtx.Unlock();
 }
 
+void AllocatorLock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  global_proc()->internal_alloc_mtx.Lock();
+  InternalAllocatorLock();
+}
+
+void AllocatorUnlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  InternalAllocatorUnlock();
+  global_proc()->internal_alloc_mtx.Unlock();
+}
+
+void GlobalProcessorLock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  global_proc()->mtx.Lock();
+}
+
+void GlobalProcessorUnlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  global_proc()->mtx.Unlock();
+}
+
 static constexpr uptr kMaxAllowedMallocSize = 1ull << 40;
 static uptr max_user_defined_malloc_size;
 
@@ -166,6 +186,12 @@ void *user_alloc_internal(ThreadState *thr, uptr pc, uptr sz, uptr align,
     GET_STACK_TRACE_FATAL(thr, pc);
     ReportAllocationSizeTooBig(sz, malloc_limit, &stack);
   }
+  if (UNLIKELY(IsRssLimitExceeded())) {
+    if (AllocatorMayReturnNull())
+      return nullptr;
+    GET_STACK_TRACE_FATAL(thr, pc);
+    ReportRssLimitExceeded(&stack);
+  }
   void *p = allocator()->Allocate(&thr->proc()->alloc_cache, sz, align);
   if (UNLIKELY(!p)) {
     SetAllocatorOutOfMemory();
@@ -219,8 +245,17 @@ void *user_reallocarray(ThreadState *thr, uptr pc, void *p, uptr size, uptr n) {
 
 void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
   DPrintf("#%d: alloc(%zu) = 0x%zx\n", thr->tid, sz, p);
+  // Note: this can run before thread initialization/after finalization.
+  // As a result this is not necessarily synchronized with DoReset,
+  // which iterates over and resets all sync objects,
+  // but it is fine to create new MBlocks in this context.
   ctx->metamap.AllocBlock(thr, pc, p, sz);
-  if (write && thr->ignore_reads_and_writes == 0)
+  // If this runs before thread initialization/after finalization
+  // and we don't have trace initialized, we can't imitate writes.
+  // In such case just reset the shadow range, it is fine since
+  // it affects only a small fraction of special objects.
+  if (write && thr->ignore_reads_and_writes == 0 &&
+      atomic_load_relaxed(&thr->trace_pos))
     MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
   else
     MemoryResetRange(thr, pc, (uptr)p, sz);
@@ -228,7 +263,14 @@ void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
 
 void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write) {
   CHECK_NE(p, (void*)0);
-  uptr sz = ctx->metamap.FreeBlock(thr->proc(), p);
+  if (!thr->slot) {
+    // Very early/late in thread lifetime, or during fork.
+    UNUSED uptr sz = ctx->metamap.FreeBlock(thr->proc(), p, false);
+    DPrintf("#%d: free(0x%zx, %zu) (no slot)\n", thr->tid, p, sz);
+    return;
+  }
+  SlotLocker locker(thr);
+  uptr sz = ctx->metamap.FreeBlock(thr->proc(), p, true);
   DPrintf("#%d: free(0x%zx, %zu)\n", thr->tid, p, sz);
   if (write && thr->ignore_reads_and_writes == 0)
     MemoryRangeFreed(thr, pc, (uptr)p, sz);
@@ -310,7 +352,7 @@ void *user_pvalloc(ThreadState *thr, uptr pc, uptr sz) {
 }
 
 uptr user_alloc_usable_size(const void *p) {
-  if (p == 0)
+  if (p == 0 || !IsAppMem((uptr)p))
     return 0;
   MBlock *b = ctx->metamap.GetBlock((uptr)p);
   if (!b)
@@ -324,7 +366,6 @@ void invoke_malloc_hook(void *ptr, uptr size) {
   ThreadState *thr = cur_thread();
   if (ctx == 0 || !ctx->initialized || thr->ignore_interceptors)
     return;
-  __sanitizer_malloc_hook(ptr, size);
   RunMallocHooks(ptr, size);
 }
 
@@ -332,7 +373,6 @@ void invoke_free_hook(void *ptr) {
   ThreadState *thr = cur_thread();
   if (ctx == 0 || !ctx->initialized || thr->ignore_interceptors)
     return;
-  __sanitizer_free_hook(ptr);
   RunFreeHooks(ptr);
 }
 
@@ -342,6 +382,7 @@ void *Alloc(uptr sz) {
     thr->nomalloc = 0;  // CHECK calls internal_malloc().
     CHECK(0);
   }
+  InternalAllocAccess();
   return InternalAlloc(sz, &thr->proc()->internal_alloc_cache);
 }
 
@@ -351,6 +392,7 @@ void FreeImpl(void *p) {
     thr->nomalloc = 0;  // CHECK calls internal_malloc().
     CHECK(0);
   }
+  InternalAllocAccess();
   InternalFree(p, &thr->proc()->internal_alloc_cache);
 }
 
@@ -393,8 +435,6 @@ uptr __sanitizer_get_allocated_size(const void *p) {
 
 void __tsan_on_thread_idle() {
   ThreadState *thr = cur_thread();
-  thr->clock.ResetCached(&thr->proc()->clock_cache);
-  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
   allocator()->SwallowCache(&thr->proc()->alloc_cache);
   internal_allocator()->SwallowCache(&thr->proc()->internal_alloc_cache);
   ctx->metamap.OnProcIdle(thr->proc());
diff --git a/libsanitizer/tsan/tsan_mman.h b/libsanitizer/tsan/tsan_mman.h
index efea5e5..2095f28 100644
--- a/libsanitizer/tsan/tsan_mman.h
+++ b/libsanitizer/tsan/tsan_mman.h
@@ -24,6 +24,10 @@ void ReplaceSystemMalloc();
 void AllocatorProcStart(Processor *proc);
 void AllocatorProcFinish(Processor *proc);
 void AllocatorPrintStats();
+void AllocatorLock();
+void AllocatorUnlock();
+void GlobalProcessorLock();
+void GlobalProcessorUnlock();
 
 // For user allocations.
 void *user_alloc_internal(ThreadState *thr, uptr pc, uptr sz,
diff --git a/libsanitizer/tsan/tsan_mutexset.cpp b/libsanitizer/tsan/tsan_mutexset.cpp
index 7351796..3a75b80a 100644
--- a/libsanitizer/tsan/tsan_mutexset.cpp
+++ b/libsanitizer/tsan/tsan_mutexset.cpp
@@ -19,57 +19,7 @@ namespace __tsan {
 MutexSet::MutexSet() {
 }
 
-void MutexSet::Add(u64 id, bool write, u64 epoch) {
-  // Look up existing mutex with the same id.
-  for (uptr i = 0; i < size_; i++) {
-    if (descs_[i].id == id) {
-      descs_[i].count++;
-      descs_[i].epoch = epoch;
-      return;
-    }
-  }
-  // On overflow, find the oldest mutex and drop it.
-  if (size_ == kMaxSize) {
-    u64 minepoch = (u64)-1;
-    u64 mini = (u64)-1;
-    for (uptr i = 0; i < size_; i++) {
-      if (descs_[i].epoch < minepoch) {
-        minepoch = descs_[i].epoch;
-        mini = i;
-      }
-    }
-    RemovePos(mini);
-    CHECK_EQ(size_, kMaxSize - 1);
-  }
-  // Add new mutex descriptor.
-  descs_[size_].addr = 0;
-  descs_[size_].stack_id = kInvalidStackID;
-  descs_[size_].id = id;
-  descs_[size_].write = write;
-  descs_[size_].epoch = epoch;
-  descs_[size_].seq = seq_++;
-  descs_[size_].count = 1;
-  size_++;
-}
-
-void MutexSet::Del(u64 id, bool write) {
-  for (uptr i = 0; i < size_; i++) {
-    if (descs_[i].id == id) {
-      if (--descs_[i].count == 0)
-        RemovePos(i);
-      return;
-    }
-  }
-}
-
-void MutexSet::Remove(u64 id) {
-  for (uptr i = 0; i < size_; i++) {
-    if (descs_[i].id == id) {
-      RemovePos(i);
-      return;
-    }
-  }
-}
+void MutexSet::Reset() { internal_memset(this, 0, sizeof(*this)); }
 
 void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {
   // Look up existing mutex with the same id.
@@ -93,9 +43,7 @@ void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {
   // Add new mutex descriptor.
   descs_[size_].addr = addr;
   descs_[size_].stack_id = stack_id;
-  descs_[size_].id = 0;
   descs_[size_].write = write;
-  descs_[size_].epoch = 0;
   descs_[size_].seq = seq_++;
   descs_[size_].count = 1;
   size_++;
diff --git a/libsanitizer/tsan/tsan_mutexset.h b/libsanitizer/tsan/tsan_mutexset.h
index 93776a6..aabd361 100644
--- a/libsanitizer/tsan/tsan_mutexset.h
+++ b/libsanitizer/tsan/tsan_mutexset.h
@@ -25,8 +25,6 @@ class MutexSet {
   struct Desc {
     uptr addr;
     StackID stack_id;
-    u64 id;
-    u64 epoch;
     u32 seq;
     u32 count;
     bool write;
@@ -40,10 +38,7 @@ class MutexSet {
   };
 
   MutexSet();
-  // The 'id' is obtained from SyncVar::GetId().
-  void Add(u64 id, bool write, u64 epoch);
-  void Del(u64 id, bool write);
-  void Remove(u64 id);  // Removes the mutex completely (if it's destroyed).
+  void Reset();
   void AddAddr(uptr addr, StackID stack_id, bool write);
   void DelAddr(uptr addr, bool destroy = false);
   uptr Size() const;
@@ -82,9 +77,7 @@ class DynamicMutexSet {
 // in different goroutine).
 #if SANITIZER_GO
 MutexSet::MutexSet() {}
-void MutexSet::Add(u64 id, bool write, u64 epoch) {}
-void MutexSet::Del(u64 id, bool write) {}
-void MutexSet::Remove(u64 id) {}
+void MutexSet::Reset() {}
 void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {}
 void MutexSet::DelAddr(uptr addr, bool destroy) {}
 uptr MutexSet::Size() const { return 0; }
diff --git a/libsanitizer/tsan/tsan_platform.h b/libsanitizer/tsan/tsan_platform.h
index 7ff0aca..233bf0a 100644
--- a/libsanitizer/tsan/tsan_platform.h
+++ b/libsanitizer/tsan/tsan_platform.h
@@ -18,8 +18,8 @@
 # error "Only 64-bit is supported"
 #endif
 
+#include "sanitizer_common/sanitizer_common.h"
 #include "tsan_defs.h"
-#include "tsan_trace.h"
 
 namespace __tsan {
 
@@ -40,14 +40,12 @@ enum {
 C/C++ on linux/x86_64 and freebsd/x86_64
 0000 0000 1000 - 0080 0000 0000: main binary and/or MAP_32BIT mappings (512GB)
 0040 0000 0000 - 0100 0000 0000: -
-0100 0000 0000 - 2000 0000 0000: shadow
-2000 0000 0000 - 3000 0000 0000: -
+0100 0000 0000 - 1000 0000 0000: shadow
+1000 0000 0000 - 3000 0000 0000: -
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
 4000 0000 0000 - 5500 0000 0000: -
 5500 0000 0000 - 5680 0000 0000: pie binaries without ASLR or on 4.1+ kernels
-5680 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 7d00 0000 0000: -
+5680 0000 0000 - 7d00 0000 0000: -
 7b00 0000 0000 - 7c00 0000 0000: heap
 7c00 0000 0000 - 7e80 0000 0000: -
 7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
@@ -67,10 +65,8 @@ C/C++ on netbsd/amd64 can reuse the same mapping:
 struct Mapping48AddressSpace {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x340000000000ull;
-  static const uptr kTraceMemBeg   = 0x600000000000ull;
-  static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
-  static const uptr kShadowEnd     = 0x200000000000ull;
+  static const uptr kShadowEnd = 0x100000000000ull;
   static const uptr kHeapMemBeg    = 0x7b0000000000ull;
   static const uptr kHeapMemEnd    = 0x7c0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000001000ull;
@@ -89,14 +85,13 @@ struct Mapping48AddressSpace {
 C/C++ on linux/mips64 (40-bit VMA)
 0000 0000 00 - 0100 0000 00: -                                           (4 GB)
 0100 0000 00 - 0200 0000 00: main binary                                 (4 GB)
-0200 0000 00 - 2000 0000 00: -                                         (120 GB)
-2000 0000 00 - 4000 0000 00: shadow                                    (128 GB)
+0200 0000 00 - 1200 0000 00: -                                          (64 GB)
+1200 0000 00 - 2200 0000 00: shadow                                     (64 GB)
+2200 0000 00 - 4000 0000 00: -                                         (120 GB)
 4000 0000 00 - 5000 0000 00: metainfo (memory blocks and sync objects)  (64 GB)
 5000 0000 00 - aa00 0000 00: -                                         (360 GB)
 aa00 0000 00 - ab00 0000 00: main binary (PIE)                           (4 GB)
-ab00 0000 00 - b000 0000 00: -                                          (20 GB)
-b000 0000 00 - b200 0000 00: traces                                      (8 GB)
-b200 0000 00 - fe00 0000 00: -                                         (304 GB)
+ab00 0000 00 - fe00 0000 00: -                                         (332 GB)
 fe00 0000 00 - ff00 0000 00: heap                                        (4 GB)
 ff00 0000 00 - ff80 0000 00: -                                           (2 GB)
 ff80 0000 00 - ffff ffff ff: modules and main thread stack              (<2 GB)
@@ -104,10 +99,8 @@ ff80 0000 00 - ffff ffff ff: modules and main thread stack              (<2 GB)
 struct MappingMips64_40 {
   static const uptr kMetaShadowBeg = 0x4000000000ull;
   static const uptr kMetaShadowEnd = 0x5000000000ull;
-  static const uptr kTraceMemBeg   = 0xb000000000ull;
-  static const uptr kTraceMemEnd   = 0xb200000000ull;
-  static const uptr kShadowBeg     = 0x2000000000ull;
-  static const uptr kShadowEnd     = 0x4000000000ull;
+  static const uptr kShadowBeg = 0x1200000000ull;
+  static const uptr kShadowEnd = 0x2200000000ull;
   static const uptr kHeapMemBeg    = 0xfe00000000ull;
   static const uptr kHeapMemEnd    = 0xff00000000ull;
   static const uptr kLoAppMemBeg   = 0x0100000000ull;
@@ -128,12 +121,10 @@ C/C++ on Darwin/iOS/ARM64 (36-bit VMA, 64 GB VM)
 0100 0000 00 - 0200 0000 00: main binary, modules, thread stacks  (4 GB)
 0200 0000 00 - 0300 0000 00: heap                                 (4 GB)
 0300 0000 00 - 0400 0000 00: -                                    (4 GB)
-0400 0000 00 - 0c00 0000 00: shadow memory                       (32 GB)
-0c00 0000 00 - 0d00 0000 00: -                                    (4 GB)
+0400 0000 00 - 0800 0000 00: shadow memory                       (16 GB)
+0800 0000 00 - 0d00 0000 00: -                                   (20 GB)
 0d00 0000 00 - 0e00 0000 00: metainfo                             (4 GB)
-0e00 0000 00 - 0f00 0000 00: -                                    (4 GB)
-0f00 0000 00 - 0fc0 0000 00: traces                               (3 GB)
-0fc0 0000 00 - 1000 0000 00: -
+0e00 0000 00 - 1000 0000 00: -
 */
 struct MappingAppleAarch64 {
   static const uptr kLoAppMemBeg   = 0x0100000000ull;
@@ -141,16 +132,14 @@ struct MappingAppleAarch64 {
   static const uptr kHeapMemBeg    = 0x0200000000ull;
   static const uptr kHeapMemEnd    = 0x0300000000ull;
   static const uptr kShadowBeg     = 0x0400000000ull;
-  static const uptr kShadowEnd     = 0x0c00000000ull;
+  static const uptr kShadowEnd = 0x0800000000ull;
   static const uptr kMetaShadowBeg = 0x0d00000000ull;
   static const uptr kMetaShadowEnd = 0x0e00000000ull;
-  static const uptr kTraceMemBeg   = 0x0f00000000ull;
-  static const uptr kTraceMemEnd   = 0x0fc0000000ull;
   static const uptr kHiAppMemBeg   = 0x0fc0000000ull;
   static const uptr kHiAppMemEnd   = 0x0fc0000000ull;
   static const uptr kShadowMsk = 0x0ull;
   static const uptr kShadowXor = 0x0ull;
-  static const uptr kShadowAdd = 0x0ull;
+  static const uptr kShadowAdd = 0x0200000000ull;
   static const uptr kVdsoBeg       = 0x7000000000000000ull;
   static const uptr kMidAppMemBeg = 0;
   static const uptr kMidAppMemEnd = 0;
@@ -159,29 +148,25 @@ struct MappingAppleAarch64 {
 /*
 C/C++ on linux/aarch64 (39-bit VMA)
 0000 0010 00 - 0100 0000 00: main binary
-0100 0000 00 - 0800 0000 00: -
-0800 0000 00 - 2000 0000 00: shadow memory
+0100 0000 00 - 0400 0000 00: -
+0400 0000 00 - 1000 0000 00: shadow memory
 2000 0000 00 - 3100 0000 00: -
 3100 0000 00 - 3400 0000 00: metainfo
 3400 0000 00 - 5500 0000 00: -
 5500 0000 00 - 5600 0000 00: main binary (PIE)
-5600 0000 00 - 6000 0000 00: -
-6000 0000 00 - 6200 0000 00: traces
-6200 0000 00 - 7d00 0000 00: -
+5600 0000 00 - 7c00 0000 00: -
 7c00 0000 00 - 7d00 0000 00: heap
 7d00 0000 00 - 7fff ffff ff: modules and main thread stack
 */
 struct MappingAarch64_39 {
   static const uptr kLoAppMemBeg   = 0x0000001000ull;
   static const uptr kLoAppMemEnd   = 0x0100000000ull;
-  static const uptr kShadowBeg     = 0x0800000000ull;
-  static const uptr kShadowEnd     = 0x2000000000ull;
+  static const uptr kShadowBeg = 0x0400000000ull;
+  static const uptr kShadowEnd = 0x1000000000ull;
   static const uptr kMetaShadowBeg = 0x3100000000ull;
   static const uptr kMetaShadowEnd = 0x3400000000ull;
   static const uptr kMidAppMemBeg  = 0x5500000000ull;
-  static const uptr kMidAppMemEnd  = 0x5600000000ull;
-  static const uptr kTraceMemBeg   = 0x6000000000ull;
-  static const uptr kTraceMemEnd   = 0x6200000000ull;
+  static const uptr kMidAppMemEnd = 0x5600000000ull;
   static const uptr kHeapMemBeg    = 0x7c00000000ull;
   static const uptr kHeapMemEnd    = 0x7d00000000ull;
   static const uptr kHiAppMemBeg   = 0x7e00000000ull;
@@ -195,15 +180,13 @@ struct MappingAarch64_39 {
 /*
 C/C++ on linux/aarch64 (42-bit VMA)
 00000 0010 00 - 01000 0000 00: main binary
-01000 0000 00 - 10000 0000 00: -
-10000 0000 00 - 20000 0000 00: shadow memory
-20000 0000 00 - 26000 0000 00: -
+01000 0000 00 - 08000 0000 00: -
+08000 0000 00 - 10000 0000 00: shadow memory
+10000 0000 00 - 26000 0000 00: -
 26000 0000 00 - 28000 0000 00: metainfo
 28000 0000 00 - 2aa00 0000 00: -
 2aa00 0000 00 - 2ab00 0000 00: main binary (PIE)
-2ab00 0000 00 - 36200 0000 00: -
-36200 0000 00 - 36240 0000 00: traces
-36240 0000 00 - 3e000 0000 00: -
+2ab00 0000 00 - 3e000 0000 00: -
 3e000 0000 00 - 3f000 0000 00: heap
 3f000 0000 00 - 3ffff ffff ff: modules and main thread stack
 */
@@ -211,14 +194,12 @@ struct MappingAarch64_42 {
   static const uptr kBroken = kBrokenReverseMapping;
   static const uptr kLoAppMemBeg   = 0x00000001000ull;
   static const uptr kLoAppMemEnd   = 0x01000000000ull;
-  static const uptr kShadowBeg     = 0x10000000000ull;
-  static const uptr kShadowEnd     = 0x20000000000ull;
+  static const uptr kShadowBeg = 0x08000000000ull;
+  static const uptr kShadowEnd = 0x10000000000ull;
   static const uptr kMetaShadowBeg = 0x26000000000ull;
   static const uptr kMetaShadowEnd = 0x28000000000ull;
   static const uptr kMidAppMemBeg  = 0x2aa00000000ull;
-  static const uptr kMidAppMemEnd  = 0x2ab00000000ull;
-  static const uptr kTraceMemBeg   = 0x36200000000ull;
-  static const uptr kTraceMemEnd   = 0x36400000000ull;
+  static const uptr kMidAppMemEnd = 0x2ab00000000ull;
   static const uptr kHeapMemBeg    = 0x3e000000000ull;
   static const uptr kHeapMemEnd    = 0x3f000000000ull;
   static const uptr kHiAppMemBeg   = 0x3f000000000ull;
@@ -232,14 +213,12 @@ struct MappingAarch64_42 {
 struct MappingAarch64_48 {
   static const uptr kLoAppMemBeg   = 0x0000000001000ull;
   static const uptr kLoAppMemEnd   = 0x0000200000000ull;
-  static const uptr kShadowBeg     = 0x0002000000000ull;
-  static const uptr kShadowEnd     = 0x0004000000000ull;
+  static const uptr kShadowBeg = 0x0001000000000ull;
+  static const uptr kShadowEnd = 0x0002000000000ull;
   static const uptr kMetaShadowBeg = 0x0005000000000ull;
   static const uptr kMetaShadowEnd = 0x0006000000000ull;
   static const uptr kMidAppMemBeg  = 0x0aaaa00000000ull;
-  static const uptr kMidAppMemEnd  = 0x0aaaf00000000ull;
-  static const uptr kTraceMemBeg   = 0x0f06000000000ull;
-  static const uptr kTraceMemEnd   = 0x0f06200000000ull;
+  static const uptr kMidAppMemEnd = 0x0aaaf00000000ull;
   static const uptr kHeapMemBeg    = 0x0ffff00000000ull;
   static const uptr kHeapMemEnd    = 0x0ffff00000000ull;
   static const uptr kHiAppMemBeg   = 0x0ffff00000000ull;
@@ -257,9 +236,7 @@ C/C++ on linux/powerpc64 (44-bit VMA)
 0001 0000 0000 - 0b00 0000 0000: shadow
 0b00 0000 0000 - 0b00 0000 0000: -
 0b00 0000 0000 - 0d00 0000 0000: metainfo (memory blocks and sync objects)
-0d00 0000 0000 - 0d00 0000 0000: -
-0d00 0000 0000 - 0f00 0000 0000: traces
-0f00 0000 0000 - 0f00 0000 0000: -
+0d00 0000 0000 - 0f00 0000 0000: -
 0f00 0000 0000 - 0f50 0000 0000: heap
 0f50 0000 0000 - 0f60 0000 0000: -
 0f60 0000 0000 - 1000 0000 0000: modules and main thread stack
@@ -269,8 +246,6 @@ struct MappingPPC64_44 {
       kBrokenMapping | kBrokenReverseMapping | kBrokenLinearity;
   static const uptr kMetaShadowBeg = 0x0b0000000000ull;
   static const uptr kMetaShadowEnd = 0x0d0000000000ull;
-  static const uptr kTraceMemBeg   = 0x0d0000000000ull;
-  static const uptr kTraceMemEnd   = 0x0f0000000000ull;
   static const uptr kShadowBeg     = 0x000100000000ull;
   static const uptr kShadowEnd     = 0x0b0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000000100ull;
@@ -291,23 +266,19 @@ struct MappingPPC64_44 {
 C/C++ on linux/powerpc64 (46-bit VMA)
 0000 0000 1000 - 0100 0000 0000: main binary
 0100 0000 0000 - 0200 0000 0000: -
-0100 0000 0000 - 1000 0000 0000: shadow
-1000 0000 0000 - 1000 0000 0000: -
-1000 0000 0000 - 2000 0000 0000: metainfo (memory blocks and sync objects)
-2000 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 2200 0000 0000: traces
-2200 0000 0000 - 3d00 0000 0000: -
+0100 0000 0000 - 0800 0000 0000: shadow
+0800 0000 0000 - 1000 0000 0000: -
+1000 0000 0000 - 1200 0000 0000: metainfo (memory blocks and sync objects)
+1200 0000 0000 - 3d00 0000 0000: -
 3d00 0000 0000 - 3e00 0000 0000: heap
 3e00 0000 0000 - 3e80 0000 0000: -
 3e80 0000 0000 - 4000 0000 0000: modules and main thread stack
 */
 struct MappingPPC64_46 {
   static const uptr kMetaShadowBeg = 0x100000000000ull;
-  static const uptr kMetaShadowEnd = 0x200000000000ull;
-  static const uptr kTraceMemBeg   = 0x200000000000ull;
-  static const uptr kTraceMemEnd   = 0x220000000000ull;
+  static const uptr kMetaShadowEnd = 0x120000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
-  static const uptr kShadowEnd     = 0x100000000000ull;
+  static const uptr kShadowEnd = 0x080000000000ull;
   static const uptr kHeapMemBeg    = 0x3d0000000000ull;
   static const uptr kHeapMemEnd    = 0x3e0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000001000ull;
@@ -326,23 +297,19 @@ struct MappingPPC64_46 {
 C/C++ on linux/powerpc64 (47-bit VMA)
 0000 0000 1000 - 0100 0000 0000: main binary
 0100 0000 0000 - 0200 0000 0000: -
-0100 0000 0000 - 1000 0000 0000: shadow
-1000 0000 0000 - 1000 0000 0000: -
-1000 0000 0000 - 2000 0000 0000: metainfo (memory blocks and sync objects)
-2000 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 2200 0000 0000: traces
-2200 0000 0000 - 7d00 0000 0000: -
+0100 0000 0000 - 0800 0000 0000: shadow
+0800 0000 0000 - 1000 0000 0000: -
+1000 0000 0000 - 1200 0000 0000: metainfo (memory blocks and sync objects)
+1200 0000 0000 - 7d00 0000 0000: -
 7d00 0000 0000 - 7e00 0000 0000: heap
 7e00 0000 0000 - 7e80 0000 0000: -
 7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
 */
 struct MappingPPC64_47 {
   static const uptr kMetaShadowBeg = 0x100000000000ull;
-  static const uptr kMetaShadowEnd = 0x200000000000ull;
-  static const uptr kTraceMemBeg   = 0x200000000000ull;
-  static const uptr kTraceMemEnd   = 0x220000000000ull;
+  static const uptr kMetaShadowEnd = 0x120000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
-  static const uptr kShadowEnd     = 0x100000000000ull;
+  static const uptr kShadowEnd = 0x080000000000ull;
   static const uptr kHeapMemBeg    = 0x7d0000000000ull;
   static const uptr kHeapMemEnd    = 0x7e0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000001000ull;
@@ -362,22 +329,18 @@ C/C++ on linux/s390x
 While the kernel provides a 64-bit address space, we have to restrict ourselves
 to 48 bits due to how e.g. SyncVar::GetId() works.
 0000 0000 1000 - 0e00 0000 0000: binary, modules, stacks - 14 TiB
-0e00 0000 0000 - 4000 0000 0000: -
-4000 0000 0000 - 8000 0000 0000: shadow - 64TiB (4 * app)
-8000 0000 0000 - 9000 0000 0000: -
+0e00 0000 0000 - 2000 0000 0000: -
+2000 0000 0000 - 4000 0000 0000: shadow - 32TiB (2 * app)
+4000 0000 0000 - 9000 0000 0000: -
 9000 0000 0000 - 9800 0000 0000: metainfo - 8TiB (0.5 * app)
-9800 0000 0000 - a000 0000 0000: -
-a000 0000 0000 - b000 0000 0000: traces - 16TiB (max history * 128k threads)
-b000 0000 0000 - be00 0000 0000: -
+9800 0000 0000 - be00 0000 0000: -
 be00 0000 0000 - c000 0000 0000: heap - 2TiB (max supported by the allocator)
 */
 struct MappingS390x {
   static const uptr kMetaShadowBeg = 0x900000000000ull;
   static const uptr kMetaShadowEnd = 0x980000000000ull;
-  static const uptr kTraceMemBeg   = 0xa00000000000ull;
-  static const uptr kTraceMemEnd   = 0xb00000000000ull;
-  static const uptr kShadowBeg     = 0x400000000000ull;
-  static const uptr kShadowEnd     = 0x800000000000ull;
+  static const uptr kShadowBeg = 0x200000000000ull;
+  static const uptr kShadowEnd = 0x400000000000ull;
   static const uptr kHeapMemBeg    = 0xbe0000000000ull;
   static const uptr kHeapMemEnd    = 0xc00000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000001000ull;
@@ -397,21 +360,17 @@ struct MappingS390x {
 0000 1000 0000 - 00c0 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 2380 0000 0000: shadow
-2380 0000 0000 - 3000 0000 0000: -
+2000 0000 0000 - 21c0 0000 0000: shadow
+21c0 0000 0000 - 3000 0000 0000: -
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
-4000 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 8000 0000 0000: -
+4000 0000 0000 - 8000 0000 0000: -
 */
 
 struct MappingGo48 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
-  static const uptr kTraceMemBeg   = 0x600000000000ull;
-  static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
-  static const uptr kShadowEnd     = 0x238000000000ull;
+  static const uptr kShadowEnd = 0x21c000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
   static const uptr kLoAppMemEnd = 0x00e000000000ull;
   static const uptr kMidAppMemBeg = 0;
@@ -431,8 +390,8 @@ struct MappingGo48 {
 0000 1000 0000 - 00f8 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 0100 0000 0000: -
-0100 0000 0000 - 0500 0000 0000: shadow
-0500 0000 0000 - 0700 0000 0000: traces
+0100 0000 0000 - 0300 0000 0000: shadow
+0300 0000 0000 - 0700 0000 0000: -
 0700 0000 0000 - 0770 0000 0000: metainfo (memory blocks and sync objects)
 07d0 0000 0000 - 8000 0000 0000: -
 */
@@ -440,10 +399,8 @@ struct MappingGo48 {
 struct MappingGoWindows {
   static const uptr kMetaShadowBeg = 0x070000000000ull;
   static const uptr kMetaShadowEnd = 0x077000000000ull;
-  static const uptr kTraceMemBeg = 0x050000000000ull;
-  static const uptr kTraceMemEnd = 0x070000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
-  static const uptr kShadowEnd     = 0x050000000000ull;
+  static const uptr kShadowEnd = 0x030000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
   static const uptr kLoAppMemEnd = 0x00e000000000ull;
   static const uptr kMidAppMemBeg = 0;
@@ -463,21 +420,17 @@ struct MappingGoWindows {
 0000 1000 0000 - 00c0 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 2380 0000 0000: shadow
-2380 0000 0000 - 2400 0000 0000: -
-2400 0000 0000 - 3400 0000 0000: metainfo (memory blocks and sync objects)
-3400 0000 0000 - 3600 0000 0000: -
-3600 0000 0000 - 3800 0000 0000: traces
-3800 0000 0000 - 4000 0000 0000: -
+2000 0000 0000 - 21c0 0000 0000: shadow
+21c0 0000 0000 - 2400 0000 0000: -
+2400 0000 0000 - 2470 0000 0000: metainfo (memory blocks and sync objects)
+2470 0000 0000 - 4000 0000 0000: -
 */
 
 struct MappingGoPPC64_46 {
   static const uptr kMetaShadowBeg = 0x240000000000ull;
-  static const uptr kMetaShadowEnd = 0x340000000000ull;
-  static const uptr kTraceMemBeg   = 0x360000000000ull;
-  static const uptr kTraceMemEnd   = 0x380000000000ull;
+  static const uptr kMetaShadowEnd = 0x247000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
-  static const uptr kShadowEnd     = 0x238000000000ull;
+  static const uptr kShadowEnd = 0x21c000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
   static const uptr kLoAppMemEnd = 0x00e000000000ull;
   static const uptr kMidAppMemBeg = 0;
@@ -497,21 +450,17 @@ struct MappingGoPPC64_46 {
 0000 1000 0000 - 00c0 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 3000 0000 0000: shadow
-3000 0000 0000 - 3000 0000 0000: -
-3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
-4000 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 8000 0000 0000: -
+2000 0000 0000 - 2800 0000 0000: shadow
+2800 0000 0000 - 3000 0000 0000: -
+3000 0000 0000 - 3200 0000 0000: metainfo (memory blocks and sync objects)
+3200 0000 0000 - 8000 0000 0000: -
 */
 
 struct MappingGoPPC64_47 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
-  static const uptr kMetaShadowEnd = 0x400000000000ull;
-  static const uptr kTraceMemBeg   = 0x600000000000ull;
-  static const uptr kTraceMemEnd   = 0x620000000000ull;
+  static const uptr kMetaShadowEnd = 0x320000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
-  static const uptr kShadowEnd     = 0x300000000000ull;
+  static const uptr kShadowEnd = 0x280000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
   static const uptr kLoAppMemEnd = 0x00e000000000ull;
   static const uptr kMidAppMemBeg = 0;
@@ -531,20 +480,16 @@ struct MappingGoPPC64_47 {
 0000 1000 0000 - 00c0 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 3000 0000 0000: shadow
-3000 0000 0000 - 3000 0000 0000: -
-3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
-4000 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 8000 0000 0000: -
+2000 0000 0000 - 2800 0000 0000: shadow
+2800 0000 0000 - 3000 0000 0000: -
+3000 0000 0000 - 3200 0000 0000: metainfo (memory blocks and sync objects)
+3200 0000 0000 - 8000 0000 0000: -
 */
 struct MappingGoAarch64 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
-  static const uptr kMetaShadowEnd = 0x400000000000ull;
-  static const uptr kTraceMemBeg   = 0x600000000000ull;
-  static const uptr kTraceMemEnd   = 0x620000000000ull;
+  static const uptr kMetaShadowEnd = 0x320000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
-  static const uptr kShadowEnd     = 0x300000000000ull;
+  static const uptr kShadowEnd = 0x280000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
   static const uptr kLoAppMemEnd = 0x00e000000000ull;
   static const uptr kMidAppMemBeg = 0;
@@ -565,20 +510,16 @@ Go on linux/mips64 (47-bit VMA)
 0000 1000 0000 - 00c0 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 2000 0000 0000: -
-2000 0000 0000 - 3000 0000 0000: shadow
-3000 0000 0000 - 3000 0000 0000: -
-3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
-4000 0000 0000 - 6000 0000 0000: -
-6000 0000 0000 - 6200 0000 0000: traces
-6200 0000 0000 - 8000 0000 0000: -
+2000 0000 0000 - 2800 0000 0000: shadow
+2800 0000 0000 - 3000 0000 0000: -
+3000 0000 0000 - 3200 0000 0000: metainfo (memory blocks and sync objects)
+3200 0000 0000 - 8000 0000 0000: -
 */
 struct MappingGoMips64_47 {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
-  static const uptr kMetaShadowEnd = 0x400000000000ull;
-  static const uptr kTraceMemBeg = 0x600000000000ull;
-  static const uptr kTraceMemEnd = 0x620000000000ull;
+  static const uptr kMetaShadowEnd = 0x320000000000ull;
   static const uptr kShadowBeg = 0x200000000000ull;
-  static const uptr kShadowEnd = 0x300000000000ull;
+  static const uptr kShadowEnd = 0x280000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
   static const uptr kLoAppMemEnd = 0x00e000000000ull;
   static const uptr kMidAppMemBeg = 0;
@@ -597,19 +538,15 @@ struct MappingGoMips64_47 {
 Go on linux/s390x
 0000 0000 1000 - 1000 0000 0000: executable and heap - 16 TiB
 1000 0000 0000 - 4000 0000 0000: -
-4000 0000 0000 - 8000 0000 0000: shadow - 64TiB (4 * app)
-8000 0000 0000 - 9000 0000 0000: -
+4000 0000 0000 - 6000 0000 0000: shadow - 64TiB (4 * app)
+6000 0000 0000 - 9000 0000 0000: -
 9000 0000 0000 - 9800 0000 0000: metainfo - 8TiB (0.5 * app)
-9800 0000 0000 - a000 0000 0000: -
-a000 0000 0000 - b000 0000 0000: traces - 16TiB (max history * 128k threads)
 */
 struct MappingGoS390x {
   static const uptr kMetaShadowBeg = 0x900000000000ull;
   static const uptr kMetaShadowEnd = 0x980000000000ull;
-  static const uptr kTraceMemBeg   = 0xa00000000000ull;
-  static const uptr kTraceMemEnd   = 0xb00000000000ull;
   static const uptr kShadowBeg     = 0x400000000000ull;
-  static const uptr kShadowEnd     = 0x800000000000ull;
+  static const uptr kShadowEnd = 0x600000000000ull;
   static const uptr kLoAppMemBeg = 0x000000001000ull;
   static const uptr kLoAppMemEnd = 0x100000000000ull;
   static const uptr kMidAppMemBeg = 0;
@@ -648,11 +585,11 @@ ALWAYS_INLINE auto SelectMapping(Arg arg) {
   return Func::template Apply<MappingGo48>(arg);
 #  endif
 #else  // SANITIZER_GO
-#  if defined(__x86_64__) || SANITIZER_IOSSIM || SANITIZER_MAC && !SANITIZER_IOS
-  return Func::template Apply<Mapping48AddressSpace>(arg);
-#  elif defined(__aarch64__) && defined(__APPLE__)
+#  if SANITIZER_IOS && !SANITIZER_IOSSIM
   return Func::template Apply<MappingAppleAarch64>(arg);
-#  elif defined(__aarch64__) && !defined(__APPLE__)
+#  elif defined(__x86_64__) || SANITIZER_MAC
+  return Func::template Apply<Mapping48AddressSpace>(arg);
+#  elif defined(__aarch64__)
   switch (vmaSize) {
     case 39:
       return Func::template Apply<MappingAarch64_39>(arg);
@@ -715,8 +652,6 @@ enum MappingType {
   kShadowEnd,
   kMetaShadowBeg,
   kMetaShadowEnd,
-  kTraceMemBeg,
-  kTraceMemEnd,
   kVdsoBeg,
 };
 
@@ -750,10 +685,6 @@ struct MappingField {
         return Mapping::kMetaShadowBeg;
       case kMetaShadowEnd:
         return Mapping::kMetaShadowEnd;
-      case kTraceMemBeg:
-        return Mapping::kTraceMemBeg;
-      case kTraceMemEnd:
-        return Mapping::kTraceMemEnd;
     }
     Die();
   }
@@ -792,11 +723,6 @@ uptr MetaShadowBeg(void) { return SelectMapping<MappingField>(kMetaShadowBeg); }
 ALWAYS_INLINE
 uptr MetaShadowEnd(void) { return SelectMapping<MappingField>(kMetaShadowEnd); }
 
-ALWAYS_INLINE
-uptr TraceMemBeg(void) { return SelectMapping<MappingField>(kTraceMemBeg); }
-ALWAYS_INLINE
-uptr TraceMemEnd(void) { return SelectMapping<MappingField>(kTraceMemEnd); }
-
 struct IsAppMemImpl {
   template <typename Mapping>
   static bool Apply(uptr mem) {
@@ -934,43 +860,10 @@ inline uptr RestoreAddr(uptr addr) {
   return SelectMapping<RestoreAddrImpl>(addr);
 }
 
-// The additional page is to catch shadow stack overflow as paging fault.
-// Windows wants 64K alignment for mmaps.
-const uptr kTotalTraceSize = (kTraceSize * sizeof(Event) + sizeof(Trace)
-    + (64 << 10) + (64 << 10) - 1) & ~((64 << 10) - 1);
-
-struct GetThreadTraceImpl {
-  template <typename Mapping>
-  static uptr Apply(uptr tid) {
-    uptr p = Mapping::kTraceMemBeg + tid * kTotalTraceSize;
-    DCHECK_LT(p, Mapping::kTraceMemEnd);
-    return p;
-  }
-};
-
-ALWAYS_INLINE
-uptr GetThreadTrace(int tid) { return SelectMapping<GetThreadTraceImpl>(tid); }
-
-struct GetThreadTraceHeaderImpl {
-  template <typename Mapping>
-  static uptr Apply(uptr tid) {
-    uptr p = Mapping::kTraceMemBeg + tid * kTotalTraceSize +
-             kTraceSize * sizeof(Event);
-    DCHECK_LT(p, Mapping::kTraceMemEnd);
-    return p;
-  }
-};
-
-ALWAYS_INLINE
-uptr GetThreadTraceHeader(int tid) {
-  return SelectMapping<GetThreadTraceHeaderImpl>(tid);
-}
-
 void InitializePlatform();
 void InitializePlatformEarly();
 void CheckAndProtect();
 void InitializeShadowMemoryPlatform();
-void FlushShadowMemory();
 void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns);
 int ExtractResolvFDs(void *state, int *fds, int nfd);
 int ExtractRecvmsgFDs(void *msg, int *fds, int nfd);
diff --git a/libsanitizer/tsan/tsan_platform_linux.cpp b/libsanitizer/tsan/tsan_platform_linux.cpp
index 73ec148..17dbdff 100644
--- a/libsanitizer/tsan/tsan_platform_linux.cpp
+++ b/libsanitizer/tsan/tsan_platform_linux.cpp
@@ -94,7 +94,6 @@ enum {
   MemMeta,
   MemFile,
   MemMmap,
-  MemTrace,
   MemHeap,
   MemOther,
   MemCount,
@@ -112,8 +111,6 @@ void FillProfileCallback(uptr p, uptr rss, bool file, uptr *mem) {
     mem[file ? MemFile : MemMmap] += rss;
   else if (p >= HeapMemBeg() && p < HeapMemEnd())
     mem[MemHeap] += rss;
-  else if (p >= TraceMemBeg() && p < TraceMemEnd())
-    mem[MemTrace] += rss;
   else
     mem[MemOther] += rss;
 }
@@ -126,42 +123,33 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
   StackDepotStats stacks = StackDepotGetStats();
   uptr nthread, nlive;
   ctx->thread_registry.GetNumberOfThreads(&nthread, &nlive);
+  uptr trace_mem;
+  {
+    Lock l(&ctx->slot_mtx);
+    trace_mem = ctx->trace_part_total_allocated * sizeof(TracePart);
+  }
   uptr internal_stats[AllocatorStatCount];
   internal_allocator()->GetStats(internal_stats);
   // All these are allocated from the common mmap region.
-  mem[MemMmap] -= meta.mem_block + meta.sync_obj + stacks.allocated +
-                  internal_stats[AllocatorStatMapped];
+  mem[MemMmap] -= meta.mem_block + meta.sync_obj + trace_mem +
+                  stacks.allocated + internal_stats[AllocatorStatMapped];
   if (s64(mem[MemMmap]) < 0)
     mem[MemMmap] = 0;
   internal_snprintf(
       buf, buf_size,
-      "%llus: RSS %zd MB: shadow:%zd meta:%zd file:%zd mmap:%zd"
-      " trace:%zd heap:%zd other:%zd intalloc:%zd memblocks:%zd syncobj:%zu"
-      " stacks=%zd[%zd] nthr=%zd/%zd\n",
-      uptime_ns / (1000 * 1000 * 1000), mem[MemTotal] >> 20,
-      mem[MemShadow] >> 20, mem[MemMeta] >> 20, mem[MemFile] >> 20,
-      mem[MemMmap] >> 20, mem[MemTrace] >> 20, mem[MemHeap] >> 20,
+      "==%zu== %llus [%zu]: RSS %zd MB: shadow:%zd meta:%zd file:%zd"
+      " mmap:%zd heap:%zd other:%zd intalloc:%zd memblocks:%zd syncobj:%zu"
+      " trace:%zu stacks=%zd threads=%zu/%zu\n",
+      internal_getpid(), uptime_ns / (1000 * 1000 * 1000), ctx->global_epoch,
+      mem[MemTotal] >> 20, mem[MemShadow] >> 20, mem[MemMeta] >> 20,
+      mem[MemFile] >> 20, mem[MemMmap] >> 20, mem[MemHeap] >> 20,
       mem[MemOther] >> 20, internal_stats[AllocatorStatMapped] >> 20,
-      meta.mem_block >> 20, meta.sync_obj >> 20, stacks.allocated >> 20,
-      stacks.n_uniq_ids, nlive, nthread);
-}
-
-#  if SANITIZER_LINUX
-void FlushShadowMemoryCallback(
-    const SuspendedThreadsList &suspended_threads_list,
-    void *argument) {
-  ReleaseMemoryPagesToOS(ShadowBeg(), ShadowEnd());
-}
-#endif
-
-void FlushShadowMemory() {
-#if SANITIZER_LINUX
-  StopTheWorld(FlushShadowMemoryCallback, 0);
-#endif
+      meta.mem_block >> 20, meta.sync_obj >> 20, trace_mem >> 20,
+      stacks.allocated >> 20, nlive, nthread);
 }
 
 #if !SANITIZER_GO
-// Mark shadow for .rodata sections with the special kShadowRodata marker.
+// Mark shadow for .rodata sections with the special Shadow::kRodata marker.
 // Accesses to .rodata can't race, so this saves time, memory and trace space.
 static void MapRodata() {
   // First create temp file.
@@ -182,13 +170,13 @@ static void MapRodata() {
     return;
   internal_unlink(name);  // Unlink it now, so that we can reuse the buffer.
   fd_t fd = openrv;
-  // Fill the file with kShadowRodata.
+  // Fill the file with Shadow::kRodata.
   const uptr kMarkerSize = 512 * 1024 / sizeof(RawShadow);
   InternalMmapVector<RawShadow> marker(kMarkerSize);
   // volatile to prevent insertion of memset
   for (volatile RawShadow *p = marker.data(); p < marker.data() + kMarkerSize;
        p++)
-    *p = kShadowRodata;
+    *p = Shadow::kRodata;
   internal_write(fd, marker.data(), marker.size() * sizeof(RawShadow));
   // Map the file into memory.
   uptr page = internal_mmap(0, GetPageSizeCached(), PROT_READ | PROT_WRITE,
diff --git a/libsanitizer/tsan/tsan_platform_mac.cpp b/libsanitizer/tsan/tsan_platform_mac.cpp
index 3faa2d0..44b98d4 100644
--- a/libsanitizer/tsan/tsan_platform_mac.cpp
+++ b/libsanitizer/tsan/tsan_platform_mac.cpp
@@ -25,6 +25,7 @@
 #include "tsan_rtl.h"
 #include "tsan_flags.h"
 
+#include <limits.h>
 #include <mach/mach.h>
 #include <pthread.h>
 #include <signal.h>
@@ -45,76 +46,86 @@
 namespace __tsan {
 
 #if !SANITIZER_GO
-static void *SignalSafeGetOrAllocate(uptr *dst, uptr size) {
-  atomic_uintptr_t *a = (atomic_uintptr_t *)dst;
-  void *val = (void *)atomic_load_relaxed(a);
-  atomic_signal_fence(memory_order_acquire);  // Turns the previous load into
-                                              // acquire wrt signals.
-  if (UNLIKELY(val == nullptr)) {
-    val = (void *)internal_mmap(nullptr, size, PROT_READ | PROT_WRITE,
-                                MAP_PRIVATE | MAP_ANON, -1, 0);
-    CHECK(val);
-    void *cmp = nullptr;
-    if (!atomic_compare_exchange_strong(a, (uintptr_t *)&cmp, (uintptr_t)val,
-                                        memory_order_acq_rel)) {
-      internal_munmap(val, size);
-      val = cmp;
-    }
-  }
-  return val;
+static char main_thread_state[sizeof(ThreadState)] ALIGNED(
+    SANITIZER_CACHE_LINE_SIZE);
+static ThreadState *dead_thread_state;
+static pthread_key_t thread_state_key;
+
+// We rely on the following documented, but Darwin-specific behavior to keep the
+// reference to the ThreadState object alive in TLS:
+// pthread_key_create man page:
+//   If, after all the destructors have been called for all non-NULL values with
+//   associated destructors, there are still some non-NULL values with
+//   associated destructors, then the process is repeated.  If, after at least
+//   [PTHREAD_DESTRUCTOR_ITERATIONS] iterations of destructor calls for
+//   outstanding non-NULL values, there are still some non-NULL values with
+//   associated destructors, the implementation stops calling destructors.
+static_assert(PTHREAD_DESTRUCTOR_ITERATIONS == 4, "Small number of iterations");
+static void ThreadStateDestructor(void *thr) {
+  int res = pthread_setspecific(thread_state_key, thr);
+  CHECK_EQ(res, 0);
 }
 
-// On OS X, accessing TLVs via __thread or manually by using pthread_key_* is
-// problematic, because there are several places where interceptors are called
-// when TLVs are not accessible (early process startup, thread cleanup, ...).
-// The following provides a "poor man's TLV" implementation, where we use the
-// shadow memory of the pointer returned by pthread_self() to store a pointer to
-// the ThreadState object. The main thread's ThreadState is stored separately
-// in a static variable, because we need to access it even before the
-// shadow memory is set up.
-static uptr main_thread_identity = 0;
-ALIGNED(64) static char main_thread_state[sizeof(ThreadState)];
-static ThreadState *main_thread_state_loc = (ThreadState *)main_thread_state;
-
-// We cannot use pthread_self() before libpthread has been initialized.  Our
-// current heuristic for guarding this is checking `main_thread_identity` which
-// is only assigned in `__tsan::InitializePlatform`.
-static ThreadState **cur_thread_location() {
-  if (main_thread_identity == 0)
-    return &main_thread_state_loc;
-  uptr thread_identity = (uptr)pthread_self();
-  if (thread_identity == main_thread_identity)
-    return &main_thread_state_loc;
-  return (ThreadState **)MemToShadow(thread_identity);
+static void InitializeThreadStateStorage() {
+  int res;
+  CHECK_EQ(thread_state_key, 0);
+  res = pthread_key_create(&thread_state_key, ThreadStateDestructor);
+  CHECK_EQ(res, 0);
+  res = pthread_setspecific(thread_state_key, main_thread_state);
+  CHECK_EQ(res, 0);
+
+  auto dts = (ThreadState *)MmapOrDie(sizeof(ThreadState), "ThreadState");
+  dts->fast_state.SetIgnoreBit();
+  dts->ignore_interceptors = 1;
+  dts->is_dead = true;
+  const_cast<Tid &>(dts->tid) = kInvalidTid;
+  res = internal_mprotect(dts, sizeof(ThreadState), PROT_READ);  // immutable
+  CHECK_EQ(res, 0);
+  dead_thread_state = dts;
 }
 
 ThreadState *cur_thread() {
-  return (ThreadState *)SignalSafeGetOrAllocate(
-      (uptr *)cur_thread_location(), sizeof(ThreadState));
+  // Some interceptors get called before libpthread has been initialized and in
+  // these cases we must avoid calling any pthread APIs.
+  if (UNLIKELY(!thread_state_key)) {
+    return (ThreadState *)main_thread_state;
+  }
+
+  // We only reach this line after InitializeThreadStateStorage() ran, i.e,
+  // after TSan (and therefore libpthread) have been initialized.
+  ThreadState *thr = (ThreadState *)pthread_getspecific(thread_state_key);
+  if (UNLIKELY(!thr)) {
+    thr = (ThreadState *)MmapOrDie(sizeof(ThreadState), "ThreadState");
+    int res = pthread_setspecific(thread_state_key, thr);
+    CHECK_EQ(res, 0);
+  }
+  return thr;
 }
 
 void set_cur_thread(ThreadState *thr) {
-  *cur_thread_location() = thr;
+  int res = pthread_setspecific(thread_state_key, thr);
+  CHECK_EQ(res, 0);
 }
 
-// TODO(kuba.brecka): This is not async-signal-safe. In particular, we call
-// munmap first and then clear `fake_tls`; if we receive a signal in between,
-// handler will try to access the unmapped ThreadState.
 void cur_thread_finalize() {
-  ThreadState **thr_state_loc = cur_thread_location();
-  if (thr_state_loc == &main_thread_state_loc) {
+  ThreadState *thr = (ThreadState *)pthread_getspecific(thread_state_key);
+  CHECK(thr);
+  if (thr == (ThreadState *)main_thread_state) {
     // Calling dispatch_main() or xpc_main() actually invokes pthread_exit to
     // exit the main thread. Let's keep the main thread's ThreadState.
     return;
   }
-  internal_munmap(*thr_state_loc, sizeof(ThreadState));
-  *thr_state_loc = nullptr;
+  // Intercepted functions can still get called after cur_thread_finalize()
+  // (called from DestroyThreadState()), so put a fake thread state for "dead"
+  // threads.  An alternative solution would be to release the ThreadState
+  // object from THREAD_DESTROY (which is delivered later and on the parent
+  // thread) instead of THREAD_TERMINATE.
+  int res = pthread_setspecific(thread_state_key, dead_thread_state);
+  CHECK_EQ(res, 0);
+  UnmapOrDie(thr, sizeof(ThreadState));
 }
 #endif
 
-void FlushShadowMemory() {
-}
-
 static void RegionMemUsage(uptr start, uptr end, uptr *res, uptr *dirty) {
   vm_address_t address = start;
   vm_address_t end_address = end;
@@ -142,12 +153,10 @@ static void RegionMemUsage(uptr start, uptr end, uptr *res, uptr *dirty) {
 void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
   uptr shadow_res, shadow_dirty;
   uptr meta_res, meta_dirty;
-  uptr trace_res, trace_dirty;
   RegionMemUsage(ShadowBeg(), ShadowEnd(), &shadow_res, &shadow_dirty);
   RegionMemUsage(MetaShadowBeg(), MetaShadowEnd(), &meta_res, &meta_dirty);
-  RegionMemUsage(TraceMemBeg(), TraceMemEnd(), &trace_res, &trace_dirty);
 
-#if !SANITIZER_GO
+#  if !SANITIZER_GO
   uptr low_res, low_dirty;
   uptr high_res, high_dirty;
   uptr heap_res, heap_dirty;
@@ -166,7 +175,6 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
       buf, buf_size,
       "shadow   (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
       "meta     (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
-      "traces   (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
 #  if !SANITIZER_GO
       "low app  (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
       "high app (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
@@ -179,7 +187,6 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
       "------------------------------\n",
       ShadowBeg(), ShadowEnd(), shadow_res / 1024, shadow_dirty / 1024,
       MetaShadowBeg(), MetaShadowEnd(), meta_res / 1024, meta_dirty / 1024,
-      TraceMemBeg(), TraceMemEnd(), trace_res / 1024, trace_dirty / 1024,
 #  if !SANITIZER_GO
       LoAppMemBeg(), LoAppMemEnd(), low_res / 1024, low_dirty / 1024,
       HiAppMemBeg(), HiAppMemEnd(), high_res / 1024, high_dirty / 1024,
@@ -222,11 +229,10 @@ static void my_pthread_introspection_hook(unsigned int event, pthread_t thread,
       ThreadStart(thr, tid, GetTid(), ThreadType::Worker);
     }
   } else if (event == PTHREAD_INTROSPECTION_THREAD_TERMINATE) {
-    if (thread == pthread_self()) {
-      ThreadState *thr = cur_thread();
-      if (thr->tctx) {
-        DestroyThreadState();
-      }
+    CHECK_EQ(thread, pthread_self());
+    ThreadState *thr = cur_thread();
+    if (thr->tctx) {
+      DestroyThreadState();
     }
   }
 
@@ -253,8 +259,7 @@ void InitializePlatform() {
 #if !SANITIZER_GO
   CheckAndProtect();
 
-  CHECK_EQ(main_thread_identity, 0);
-  main_thread_identity = (uptr)pthread_self();
+  InitializeThreadStateStorage();
 
   prev_pthread_introspection_hook =
       pthread_introspection_hook_install(&my_pthread_introspection_hook);
@@ -286,24 +291,11 @@ uptr ExtractLongJmpSp(uptr *env) {
 extern "C" void __tsan_tls_initialization() {}
 
 void ImitateTlsWrite(ThreadState *thr, uptr tls_addr, uptr tls_size) {
-  // The pointer to the ThreadState object is stored in the shadow memory
-  // of the tls.
-  uptr tls_end = tls_addr + tls_size;
-  uptr thread_identity = (uptr)pthread_self();
   const uptr pc = StackTrace::GetNextInstructionPc(
       reinterpret_cast<uptr>(__tsan_tls_initialization));
-  if (thread_identity == main_thread_identity) {
-    MemoryRangeImitateWrite(thr, pc, tls_addr, tls_size);
-  } else {
-    uptr thr_state_start = thread_identity;
-    uptr thr_state_end = thr_state_start + sizeof(uptr);
-    CHECK_GE(thr_state_start, tls_addr);
-    CHECK_LE(thr_state_start, tls_addr + tls_size);
-    CHECK_GE(thr_state_end, tls_addr);
-    CHECK_LE(thr_state_end, tls_addr + tls_size);
-    MemoryRangeImitateWrite(thr, pc, tls_addr, thr_state_start - tls_addr);
-    MemoryRangeImitateWrite(thr, pc, thr_state_end, tls_end - thr_state_end);
-  }
+  // Unlike Linux, we only store a pointer to the ThreadState object in TLS;
+  // just mark the entire range as written to.
+  MemoryRangeImitateWrite(thr, pc, tls_addr, tls_size);
 }
 #endif
 
diff --git a/libsanitizer/tsan/tsan_platform_posix.cpp b/libsanitizer/tsan/tsan_platform_posix.cpp
index 763ac44..71874aa 100644
--- a/libsanitizer/tsan/tsan_platform_posix.cpp
+++ b/libsanitizer/tsan/tsan_platform_posix.cpp
@@ -110,27 +110,23 @@ void CheckAndProtect() {
     Die();
   }
 
-#    if defined(__aarch64__) && defined(__APPLE__) && SANITIZER_IOS
+#    if SANITIZER_IOS && !SANITIZER_IOSSIM
   ProtectRange(HeapMemEnd(), ShadowBeg());
   ProtectRange(ShadowEnd(), MetaShadowBeg());
-  ProtectRange(MetaShadowEnd(), TraceMemBeg());
-#else
+  ProtectRange(MetaShadowEnd(), HiAppMemBeg());
+#    else
   ProtectRange(LoAppMemEnd(), ShadowBeg());
   ProtectRange(ShadowEnd(), MetaShadowBeg());
   if (MidAppMemBeg()) {
     ProtectRange(MetaShadowEnd(), MidAppMemBeg());
-    ProtectRange(MidAppMemEnd(), TraceMemBeg());
+    ProtectRange(MidAppMemEnd(), HeapMemBeg());
   } else {
-    ProtectRange(MetaShadowEnd(), TraceMemBeg());
+    ProtectRange(MetaShadowEnd(), HeapMemBeg());
   }
-  // Memory for traces is mapped lazily in MapThreadTrace.
-  // Protect the whole range for now, so that user does not map something here.
-  ProtectRange(TraceMemBeg(), TraceMemEnd());
-  ProtectRange(TraceMemEnd(), HeapMemBeg());
   ProtectRange(HeapEnd(), HiAppMemBeg());
-#endif
+#    endif
 
-#if defined(__s390x__)
+#    if defined(__s390x__)
   // Protect the rest of the address space.
   const uptr user_addr_max_l4 = 0x0020000000000000ull;
   const uptr user_addr_max_l5 = 0xfffffffffffff000ull;
diff --git a/libsanitizer/tsan/tsan_platform_windows.cpp b/libsanitizer/tsan/tsan_platform_windows.cpp
index fea8937..eb8f354 100644
--- a/libsanitizer/tsan/tsan_platform_windows.cpp
+++ b/libsanitizer/tsan/tsan_platform_windows.cpp
@@ -20,9 +20,6 @@
 
 namespace __tsan {
 
-void FlushShadowMemory() {
-}
-
 void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {}
 
 void InitializePlatformEarly() {
diff --git a/libsanitizer/tsan/tsan_report.cpp b/libsanitizer/tsan/tsan_report.cpp
index a926c37..9f15127 100644
--- a/libsanitizer/tsan/tsan_report.cpp
+++ b/libsanitizer/tsan/tsan_report.cpp
@@ -126,7 +126,7 @@ static void PrintMutexSet(Vector<ReportMopMutex> const& mset) {
     if (i == 0)
       Printf(" (mutexes:");
     const ReportMopMutex m = mset[i];
-    Printf(" %s M%llu", m.write ? "write" : "read", m.id);
+    Printf(" %s M%u", m.write ? "write" : "read", m.id);
     Printf(i == mset.Size() - 1 ? ")" : ",");
   }
 }
@@ -211,29 +211,23 @@ static void PrintLocation(const ReportLocation *loc) {
 
 static void PrintMutexShort(const ReportMutex *rm, const char *after) {
   Decorator d;
-  Printf("%sM%lld%s%s", d.Mutex(), rm->id, d.Default(), after);
+  Printf("%sM%d%s%s", d.Mutex(), rm->id, d.Default(), after);
 }
 
 static void PrintMutexShortWithAddress(const ReportMutex *rm,
                                        const char *after) {
   Decorator d;
-  Printf("%sM%lld (%p)%s%s", d.Mutex(), rm->id,
+  Printf("%sM%d (%p)%s%s", d.Mutex(), rm->id,
          reinterpret_cast<void *>(rm->addr), d.Default(), after);
 }
 
 static void PrintMutex(const ReportMutex *rm) {
   Decorator d;
-  if (rm->destroyed) {
-    Printf("%s", d.Mutex());
-    Printf("  Mutex M%llu is already destroyed.\n\n", rm->id);
-    Printf("%s", d.Default());
-  } else {
-    Printf("%s", d.Mutex());
-    Printf("  Mutex M%llu (%p) created at:\n", rm->id,
-           reinterpret_cast<void *>(rm->addr));
-    Printf("%s", d.Default());
-    PrintStack(rm->stack);
-  }
+  Printf("%s", d.Mutex());
+  Printf("  Mutex M%u (%p) created at:\n", rm->id,
+         reinterpret_cast<void *>(rm->addr));
+  Printf("%s", d.Default());
+  PrintStack(rm->stack);
 }
 
 static void PrintThread(const ReportThread *rt) {
@@ -312,6 +306,9 @@ void PrintReport(const ReportDesc *rep) {
          (int)internal_getpid());
   Printf("%s", d.Default());
 
+  if (rep->typ == ReportTypeErrnoInSignal)
+    Printf("  Signal %u handler invoked at:\n", rep->signum);
+
   if (rep->typ == ReportTypeDeadlock) {
     char thrbuf[kThreadBufSize];
     Printf("  Cycle in lock order graph: ");
@@ -460,12 +457,12 @@ void PrintReport(const ReportDesc *rep) {
   } else if (rep->typ == ReportTypeDeadlock) {
     Printf("WARNING: DEADLOCK\n");
     for (uptr i = 0; i < rep->mutexes.Size(); i++) {
-      Printf("Goroutine %d lock mutex %llu while holding mutex %llu:\n", 999,
+      Printf("Goroutine %d lock mutex %u while holding mutex %u:\n", 999,
              rep->mutexes[i]->id,
              rep->mutexes[(i + 1) % rep->mutexes.Size()]->id);
       PrintStack(rep->stacks[2*i]);
       Printf("\n");
-      Printf("Mutex %llu was previously locked here:\n",
+      Printf("Mutex %u was previously locked here:\n",
              rep->mutexes[(i + 1) % rep->mutexes.Size()]->id);
       PrintStack(rep->stacks[2*i + 1]);
       Printf("\n");
diff --git a/libsanitizer/tsan/tsan_report.h b/libsanitizer/tsan/tsan_report.h
index d68c2db..718eacd 100644
--- a/libsanitizer/tsan/tsan_report.h
+++ b/libsanitizer/tsan/tsan_report.h
@@ -43,7 +43,7 @@ struct ReportStack {
 };
 
 struct ReportMopMutex {
-  u64 id;
+  int id;
   bool write;
 };
 
@@ -91,9 +91,8 @@ struct ReportThread {
 };
 
 struct ReportMutex {
-  u64 id;
+  int id;
   uptr addr;
-  bool destroyed;
   ReportStack *stack;
 };
 
@@ -109,6 +108,7 @@ class ReportDesc {
   Vector<Tid> unique_tids;
   ReportStack *sleep;
   int count;
+  int signum = 0;
 
   ReportDesc();
   ~ReportDesc();
diff --git a/libsanitizer/tsan/tsan_rtl.cpp b/libsanitizer/tsan/tsan_rtl.cpp
index 46dec04..1d6fc72 100644
--- a/libsanitizer/tsan/tsan_rtl.cpp
+++ b/libsanitizer/tsan/tsan_rtl.cpp
@@ -16,6 +16,7 @@
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_file.h"
+#include "sanitizer_common/sanitizer_interface_internal.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
@@ -34,6 +35,9 @@ extern "C" void __tsan_resume() {
   __tsan_resumed = 1;
 }
 
+SANITIZER_WEAK_DEFAULT_IMPL
+void __tsan_test_only_on_fork() {}
+
 namespace __tsan {
 
 #if !SANITIZER_GO
@@ -54,109 +58,355 @@ Context *ctx;
 bool OnFinalize(bool failed);
 void OnInitialize();
 #else
-#include <dlfcn.h>
 SANITIZER_WEAK_CXX_DEFAULT_IMPL
 bool OnFinalize(bool failed) {
-#if !SANITIZER_GO
+#  if !SANITIZER_GO
   if (on_finalize)
     return on_finalize(failed);
-#endif
+#  endif
   return failed;
 }
+
 SANITIZER_WEAK_CXX_DEFAULT_IMPL
 void OnInitialize() {
-#if !SANITIZER_GO
+#  if !SANITIZER_GO
   if (on_initialize)
     on_initialize();
-#endif
+#  endif
 }
 #endif
 
-static ThreadContextBase *CreateThreadContext(Tid tid) {
-  // Map thread trace when context is created.
-  char name[50];
-  internal_snprintf(name, sizeof(name), "trace %u", tid);
-  MapThreadTrace(GetThreadTrace(tid), TraceSize() * sizeof(Event), name);
-  const uptr hdr = GetThreadTraceHeader(tid);
-  internal_snprintf(name, sizeof(name), "trace header %u", tid);
-  MapThreadTrace(hdr, sizeof(Trace), name);
-  new((void*)hdr) Trace();
-  // We are going to use only a small part of the trace with the default
-  // value of history_size. However, the constructor writes to the whole trace.
-  // Release the unused part.
-  uptr hdr_end = hdr + sizeof(Trace);
-  hdr_end -= sizeof(TraceHeader) * (kTraceParts - TraceParts());
-  hdr_end = RoundUp(hdr_end, GetPageSizeCached());
-  if (hdr_end < hdr + sizeof(Trace)) {
-    ReleaseMemoryPagesToOS(hdr_end, hdr + sizeof(Trace));
-    uptr unused = hdr + sizeof(Trace) - hdr_end;
-    if (hdr_end != (uptr)MmapFixedNoAccess(hdr_end, unused)) {
-      Report("ThreadSanitizer: failed to mprotect [0x%zx-0x%zx) \n", hdr_end,
-             unused);
-      CHECK("unable to mprotect" && 0);
+static TracePart* TracePartAlloc(ThreadState* thr) {
+  TracePart* part = nullptr;
+  {
+    Lock lock(&ctx->slot_mtx);
+    uptr max_parts = Trace::kMinParts + flags()->history_size;
+    Trace* trace = &thr->tctx->trace;
+    if (trace->parts_allocated == max_parts ||
+        ctx->trace_part_finished_excess) {
+      part = ctx->trace_part_recycle.PopFront();
+      DPrintf("#%d: TracePartAlloc: part=%p\n", thr->tid, part);
+      if (part && part->trace) {
+        Trace* trace1 = part->trace;
+        Lock trace_lock(&trace1->mtx);
+        part->trace = nullptr;
+        TracePart* part1 = trace1->parts.PopFront();
+        CHECK_EQ(part, part1);
+        if (trace1->parts_allocated > trace1->parts.Size()) {
+          ctx->trace_part_finished_excess +=
+              trace1->parts_allocated - trace1->parts.Size();
+          trace1->parts_allocated = trace1->parts.Size();
+        }
+      }
+    }
+    if (trace->parts_allocated < max_parts) {
+      trace->parts_allocated++;
+      if (ctx->trace_part_finished_excess)
+        ctx->trace_part_finished_excess--;
     }
+    if (!part)
+      ctx->trace_part_total_allocated++;
+    else if (ctx->trace_part_recycle_finished)
+      ctx->trace_part_recycle_finished--;
   }
-  return New<ThreadContext>(tid);
+  if (!part)
+    part = new (MmapOrDie(sizeof(*part), "TracePart")) TracePart();
+  return part;
 }
 
+static void TracePartFree(TracePart* part) SANITIZER_REQUIRES(ctx->slot_mtx) {
+  DCHECK(part->trace);
+  part->trace = nullptr;
+  ctx->trace_part_recycle.PushFront(part);
+}
+
+void TraceResetForTesting() {
+  Lock lock(&ctx->slot_mtx);
+  while (auto* part = ctx->trace_part_recycle.PopFront()) {
+    if (auto trace = part->trace)
+      CHECK_EQ(trace->parts.PopFront(), part);
+    UnmapOrDie(part, sizeof(*part));
+  }
+  ctx->trace_part_total_allocated = 0;
+  ctx->trace_part_recycle_finished = 0;
+  ctx->trace_part_finished_excess = 0;
+}
+
+static void DoResetImpl(uptr epoch) {
+  ThreadRegistryLock lock0(&ctx->thread_registry);
+  Lock lock1(&ctx->slot_mtx);
+  CHECK_EQ(ctx->global_epoch, epoch);
+  ctx->global_epoch++;
+  CHECK(!ctx->resetting);
+  ctx->resetting = true;
+  for (u32 i = ctx->thread_registry.NumThreadsLocked(); i--;) {
+    ThreadContext* tctx = (ThreadContext*)ctx->thread_registry.GetThreadLocked(
+        static_cast<Tid>(i));
+    // Potentially we could purge all ThreadStatusDead threads from the
+    // registry. Since we reset all shadow, they can't race with anything
+    // anymore. However, their tid's can still be stored in some aux places
+    // (e.g. tid of thread that created something).
+    auto trace = &tctx->trace;
+    Lock lock(&trace->mtx);
+    bool attached = tctx->thr && tctx->thr->slot;
+    auto parts = &trace->parts;
+    bool local = false;
+    while (!parts->Empty()) {
+      auto part = parts->Front();
+      local = local || part == trace->local_head;
+      if (local)
+        CHECK(!ctx->trace_part_recycle.Queued(part));
+      else
+        ctx->trace_part_recycle.Remove(part);
+      if (attached && parts->Size() == 1) {
+        // The thread is running and this is the last/current part.
+        // Set the trace position to the end of the current part
+        // to force the thread to call SwitchTracePart and re-attach
+        // to a new slot and allocate a new trace part.
+        // Note: the thread is concurrently modifying the position as well,
+        // so this is only best-effort. The thread can only modify position
+        // within this part, because switching parts is protected by
+        // slot/trace mutexes that we hold here.
+        atomic_store_relaxed(
+            &tctx->thr->trace_pos,
+            reinterpret_cast<uptr>(&part->events[TracePart::kSize]));
+        break;
+      }
+      parts->Remove(part);
+      TracePartFree(part);
+    }
+    CHECK_LE(parts->Size(), 1);
+    trace->local_head = parts->Front();
+    if (tctx->thr && !tctx->thr->slot) {
+      atomic_store_relaxed(&tctx->thr->trace_pos, 0);
+      tctx->thr->trace_prev_pc = 0;
+    }
+    if (trace->parts_allocated > trace->parts.Size()) {
+      ctx->trace_part_finished_excess +=
+          trace->parts_allocated - trace->parts.Size();
+      trace->parts_allocated = trace->parts.Size();
+    }
+  }
+  while (ctx->slot_queue.PopFront()) {
+  }
+  for (auto& slot : ctx->slots) {
+    slot.SetEpoch(kEpochZero);
+    slot.journal.Reset();
+    slot.thr = nullptr;
+    ctx->slot_queue.PushBack(&slot);
+  }
+
+  DPrintf("Resetting shadow...\n");
+  if (!MmapFixedSuperNoReserve(ShadowBeg(), ShadowEnd() - ShadowBeg(),
+                               "shadow")) {
+    Printf("failed to reset shadow memory\n");
+    Die();
+  }
+  DPrintf("Resetting meta shadow...\n");
+  ctx->metamap.ResetClocks();
+  ctx->resetting = false;
+}
+
+// Clang does not understand locking all slots in the loop:
+// error: expecting mutex 'slot.mtx' to be held at start of each loop
+void DoReset(ThreadState* thr, uptr epoch) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  {
+    for (auto& slot : ctx->slots) {
+      slot.mtx.Lock();
+      if (UNLIKELY(epoch == 0))
+        epoch = ctx->global_epoch;
+      if (UNLIKELY(epoch != ctx->global_epoch)) {
+        // Epoch can't change once we've locked the first slot.
+        CHECK_EQ(slot.sid, 0);
+        slot.mtx.Unlock();
+        return;
+      }
+    }
+  }
+  DPrintf("#%d: DoReset epoch=%lu\n", thr ? thr->tid : -1, epoch);
+  DoResetImpl(epoch);
+  for (auto& slot : ctx->slots) slot.mtx.Unlock();
+}
+
+void FlushShadowMemory() { DoReset(nullptr, 0); }
+
+static TidSlot* FindSlotAndLock(ThreadState* thr)
+    SANITIZER_ACQUIRE(thr->slot->mtx) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  CHECK(!thr->slot);
+  TidSlot* slot = nullptr;
+  for (;;) {
+    uptr epoch;
+    {
+      Lock lock(&ctx->slot_mtx);
+      epoch = ctx->global_epoch;
+      if (slot) {
+        // This is an exhausted slot from the previous iteration.
+        if (ctx->slot_queue.Queued(slot))
+          ctx->slot_queue.Remove(slot);
+        thr->slot_locked = false;
+        slot->mtx.Unlock();
+      }
+      for (;;) {
+        slot = ctx->slot_queue.PopFront();
+        if (!slot)
+          break;
+        if (slot->epoch() != kEpochLast) {
+          ctx->slot_queue.PushBack(slot);
+          break;
+        }
+      }
+    }
+    if (!slot) {
+      DoReset(thr, epoch);
+      continue;
+    }
+    slot->mtx.Lock();
+    CHECK(!thr->slot_locked);
+    thr->slot_locked = true;
+    if (slot->thr) {
+      DPrintf("#%d: preempting sid=%d tid=%d\n", thr->tid, (u32)slot->sid,
+              slot->thr->tid);
+      slot->SetEpoch(slot->thr->fast_state.epoch());
+      slot->thr = nullptr;
+    }
+    if (slot->epoch() != kEpochLast)
+      return slot;
+  }
+}
+
+void SlotAttachAndLock(ThreadState* thr) {
+  TidSlot* slot = FindSlotAndLock(thr);
+  DPrintf("#%d: SlotAttach: slot=%u\n", thr->tid, static_cast<int>(slot->sid));
+  CHECK(!slot->thr);
+  CHECK(!thr->slot);
+  slot->thr = thr;
+  thr->slot = slot;
+  Epoch epoch = EpochInc(slot->epoch());
+  CHECK(!EpochOverflow(epoch));
+  slot->SetEpoch(epoch);
+  thr->fast_state.SetSid(slot->sid);
+  thr->fast_state.SetEpoch(epoch);
+  if (thr->slot_epoch != ctx->global_epoch) {
+    thr->slot_epoch = ctx->global_epoch;
+    thr->clock.Reset();
 #if !SANITIZER_GO
-static const u32 kThreadQuarantineSize = 16;
-#else
-static const u32 kThreadQuarantineSize = 64;
+    thr->last_sleep_stack_id = kInvalidStackID;
+    thr->last_sleep_clock.Reset();
+#endif
+  }
+  thr->clock.Set(slot->sid, epoch);
+  slot->journal.PushBack({thr->tid, epoch});
+}
+
+static void SlotDetachImpl(ThreadState* thr, bool exiting) {
+  TidSlot* slot = thr->slot;
+  thr->slot = nullptr;
+  if (thr != slot->thr) {
+    slot = nullptr;  // we don't own the slot anymore
+    if (thr->slot_epoch != ctx->global_epoch) {
+      TracePart* part = nullptr;
+      auto* trace = &thr->tctx->trace;
+      {
+        Lock l(&trace->mtx);
+        auto* parts = &trace->parts;
+        // The trace can be completely empty in an unlikely event
+        // the thread is preempted right after it acquired the slot
+        // in ThreadStart and did not trace any events yet.
+        CHECK_LE(parts->Size(), 1);
+        part = parts->PopFront();
+        thr->tctx->trace.local_head = nullptr;
+        atomic_store_relaxed(&thr->trace_pos, 0);
+        thr->trace_prev_pc = 0;
+      }
+      if (part) {
+        Lock l(&ctx->slot_mtx);
+        TracePartFree(part);
+      }
+    }
+    return;
+  }
+  CHECK(exiting || thr->fast_state.epoch() == kEpochLast);
+  slot->SetEpoch(thr->fast_state.epoch());
+  slot->thr = nullptr;
+}
+
+void SlotDetach(ThreadState* thr) {
+  Lock lock(&thr->slot->mtx);
+  SlotDetachImpl(thr, true);
+}
+
+void SlotLock(ThreadState* thr) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  DCHECK(!thr->slot_locked);
+#if SANITIZER_DEBUG
+  // Check these mutexes are not locked.
+  // We can call DoReset from SlotAttachAndLock, which will lock
+  // these mutexes, but it happens only every once in a while.
+  { ThreadRegistryLock lock(&ctx->thread_registry); }
+  { Lock lock(&ctx->slot_mtx); }
 #endif
+  TidSlot* slot = thr->slot;
+  slot->mtx.Lock();
+  thr->slot_locked = true;
+  if (LIKELY(thr == slot->thr && thr->fast_state.epoch() != kEpochLast))
+    return;
+  SlotDetachImpl(thr, false);
+  thr->slot_locked = false;
+  slot->mtx.Unlock();
+  SlotAttachAndLock(thr);
+}
+
+void SlotUnlock(ThreadState* thr) {
+  DCHECK(thr->slot_locked);
+  thr->slot_locked = false;
+  thr->slot->mtx.Unlock();
+}
 
 Context::Context()
     : initialized(),
       report_mtx(MutexTypeReport),
       nreported(),
-      thread_registry(CreateThreadContext, kMaxTid, kThreadQuarantineSize,
-                      kMaxTidReuse),
+      thread_registry([](Tid tid) -> ThreadContextBase* {
+        return new (Alloc(sizeof(ThreadContext))) ThreadContext(tid);
+      }),
       racy_mtx(MutexTypeRacy),
       racy_stacks(),
       racy_addresses(),
       fired_suppressions_mtx(MutexTypeFired),
-      clock_alloc(LINKER_INITIALIZED, "clock allocator") {
+      slot_mtx(MutexTypeSlots),
+      resetting() {
   fired_suppressions.reserve(8);
+  for (uptr i = 0; i < ARRAY_SIZE(slots); i++) {
+    TidSlot* slot = &slots[i];
+    slot->sid = static_cast<Sid>(i);
+    slot_queue.PushBack(slot);
+  }
+  global_epoch = 1;
 }
 
+TidSlot::TidSlot() : mtx(MutexTypeSlot) {}
+
 // The objects are allocated in TLS, so one may rely on zero-initialization.
-ThreadState::ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
-                         unsigned reuse_count, uptr stk_addr, uptr stk_size,
-                         uptr tls_addr, uptr tls_size)
-    : fast_state(tid, epoch)
-      // Do not touch these, rely on zero initialization,
-      // they may be accessed before the ctor.
-      // , ignore_reads_and_writes()
-      // , ignore_interceptors()
-      ,
-      clock(tid, reuse_count)
-#if !SANITIZER_GO
-      ,
-      jmp_bufs()
-#endif
-      ,
-      tid(tid),
-      unique_id(unique_id),
-      stk_addr(stk_addr),
-      stk_size(stk_size),
-      tls_addr(tls_addr),
-      tls_size(tls_size)
-#if !SANITIZER_GO
-      ,
-      last_sleep_clock(tid)
-#endif
-{
+ThreadState::ThreadState(Tid tid)
+    // Do not touch these, rely on zero initialization,
+    // they may be accessed before the ctor.
+    // ignore_reads_and_writes()
+    // ignore_interceptors()
+    : tid(tid) {
   CHECK_EQ(reinterpret_cast<uptr>(this) % SANITIZER_CACHE_LINE_SIZE, 0);
 #if !SANITIZER_GO
-  shadow_stack_pos = shadow_stack;
-  shadow_stack_end = shadow_stack + kShadowStackSize;
+  // C/C++ uses fixed size shadow stack.
+  const int kInitStackSize = kShadowStackSize;
+  shadow_stack = static_cast<uptr*>(
+      MmapNoReserveOrDie(kInitStackSize * sizeof(uptr), "shadow stack"));
+  SetShadowRegionHugePageMode(reinterpret_cast<uptr>(shadow_stack),
+                              kInitStackSize * sizeof(uptr));
 #else
-  // Setup dynamic shadow stack.
+  // Go uses malloc-allocated shadow stack with dynamic size.
   const int kInitStackSize = 8;
-  shadow_stack = (uptr *)Alloc(kInitStackSize * sizeof(uptr));
+  shadow_stack = static_cast<uptr*>(Alloc(kInitStackSize * sizeof(uptr)));
+#endif
   shadow_stack_pos = shadow_stack;
   shadow_stack_end = shadow_stack + kInitStackSize;
-#endif
 }
 
 #if !SANITIZER_GO
@@ -168,11 +418,11 @@ void MemoryProfiler(u64 uptime) {
   WriteToFile(ctx->memprof_fd, buf.data(), internal_strlen(buf.data()));
 }
 
-void InitializeMemoryProfiler() {
+static bool InitializeMemoryProfiler() {
   ctx->memprof_fd = kInvalidFd;
   const char *fname = flags()->profile_memory;
   if (!fname || !fname[0])
-    return;
+    return false;
   if (internal_strcmp(fname, "stdout") == 0) {
     ctx->memprof_fd = 1;
   } else if (internal_strcmp(fname, "stderr") == 0) {
@@ -184,11 +434,11 @@ void InitializeMemoryProfiler() {
     if (ctx->memprof_fd == kInvalidFd) {
       Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
              filename.data());
-      return;
+      return false;
     }
   }
   MemoryProfiler(0);
-  MaybeSpawnBackgroundThread();
+  return true;
 }
 
 static void *BackgroundThread(void *arg) {
@@ -200,33 +450,34 @@ static void *BackgroundThread(void *arg) {
   const u64 kMs2Ns = 1000 * 1000;
   const u64 start = NanoTime();
 
-  u64 last_flush = NanoTime();
+  u64 last_flush = start;
   uptr last_rss = 0;
-  for (int i = 0;
-      atomic_load(&ctx->stop_background_thread, memory_order_relaxed) == 0;
-      i++) {
+  while (!atomic_load_relaxed(&ctx->stop_background_thread)) {
     SleepForMillis(100);
     u64 now = NanoTime();
 
     // Flush memory if requested.
     if (flags()->flush_memory_ms > 0) {
       if (last_flush + flags()->flush_memory_ms * kMs2Ns < now) {
-        VPrintf(1, "ThreadSanitizer: periodic memory flush\n");
+        VReport(1, "ThreadSanitizer: periodic memory flush\n");
         FlushShadowMemory();
-        last_flush = NanoTime();
+        now = last_flush = NanoTime();
       }
     }
     if (flags()->memory_limit_mb > 0) {
       uptr rss = GetRSS();
       uptr limit = uptr(flags()->memory_limit_mb) << 20;
-      VPrintf(1, "ThreadSanitizer: memory flush check"
-                 " RSS=%llu LAST=%llu LIMIT=%llu\n",
+      VReport(1,
+              "ThreadSanitizer: memory flush check"
+              " RSS=%llu LAST=%llu LIMIT=%llu\n",
               (u64)rss >> 20, (u64)last_rss >> 20, (u64)limit >> 20);
       if (2 * rss > limit + last_rss) {
-        VPrintf(1, "ThreadSanitizer: flushing memory due to RSS\n");
+        VReport(1, "ThreadSanitizer: flushing memory due to RSS\n");
         FlushShadowMemory();
         rss = GetRSS();
-        VPrintf(1, "ThreadSanitizer: memory flushed RSS=%llu\n", (u64)rss>>20);
+        now = NanoTime();
+        VReport(1, "ThreadSanitizer: memory flushed RSS=%llu\n",
+                (u64)rss >> 20);
       }
       last_rss = rss;
     }
@@ -267,11 +518,43 @@ void DontNeedShadowFor(uptr addr, uptr size) {
 }
 
 #if !SANITIZER_GO
+// We call UnmapShadow before the actual munmap, at that point we don't yet
+// know if the provided address/size are sane. We can't call UnmapShadow
+// after the actual munmap becuase at that point the memory range can
+// already be reused for something else, so we can't rely on the munmap
+// return value to understand is the values are sane.
+// While calling munmap with insane values (non-canonical address, negative
+// size, etc) is an error, the kernel won't crash. We must also try to not
+// crash as the failure mode is very confusing (paging fault inside of the
+// runtime on some derived shadow address).
+static bool IsValidMmapRange(uptr addr, uptr size) {
+  if (size == 0)
+    return true;
+  if (static_cast<sptr>(size) < 0)
+    return false;
+  if (!IsAppMem(addr) || !IsAppMem(addr + size - 1))
+    return false;
+  // Check that if the start of the region belongs to one of app ranges,
+  // end of the region belongs to the same region.
+  const uptr ranges[][2] = {
+      {LoAppMemBeg(), LoAppMemEnd()},
+      {MidAppMemBeg(), MidAppMemEnd()},
+      {HiAppMemBeg(), HiAppMemEnd()},
+  };
+  for (auto range : ranges) {
+    if (addr >= range[0] && addr < range[1])
+      return addr + size <= range[1];
+  }
+  return false;
+}
+
 void UnmapShadow(ThreadState *thr, uptr addr, uptr size) {
-  if (size == 0) return;
+  if (size == 0 || !IsValidMmapRange(addr, size))
+    return;
   DontNeedShadowFor(addr, size);
   ScopedGlobalProcessor sgp;
-  ctx->metamap.ResetRange(thr->proc(), addr, size);
+  SlotLocker locker(thr, true);
+  ctx->metamap.ResetRange(thr->proc(), addr, size, true);
 }
 #endif
 
@@ -317,18 +600,6 @@ void MapShadow(uptr addr, uptr size) {
           addr + size, meta_begin, meta_end);
 }
 
-void MapThreadTrace(uptr addr, uptr size, const char *name) {
-  DPrintf("#0: Mapping trace at 0x%zx-0x%zx(0x%zx)\n", addr, addr + size, size);
-  CHECK_GE(addr, TraceMemBeg());
-  CHECK_LE(addr + size, TraceMemEnd());
-  CHECK_EQ(addr, addr & ~((64 << 10) - 1));  // windows wants 64K alignment
-  if (!MmapFixedSuperNoReserve(addr, size, name)) {
-    Printf("FATAL: ThreadSanitizer can not mmap thread trace (0x%zx/0x%zx)\n",
-           addr, size);
-    Die();
-  }
-}
-
 #if !SANITIZER_GO
 static void OnStackUnwind(const SignalContext &sig, const void *,
                           BufferedStackTrace *stack) {
@@ -347,8 +618,11 @@ void CheckUnwind() {
   // since we are going to die soon.
   ScopedIgnoreInterceptors ignore;
 #if !SANITIZER_GO
-  cur_thread()->ignore_sync++;
-  cur_thread()->ignore_reads_and_writes++;
+  ThreadState* thr = cur_thread();
+  thr->nomalloc = false;
+  thr->ignore_sync++;
+  thr->ignore_reads_and_writes++;
+  atomic_store_relaxed(&thr->in_signal_handler, 0);
 #endif
   PrintCurrentStackSlow(StackTrace::GetCurrentPc());
 }
@@ -403,22 +677,23 @@ void Initialize(ThreadState *thr) {
   Symbolizer::GetOrInit()->AddHooks(EnterSymbolizer, ExitSymbolizer);
 #endif
 
-  VPrintf(1, "***** Running under ThreadSanitizer v2 (pid %d) *****\n",
+  VPrintf(1, "***** Running under ThreadSanitizer v3 (pid %d) *****\n",
           (int)internal_getpid());
 
   // Initialize thread 0.
-  Tid tid = ThreadCreate(thr, 0, 0, true);
+  Tid tid = ThreadCreate(nullptr, 0, 0, true);
   CHECK_EQ(tid, kMainTid);
   ThreadStart(thr, tid, GetTid(), ThreadType::Regular);
 #if TSAN_CONTAINS_UBSAN
   __ubsan::InitAsPlugin();
 #endif
-  ctx->initialized = true;
 
 #if !SANITIZER_GO
   Symbolizer::LateInitialize();
-  InitializeMemoryProfiler();
+  if (InitializeMemoryProfiler() || flags()->force_background_thread)
+    MaybeSpawnBackgroundThread();
 #endif
+  ctx->initialized = true;
 
   if (flags()->stop_on_start) {
     Printf("ThreadSanitizer is suspended at startup (pid %d)."
@@ -444,7 +719,6 @@ void MaybeSpawnBackgroundThread() {
 #endif
 }
 
-
 int Finalize(ThreadState *thr) {
   bool failed = false;
 
@@ -452,12 +726,12 @@ int Finalize(ThreadState *thr) {
     DumpProcessMap();
 
   if (flags()->atexit_sleep_ms > 0 && ThreadCount(thr) > 1)
-    SleepForMillis(flags()->atexit_sleep_ms);
+    internal_usleep(u64(flags()->atexit_sleep_ms) * 1000);
 
-  // Wait for pending reports.
-  ctx->report_mtx.Lock();
-  { ScopedErrorReportLock l; }
-  ctx->report_mtx.Unlock();
+  {
+    // Wait for pending reports.
+    ScopedErrorReportLock lock;
+  }
 
 #if !SANITIZER_GO
   if (Verbosity()) AllocatorPrintStats();
@@ -483,10 +757,16 @@ int Finalize(ThreadState *thr) {
 }
 
 #if !SANITIZER_GO
-void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
+void ForkBefore(ThreadState* thr, uptr pc) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  GlobalProcessorLock();
+  // Detaching from the slot makes OnUserFree skip writing to the shadow.
+  // The slot will be locked so any attempts to use it will deadlock anyway.
+  SlotDetach(thr);
+  for (auto& slot : ctx->slots) slot.mtx.Lock();
   ctx->thread_registry.Lock();
-  ctx->report_mtx.Lock();
+  ctx->slot_mtx.Lock();
   ScopedErrorReportLock::Lock();
+  AllocatorLock();
   // Suppress all reports in the pthread_atfork callbacks.
   // Reports will deadlock on the report_mtx.
   // We could ignore sync operations as well,
@@ -495,29 +775,38 @@ void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
   thr->suppress_reports++;
   // On OS X, REAL(fork) can call intercepted functions (OSSpinLockLock), and
   // we'll assert in CheckNoLocks() unless we ignore interceptors.
+  // On OS X libSystem_atfork_prepare/parent/child callbacks are called
+  // after/before our callbacks and they call free.
   thr->ignore_interceptors++;
+  // Disables memory write in OnUserAlloc/Free.
+  thr->ignore_reads_and_writes++;
+
+  __tsan_test_only_on_fork();
 }
 
-void ForkParentAfter(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
+static void ForkAfter(ThreadState* thr) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
   thr->suppress_reports--;  // Enabled in ForkBefore.
   thr->ignore_interceptors--;
+  thr->ignore_reads_and_writes--;
+  AllocatorUnlock();
   ScopedErrorReportLock::Unlock();
-  ctx->report_mtx.Unlock();
+  ctx->slot_mtx.Unlock();
   ctx->thread_registry.Unlock();
+  for (auto& slot : ctx->slots) slot.mtx.Unlock();
+  SlotAttachAndLock(thr);
+  SlotUnlock(thr);
+  GlobalProcessorUnlock();
 }
 
-void ForkChildAfter(ThreadState *thr, uptr pc,
-                    bool start_thread) NO_THREAD_SAFETY_ANALYSIS {
-  thr->suppress_reports--;  // Enabled in ForkBefore.
-  thr->ignore_interceptors--;
-  ScopedErrorReportLock::Unlock();
-  ctx->report_mtx.Unlock();
-  ctx->thread_registry.Unlock();
+void ForkParentAfter(ThreadState* thr, uptr pc) { ForkAfter(thr); }
 
-  uptr nthread = 0;
-  ctx->thread_registry.GetNumberOfThreads(0, 0, &nthread /* alive threads */);
-  VPrintf(1, "ThreadSanitizer: forked new process with pid %d,"
-      " parent had %d threads\n", (int)internal_getpid(), (int)nthread);
+void ForkChildAfter(ThreadState* thr, uptr pc, bool start_thread) {
+  ForkAfter(thr);
+  u32 nthread = ctx->thread_registry.OnFork(thr->tid);
+  VPrintf(1,
+          "ThreadSanitizer: forked new process with pid %d,"
+          " parent had %d threads\n",
+          (int)internal_getpid(), (int)nthread);
   if (nthread == 1) {
     if (start_thread)
       StartBackgroundThread();
@@ -527,6 +816,7 @@ void ForkChildAfter(ThreadState *thr, uptr pc,
     // ignores for everything in the hope that we will exec soon.
     ctx->after_multithreaded_fork = true;
     thr->ignore_interceptors++;
+    thr->suppress_reports++;
     ThreadIgnoreBegin(thr, pc);
     ThreadIgnoreSyncBegin(thr, pc);
   }
@@ -548,8 +838,10 @@ void GrowShadowStack(ThreadState *thr) {
 #endif
 
 StackID CurrentStackId(ThreadState *thr, uptr pc) {
+#if !SANITIZER_GO
   if (!thr->is_inited)  // May happen during bootstrap.
     return kInvalidStackID;
+#endif
   if (pc != 0) {
 #if !SANITIZER_GO
     DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
@@ -567,53 +859,72 @@ StackID CurrentStackId(ThreadState *thr, uptr pc) {
   return id;
 }
 
-namespace v3 {
-
-NOINLINE
-void TraceSwitchPart(ThreadState *thr) {
+static bool TraceSkipGap(ThreadState* thr) {
   Trace *trace = &thr->tctx->trace;
   Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
   DCHECK_EQ(reinterpret_cast<uptr>(pos + 1) & TracePart::kAlignment, 0);
   auto *part = trace->parts.Back();
-  DPrintf("TraceSwitchPart part=%p pos=%p\n", part, pos);
-  if (part) {
-    // We can get here when we still have space in the current trace part.
-    // The fast-path check in TraceAcquire has false positives in the middle of
-    // the part. Check if we are indeed at the end of the current part or not,
-    // and fill any gaps with NopEvent's.
-    Event *end = &part->events[TracePart::kSize];
-    DCHECK_GE(pos, &part->events[0]);
-    DCHECK_LE(pos, end);
-    if (pos + 1 < end) {
-      if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
-          TracePart::kAlignment)
-        *pos++ = NopEvent;
+  DPrintf("#%d: TraceSwitchPart enter trace=%p parts=%p-%p pos=%p\n", thr->tid,
+          trace, trace->parts.Front(), part, pos);
+  if (!part)
+    return false;
+  // We can get here when we still have space in the current trace part.
+  // The fast-path check in TraceAcquire has false positives in the middle of
+  // the part. Check if we are indeed at the end of the current part or not,
+  // and fill any gaps with NopEvent's.
+  Event* end = &part->events[TracePart::kSize];
+  DCHECK_GE(pos, &part->events[0]);
+  DCHECK_LE(pos, end);
+  if (pos + 1 < end) {
+    if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
+        TracePart::kAlignment)
       *pos++ = NopEvent;
-      DCHECK_LE(pos + 2, end);
-      atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
-      // Ensure we setup trace so that the next TraceAcquire
-      // won't detect trace part end.
-      Event *ev;
-      CHECK(TraceAcquire(thr, &ev));
-      return;
-    }
-    // We are indeed at the end.
-    for (; pos < end; pos++) *pos = NopEvent;
+    *pos++ = NopEvent;
+    DCHECK_LE(pos + 2, end);
+    atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
+    return true;
   }
+  // We are indeed at the end.
+  for (; pos < end; pos++) *pos = NopEvent;
+  return false;
+}
+
+NOINLINE
+void TraceSwitchPart(ThreadState* thr) {
+  if (TraceSkipGap(thr))
+    return;
 #if !SANITIZER_GO
   if (ctx->after_multithreaded_fork) {
     // We just need to survive till exec.
-    CHECK(part);
-    atomic_store_relaxed(&thr->trace_pos,
-                         reinterpret_cast<uptr>(&part->events[0]));
-    return;
+    TracePart* part = thr->tctx->trace.parts.Back();
+    if (part) {
+      atomic_store_relaxed(&thr->trace_pos,
+                           reinterpret_cast<uptr>(&part->events[0]));
+      return;
+    }
   }
 #endif
-  part = new (MmapOrDie(sizeof(TracePart), "TracePart")) TracePart();
+  TraceSwitchPartImpl(thr);
+}
+
+void TraceSwitchPartImpl(ThreadState* thr) {
+  SlotLocker locker(thr, true);
+  Trace* trace = &thr->tctx->trace;
+  TracePart* part = TracePartAlloc(thr);
   part->trace = trace;
   thr->trace_prev_pc = 0;
+  TracePart* recycle = nullptr;
+  // Keep roughly half of parts local to the thread
+  // (not queued into the recycle queue).
+  uptr local_parts = (Trace::kMinParts + flags()->history_size + 1) / 2;
   {
     Lock lock(&trace->mtx);
+    if (trace->parts.Empty())
+      trace->local_head = part;
+    if (trace->parts.Size() >= local_parts) {
+      recycle = trace->local_head;
+      trace->local_head = trace->parts.Next(recycle);
+    }
     trace->parts.PushBack(part);
     atomic_store_relaxed(&thr->trace_pos,
                          reinterpret_cast<uptr>(&part->events[0]));
@@ -621,60 +932,49 @@ void TraceSwitchPart(ThreadState *thr) {
   // Make this part self-sufficient by restoring the current stack
   // and mutex set in the beginning of the trace.
   TraceTime(thr);
-  for (uptr *pos = &thr->shadow_stack[0]; pos < thr->shadow_stack_pos; pos++)
-    CHECK(TryTraceFunc(thr, *pos));
+  {
+    // Pathologically large stacks may not fit into the part.
+    // In these cases we log only fixed number of top frames.
+    const uptr kMaxFrames = 1000;
+    // Check that kMaxFrames won't consume the whole part.
+    static_assert(kMaxFrames < TracePart::kSize / 2, "kMaxFrames is too big");
+    uptr* pos = Max(&thr->shadow_stack[0], thr->shadow_stack_pos - kMaxFrames);
+    for (; pos < thr->shadow_stack_pos; pos++) {
+      if (TryTraceFunc(thr, *pos))
+        continue;
+      CHECK(TraceSkipGap(thr));
+      CHECK(TryTraceFunc(thr, *pos));
+    }
+  }
   for (uptr i = 0; i < thr->mset.Size(); i++) {
     MutexSet::Desc d = thr->mset.Get(i);
-    TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
-                   d.addr, d.stack_id);
+    for (uptr i = 0; i < d.count; i++)
+      TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
+                     d.addr, d.stack_id);
   }
+  {
+    Lock lock(&ctx->slot_mtx);
+    // There is a small chance that the slot may be not queued at this point.
+    // This can happen if the slot has kEpochLast epoch and another thread
+    // in FindSlotAndLock discovered that it's exhausted and removed it from
+    // the slot queue. kEpochLast can happen in 2 cases: (1) if TraceSwitchPart
+    // was called with the slot locked and epoch already at kEpochLast,
+    // or (2) if we've acquired a new slot in SlotLock in the beginning
+    // of the function and the slot was at kEpochLast - 1, so after increment
+    // in SlotAttachAndLock it become kEpochLast.
+    if (ctx->slot_queue.Queued(thr->slot)) {
+      ctx->slot_queue.Remove(thr->slot);
+      ctx->slot_queue.PushBack(thr->slot);
+    }
+    if (recycle)
+      ctx->trace_part_recycle.PushBack(recycle);
+  }
+  DPrintf("#%d: TraceSwitchPart exit parts=%p-%p pos=0x%zx\n", thr->tid,
+          trace->parts.Front(), trace->parts.Back(),
+          atomic_load_relaxed(&thr->trace_pos));
 }
 
-}  // namespace v3
-
-void TraceSwitch(ThreadState *thr) {
-#if !SANITIZER_GO
-  if (ctx->after_multithreaded_fork)
-    return;
-#endif
-  thr->nomalloc++;
-  Trace *thr_trace = ThreadTrace(thr->tid);
-  Lock l(&thr_trace->mtx);
-  unsigned trace = (thr->fast_state.epoch() / kTracePartSize) % TraceParts();
-  TraceHeader *hdr = &thr_trace->headers[trace];
-  hdr->epoch0 = thr->fast_state.epoch();
-  ObtainCurrentStack(thr, 0, &hdr->stack0);
-  hdr->mset0 = thr->mset;
-  thr->nomalloc--;
-}
-
-Trace *ThreadTrace(Tid tid) { return (Trace *)GetThreadTraceHeader(tid); }
-
-uptr TraceTopPC(ThreadState *thr) {
-  Event *events = (Event*)GetThreadTrace(thr->tid);
-  uptr pc = events[thr->fast_state.GetTracePos()];
-  return pc;
-}
-
-uptr TraceSize() {
-  return (uptr)(1ull << (kTracePartSizeBits + flags()->history_size + 1));
-}
-
-uptr TraceParts() {
-  return TraceSize() / kTracePartSize;
-}
-
-#if !SANITIZER_GO
-extern "C" void __tsan_trace_switch() {
-  TraceSwitch(cur_thread());
-}
-
-extern "C" void __tsan_report_race() {
-  ReportRace(cur_thread());
-}
-#endif
-
-void ThreadIgnoreBegin(ThreadState *thr, uptr pc) {
+void ThreadIgnoreBegin(ThreadState* thr, uptr pc) {
   DPrintf("#%d: ThreadIgnoreBegin\n", thr->tid);
   thr->ignore_reads_and_writes++;
   CHECK_GT(thr->ignore_reads_and_writes, 0);
@@ -734,7 +1034,6 @@ void build_consistency_debug() {}
 #else
 void build_consistency_release() {}
 #endif
-
 }  // namespace __tsan
 
 #if SANITIZER_CHECK_DEADLOCKS
@@ -742,18 +1041,27 @@ namespace __sanitizer {
 using namespace __tsan;
 MutexMeta mutex_meta[] = {
     {MutexInvalid, "Invalid", {}},
-    {MutexThreadRegistry, "ThreadRegistry", {}},
-    {MutexTypeTrace, "Trace", {MutexLeaf}},
-    {MutexTypeReport, "Report", {MutexTypeSyncVar}},
-    {MutexTypeSyncVar, "SyncVar", {}},
+    {MutexThreadRegistry,
+     "ThreadRegistry",
+     {MutexTypeSlots, MutexTypeTrace, MutexTypeReport}},
+    {MutexTypeReport, "Report", {MutexTypeTrace}},
+    {MutexTypeSyncVar, "SyncVar", {MutexTypeReport, MutexTypeTrace}},
     {MutexTypeAnnotations, "Annotations", {}},
-    {MutexTypeAtExit, "AtExit", {MutexTypeSyncVar}},
+    {MutexTypeAtExit, "AtExit", {}},
     {MutexTypeFired, "Fired", {MutexLeaf}},
     {MutexTypeRacy, "Racy", {MutexLeaf}},
-    {MutexTypeGlobalProc, "GlobalProc", {}},
+    {MutexTypeGlobalProc, "GlobalProc", {MutexTypeSlot, MutexTypeSlots}},
+    {MutexTypeInternalAlloc, "InternalAlloc", {MutexLeaf}},
+    {MutexTypeTrace, "Trace", {}},
+    {MutexTypeSlot,
+     "Slot",
+     {MutexMulti, MutexTypeTrace, MutexTypeSyncVar, MutexThreadRegistry,
+      MutexTypeSlots}},
+    {MutexTypeSlots, "Slots", {MutexTypeTrace, MutexTypeReport}},
     {},
 };
 
 void PrintMutexPC(uptr pc) { StackTrace(&pc, 1).Print(); }
+
 }  // namespace __sanitizer
 #endif
diff --git a/libsanitizer/tsan/tsan_rtl.h b/libsanitizer/tsan/tsan_rtl.h
index eab8370..b472c0f 100644
--- a/libsanitizer/tsan/tsan_rtl.h
+++ b/libsanitizer/tsan/tsan_rtl.h
@@ -34,10 +34,10 @@
 #include "sanitizer_common/sanitizer_suppressions.h"
 #include "sanitizer_common/sanitizer_thread_registry.h"
 #include "sanitizer_common/sanitizer_vector.h"
-#include "tsan_clock.h"
 #include "tsan_defs.h"
 #include "tsan_flags.h"
 #include "tsan_ignoreset.h"
+#include "tsan_ilist.h"
 #include "tsan_mman.h"
 #include "tsan_mutexset.h"
 #include "tsan_platform.h"
@@ -46,6 +46,7 @@
 #include "tsan_stack_trace.h"
 #include "tsan_sync.h"
 #include "tsan_trace.h"
+#include "tsan_vector_clock.h"
 
 #if SANITIZER_WORDSIZE != 64
 # error "ThreadSanitizer is supported only on 64-bit platforms"
@@ -116,7 +117,6 @@ struct Processor {
 #endif
   DenseSlabAllocCache block_cache;
   DenseSlabAllocCache sync_cache;
-  DenseSlabAllocCache clock_cache;
   DDPhysicalThread *dd_pt;
 };
 
@@ -130,67 +130,85 @@ struct ScopedGlobalProcessor {
 };
 #endif
 
+struct TidEpoch {
+  Tid tid;
+  Epoch epoch;
+};
+
+struct TidSlot {
+  Mutex mtx;
+  Sid sid;
+  atomic_uint32_t raw_epoch;
+  ThreadState *thr;
+  Vector<TidEpoch> journal;
+  INode node;
+
+  Epoch epoch() const {
+    return static_cast<Epoch>(atomic_load(&raw_epoch, memory_order_relaxed));
+  }
+
+  void SetEpoch(Epoch v) {
+    atomic_store(&raw_epoch, static_cast<u32>(v), memory_order_relaxed);
+  }
+
+  TidSlot();
+} ALIGNED(SANITIZER_CACHE_LINE_SIZE);
+
 // This struct is stored in TLS.
 struct ThreadState {
   FastState fast_state;
-  // Synch epoch represents the threads's epoch before the last synchronization
-  // action. It allows to reduce number of shadow state updates.
-  // For example, fast_synch_epoch=100, last write to addr X was at epoch=150,
-  // if we are processing write to X from the same thread at epoch=200,
-  // we do nothing, because both writes happen in the same 'synch epoch'.
-  // That is, if another memory access does not race with the former write,
-  // it does not race with the latter as well.
-  // QUESTION: can we can squeeze this into ThreadState::Fast?
-  // E.g. ThreadState::Fast is a 44-bit, 32 are taken by synch_epoch and 12 are
-  // taken by epoch between synchs.
-  // This way we can save one load from tls.
-  u64 fast_synch_epoch;
+  int ignore_sync;
+#if !SANITIZER_GO
+  int ignore_interceptors;
+#endif
+  uptr *shadow_stack_pos;
+
+  // Current position in tctx->trace.Back()->events (Event*).
+  atomic_uintptr_t trace_pos;
+  // PC of the last memory access, used to compute PC deltas in the trace.
+  uptr trace_prev_pc;
+
   // Technically `current` should be a separate THREADLOCAL variable;
   // but it is placed here in order to share cache line with previous fields.
   ThreadState* current;
+
+  atomic_sint32_t pending_signals;
+
+  VectorClock clock;
+
   // This is a slow path flag. On fast path, fast_state.GetIgnoreBit() is read.
   // We do not distinguish beteween ignoring reads and writes
   // for better performance.
   int ignore_reads_and_writes;
-  atomic_sint32_t pending_signals;
-  int ignore_sync;
   int suppress_reports;
   // Go does not support ignores.
 #if !SANITIZER_GO
   IgnoreSet mop_ignore_set;
   IgnoreSet sync_ignore_set;
-  // C/C++ uses fixed size shadow stack.
-  uptr shadow_stack[kShadowStackSize];
-#else
-  // Go uses malloc-allocated shadow stack with dynamic size.
-  uptr *shadow_stack;
 #endif
+  uptr *shadow_stack;
   uptr *shadow_stack_end;
-  uptr *shadow_stack_pos;
-  RawShadow *racy_shadow_addr;
-  RawShadow racy_state[2];
-  MutexSet mset;
-  ThreadClock clock;
 #if !SANITIZER_GO
   Vector<JmpBuf> jmp_bufs;
-  int ignore_interceptors;
-#endif
-  const Tid tid;
-  const int unique_id;
-  bool in_symbolizer;
+  int in_symbolizer;
   bool in_ignored_lib;
   bool is_inited;
+#endif
+  MutexSet mset;
   bool is_dead;
-  bool is_freeing;
-  bool is_vptr_access;
-  const uptr stk_addr;
-  const uptr stk_size;
-  const uptr tls_addr;
-  const uptr tls_size;
+  const Tid tid;
+  uptr stk_addr;
+  uptr stk_size;
+  uptr tls_addr;
+  uptr tls_size;
   ThreadContext *tctx;
 
   DDLogicalThread *dd_lt;
 
+  TidSlot *slot;
+  uptr slot_epoch;
+  bool slot_locked;
+
   // Current wired Processor, or nullptr. Required to handle any events.
   Processor *proc1;
 #if !SANITIZER_GO
@@ -204,7 +222,7 @@ struct ThreadState {
 
 #if !SANITIZER_GO
   StackID last_sleep_stack_id;
-  ThreadClock last_sleep_clock;
+  VectorClock last_sleep_clock;
 #endif
 
   // Set in regions of runtime that must be signal-safe and fork-safe.
@@ -213,16 +231,7 @@ struct ThreadState {
 
   const ReportDesc *current_report;
 
-  // Current position in tctx->trace.Back()->events (Event*).
-  atomic_uintptr_t trace_pos;
-  // PC of the last memory access, used to compute PC deltas in the trace.
-  uptr trace_prev_pc;
-  Sid sid;
-  Epoch epoch;
-
-  explicit ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
-                       unsigned reuse_count, uptr stk_addr, uptr stk_size,
-                       uptr tls_addr, uptr tls_size);
+  explicit ThreadState(Tid tid);
 } ALIGNED(SANITIZER_CACHE_LINE_SIZE);
 
 #if !SANITIZER_GO
@@ -256,14 +265,9 @@ class ThreadContext final : public ThreadContextBase {
   ~ThreadContext();
   ThreadState *thr;
   StackID creation_stack_id;
-  SyncClock sync;
-  // Epoch at which the thread had started.
-  // If we see an event from the thread stamped by an older epoch,
-  // the event is from a dead thread that shared tid with this thread.
-  u64 epoch0;
-  u64 epoch1;
-
-  v3::Trace trace;
+  VectorClock *sync;
+  uptr sync_epoch;
+  Trace trace;
 
   // Override superclass callbacks.
   void OnDead() override;
@@ -318,12 +322,22 @@ struct Context {
   InternalMmapVector<FiredSuppression> fired_suppressions;
   DDetector *dd;
 
-  ClockAlloc clock_alloc;
-
   Flags flags;
   fd_t memprof_fd;
 
+  // The last slot index (kFreeSid) is used to denote freed memory.
+  TidSlot slots[kThreadSlotCount - 1];
+
+  // Protects global_epoch, slot_queue, trace_part_recycle.
   Mutex slot_mtx;
+  uptr global_epoch;  // guarded by slot_mtx and by all slot mutexes
+  bool resetting;     // global reset is in progress
+  IList<TidSlot, &TidSlot::node> slot_queue SANITIZER_GUARDED_BY(slot_mtx);
+  IList<TraceHeader, &TraceHeader::global, TracePart> trace_part_recycle
+      SANITIZER_GUARDED_BY(slot_mtx);
+  uptr trace_part_total_allocated SANITIZER_GUARDED_BY(slot_mtx);
+  uptr trace_part_recycle_finished SANITIZER_GUARDED_BY(slot_mtx);
+  uptr trace_part_finished_excess SANITIZER_GUARDED_BY(slot_mtx);
 };
 
 extern Context *ctx;  // The one and the only global runtime context.
@@ -352,17 +366,17 @@ uptr TagFromShadowStackFrame(uptr pc);
 
 class ScopedReportBase {
  public:
-  void AddMemoryAccess(uptr addr, uptr external_tag, Shadow s, StackTrace stack,
-                       const MutexSet *mset);
+  void AddMemoryAccess(uptr addr, uptr external_tag, Shadow s, Tid tid,
+                       StackTrace stack, const MutexSet *mset);
   void AddStack(StackTrace stack, bool suppressable = false);
   void AddThread(const ThreadContext *tctx, bool suppressable = false);
-  void AddThread(Tid unique_tid, bool suppressable = false);
+  void AddThread(Tid tid, bool suppressable = false);
   void AddUniqueTid(Tid unique_tid);
-  void AddMutex(const SyncVar *s);
-  u64 AddMutex(u64 id);
+  int AddMutex(uptr addr, StackID creation_stack_id);
   void AddLocation(uptr addr, uptr size);
   void AddSleep(StackID stack_id);
   void SetCount(int count);
+  void SetSigNum(int sig);
 
   const ReportDesc *GetReport() const;
 
@@ -376,8 +390,6 @@ class ScopedReportBase {
   // at best it will cause deadlocks on internal mutexes.
   ScopedIgnoreInterceptors ignore_interceptors_;
 
-  void AddDeadMutex(u64 id);
-
   ScopedReportBase(const ScopedReportBase &) = delete;
   void operator=(const ScopedReportBase &) = delete;
 };
@@ -393,8 +405,6 @@ class ScopedReport : public ScopedReportBase {
 
 bool ShouldReport(ThreadState *thr, ReportType typ);
 ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack);
-void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
-                  MutexSet *mset, uptr *tag = nullptr);
 
 // The stack could look like:
 //   <start> | <main> | <foo> | tag | <bar>
@@ -442,7 +452,8 @@ void ForkBefore(ThreadState *thr, uptr pc);
 void ForkParentAfter(ThreadState *thr, uptr pc);
 void ForkChildAfter(ThreadState *thr, uptr pc, bool start_thread);
 
-void ReportRace(ThreadState *thr);
+void ReportRace(ThreadState *thr, RawShadow *shadow_mem, Shadow cur, Shadow old,
+                AccessType typ);
 bool OutputReport(ThreadState *thr, const ScopedReport &srep);
 bool IsFiredSuppression(Context *ctx, ReportType type, StackTrace trace);
 bool IsExpectedReport(uptr addr, uptr size);
@@ -472,55 +483,28 @@ int Finalize(ThreadState *thr);
 void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write);
 void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write);
 
-void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
-    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic);
-void MemoryAccessImpl(ThreadState *thr, uptr addr,
-    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
-    u64 *shadow_mem, Shadow cur);
-void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
-    uptr size, bool is_write);
+void MemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                  AccessType typ);
 void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
                            AccessType typ);
-
-const int kSizeLog1 = 0;
-const int kSizeLog2 = 1;
-const int kSizeLog4 = 2;
-const int kSizeLog8 = 3;
+// This creates 2 non-inlined specialized versions of MemoryAccessRange.
+template <bool is_read>
+void MemoryAccessRangeT(ThreadState *thr, uptr pc, uptr addr, uptr size);
 
 ALWAYS_INLINE
-void MemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                  AccessType typ) {
-  int size_log;
-  switch (size) {
-    case 1:
-      size_log = kSizeLog1;
-      break;
-    case 2:
-      size_log = kSizeLog2;
-      break;
-    case 4:
-      size_log = kSizeLog4;
-      break;
-    default:
-      DCHECK_EQ(size, 8);
-      size_log = kSizeLog8;
-      break;
-  }
-  bool is_write = !(typ & kAccessRead);
-  bool is_atomic = typ & kAccessAtomic;
-  if (typ & kAccessVptr)
-    thr->is_vptr_access = true;
-  if (typ & kAccessFree)
-    thr->is_freeing = true;
-  MemoryAccess(thr, pc, addr, size_log, is_write, is_atomic);
-  if (typ & kAccessVptr)
-    thr->is_vptr_access = false;
-  if (typ & kAccessFree)
-    thr->is_freeing = false;
+void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                       bool is_write) {
+  if (size == 0)
+    return;
+  if (is_write)
+    MemoryAccessRangeT<false>(thr, pc, addr, size);
+  else
+    MemoryAccessRangeT<true>(thr, pc, addr, size);
 }
 
-void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size);
+void ShadowSet(RawShadow *p, RawShadow *end, RawShadow v);
 void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size);
+void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size);
 void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
                                          uptr size);
@@ -530,9 +514,6 @@ void ThreadIgnoreEnd(ThreadState *thr);
 void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc);
 void ThreadIgnoreSyncEnd(ThreadState *thr);
 
-void FuncEntry(ThreadState *thr, uptr pc);
-void FuncExit(ThreadState *thr);
-
 Tid ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached);
 void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
                  ThreadType thread_type);
@@ -578,60 +559,7 @@ void Release(ThreadState *thr, uptr pc, uptr addr);
 void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr);
 void ReleaseStore(ThreadState *thr, uptr pc, uptr addr);
 void AfterSleep(ThreadState *thr, uptr pc);
-void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c);
-void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
-void ReleaseStoreAcquireImpl(ThreadState *thr, uptr pc, SyncClock *c);
-void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c);
-void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
-
-// The hacky call uses custom calling convention and an assembly thunk.
-// It is considerably faster that a normal call for the caller
-// if it is not executed (it is intended for slow paths from hot functions).
-// The trick is that the call preserves all registers and the compiler
-// does not treat it as a call.
-// If it does not work for you, use normal call.
-#if !SANITIZER_DEBUG && defined(__x86_64__) && !SANITIZER_MAC
-// The caller may not create the stack frame for itself at all,
-// so we create a reserve stack frame for it (1024b must be enough).
-#define HACKY_CALL(f) \
-  __asm__ __volatile__("sub $1024, %%rsp;" \
-                       CFI_INL_ADJUST_CFA_OFFSET(1024) \
-                       ".hidden " #f "_thunk;" \
-                       "call " #f "_thunk;" \
-                       "add $1024, %%rsp;" \
-                       CFI_INL_ADJUST_CFA_OFFSET(-1024) \
-                       ::: "memory", "cc");
-#else
-#define HACKY_CALL(f) f()
-#endif
-
-void TraceSwitch(ThreadState *thr);
-uptr TraceTopPC(ThreadState *thr);
-uptr TraceSize();
-uptr TraceParts();
-Trace *ThreadTrace(Tid tid);
-
-extern "C" void __tsan_trace_switch();
-void ALWAYS_INLINE TraceAddEvent(ThreadState *thr, FastState fs,
-                                        EventType typ, u64 addr) {
-  if (!kCollectHistory)
-    return;
-  DCHECK_GE((int)typ, 0);
-  DCHECK_LE((int)typ, 7);
-  DCHECK_EQ(GetLsb(addr, kEventPCBits), addr);
-  u64 pos = fs.GetTracePos();
-  if (UNLIKELY((pos % kTracePartSize) == 0)) {
-#if !SANITIZER_GO
-    HACKY_CALL(__tsan_trace_switch);
-#else
-    TraceSwitch(thr);
-#endif
-  }
-  Event *trace = (Event*)GetThreadTrace(fs.tid());
-  Event *evp = &trace[pos];
-  Event ev = (u64)addr | ((u64)typ << kEventPCBits);
-  *evp = ev;
-}
+void IncrementEpoch(ThreadState *thr);
 
 #if !SANITIZER_GO
 uptr ALWAYS_INLINE HeapEnd() {
@@ -639,6 +567,13 @@ uptr ALWAYS_INLINE HeapEnd() {
 }
 #endif
 
+void SlotAttachAndLock(ThreadState *thr) SANITIZER_ACQUIRE(thr->slot->mtx);
+void SlotDetach(ThreadState *thr);
+void SlotLock(ThreadState *thr) SANITIZER_ACQUIRE(thr->slot->mtx);
+void SlotUnlock(ThreadState *thr) SANITIZER_RELEASE(thr->slot->mtx);
+void DoReset(ThreadState *thr, uptr epoch);
+void FlushShadowMemory();
+
 ThreadState *FiberCreate(ThreadState *thr, uptr pc, unsigned flags);
 void FiberDestroy(ThreadState *thr, uptr pc, ThreadState *fiber);
 void FiberSwitch(ThreadState *thr, uptr pc, ThreadState *fiber, unsigned flags);
@@ -649,6 +584,43 @@ enum FiberSwitchFlags {
   FiberSwitchFlagNoSync = 1 << 0, // __tsan_switch_to_fiber_no_sync
 };
 
+class SlotLocker {
+ public:
+  ALWAYS_INLINE
+  SlotLocker(ThreadState *thr, bool recursive = false)
+      : thr_(thr), locked_(recursive ? thr->slot_locked : false) {
+    if (!locked_)
+      SlotLock(thr_);
+  }
+
+  ALWAYS_INLINE
+  ~SlotLocker() {
+    if (!locked_)
+      SlotUnlock(thr_);
+  }
+
+ private:
+  ThreadState *thr_;
+  bool locked_;
+};
+
+class SlotUnlocker {
+ public:
+  SlotUnlocker(ThreadState *thr) : thr_(thr), locked_(thr->slot_locked) {
+    if (locked_)
+      SlotUnlock(thr_);
+  }
+
+  ~SlotUnlocker() {
+    if (locked_)
+      SlotLock(thr_);
+  }
+
+ private:
+  ThreadState *thr_;
+  bool locked_;
+};
+
 ALWAYS_INLINE void ProcessPendingSignals(ThreadState *thr) {
   if (UNLIKELY(atomic_load_relaxed(&thr->pending_signals)))
     ProcessPendingSignalsImpl(thr);
@@ -667,16 +639,19 @@ void LazyInitialize(ThreadState *thr) {
 #endif
 }
 
-namespace v3 {
-
+void TraceResetForTesting();
 void TraceSwitchPart(ThreadState *thr);
-bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
-                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
+void TraceSwitchPartImpl(ThreadState *thr);
+bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
+                  AccessType typ, Tid *ptid, VarSizeStackTrace *pstk,
                   MutexSet *pmset, uptr *ptag);
 
 template <typename EventT>
 ALWAYS_INLINE WARN_UNUSED_RESULT bool TraceAcquire(ThreadState *thr,
                                                    EventT **ev) {
+  // TraceSwitchPart accesses shadow_stack, but it's called infrequently,
+  // so we check it here proactively.
+  DCHECK(thr->shadow_stack);
   Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
 #if SANITIZER_DEBUG
   // TraceSwitch acquires these mutexes,
@@ -747,20 +722,16 @@ void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
 void TraceMutexUnlock(ThreadState *thr, uptr addr);
 void TraceTime(ThreadState *thr);
 
-}  // namespace v3
+void TraceRestartFuncExit(ThreadState *thr);
+void TraceRestartFuncEntry(ThreadState *thr, uptr pc);
 
 void GrowShadowStack(ThreadState *thr);
 
 ALWAYS_INLINE
 void FuncEntry(ThreadState *thr, uptr pc) {
-  DPrintf2("#%d: FuncEntry %p\n", (int)thr->fast_state.tid(), (void *)pc);
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeFuncEnter, pc);
-  }
-
-  // Shadow stack maintenance can be replaced with
-  // stack unwinding during trace switch (which presumably must be faster).
+  DPrintf2("#%d: FuncEntry %p\n", (int)thr->fast_state.sid(), (void *)pc);
+  if (UNLIKELY(!TryTraceFunc(thr, pc)))
+    return TraceRestartFuncEntry(thr, pc);
   DCHECK_GE(thr->shadow_stack_pos, thr->shadow_stack);
 #if !SANITIZER_GO
   DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
@@ -774,12 +745,9 @@ void FuncEntry(ThreadState *thr, uptr pc) {
 
 ALWAYS_INLINE
 void FuncExit(ThreadState *thr) {
-  DPrintf2("#%d: FuncExit\n", (int)thr->fast_state.tid());
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeFuncExit, 0);
-  }
-
+  DPrintf2("#%d: FuncExit\n", (int)thr->fast_state.sid());
+  if (UNLIKELY(!TryTraceFunc(thr, 0)))
+    return TraceRestartFuncExit(thr);
   DCHECK_GT(thr->shadow_stack_pos, thr->shadow_stack);
 #if !SANITIZER_GO
   DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
@@ -791,7 +759,6 @@ void FuncExit(ThreadState *thr) {
 extern void (*on_initialize)(void);
 extern int (*on_finalize)(int);
 #endif
-
 }  // namespace __tsan
 
 #endif  // TSAN_RTL_H
diff --git a/libsanitizer/tsan/tsan_rtl_access.cpp b/libsanitizer/tsan/tsan_rtl_access.cpp
index 7365fda..7d771bfa 100644
--- a/libsanitizer/tsan/tsan_rtl_access.cpp
+++ b/libsanitizer/tsan/tsan_rtl_access.cpp
@@ -15,15 +15,13 @@
 
 namespace __tsan {
 
-namespace v3 {
-
-ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState *thr, uptr pc,
+ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState* thr, uptr pc,
                                              uptr addr, uptr size,
                                              AccessType typ) {
   DCHECK(size == 1 || size == 2 || size == 4 || size == 8);
   if (!kCollectHistory)
     return true;
-  EventAccess *ev;
+  EventAccess* ev;
   if (UNLIKELY(!TraceAcquire(thr, &ev)))
     return false;
   u64 size_log = size == 1 ? 0 : size == 2 ? 1 : size == 4 ? 2 : 3;
@@ -40,25 +38,27 @@ ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState *thr, uptr pc,
     TraceRelease(thr, ev);
     return true;
   }
-  auto *evex = reinterpret_cast<EventAccessExt *>(ev);
+  auto* evex = reinterpret_cast<EventAccessExt*>(ev);
   evex->is_access = 0;
   evex->is_func = 0;
   evex->type = EventType::kAccessExt;
   evex->is_read = !!(typ & kAccessRead);
   evex->is_atomic = !!(typ & kAccessAtomic);
   evex->size_log = size_log;
+  // Note: this is important, see comment in EventAccessExt.
+  evex->_ = 0;
   evex->addr = CompressAddr(addr);
   evex->pc = pc;
   TraceRelease(thr, evex);
   return true;
 }
 
-ALWAYS_INLINE USED bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc,
-                                                  uptr addr, uptr size,
-                                                  AccessType typ) {
+ALWAYS_INLINE
+bool TryTraceMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr, uptr size,
+                               AccessType typ) {
   if (!kCollectHistory)
     return true;
-  EventAccessRange *ev;
+  EventAccessRange* ev;
   if (UNLIKELY(!TraceAcquire(thr, &ev)))
     return false;
   thr->trace_prev_pc = pc;
@@ -75,7 +75,7 @@ ALWAYS_INLINE USED bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc,
   return true;
 }
 
-void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+void TraceMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr, uptr size,
                             AccessType typ) {
   if (LIKELY(TryTraceMemoryAccessRange(thr, pc, addr, size, typ)))
     return;
@@ -84,7 +84,7 @@ void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
   DCHECK(res);
 }
 
-void TraceFunc(ThreadState *thr, uptr pc) {
+void TraceFunc(ThreadState* thr, uptr pc) {
   if (LIKELY(TryTraceFunc(thr, pc)))
     return;
   TraceSwitchPart(thr);
@@ -92,7 +92,17 @@ void TraceFunc(ThreadState *thr, uptr pc) {
   DCHECK(res);
 }
 
-void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
+NOINLINE void TraceRestartFuncEntry(ThreadState* thr, uptr pc) {
+  TraceSwitchPart(thr);
+  FuncEntry(thr, pc);
+}
+
+NOINLINE void TraceRestartFuncExit(ThreadState* thr) {
+  TraceSwitchPart(thr);
+  FuncExit(thr);
+}
+
+void TraceMutexLock(ThreadState* thr, EventType type, uptr pc, uptr addr,
                     StackID stk) {
   DCHECK(type == EventType::kLock || type == EventType::kRLock);
   if (!kCollectHistory)
@@ -109,7 +119,7 @@ void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
   TraceEvent(thr, ev);
 }
 
-void TraceMutexUnlock(ThreadState *thr, uptr addr) {
+void TraceMutexUnlock(ThreadState* thr, uptr addr) {
   if (!kCollectHistory)
     return;
   EventUnlock ev;
@@ -121,396 +131,523 @@ void TraceMutexUnlock(ThreadState *thr, uptr addr) {
   TraceEvent(thr, ev);
 }
 
-void TraceTime(ThreadState *thr) {
+void TraceTime(ThreadState* thr) {
   if (!kCollectHistory)
     return;
+  FastState fast_state = thr->fast_state;
   EventTime ev;
   ev.is_access = 0;
   ev.is_func = 0;
   ev.type = EventType::kTime;
-  ev.sid = static_cast<u64>(thr->sid);
-  ev.epoch = static_cast<u64>(thr->epoch);
+  ev.sid = static_cast<u64>(fast_state.sid());
+  ev.epoch = static_cast<u64>(fast_state.epoch());
   ev._ = 0;
   TraceEvent(thr, ev);
 }
 
-}  // namespace v3
+ALWAYS_INLINE RawShadow LoadShadow(RawShadow* p) {
+  return static_cast<RawShadow>(
+      atomic_load((atomic_uint32_t*)p, memory_order_relaxed));
+}
 
-ALWAYS_INLINE
-Shadow LoadShadow(u64 *p) {
-  u64 raw = atomic_load((atomic_uint64_t *)p, memory_order_relaxed);
-  return Shadow(raw);
+ALWAYS_INLINE void StoreShadow(RawShadow* sp, RawShadow s) {
+  atomic_store((atomic_uint32_t*)sp, static_cast<u32>(s), memory_order_relaxed);
 }
 
-ALWAYS_INLINE
-void StoreShadow(u64 *sp, u64 s) {
-  atomic_store((atomic_uint64_t *)sp, s, memory_order_relaxed);
+NOINLINE void DoReportRace(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                           Shadow old,
+                           AccessType typ) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  // For the free shadow markers the first element (that contains kFreeSid)
+  // triggers the race, but the second element contains info about the freeing
+  // thread, take it.
+  if (old.sid() == kFreeSid)
+    old = Shadow(LoadShadow(&shadow_mem[1]));
+  // This prevents trapping on this address in future.
+  for (uptr i = 0; i < kShadowCnt; i++)
+    StoreShadow(&shadow_mem[i], i == 0 ? Shadow::kRodata : Shadow::kEmpty);
+  // See the comment in MemoryRangeFreed as to why the slot is locked
+  // for free memory accesses. ReportRace must not be called with
+  // the slot locked because of the fork. But MemoryRangeFreed is not
+  // called during fork because fork sets ignore_reads_and_writes,
+  // so simply unlocking the slot should be fine.
+  if (typ & kAccessSlotLocked)
+    SlotUnlock(thr);
+  ReportRace(thr, shadow_mem, cur, Shadow(old), typ);
+  if (typ & kAccessSlotLocked)
+    SlotLock(thr);
 }
 
+#if !TSAN_VECTORIZE
 ALWAYS_INLINE
-void StoreIfNotYetStored(u64 *sp, u64 *s) {
-  StoreShadow(sp, *s);
-  *s = 0;
+bool ContainsSameAccess(RawShadow* s, Shadow cur, int unused0, int unused1,
+                        AccessType typ) {
+  for (uptr i = 0; i < kShadowCnt; i++) {
+    auto old = LoadShadow(&s[i]);
+    if (!(typ & kAccessRead)) {
+      if (old == cur.raw())
+        return true;
+      continue;
+    }
+    auto masked = static_cast<RawShadow>(static_cast<u32>(old) |
+                                         static_cast<u32>(Shadow::kRodata));
+    if (masked == cur.raw())
+      return true;
+    if (!(typ & kAccessNoRodata) && !SANITIZER_GO) {
+      if (old == Shadow::kRodata)
+        return true;
+    }
+  }
+  return false;
 }
 
-extern "C" void __tsan_report_race();
-
 ALWAYS_INLINE
-void HandleRace(ThreadState *thr, u64 *shadow_mem, Shadow cur, Shadow old) {
-  thr->racy_state[0] = cur.raw();
-  thr->racy_state[1] = old.raw();
-  thr->racy_shadow_addr = shadow_mem;
-#if !SANITIZER_GO
-  HACKY_CALL(__tsan_report_race);
-#else
-  ReportRace(thr);
-#endif
+bool CheckRaces(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                int unused0, int unused1, AccessType typ) {
+  bool stored = false;
+  for (uptr idx = 0; idx < kShadowCnt; idx++) {
+    RawShadow* sp = &shadow_mem[idx];
+    Shadow old(LoadShadow(sp));
+    if (LIKELY(old.raw() == Shadow::kEmpty)) {
+      if (!(typ & kAccessCheckOnly) && !stored)
+        StoreShadow(sp, cur.raw());
+      return false;
+    }
+    if (LIKELY(!(cur.access() & old.access())))
+      continue;
+    if (LIKELY(cur.sid() == old.sid())) {
+      if (!(typ & kAccessCheckOnly) &&
+          LIKELY(cur.access() == old.access() && old.IsRWWeakerOrEqual(typ))) {
+        StoreShadow(sp, cur.raw());
+        stored = true;
+      }
+      continue;
+    }
+    if (LIKELY(old.IsBothReadsOrAtomic(typ)))
+      continue;
+    if (LIKELY(thr->clock.Get(old.sid()) >= old.epoch()))
+      continue;
+    DoReportRace(thr, shadow_mem, cur, old, typ);
+    return true;
+  }
+  // We did not find any races and had already stored
+  // the current access info, so we are done.
+  if (LIKELY(stored))
+    return false;
+  // Choose a random candidate slot and replace it.
+  uptr index =
+      atomic_load_relaxed(&thr->trace_pos) / sizeof(Event) % kShadowCnt;
+  StoreShadow(&shadow_mem[index], cur.raw());
+  return false;
 }
 
-static inline bool HappensBefore(Shadow old, ThreadState *thr) {
-  return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
-}
+#  define LOAD_CURRENT_SHADOW(cur, shadow_mem) UNUSED int access = 0, shadow = 0
 
-ALWAYS_INLINE
-void MemoryAccessImpl1(ThreadState *thr, uptr addr, int kAccessSizeLog,
-                       bool kAccessIsWrite, bool kIsAtomic, u64 *shadow_mem,
-                       Shadow cur) {
-  // This potentially can live in an MMX/SSE scratch register.
-  // The required intrinsics are:
-  // __m128i _mm_move_epi64(__m128i*);
-  // _mm_storel_epi64(u64*, __m128i);
-  u64 store_word = cur.raw();
-  bool stored = false;
+#else /* !TSAN_VECTORIZE */
 
-  // scan all the shadow values and dispatch to 4 categories:
-  // same, replace, candidate and race (see comments below).
-  // we consider only 3 cases regarding access sizes:
-  // equal, intersect and not intersect. initially I considered
-  // larger and smaller as well, it allowed to replace some
-  // 'candidates' with 'same' or 'replace', but I think
-  // it's just not worth it (performance- and complexity-wise).
-
-  Shadow old(0);
-
-  // It release mode we manually unroll the loop,
-  // because empirically gcc generates better code this way.
-  // However, we can't afford unrolling in debug mode, because the function
-  // consumes almost 4K of stack. Gtest gives only 4K of stack to death test
-  // threads, which is not enough for the unrolled loop.
-#if SANITIZER_DEBUG
-  for (int idx = 0; idx < 4; idx++) {
-#  include "tsan_update_shadow_word.inc"
-  }
-#else
-  int idx = 0;
-#  include "tsan_update_shadow_word.inc"
-  idx = 1;
-  if (stored) {
-#  include "tsan_update_shadow_word.inc"
-  } else {
-#  include "tsan_update_shadow_word.inc"
-  }
-  idx = 2;
-  if (stored) {
-#  include "tsan_update_shadow_word.inc"
-  } else {
-#  include "tsan_update_shadow_word.inc"
+ALWAYS_INLINE
+bool ContainsSameAccess(RawShadow* unused0, Shadow unused1, m128 shadow,
+                        m128 access, AccessType typ) {
+  // Note: we could check if there is a larger access of the same type,
+  // e.g. we just allocated/memset-ed a block (so it contains 8 byte writes)
+  // and now do smaller reads/writes, these can also be considered as "same
+  // access". However, it will make the check more expensive, so it's unclear
+  // if it's worth it. But this would conserve trace space, so it's useful
+  // besides potential speed up.
+  if (!(typ & kAccessRead)) {
+    const m128 same = _mm_cmpeq_epi32(shadow, access);
+    return _mm_movemask_epi8(same);
   }
-  idx = 3;
-  if (stored) {
-#  include "tsan_update_shadow_word.inc"
-  } else {
-#  include "tsan_update_shadow_word.inc"
+  // For reads we need to reset read bit in the shadow,
+  // because we need to match read with both reads and writes.
+  // Shadow::kRodata has only read bit set, so it does what we want.
+  // We also abuse it for rodata check to save few cycles
+  // since we already loaded Shadow::kRodata into a register.
+  // Reads from rodata can't race.
+  // Measurements show that they can be 10-20% of all memory accesses.
+  // Shadow::kRodata has epoch 0 which cannot appear in shadow normally
+  // (thread epochs start from 1). So the same read bit mask
+  // serves as rodata indicator.
+  const m128 read_mask = _mm_set1_epi32(static_cast<u32>(Shadow::kRodata));
+  const m128 masked_shadow = _mm_or_si128(shadow, read_mask);
+  m128 same = _mm_cmpeq_epi32(masked_shadow, access);
+  // Range memory accesses check Shadow::kRodata before calling this,
+  // Shadow::kRodatas is not possible for free memory access
+  // and Go does not use Shadow::kRodata.
+  if (!(typ & kAccessNoRodata) && !SANITIZER_GO) {
+    const m128 ro = _mm_cmpeq_epi32(shadow, read_mask);
+    same = _mm_or_si128(ro, same);
   }
-#endif
-
-  // we did not find any races and had already stored
-  // the current access info, so we are done
-  if (LIKELY(stored))
-    return;
-  // choose a random candidate slot and replace it
-  StoreShadow(shadow_mem + (cur.epoch() % kShadowCnt), store_word);
-  return;
-RACE:
-  HandleRace(thr, shadow_mem, cur, old);
-  return;
+  return _mm_movemask_epi8(same);
 }
 
-void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                           AccessType typ) {
-  DCHECK(!(typ & kAccessAtomic));
-  const bool kAccessIsWrite = !(typ & kAccessRead);
-  const bool kIsAtomic = false;
-  while (size) {
-    int size1 = 1;
-    int kAccessSizeLog = kSizeLog1;
-    if (size >= 8 && (addr & ~7) == ((addr + 7) & ~7)) {
-      size1 = 8;
-      kAccessSizeLog = kSizeLog8;
-    } else if (size >= 4 && (addr & ~7) == ((addr + 3) & ~7)) {
-      size1 = 4;
-      kAccessSizeLog = kSizeLog4;
-    } else if (size >= 2 && (addr & ~7) == ((addr + 1) & ~7)) {
-      size1 = 2;
-      kAccessSizeLog = kSizeLog2;
-    }
-    MemoryAccess(thr, pc, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic);
-    addr += size1;
-    size -= size1;
+NOINLINE void DoReportRaceV(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                            u32 race_mask, m128 shadow, AccessType typ) {
+  // race_mask points which of the shadow elements raced with the current
+  // access. Extract that element.
+  CHECK_NE(race_mask, 0);
+  u32 old;
+  // Note: _mm_extract_epi32 index must be a constant value.
+  switch (__builtin_ffs(race_mask) / 4) {
+    case 0:
+      old = _mm_extract_epi32(shadow, 0);
+      break;
+    case 1:
+      old = _mm_extract_epi32(shadow, 1);
+      break;
+    case 2:
+      old = _mm_extract_epi32(shadow, 2);
+      break;
+    case 3:
+      old = _mm_extract_epi32(shadow, 3);
+      break;
   }
+  Shadow prev(static_cast<RawShadow>(old));
+  // For the free shadow markers the first element (that contains kFreeSid)
+  // triggers the race, but the second element contains info about the freeing
+  // thread, take it.
+  if (prev.sid() == kFreeSid)
+    prev = Shadow(static_cast<RawShadow>(_mm_extract_epi32(shadow, 1)));
+  DoReportRace(thr, shadow_mem, cur, prev, typ);
 }
 
 ALWAYS_INLINE
-bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-  Shadow cur(a);
-  for (uptr i = 0; i < kShadowCnt; i++) {
-    Shadow old(LoadShadow(&s[i]));
-    if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
-        old.TidWithIgnore() == cur.TidWithIgnore() &&
-        old.epoch() > sync_epoch && old.IsAtomic() == cur.IsAtomic() &&
-        old.IsRead() <= cur.IsRead())
-      return true;
+bool CheckRaces(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                m128 shadow, m128 access, AccessType typ) {
+  // Note: empty/zero slots don't intersect with any access.
+  const m128 zero = _mm_setzero_si128();
+  const m128 mask_access = _mm_set1_epi32(0x000000ff);
+  const m128 mask_sid = _mm_set1_epi32(0x0000ff00);
+  const m128 mask_read_atomic = _mm_set1_epi32(0xc0000000);
+  const m128 access_and = _mm_and_si128(access, shadow);
+  const m128 access_xor = _mm_xor_si128(access, shadow);
+  const m128 intersect = _mm_and_si128(access_and, mask_access);
+  const m128 not_intersect = _mm_cmpeq_epi32(intersect, zero);
+  const m128 not_same_sid = _mm_and_si128(access_xor, mask_sid);
+  const m128 same_sid = _mm_cmpeq_epi32(not_same_sid, zero);
+  const m128 both_read_or_atomic = _mm_and_si128(access_and, mask_read_atomic);
+  const m128 no_race =
+      _mm_or_si128(_mm_or_si128(not_intersect, same_sid), both_read_or_atomic);
+  const int race_mask = _mm_movemask_epi8(_mm_cmpeq_epi32(no_race, zero));
+  if (UNLIKELY(race_mask))
+    goto SHARED;
+
+STORE : {
+  if (typ & kAccessCheckOnly)
+    return false;
+  // We could also replace different sid's if access is the same,
+  // rw weaker and happens before. However, just checking access below
+  // is not enough because we also need to check that !both_read_or_atomic
+  // (reads from different sids can be concurrent).
+  // Theoretically we could replace smaller accesses with larger accesses,
+  // but it's unclear if it's worth doing.
+  const m128 mask_access_sid = _mm_set1_epi32(0x0000ffff);
+  const m128 not_same_sid_access = _mm_and_si128(access_xor, mask_access_sid);
+  const m128 same_sid_access = _mm_cmpeq_epi32(not_same_sid_access, zero);
+  const m128 access_read_atomic =
+      _mm_set1_epi32((typ & (kAccessRead | kAccessAtomic)) << 30);
+  const m128 rw_weaker =
+      _mm_cmpeq_epi32(_mm_max_epu32(shadow, access_read_atomic), shadow);
+  const m128 rewrite = _mm_and_si128(same_sid_access, rw_weaker);
+  const int rewrite_mask = _mm_movemask_epi8(rewrite);
+  int index = __builtin_ffs(rewrite_mask);
+  if (UNLIKELY(index == 0)) {
+    const m128 empty = _mm_cmpeq_epi32(shadow, zero);
+    const int empty_mask = _mm_movemask_epi8(empty);
+    index = __builtin_ffs(empty_mask);
+    if (UNLIKELY(index == 0))
+      index = (atomic_load_relaxed(&thr->trace_pos) / 2) % 16;
   }
+  StoreShadow(&shadow_mem[index / 4], cur.raw());
+  // We could zero other slots determined by rewrite_mask.
+  // That would help other threads to evict better slots,
+  // but it's unclear if it's worth it.
   return false;
 }
 
-#if TSAN_VECTORIZE
-#  define SHUF(v0, v1, i0, i1, i2, i3)                    \
-    _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v0), \
-                                    _mm_castsi128_ps(v1), \
-                                    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
-ALWAYS_INLINE
-bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-  // This is an optimized version of ContainsSameAccessSlow.
-  // load current access into access[0:63]
-  const m128 access = _mm_cvtsi64_si128(a);
-  // duplicate high part of access in addr0:
-  // addr0[0:31]        = access[32:63]
-  // addr0[32:63]       = access[32:63]
-  // addr0[64:95]       = access[32:63]
-  // addr0[96:127]      = access[32:63]
-  const m128 addr0 = SHUF(access, access, 1, 1, 1, 1);
-  // load 4 shadow slots
-  const m128 shadow0 = _mm_load_si128((__m128i *)s);
-  const m128 shadow1 = _mm_load_si128((__m128i *)s + 1);
-  // load high parts of 4 shadow slots into addr_vect:
-  // addr_vect[0:31]    = shadow0[32:63]
-  // addr_vect[32:63]   = shadow0[96:127]
-  // addr_vect[64:95]   = shadow1[32:63]
-  // addr_vect[96:127]  = shadow1[96:127]
-  m128 addr_vect = SHUF(shadow0, shadow1, 1, 3, 1, 3);
-  if (!is_write) {
-    // set IsRead bit in addr_vect
-    const m128 rw_mask1 = _mm_cvtsi64_si128(1 << 15);
-    const m128 rw_mask = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
-    addr_vect = _mm_or_si128(addr_vect, rw_mask);
-  }
-  // addr0 == addr_vect?
-  const m128 addr_res = _mm_cmpeq_epi32(addr0, addr_vect);
-  // epoch1[0:63]       = sync_epoch
-  const m128 epoch1 = _mm_cvtsi64_si128(sync_epoch);
-  // epoch[0:31]        = sync_epoch[0:31]
-  // epoch[32:63]       = sync_epoch[0:31]
-  // epoch[64:95]       = sync_epoch[0:31]
-  // epoch[96:127]      = sync_epoch[0:31]
-  const m128 epoch = SHUF(epoch1, epoch1, 0, 0, 0, 0);
-  // load low parts of shadow cell epochs into epoch_vect:
-  // epoch_vect[0:31]   = shadow0[0:31]
-  // epoch_vect[32:63]  = shadow0[64:95]
-  // epoch_vect[64:95]  = shadow1[0:31]
-  // epoch_vect[96:127] = shadow1[64:95]
-  const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
-  // epoch_vect >= sync_epoch?
-  const m128 epoch_res = _mm_cmpgt_epi32(epoch_vect, epoch);
-  // addr_res & epoch_res
-  const m128 res = _mm_and_si128(addr_res, epoch_res);
-  // mask[0] = res[7]
-  // mask[1] = res[15]
-  // ...
-  // mask[15] = res[127]
-  const int mask = _mm_movemask_epi8(res);
-  return mask != 0;
+SHARED:
+  m128 thread_epochs = _mm_set1_epi32(0x7fffffff);
+  // Need to unwind this because _mm_extract_epi8/_mm_insert_epi32
+  // indexes must be constants.
+#  define LOAD_EPOCH(idx)                                                     \
+    if (LIKELY(race_mask & (1 << (idx * 4)))) {                               \
+      u8 sid = _mm_extract_epi8(shadow, idx * 4 + 1);                         \
+      u16 epoch = static_cast<u16>(thr->clock.Get(static_cast<Sid>(sid)));    \
+      thread_epochs = _mm_insert_epi32(thread_epochs, u32(epoch) << 16, idx); \
+    }
+  LOAD_EPOCH(0);
+  LOAD_EPOCH(1);
+  LOAD_EPOCH(2);
+  LOAD_EPOCH(3);
+#  undef LOAD_EPOCH
+  const m128 mask_epoch = _mm_set1_epi32(0x3fff0000);
+  const m128 shadow_epochs = _mm_and_si128(shadow, mask_epoch);
+  const m128 concurrent = _mm_cmplt_epi32(thread_epochs, shadow_epochs);
+  const int concurrent_mask = _mm_movemask_epi8(concurrent);
+  if (LIKELY(concurrent_mask == 0))
+    goto STORE;
+
+  DoReportRaceV(thr, shadow_mem, cur, concurrent_mask, shadow, typ);
+  return true;
 }
-#endif
 
-ALWAYS_INLINE
-bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
-#if TSAN_VECTORIZE
-  bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
-  // NOTE: this check can fail if the shadow is concurrently mutated
-  // by other threads. But it still can be useful if you modify
-  // ContainsSameAccessFast and want to ensure that it's not completely broken.
-  // DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
-  return res;
-#else
-  return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
+#  define LOAD_CURRENT_SHADOW(cur, shadow_mem)                         \
+    const m128 access = _mm_set1_epi32(static_cast<u32>((cur).raw())); \
+    const m128 shadow = _mm_load_si128(reinterpret_cast<m128*>(shadow_mem))
 #endif
-}
 
-ALWAYS_INLINE USED void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
-                                     int kAccessSizeLog, bool kAccessIsWrite,
-                                     bool kIsAtomic) {
-  RawShadow *shadow_mem = MemToShadow(addr);
-  DPrintf2(
-      "#%d: MemoryAccess: @%p %p size=%d"
-      " is_write=%d shadow_mem=%p {%zx, %zx, %zx, %zx}\n",
-      (int)thr->fast_state.tid(), (void *)pc, (void *)addr,
-      (int)(1 << kAccessSizeLog), kAccessIsWrite, shadow_mem,
-      (uptr)shadow_mem[0], (uptr)shadow_mem[1], (uptr)shadow_mem[2],
-      (uptr)shadow_mem[3]);
-#if SANITIZER_DEBUG
-  if (!IsAppMem(addr)) {
-    Printf("Access to non app mem %zx\n", addr);
-    DCHECK(IsAppMem(addr));
+char* DumpShadow(char* buf, RawShadow raw) {
+  if (raw == Shadow::kEmpty) {
+    internal_snprintf(buf, 64, "0");
+    return buf;
   }
-  if (!IsShadowMem(shadow_mem)) {
-    Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
-    DCHECK(IsShadowMem(shadow_mem));
-  }
-#endif
+  Shadow s(raw);
+  AccessType typ;
+  s.GetAccess(nullptr, nullptr, &typ);
+  internal_snprintf(buf, 64, "{tid=%u@%u access=0x%x typ=%x}",
+                    static_cast<u32>(s.sid()), static_cast<u32>(s.epoch()),
+                    s.access(), static_cast<u32>(typ));
+  return buf;
+}
 
-  if (!SANITIZER_GO && !kAccessIsWrite && *shadow_mem == kShadowRodata) {
-    // Access to .rodata section, no races here.
-    // Measurements show that it can be 10-20% of all memory accesses.
-    return;
-  }
+// TryTrace* and TraceRestart* functions allow to turn memory access and func
+// entry/exit callbacks into leaf functions with all associated performance
+// benefits. These hottest callbacks do only 2 slow path calls: report a race
+// and trace part switching. Race reporting is easy to turn into a tail call, we
+// just always return from the runtime after reporting a race. But trace part
+// switching is harder because it needs to be in the middle of callbacks. To
+// turn it into a tail call we immidiately return after TraceRestart* functions,
+// but TraceRestart* functions themselves recurse into the callback after
+// switching trace part. As the result the hottest callbacks contain only tail
+// calls, which effectively makes them leaf functions (can use all registers,
+// no frame setup, etc).
+NOINLINE void TraceRestartMemoryAccess(ThreadState* thr, uptr pc, uptr addr,
+                                       uptr size, AccessType typ) {
+  TraceSwitchPart(thr);
+  MemoryAccess(thr, pc, addr, size, typ);
+}
+
+ALWAYS_INLINE USED void MemoryAccess(ThreadState* thr, uptr pc, uptr addr,
+                                     uptr size, AccessType typ) {
+  RawShadow* shadow_mem = MemToShadow(addr);
+  UNUSED char memBuf[4][64];
+  DPrintf2("#%d: Access: %d@%d %p/%zd typ=0x%x {%s, %s, %s, %s}\n", thr->tid,
+           static_cast<int>(thr->fast_state.sid()),
+           static_cast<int>(thr->fast_state.epoch()), (void*)addr, size,
+           static_cast<int>(typ), DumpShadow(memBuf[0], shadow_mem[0]),
+           DumpShadow(memBuf[1], shadow_mem[1]),
+           DumpShadow(memBuf[2], shadow_mem[2]),
+           DumpShadow(memBuf[3], shadow_mem[3]));
 
   FastState fast_state = thr->fast_state;
-  if (UNLIKELY(fast_state.GetIgnoreBit())) {
+  Shadow cur(fast_state, addr, size, typ);
+
+  LOAD_CURRENT_SHADOW(cur, shadow_mem);
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
     return;
-  }
+  if (UNLIKELY(fast_state.GetIgnoreBit()))
+    return;
+  if (!TryTraceMemoryAccess(thr, pc, addr, size, typ))
+    return TraceRestartMemoryAccess(thr, pc, addr, size, typ);
+  CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
+}
 
-  Shadow cur(fast_state);
-  cur.SetAddr0AndSizeLog(addr & 7, kAccessSizeLog);
-  cur.SetWrite(kAccessIsWrite);
-  cur.SetAtomic(kIsAtomic);
+void MemoryAccess16(ThreadState* thr, uptr pc, uptr addr, AccessType typ);
 
-  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(), thr->fast_synch_epoch,
-                                kAccessIsWrite))) {
-    return;
-  }
+NOINLINE
+void RestartMemoryAccess16(ThreadState* thr, uptr pc, uptr addr,
+                           AccessType typ) {
+  TraceSwitchPart(thr);
+  MemoryAccess16(thr, pc, addr, typ);
+}
 
-  if (kCollectHistory) {
-    fast_state.IncrementEpoch();
-    thr->fast_state = fast_state;
-    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
-    cur.IncrementEpoch();
+ALWAYS_INLINE USED void MemoryAccess16(ThreadState* thr, uptr pc, uptr addr,
+                                       AccessType typ) {
+  const uptr size = 16;
+  FastState fast_state = thr->fast_state;
+  if (UNLIKELY(fast_state.GetIgnoreBit()))
+    return;
+  Shadow cur(fast_state, 0, 8, typ);
+  RawShadow* shadow_mem = MemToShadow(addr);
+  bool traced = false;
+  {
+    LOAD_CURRENT_SHADOW(cur, shadow_mem);
+    if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
+      goto SECOND;
+    if (!TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
+      return RestartMemoryAccess16(thr, pc, addr, typ);
+    traced = true;
+    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, shadow, access, typ)))
+      return;
   }
+SECOND:
+  shadow_mem += kShadowCnt;
+  LOAD_CURRENT_SHADOW(cur, shadow_mem);
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
+    return;
+  if (!traced && !TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
+    return RestartMemoryAccess16(thr, pc, addr, typ);
+  CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
+}
 
-  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
-                    shadow_mem, cur);
+NOINLINE
+void RestartUnalignedMemoryAccess(ThreadState* thr, uptr pc, uptr addr,
+                                  uptr size, AccessType typ) {
+  TraceSwitchPart(thr);
+  UnalignedMemoryAccess(thr, pc, addr, size, typ);
 }
 
-// Called by MemoryAccessRange in tsan_rtl_thread.cpp
-ALWAYS_INLINE USED void MemoryAccessImpl(ThreadState *thr, uptr addr,
-                                         int kAccessSizeLog,
-                                         bool kAccessIsWrite, bool kIsAtomic,
-                                         u64 *shadow_mem, Shadow cur) {
-  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(), thr->fast_synch_epoch,
-                                kAccessIsWrite))) {
+ALWAYS_INLINE USED void UnalignedMemoryAccess(ThreadState* thr, uptr pc,
+                                              uptr addr, uptr size,
+                                              AccessType typ) {
+  DCHECK_LE(size, 8);
+  FastState fast_state = thr->fast_state;
+  if (UNLIKELY(fast_state.GetIgnoreBit()))
     return;
+  RawShadow* shadow_mem = MemToShadow(addr);
+  bool traced = false;
+  uptr size1 = Min<uptr>(size, RoundUp(addr + 1, kShadowCell) - addr);
+  {
+    Shadow cur(fast_state, addr, size1, typ);
+    LOAD_CURRENT_SHADOW(cur, shadow_mem);
+    if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
+      goto SECOND;
+    if (!TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
+      return RestartUnalignedMemoryAccess(thr, pc, addr, size, typ);
+    traced = true;
+    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, shadow, access, typ)))
+      return;
   }
+SECOND:
+  uptr size2 = size - size1;
+  if (LIKELY(size2 == 0))
+    return;
+  shadow_mem += kShadowCnt;
+  Shadow cur(fast_state, 0, size2, typ);
+  LOAD_CURRENT_SHADOW(cur, shadow_mem);
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
+    return;
+  if (!traced && !TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
+    return RestartUnalignedMemoryAccess(thr, pc, addr, size, typ);
+  CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
+}
 
-  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
-                    shadow_mem, cur);
+void ShadowSet(RawShadow* p, RawShadow* end, RawShadow v) {
+  DCHECK_LE(p, end);
+  DCHECK(IsShadowMem(p));
+  DCHECK(IsShadowMem(end));
+  UNUSED const uptr kAlign = kShadowCnt * kShadowSize;
+  DCHECK_EQ(reinterpret_cast<uptr>(p) % kAlign, 0);
+  DCHECK_EQ(reinterpret_cast<uptr>(end) % kAlign, 0);
+#if !TSAN_VECTORIZE
+  for (; p < end; p += kShadowCnt) {
+    p[0] = v;
+    for (uptr i = 1; i < kShadowCnt; i++) p[i] = Shadow::kEmpty;
+  }
+#else
+  m128 vv = _mm_setr_epi32(
+      static_cast<u32>(v), static_cast<u32>(Shadow::kEmpty),
+      static_cast<u32>(Shadow::kEmpty), static_cast<u32>(Shadow::kEmpty));
+  m128* vp = reinterpret_cast<m128*>(p);
+  m128* vend = reinterpret_cast<m128*>(end);
+  for (; vp < vend; vp++) _mm_store_si128(vp, vv);
+#endif
 }
 
-static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                           u64 val) {
-  (void)thr;
-  (void)pc;
+static void MemoryRangeSet(uptr addr, uptr size, RawShadow val) {
   if (size == 0)
     return;
-  // FIXME: fix me.
-  uptr offset = addr % kShadowCell;
-  if (offset) {
-    offset = kShadowCell - offset;
-    if (size <= offset)
-      return;
-    addr += offset;
-    size -= offset;
-  }
-  DCHECK_EQ(addr % 8, 0);
+  DCHECK_EQ(addr % kShadowCell, 0);
+  DCHECK_EQ(size % kShadowCell, 0);
   // If a user passes some insane arguments (memset(0)),
   // let it just crash as usual.
   if (!IsAppMem(addr) || !IsAppMem(addr + size - 1))
     return;
+  RawShadow* begin = MemToShadow(addr);
+  RawShadow* end = begin + size / kShadowCell * kShadowCnt;
   // Don't want to touch lots of shadow memory.
   // If a program maps 10MB stack, there is no need reset the whole range.
-  size = (size + (kShadowCell - 1)) & ~(kShadowCell - 1);
   // UnmapOrDie/MmapFixedNoReserve does not work on Windows.
-  if (SANITIZER_WINDOWS || size < common_flags()->clear_shadow_mmap_threshold) {
-    RawShadow *p = MemToShadow(addr);
-    CHECK(IsShadowMem(p));
-    CHECK(IsShadowMem(p + size * kShadowCnt / kShadowCell - 1));
-    // FIXME: may overwrite a part outside the region
-    for (uptr i = 0; i < size / kShadowCell * kShadowCnt;) {
-      p[i++] = val;
-      for (uptr j = 1; j < kShadowCnt; j++) p[i++] = 0;
-    }
-  } else {
-    // The region is big, reset only beginning and end.
-    const uptr kPageSize = GetPageSizeCached();
-    RawShadow *begin = MemToShadow(addr);
-    RawShadow *end = begin + size / kShadowCell * kShadowCnt;
-    RawShadow *p = begin;
-    // Set at least first kPageSize/2 to page boundary.
-    while ((p < begin + kPageSize / kShadowSize / 2) || ((uptr)p % kPageSize)) {
-      *p++ = val;
-      for (uptr j = 1; j < kShadowCnt; j++) *p++ = 0;
-    }
-    // Reset middle part.
-    RawShadow *p1 = p;
-    p = RoundDown(end, kPageSize);
-    if (!MmapFixedSuperNoReserve((uptr)p1, (uptr)p - (uptr)p1))
+  if (SANITIZER_WINDOWS ||
+      size <= common_flags()->clear_shadow_mmap_threshold) {
+    ShadowSet(begin, end, val);
+    return;
+  }
+  // The region is big, reset only beginning and end.
+  const uptr kPageSize = GetPageSizeCached();
+  // Set at least first kPageSize/2 to page boundary.
+  RawShadow* mid1 =
+      Min(end, reinterpret_cast<RawShadow*>(RoundUp(
+                   reinterpret_cast<uptr>(begin) + kPageSize / 2, kPageSize)));
+  ShadowSet(begin, mid1, val);
+  // Reset middle part.
+  RawShadow* mid2 = RoundDown(end, kPageSize);
+  if (mid2 > mid1) {
+    if (!MmapFixedSuperNoReserve((uptr)mid1, (uptr)mid2 - (uptr)mid1))
       Die();
-    // Set the ending.
-    while (p < end) {
-      *p++ = val;
-      for (uptr j = 1; j < kShadowCnt; j++) *p++ = 0;
-    }
   }
+  // Set the ending.
+  ShadowSet(mid2, end, val);
 }
 
-void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  MemoryRangeSet(thr, pc, addr, size, 0);
+void MemoryResetRange(ThreadState* thr, uptr pc, uptr addr, uptr size) {
+  uptr addr1 = RoundDown(addr, kShadowCell);
+  uptr size1 = RoundUp(size + addr - addr1, kShadowCell);
+  MemoryRangeSet(addr1, size1, Shadow::kEmpty);
 }
 
-void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  // Processing more than 1k (4k of shadow) is expensive,
+void MemoryRangeFreed(ThreadState* thr, uptr pc, uptr addr, uptr size) {
+  // Callers must lock the slot to ensure synchronization with the reset.
+  // The problem with "freed" memory is that it's not "monotonic"
+  // with respect to bug detection: freed memory is bad to access,
+  // but then if the heap block is reallocated later, it's good to access.
+  // As the result a garbage "freed" shadow can lead to a false positive
+  // if it happens to match a real free in the thread trace,
+  // but the heap block was reallocated before the current memory access,
+  // so it's still good to access. It's not the case with data races.
+  DCHECK(thr->slot_locked);
+  DCHECK_EQ(addr % kShadowCell, 0);
+  size = RoundUp(size, kShadowCell);
+  // Processing more than 1k (2k of shadow) is expensive,
   // can cause excessive memory consumption (user does not necessary touch
   // the whole range) and most likely unnecessary.
-  if (size > 1024)
-    size = 1024;
-  CHECK_EQ(thr->is_freeing, false);
-  thr->is_freeing = true;
-  MemoryAccessRange(thr, pc, addr, size, true);
-  thr->is_freeing = false;
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, pc);
+  size = Min<uptr>(size, 1024);
+  const AccessType typ = kAccessWrite | kAccessFree | kAccessSlotLocked |
+                         kAccessCheckOnly | kAccessNoRodata;
+  TraceMemoryAccessRange(thr, pc, addr, size, typ);
+  RawShadow* shadow_mem = MemToShadow(addr);
+  Shadow cur(thr->fast_state, 0, kShadowCell, typ);
+#if TSAN_VECTORIZE
+  const m128 access = _mm_set1_epi32(static_cast<u32>(cur.raw()));
+  const m128 freed = _mm_setr_epi32(
+      static_cast<u32>(Shadow::FreedMarker()),
+      static_cast<u32>(Shadow::FreedInfo(cur.sid(), cur.epoch())), 0, 0);
+  for (; size; size -= kShadowCell, shadow_mem += kShadowCnt) {
+    const m128 shadow = _mm_load_si128((m128*)shadow_mem);
+    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, shadow, access, typ)))
+      return;
+    _mm_store_si128((m128*)shadow_mem, freed);
   }
-  Shadow s(thr->fast_state);
-  s.ClearIgnoreBit();
-  s.MarkAsFreed();
-  s.SetWrite(true);
-  s.SetAddr0AndSizeLog(0, 3);
-  MemoryRangeSet(thr, pc, addr, size, s.raw());
-}
-
-void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size) {
-  if (kCollectHistory) {
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, pc);
+#else
+  for (; size; size -= kShadowCell, shadow_mem += kShadowCnt) {
+    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, 0, 0, typ)))
+      return;
+    StoreShadow(&shadow_mem[0], Shadow::FreedMarker());
+    StoreShadow(&shadow_mem[1], Shadow::FreedInfo(cur.sid(), cur.epoch()));
+    StoreShadow(&shadow_mem[2], Shadow::kEmpty);
+    StoreShadow(&shadow_mem[3], Shadow::kEmpty);
   }
-  Shadow s(thr->fast_state);
-  s.ClearIgnoreBit();
-  s.SetWrite(true);
-  s.SetAddr0AndSizeLog(0, 3);
-  MemoryRangeSet(thr, pc, addr, size, s.raw());
+#endif
+}
+
+void MemoryRangeImitateWrite(ThreadState* thr, uptr pc, uptr addr, uptr size) {
+  DCHECK_EQ(addr % kShadowCell, 0);
+  size = RoundUp(size, kShadowCell);
+  TraceMemoryAccessRange(thr, pc, addr, size, kAccessWrite);
+  Shadow cur(thr->fast_state, 0, 8, kAccessWrite);
+  MemoryRangeSet(addr, size, cur.raw());
 }
 
-void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
+void MemoryRangeImitateWriteOrResetRange(ThreadState* thr, uptr pc, uptr addr,
                                          uptr size) {
   if (thr->ignore_reads_and_writes == 0)
     MemoryRangeImitateWrite(thr, pc, addr, size);
@@ -518,14 +655,29 @@ void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
     MemoryResetRange(thr, pc, addr, size);
 }
 
-void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
-                       bool is_write) {
-  if (size == 0)
-    return;
+ALWAYS_INLINE
+bool MemoryAccessRangeOne(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
+                          AccessType typ) {
+  LOAD_CURRENT_SHADOW(cur, shadow_mem);
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
+    return false;
+  return CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
+}
+
+template <bool is_read>
+NOINLINE void RestartMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr,
+                                       uptr size) {
+  TraceSwitchPart(thr);
+  MemoryAccessRangeT<is_read>(thr, pc, addr, size);
+}
 
-  RawShadow *shadow_mem = MemToShadow(addr);
-  DPrintf2("#%d: MemoryAccessRange: @%p %p size=%d is_write=%d\n", thr->tid,
-           (void *)pc, (void *)addr, (int)size, is_write);
+template <bool is_read>
+void MemoryAccessRangeT(ThreadState* thr, uptr pc, uptr addr, uptr size) {
+  const AccessType typ =
+      (is_read ? kAccessRead : kAccessWrite) | kAccessNoRodata;
+  RawShadow* shadow_mem = MemToShadow(addr);
+  DPrintf2("#%d: MemoryAccessRange: @%p %p size=%d is_read=%d\n", thr->tid,
+           (void*)pc, (void*)addr, (int)size, is_read);
 
 #if SANITIZER_DEBUG
   if (!IsAppMem(addr)) {
@@ -537,65 +689,62 @@ void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
     DCHECK(IsAppMem(addr + size - 1));
   }
   if (!IsShadowMem(shadow_mem)) {
-    Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
+    Printf("Bad shadow addr %p (%zx)\n", static_cast<void*>(shadow_mem), addr);
     DCHECK(IsShadowMem(shadow_mem));
   }
-  if (!IsShadowMem(shadow_mem + size * kShadowCnt / 8 - 1)) {
-    Printf("Bad shadow addr %p (%zx)\n", shadow_mem + size * kShadowCnt / 8 - 1,
+  if (!IsShadowMem(shadow_mem + size * kShadowCnt - 1)) {
+    Printf("Bad shadow addr %p (%zx)\n",
+           static_cast<void*>(shadow_mem + size * kShadowCnt - 1),
            addr + size - 1);
-    DCHECK(IsShadowMem(shadow_mem + size * kShadowCnt / 8 - 1));
+    DCHECK(IsShadowMem(shadow_mem + size * kShadowCnt - 1));
   }
 #endif
 
-  if (*shadow_mem == kShadowRodata) {
-    DCHECK(!is_write);
-    // Access to .rodata section, no races here.
-    // Measurements show that it can be 10-20% of all memory accesses.
+  // Access to .rodata section, no races here.
+  // Measurements show that it can be 10-20% of all memory accesses.
+  // Check here once to not check for every access separately.
+  // Note: we could (and should) do this only for the is_read case
+  // (writes shouldn't go to .rodata). But it happens in Chromium tests:
+  // https://bugs.chromium.org/p/chromium/issues/detail?id=1275581#c19
+  // Details are unknown since it happens only on CI machines.
+  if (*shadow_mem == Shadow::kRodata)
     return;
-  }
 
   FastState fast_state = thr->fast_state;
-  if (fast_state.GetIgnoreBit())
+  if (UNLIKELY(fast_state.GetIgnoreBit()))
     return;
 
-  fast_state.IncrementEpoch();
-  thr->fast_state = fast_state;
-  TraceAddEvent(thr, fast_state, EventTypeMop, pc);
+  if (!TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
+    return RestartMemoryAccessRange<is_read>(thr, pc, addr, size);
 
-  bool unaligned = (addr % kShadowCell) != 0;
-
-  // Handle unaligned beginning, if any.
-  for (; addr % kShadowCell && size; addr++, size--) {
-    int const kAccessSizeLog = 0;
-    Shadow cur(fast_state);
-    cur.SetWrite(is_write);
-    cur.SetAddr0AndSizeLog(addr & (kShadowCell - 1), kAccessSizeLog);
-    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
-                     cur);
-  }
-  if (unaligned)
+  if (UNLIKELY(addr % kShadowCell)) {
+    // Handle unaligned beginning, if any.
+    uptr size1 = Min(size, RoundUp(addr, kShadowCell) - addr);
+    size -= size1;
+    Shadow cur(fast_state, addr, size1, typ);
+    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
+      return;
     shadow_mem += kShadowCnt;
+  }
   // Handle middle part, if any.
-  for (; size >= kShadowCell; addr += kShadowCell, size -= kShadowCell) {
-    int const kAccessSizeLog = 3;
-    Shadow cur(fast_state);
-    cur.SetWrite(is_write);
-    cur.SetAddr0AndSizeLog(0, kAccessSizeLog);
-    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
-                     cur);
-    shadow_mem += kShadowCnt;
+  Shadow cur(fast_state, 0, kShadowCell, typ);
+  for (; size >= kShadowCell; size -= kShadowCell, shadow_mem += kShadowCnt) {
+    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
+      return;
   }
   // Handle ending, if any.
-  for (; size; addr++, size--) {
-    int const kAccessSizeLog = 0;
-    Shadow cur(fast_state);
-    cur.SetWrite(is_write);
-    cur.SetAddr0AndSizeLog(addr & (kShadowCell - 1), kAccessSizeLog);
-    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
-                     cur);
+  if (UNLIKELY(size)) {
+    Shadow cur(fast_state, 0, size, typ);
+    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
+      return;
   }
 }
 
+template void MemoryAccessRangeT<true>(ThreadState* thr, uptr pc, uptr addr,
+                                       uptr size);
+template void MemoryAccessRangeT<false>(ThreadState* thr, uptr pc, uptr addr,
+                                        uptr size);
+
 }  // namespace __tsan
 
 #if !SANITIZER_GO
diff --git a/libsanitizer/tsan/tsan_rtl_amd64.S b/libsanitizer/tsan/tsan_rtl_amd64.S
index c15b01e..f848be9 100644
--- a/libsanitizer/tsan/tsan_rtl_amd64.S
+++ b/libsanitizer/tsan/tsan_rtl_amd64.S
@@ -9,242 +9,6 @@
 .section __TEXT,__text
 #endif
 
-ASM_HIDDEN(__tsan_trace_switch)
-.globl ASM_SYMBOL(__tsan_trace_switch_thunk)
-ASM_SYMBOL(__tsan_trace_switch_thunk):
-  CFI_STARTPROC
-  _CET_ENDBR
-  # Save scratch registers.
-  push %rax
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rax, 0)
-  push %rcx
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rcx, 0)
-  push %rdx
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rdx, 0)
-  push %rsi
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rsi, 0)
-  push %rdi
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rdi, 0)
-  push %r8
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%r8, 0)
-  push %r9
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%r9, 0)
-  push %r10
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%r10, 0)
-  push %r11
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%r11, 0)
-  # All XMM registers are caller-saved.
-  sub $0x100, %rsp
-  CFI_ADJUST_CFA_OFFSET(0x100)
-  movdqu %xmm0, 0x0(%rsp)
-  movdqu %xmm1, 0x10(%rsp)
-  movdqu %xmm2, 0x20(%rsp)
-  movdqu %xmm3, 0x30(%rsp)
-  movdqu %xmm4, 0x40(%rsp)
-  movdqu %xmm5, 0x50(%rsp)
-  movdqu %xmm6, 0x60(%rsp)
-  movdqu %xmm7, 0x70(%rsp)
-  movdqu %xmm8, 0x80(%rsp)
-  movdqu %xmm9, 0x90(%rsp)
-  movdqu %xmm10, 0xa0(%rsp)
-  movdqu %xmm11, 0xb0(%rsp)
-  movdqu %xmm12, 0xc0(%rsp)
-  movdqu %xmm13, 0xd0(%rsp)
-  movdqu %xmm14, 0xe0(%rsp)
-  movdqu %xmm15, 0xf0(%rsp)
-  # Align stack frame.
-  push %rbx  # non-scratch
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rbx, 0)
-  mov %rsp, %rbx  # save current rsp
-  CFI_DEF_CFA_REGISTER(%rbx)
-  shr $4, %rsp  # clear 4 lsb, align to 16
-  shl $4, %rsp
-
-  call ASM_SYMBOL(__tsan_trace_switch)
-
-  # Unalign stack frame back.
-  mov %rbx, %rsp  # restore the original rsp
-  CFI_DEF_CFA_REGISTER(%rsp)
-  pop %rbx
-  CFI_ADJUST_CFA_OFFSET(-8)
-  # Restore scratch registers.
-  movdqu 0x0(%rsp), %xmm0
-  movdqu 0x10(%rsp), %xmm1
-  movdqu 0x20(%rsp), %xmm2
-  movdqu 0x30(%rsp), %xmm3
-  movdqu 0x40(%rsp), %xmm4
-  movdqu 0x50(%rsp), %xmm5
-  movdqu 0x60(%rsp), %xmm6
-  movdqu 0x70(%rsp), %xmm7
-  movdqu 0x80(%rsp), %xmm8
-  movdqu 0x90(%rsp), %xmm9
-  movdqu 0xa0(%rsp), %xmm10
-  movdqu 0xb0(%rsp), %xmm11
-  movdqu 0xc0(%rsp), %xmm12
-  movdqu 0xd0(%rsp), %xmm13
-  movdqu 0xe0(%rsp), %xmm14
-  movdqu 0xf0(%rsp), %xmm15
-  add $0x100, %rsp
-  CFI_ADJUST_CFA_OFFSET(-0x100)
-  pop %r11
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %r10
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %r9
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %r8
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rdi
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rsi
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rdx
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rcx
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rax
-  CFI_ADJUST_CFA_OFFSET(-8)
-  CFI_RESTORE(%rax)
-  CFI_RESTORE(%rbx)
-  CFI_RESTORE(%rcx)
-  CFI_RESTORE(%rdx)
-  CFI_RESTORE(%rsi)
-  CFI_RESTORE(%rdi)
-  CFI_RESTORE(%r8)
-  CFI_RESTORE(%r9)
-  CFI_RESTORE(%r10)
-  CFI_RESTORE(%r11)
-  ret
-  CFI_ENDPROC
-
-ASM_HIDDEN(__tsan_report_race)
-.globl ASM_SYMBOL(__tsan_report_race_thunk)
-ASM_SYMBOL(__tsan_report_race_thunk):
-  CFI_STARTPROC
-  _CET_ENDBR
-  # Save scratch registers.
-  push %rax
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rax, 0)
-  push %rcx
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rcx, 0)
-  push %rdx
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rdx, 0)
-  push %rsi
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rsi, 0)
-  push %rdi
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rdi, 0)
-  push %r8
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%r8, 0)
-  push %r9
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%r9, 0)
-  push %r10
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%r10, 0)
-  push %r11
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%r11, 0)
-  # All XMM registers are caller-saved.
-  sub $0x100, %rsp
-  CFI_ADJUST_CFA_OFFSET(0x100)
-  movdqu %xmm0, 0x0(%rsp)
-  movdqu %xmm1, 0x10(%rsp)
-  movdqu %xmm2, 0x20(%rsp)
-  movdqu %xmm3, 0x30(%rsp)
-  movdqu %xmm4, 0x40(%rsp)
-  movdqu %xmm5, 0x50(%rsp)
-  movdqu %xmm6, 0x60(%rsp)
-  movdqu %xmm7, 0x70(%rsp)
-  movdqu %xmm8, 0x80(%rsp)
-  movdqu %xmm9, 0x90(%rsp)
-  movdqu %xmm10, 0xa0(%rsp)
-  movdqu %xmm11, 0xb0(%rsp)
-  movdqu %xmm12, 0xc0(%rsp)
-  movdqu %xmm13, 0xd0(%rsp)
-  movdqu %xmm14, 0xe0(%rsp)
-  movdqu %xmm15, 0xf0(%rsp)
-  # Align stack frame.
-  push %rbx  # non-scratch
-  CFI_ADJUST_CFA_OFFSET(8)
-  CFI_REL_OFFSET(%rbx, 0)
-  mov %rsp, %rbx  # save current rsp
-  CFI_DEF_CFA_REGISTER(%rbx)
-  shr $4, %rsp  # clear 4 lsb, align to 16
-  shl $4, %rsp
-
-  call ASM_SYMBOL(__tsan_report_race)
-
-  # Unalign stack frame back.
-  mov %rbx, %rsp  # restore the original rsp
-  CFI_DEF_CFA_REGISTER(%rsp)
-  pop %rbx
-  CFI_ADJUST_CFA_OFFSET(-8)
-  # Restore scratch registers.
-  movdqu 0x0(%rsp), %xmm0
-  movdqu 0x10(%rsp), %xmm1
-  movdqu 0x20(%rsp), %xmm2
-  movdqu 0x30(%rsp), %xmm3
-  movdqu 0x40(%rsp), %xmm4
-  movdqu 0x50(%rsp), %xmm5
-  movdqu 0x60(%rsp), %xmm6
-  movdqu 0x70(%rsp), %xmm7
-  movdqu 0x80(%rsp), %xmm8
-  movdqu 0x90(%rsp), %xmm9
-  movdqu 0xa0(%rsp), %xmm10
-  movdqu 0xb0(%rsp), %xmm11
-  movdqu 0xc0(%rsp), %xmm12
-  movdqu 0xd0(%rsp), %xmm13
-  movdqu 0xe0(%rsp), %xmm14
-  movdqu 0xf0(%rsp), %xmm15
-  add $0x100, %rsp
-  CFI_ADJUST_CFA_OFFSET(-0x100)
-  pop %r11
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %r10
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %r9
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %r8
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rdi
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rsi
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rdx
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rcx
-  CFI_ADJUST_CFA_OFFSET(-8)
-  pop %rax
-  CFI_ADJUST_CFA_OFFSET(-8)
-  CFI_RESTORE(%rax)
-  CFI_RESTORE(%rbx)
-  CFI_RESTORE(%rcx)
-  CFI_RESTORE(%rdx)
-  CFI_RESTORE(%rsi)
-  CFI_RESTORE(%rdi)
-  CFI_RESTORE(%r8)
-  CFI_RESTORE(%r9)
-  CFI_RESTORE(%r10)
-  CFI_RESTORE(%r11)
-  ret
-  CFI_ENDPROC
-
 ASM_HIDDEN(__tsan_setjmp)
 #if defined(__NetBSD__)
 .comm _ZN14__interception15real___setjmp14E,8,8
diff --git a/libsanitizer/tsan/tsan_rtl_mutex.cpp b/libsanitizer/tsan/tsan_rtl_mutex.cpp
index 7d6b411..2e97885 100644
--- a/libsanitizer/tsan/tsan_rtl_mutex.cpp
+++ b/libsanitizer/tsan/tsan_rtl_mutex.cpp
@@ -23,6 +23,8 @@
 namespace __tsan {
 
 void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r);
+void ReportDestroyLocked(ThreadState *thr, uptr pc, uptr addr,
+                         FastState last_lock, StackID creation_stack_id);
 
 struct Callback final : public DDCallback {
   ThreadState *thr;
@@ -36,17 +38,17 @@ struct Callback final : public DDCallback {
   }
 
   StackID Unwind() override { return CurrentStackId(thr, pc); }
-  int UniqueTid() override { return thr->unique_id; }
+  int UniqueTid() override { return thr->tid; }
 };
 
 void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s) {
   Callback cb(thr, pc);
   ctx->dd->MutexInit(&cb, &s->dd);
-  s->dd.ctx = s->GetId();
+  s->dd.ctx = s->addr;
 }
 
 static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
-    uptr addr, u64 mid) {
+                              uptr addr, StackID creation_stack_id) {
   // In Go, these misuses are either impossible, or detected by std lib,
   // or false positives (e.g. unlock in a different thread).
   if (SANITIZER_GO)
@@ -55,7 +57,7 @@ static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
     return;
   ThreadRegistryLock l(&ctx->thread_registry);
   ScopedReport rep(typ);
-  rep.AddMutex(mid);
+  rep.AddMutex(addr, creation_stack_id);
   VarSizeStackTrace trace;
   ObtainCurrentStack(thr, pc, &trace);
   rep.AddStack(trace, true);
@@ -63,95 +65,94 @@ static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
   OutputReport(thr, rep);
 }
 
+static void RecordMutexLock(ThreadState *thr, uptr pc, uptr addr,
+                            StackID stack_id, bool write) {
+  auto typ = write ? EventType::kLock : EventType::kRLock;
+  // Note: it's important to trace before modifying mutex set
+  // because tracing can switch trace part and we write the current
+  // mutex set in the beginning of each part.
+  // If we do it in the opposite order, we will write already reduced
+  // mutex set in the beginning of the part and then trace unlock again.
+  TraceMutexLock(thr, typ, pc, addr, stack_id);
+  thr->mset.AddAddr(addr, stack_id, write);
+}
+
+static void RecordMutexUnlock(ThreadState *thr, uptr addr) {
+  // See the comment in RecordMutexLock re order of operations.
+  TraceMutexUnlock(thr, addr);
+  thr->mset.DelAddr(addr);
+}
+
 void MutexCreate(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexCreate %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (!(flagz & MutexFlagLinkerInit) && IsAppMem(addr)) {
-    CHECK(!thr->is_freeing);
-    thr->is_freeing = true;
+  if (!(flagz & MutexFlagLinkerInit) && pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessWrite);
-    thr->is_freeing = false;
-  }
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-  Lock l(&s->mtx);
+  SlotLocker locker(thr);
+  auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
   s->SetFlags(flagz & MutexCreationFlagMask);
   // Save stack in the case the sync object was created before as atomic.
-  if (!SANITIZER_GO && s->creation_stack_id == 0)
+  if (!SANITIZER_GO && s->creation_stack_id == kInvalidStackID)
     s->creation_stack_id = CurrentStackId(thr, pc);
 }
 
 void MutexDestroy(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexDestroy %zx\n", thr->tid, addr);
   bool unlock_locked = false;
-  u64 mid = 0;
-  u64 last_lock = 0;
+  StackID creation_stack_id;
+  FastState last_lock;
   {
-    SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
-    if (s == 0)
+    auto s = ctx->metamap.GetSyncIfExists(addr);
+    if (!s)
       return;
-    Lock l(&s->mtx);
-    if ((flagz & MutexFlagLinkerInit) || s->IsFlagSet(MutexFlagLinkerInit) ||
-        ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
-      // Destroy is no-op for linker-initialized mutexes.
-      return;
-    }
-    if (common_flags()->detect_deadlocks) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexDestroy(&cb, &s->dd);
-      ctx->dd->MutexInit(&cb, &s->dd);
-    }
-    if (flags()->report_destroy_locked && s->owner_tid != kInvalidTid &&
-        !s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      unlock_locked = true;
-    }
-    mid = s->GetId();
-    last_lock = s->last_lock;
-    if (!unlock_locked)
-      s->Reset(thr->proc());  // must not reset it before the report is printed
-  }
-  if (unlock_locked && ShouldReport(thr, ReportTypeMutexDestroyLocked)) {
-    ThreadRegistryLock l(&ctx->thread_registry);
-    ScopedReport rep(ReportTypeMutexDestroyLocked);
-    rep.AddMutex(mid);
-    VarSizeStackTrace trace;
-    ObtainCurrentStack(thr, pc, &trace);
-    rep.AddStack(trace, true);
-    FastState last(last_lock);
-    RestoreStack(last.tid(), last.epoch(), &trace, 0);
-    rep.AddStack(trace, true);
-    rep.AddLocation(addr, 1);
-    OutputReport(thr, rep);
-
-    SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
-    if (s != 0) {
-      Lock l(&s->mtx);
-      s->Reset(thr->proc());
+    SlotLocker locker(thr);
+    {
+      Lock lock(&s->mtx);
+      creation_stack_id = s->creation_stack_id;
+      last_lock = s->last_lock;
+      if ((flagz & MutexFlagLinkerInit) || s->IsFlagSet(MutexFlagLinkerInit) ||
+          ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
+        // Destroy is no-op for linker-initialized mutexes.
+        return;
+      }
+      if (common_flags()->detect_deadlocks) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexDestroy(&cb, &s->dd);
+        ctx->dd->MutexInit(&cb, &s->dd);
+      }
+      if (flags()->report_destroy_locked && s->owner_tid != kInvalidTid &&
+          !s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        unlock_locked = true;
+      }
+      s->Reset();
     }
+    // Imitate a memory write to catch unlock-destroy races.
+    if (pc && IsAppMem(addr))
+      MemoryAccess(thr, pc, addr, 1,
+                   kAccessWrite | kAccessFree | kAccessSlotLocked);
   }
-  thr->mset.Remove(mid);
-  // Imitate a memory write to catch unlock-destroy races.
-  // Do this outside of sync mutex, because it can report a race which locks
-  // sync mutexes.
-  if (IsAppMem(addr))
-    MemoryAccess(thr, pc, addr, 1, kAccessWrite | kAccessFree);
+  if (unlock_locked && ShouldReport(thr, ReportTypeMutexDestroyLocked))
+    ReportDestroyLocked(thr, pc, addr, last_lock, creation_stack_id);
+  thr->mset.DelAddr(addr, true);
   // s will be destroyed and freed in MetaMap::FreeBlock.
 }
 
 void MutexPreLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPreLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    {
-      ReadLock l(&s->mtx);
-      s->UpdateFlags(flagz);
-      if (s->owner_tid != thr->tid) {
-        Callback cb(thr, pc);
-        ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
-      }
-    }
-    Callback cb(thr, pc);
-    ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
+  if (flagz & MutexFlagTryLock)
+    return;
+  if (!common_flags()->detect_deadlocks)
+    return;
+  Callback cb(thr, pc);
+  {
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    ReadLock lock(&s->mtx);
+    s->UpdateFlags(flagz);
+    if (s->owner_tid != thr->tid)
+      ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
   }
+  ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 }
 
 void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
@@ -161,48 +162,51 @@ void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
     CHECK_GT(rec, 0);
   else
     rec = 1;
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
+  bool report_double_lock = false;
   bool pre_lock = false;
   bool first = false;
-  bool report_double_lock = false;
+  StackID creation_stack_id = kInvalidStackID;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    Lock l(&s->mtx);
-    s->UpdateFlags(flagz);
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeLock, s->GetId());
-    if (s->owner_tid == kInvalidTid) {
-      CHECK_EQ(s->recursion, 0);
-      s->owner_tid = thr->tid;
-      s->last_lock = thr->fast_state.raw();
-    } else if (s->owner_tid == thr->tid) {
-      CHECK_GT(s->recursion, 0);
-    } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      report_double_lock = true;
-    }
-    first = s->recursion == 0;
-    s->recursion += rec;
-    if (first) {
-      AcquireImpl(thr, pc, &s->clock);
-      AcquireImpl(thr, pc, &s->read_clock);
-    } else if (!s->IsFlagSet(MutexFlagWriteReentrant)) {
-    }
-    thr->mset.Add(s->GetId(), true, thr->fast_state.epoch());
-    if (first && common_flags()->detect_deadlocks) {
-      pre_lock =
-          (flagz & MutexFlagDoPreLockOnPostLock) && !(flagz & MutexFlagTryLock);
-      Callback cb(thr, pc);
-      if (pre_lock)
-        ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
-      ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    creation_stack_id = s->creation_stack_id;
+    RecordMutexLock(thr, pc, addr, creation_stack_id, true);
+    {
+      Lock lock(&s->mtx);
+      first = s->recursion == 0;
+      s->UpdateFlags(flagz);
+      if (s->owner_tid == kInvalidTid) {
+        CHECK_EQ(s->recursion, 0);
+        s->owner_tid = thr->tid;
+        s->last_lock = thr->fast_state;
+      } else if (s->owner_tid == thr->tid) {
+        CHECK_GT(s->recursion, 0);
+      } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        report_double_lock = true;
+      }
+      s->recursion += rec;
+      if (first) {
+        if (!thr->ignore_sync) {
+          thr->clock.Acquire(s->clock);
+          thr->clock.Acquire(s->read_clock);
+        }
+      }
+      if (first && common_flags()->detect_deadlocks) {
+        pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
+                   !(flagz & MutexFlagTryLock);
+        Callback cb(thr, pc);
+        if (pre_lock)
+          ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
+        ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
+      }
     }
-    mid = s->GetId();
   }
   if (report_double_lock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr,
+                      creation_stack_id);
   if (first && pre_lock && common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -211,40 +215,47 @@ void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
 
 int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexUnlock %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
+  StackID creation_stack_id;
+  RecordMutexUnlock(thr, addr);
   bool report_bad_unlock = false;
   int rec = 0;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    Lock l(&s->mtx);
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
-    if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
-      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-        s->SetFlags(MutexFlagBroken);
-        report_bad_unlock = true;
-      }
-    } else {
-      rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
-      s->recursion -= rec;
-      if (s->recursion == 0) {
-        s->owner_tid = kInvalidTid;
-        ReleaseStoreImpl(thr, pc, &s->clock);
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    bool released = false;
+    {
+      Lock lock(&s->mtx);
+      creation_stack_id = s->creation_stack_id;
+      if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
+        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+          s->SetFlags(MutexFlagBroken);
+          report_bad_unlock = true;
+        }
       } else {
+        rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
+        s->recursion -= rec;
+        if (s->recursion == 0) {
+          s->owner_tid = kInvalidTid;
+          if (!thr->ignore_sync) {
+            thr->clock.ReleaseStore(&s->clock);
+            released = true;
+          }
+        }
+      }
+      if (common_flags()->detect_deadlocks && s->recursion == 0 &&
+          !report_bad_unlock) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
       }
     }
-    thr->mset.Del(s->GetId(), true);
-    if (common_flags()->detect_deadlocks && s->recursion == 0 &&
-        !report_bad_unlock) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
-    }
-    mid = s->GetId();
+    if (released)
+      IncrementEpoch(thr);
   }
   if (report_bad_unlock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr,
+                      creation_stack_id);
   if (common_flags()->detect_deadlocks && !report_bad_unlock) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -254,53 +265,56 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
 
 void MutexPreReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPreReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
-    {
-      SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-      ReadLock l(&s->mtx);
-      s->UpdateFlags(flagz);
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
-    }
-    Callback cb(thr, pc);
-    ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
+  if ((flagz & MutexFlagTryLock) || !common_flags()->detect_deadlocks)
+    return;
+  Callback cb(thr, pc);
+  {
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    ReadLock lock(&s->mtx);
+    s->UpdateFlags(flagz);
+    ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
   }
+  ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 }
 
 void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
   DPrintf("#%d: MutexPostReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
   bool report_bad_lock = false;
   bool pre_lock = false;
+  StackID creation_stack_id = kInvalidStackID;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    ReadLock l(&s->mtx);
-    s->UpdateFlags(flagz);
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeRLock, s->GetId());
-    if (s->owner_tid != kInvalidTid) {
-      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-        s->SetFlags(MutexFlagBroken);
-        report_bad_lock = true;
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    creation_stack_id = s->creation_stack_id;
+    RecordMutexLock(thr, pc, addr, creation_stack_id, false);
+    {
+      ReadLock lock(&s->mtx);
+      s->UpdateFlags(flagz);
+      if (s->owner_tid != kInvalidTid) {
+        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+          s->SetFlags(MutexFlagBroken);
+          report_bad_lock = true;
+        }
+      }
+      if (!thr->ignore_sync)
+        thr->clock.Acquire(s->clock);
+      s->last_lock = thr->fast_state;
+      if (common_flags()->detect_deadlocks) {
+        pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
+                   !(flagz & MutexFlagTryLock);
+        Callback cb(thr, pc);
+        if (pre_lock)
+          ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
+        ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
       }
     }
-    AcquireImpl(thr, pc, &s->clock);
-    s->last_lock = thr->fast_state.raw();
-    thr->mset.Add(s->GetId(), false, thr->fast_state.epoch());
-    if (common_flags()->detect_deadlocks) {
-      pre_lock =
-          (flagz & MutexFlagDoPreLockOnPostLock) && !(flagz & MutexFlagTryLock);
-      Callback cb(thr, pc);
-      if (pre_lock)
-        ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
-      ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
-    }
-    mid = s->GetId();
   }
   if (report_bad_lock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr,
+                      creation_stack_id);
   if (pre_lock  && common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -309,31 +323,39 @@ void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
 
 void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexReadUnlock %zx\n", thr->tid, addr);
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
+  RecordMutexUnlock(thr, addr);
+  StackID creation_stack_id;
   bool report_bad_unlock = false;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    Lock l(&s->mtx);
-    thr->fast_state.IncrementEpoch();
-    TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
-    if (s->owner_tid != kInvalidTid) {
-      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
-        s->SetFlags(MutexFlagBroken);
-        report_bad_unlock = true;
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    bool released = false;
+    {
+      Lock lock(&s->mtx);
+      creation_stack_id = s->creation_stack_id;
+      if (s->owner_tid != kInvalidTid) {
+        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
+          s->SetFlags(MutexFlagBroken);
+          report_bad_unlock = true;
+        }
+      }
+      if (!thr->ignore_sync) {
+        thr->clock.Release(&s->read_clock);
+        released = true;
+      }
+      if (common_flags()->detect_deadlocks && s->recursion == 0) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
       }
     }
-    ReleaseImpl(thr, pc, &s->read_clock);
-    if (common_flags()->detect_deadlocks && s->recursion == 0) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
-    }
-    mid = s->GetId();
+    if (released)
+      IncrementEpoch(thr);
   }
-  thr->mset.Del(mid, false);
   if (report_bad_unlock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadUnlock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadUnlock, addr,
+                      creation_stack_id);
   if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -342,44 +364,52 @@ void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
 
 void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexReadOrWriteUnlock %zx\n", thr->tid, addr);
-  if (IsAppMem(addr))
+  if (pc && IsAppMem(addr))
     MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
-  u64 mid = 0;
+  RecordMutexUnlock(thr, addr);
+  StackID creation_stack_id;
   bool report_bad_unlock = false;
+  bool write = true;
   {
-    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-    Lock l(&s->mtx);
-    bool write = true;
-    if (s->owner_tid == kInvalidTid) {
-      // Seems to be read unlock.
-      write = false;
-      thr->fast_state.IncrementEpoch();
-      TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
-      ReleaseImpl(thr, pc, &s->read_clock);
-    } else if (s->owner_tid == thr->tid) {
-      // Seems to be write unlock.
-      thr->fast_state.IncrementEpoch();
-      TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
-      CHECK_GT(s->recursion, 0);
-      s->recursion--;
-      if (s->recursion == 0) {
-        s->owner_tid = kInvalidTid;
-        ReleaseStoreImpl(thr, pc, &s->clock);
-      } else {
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    bool released = false;
+    {
+      Lock lock(&s->mtx);
+      creation_stack_id = s->creation_stack_id;
+      if (s->owner_tid == kInvalidTid) {
+        // Seems to be read unlock.
+        write = false;
+        if (!thr->ignore_sync) {
+          thr->clock.Release(&s->read_clock);
+          released = true;
+        }
+      } else if (s->owner_tid == thr->tid) {
+        // Seems to be write unlock.
+        CHECK_GT(s->recursion, 0);
+        s->recursion--;
+        if (s->recursion == 0) {
+          s->owner_tid = kInvalidTid;
+          if (!thr->ignore_sync) {
+            thr->clock.ReleaseStore(&s->clock);
+            released = true;
+          }
+        }
+      } else if (!s->IsFlagSet(MutexFlagBroken)) {
+        s->SetFlags(MutexFlagBroken);
+        report_bad_unlock = true;
+      }
+      if (common_flags()->detect_deadlocks && s->recursion == 0) {
+        Callback cb(thr, pc);
+        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
       }
-    } else if (!s->IsFlagSet(MutexFlagBroken)) {
-      s->SetFlags(MutexFlagBroken);
-      report_bad_unlock = true;
-    }
-    thr->mset.Del(s->GetId(), write);
-    if (common_flags()->detect_deadlocks && s->recursion == 0) {
-      Callback cb(thr, pc);
-      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
     }
-    mid = s->GetId();
+    if (released)
+      IncrementEpoch(thr);
   }
   if (report_bad_unlock)
-    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
+    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr,
+                      creation_stack_id);
   if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
@@ -388,143 +418,112 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
 
 void MutexRepair(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexRepair %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-  Lock l(&s->mtx);
+  SlotLocker locker(thr);
+  auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+  Lock lock(&s->mtx);
   s->owner_tid = kInvalidTid;
   s->recursion = 0;
 }
 
 void MutexInvalidAccess(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexInvalidAccess %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
-  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr, s->GetId());
+  StackID creation_stack_id = kInvalidStackID;
+  {
+    SlotLocker locker(thr);
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
+    if (s)
+      creation_stack_id = s->creation_stack_id;
+  }
+  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr,
+                    creation_stack_id);
 }
 
 void Acquire(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Acquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
+  auto s = ctx->metamap.GetSyncIfExists(addr);
   if (!s)
     return;
-  ReadLock l(&s->mtx);
-  AcquireImpl(thr, pc, &s->clock);
-}
-
-static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
-  ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
-  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
-  u64 epoch = tctx->epoch1;
-  if (tctx->status == ThreadStatusRunning) {
-    epoch = tctx->thr->fast_state.epoch();
-    tctx->thr->clock.NoteGlobalAcquire(epoch);
-  }
-  thr->clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
+  SlotLocker locker(thr);
+  if (!s->clock)
+    return;
+  ReadLock lock(&s->mtx);
+  thr->clock.Acquire(s->clock);
 }
 
 void AcquireGlobal(ThreadState *thr) {
   DPrintf("#%d: AcquireGlobal\n", thr->tid);
   if (thr->ignore_sync)
     return;
-  ThreadRegistryLock l(&ctx->thread_registry);
-  ctx->thread_registry.RunCallbackForEachThreadLocked(UpdateClockCallback, thr);
-}
-
-void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr) {
-  DPrintf("#%d: ReleaseStoreAcquire %zx\n", thr->tid, addr);
-  if (thr->ignore_sync)
-    return;
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseStoreAcquireImpl(thr, pc, &s->clock);
+  SlotLocker locker(thr);
+  for (auto &slot : ctx->slots) thr->clock.Set(slot.sid, slot.epoch());
 }
 
 void Release(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Release %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseImpl(thr, pc, &s->clock);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+    Lock lock(&s->mtx);
+    thr->clock.Release(&s->clock);
+  }
+  IncrementEpoch(thr);
 }
 
 void ReleaseStore(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: ReleaseStore %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
-  Lock l(&s->mtx);
-  thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-  ReleaseStoreImpl(thr, pc, &s->clock);
-}
-
-#if !SANITIZER_GO
-static void UpdateSleepClockCallback(ThreadContextBase *tctx_base, void *arg) {
-  ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
-  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
-  u64 epoch = tctx->epoch1;
-  if (tctx->status == ThreadStatusRunning)
-    epoch = tctx->thr->fast_state.epoch();
-  thr->last_sleep_clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
-}
-
-void AfterSleep(ThreadState *thr, uptr pc) {
-  DPrintf("#%d: AfterSleep\n", thr->tid);
-  if (thr->ignore_sync)
-    return;
-  thr->last_sleep_stack_id = CurrentStackId(thr, pc);
-  ThreadRegistryLock l(&ctx->thread_registry);
-  ctx->thread_registry.RunCallbackForEachThreadLocked(UpdateSleepClockCallback,
-                                                      thr);
-}
-#endif
-
-void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) {
-  if (thr->ignore_sync)
-    return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->clock.acquire(&thr->proc()->clock_cache, c);
-}
-
-void ReleaseStoreAcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) {
-  if (thr->ignore_sync)
-    return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.releaseStoreAcquire(&thr->proc()->clock_cache, c);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+    Lock lock(&s->mtx);
+    thr->clock.ReleaseStore(&s->clock);
+  }
+  IncrementEpoch(thr);
 }
 
-void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
+void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr) {
+  DPrintf("#%d: ReleaseStoreAcquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.release(&thr->proc()->clock_cache, c);
+  SlotLocker locker(thr);
+  {
+    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
+    Lock lock(&s->mtx);
+    thr->clock.ReleaseStoreAcquire(&s->clock);
+  }
+  IncrementEpoch(thr);
 }
 
-void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) {
-  if (thr->ignore_sync)
-    return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.ReleaseStore(&thr->proc()->clock_cache, c);
+void IncrementEpoch(ThreadState *thr) {
+  DCHECK(!thr->ignore_sync);
+  DCHECK(thr->slot_locked);
+  Epoch epoch = EpochInc(thr->fast_state.epoch());
+  if (!EpochOverflow(epoch)) {
+    Sid sid = thr->fast_state.sid();
+    thr->clock.Set(sid, epoch);
+    thr->fast_state.SetEpoch(epoch);
+    thr->slot->SetEpoch(epoch);
+    TraceTime(thr);
+  }
 }
 
-void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
+#if !SANITIZER_GO
+void AfterSleep(ThreadState *thr, uptr pc) {
+  DPrintf("#%d: AfterSleep\n", thr->tid);
   if (thr->ignore_sync)
     return;
-  thr->clock.set(thr->fast_state.epoch());
-  thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.acq_rel(&thr->proc()->clock_cache, c);
+  thr->last_sleep_stack_id = CurrentStackId(thr, pc);
+  thr->last_sleep_clock.Reset();
+  SlotLocker locker(thr);
+  for (auto &slot : ctx->slots)
+    thr->last_sleep_clock.Set(slot.sid, slot.epoch());
 }
+#endif
 
 void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   if (r == 0 || !ShouldReport(thr, ReportTypeDeadlock))
@@ -532,7 +531,7 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   ThreadRegistryLock l(&ctx->thread_registry);
   ScopedReport rep(ReportTypeDeadlock);
   for (int i = 0; i < r->n; i++) {
-    rep.AddMutex(r->loop[i].mtx_ctx0);
+    rep.AddMutex(r->loop[i].mtx_ctx0, r->loop[i].stk[0]);
     rep.AddUniqueTid((int)r->loop[i].thr_ctx);
     rep.AddThread((int)r->loop[i].thr_ctx);
   }
@@ -540,7 +539,7 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   for (int i = 0; i < r->n; i++) {
     for (int j = 0; j < (flags()->second_deadlock_stack ? 2 : 1); j++) {
       u32 stk = r->loop[i].stk[j];
-      if (stk && stk != 0xffffffff) {
+      if (stk && stk != kInvalidStackID) {
         rep.AddStack(StackDepotGet(stk), true);
       } else {
         // Sometimes we fail to extract the stack trace (FIXME: investigate),
@@ -552,4 +551,28 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
   OutputReport(thr, rep);
 }
 
+void ReportDestroyLocked(ThreadState *thr, uptr pc, uptr addr,
+                         FastState last_lock, StackID creation_stack_id) {
+  // We need to lock the slot during RestoreStack because it protects
+  // the slot journal.
+  Lock slot_lock(&ctx->slots[static_cast<uptr>(last_lock.sid())].mtx);
+  ThreadRegistryLock l0(&ctx->thread_registry);
+  Lock slots_lock(&ctx->slot_mtx);
+  ScopedReport rep(ReportTypeMutexDestroyLocked);
+  rep.AddMutex(addr, creation_stack_id);
+  VarSizeStackTrace trace;
+  ObtainCurrentStack(thr, pc, &trace);
+  rep.AddStack(trace, true);
+
+  Tid tid;
+  DynamicMutexSet mset;
+  uptr tag;
+  if (!RestoreStack(EventType::kLock, last_lock.sid(), last_lock.epoch(), addr,
+                    0, kAccessWrite, &tid, &trace, mset, &tag))
+    return;
+  rep.AddStack(trace, true);
+  rep.AddLocation(addr, 1);
+  OutputReport(thr, rep);
+}
+
 }  // namespace __tsan
diff --git a/libsanitizer/tsan/tsan_rtl_ppc64.S b/libsanitizer/tsan/tsan_rtl_ppc64.S
index 9e533a7..8285e21 100644
--- a/libsanitizer/tsan/tsan_rtl_ppc64.S
+++ b/libsanitizer/tsan/tsan_rtl_ppc64.S
@@ -1,6 +1,5 @@
 #include "tsan_ppc_regs.h"
 
-        .machine altivec
         .section .text
         .hidden __tsan_setjmp
         .globl _setjmp
diff --git a/libsanitizer/tsan/tsan_rtl_proc.cpp b/libsanitizer/tsan/tsan_rtl_proc.cpp
index def61cc..5acc396 100644
--- a/libsanitizer/tsan/tsan_rtl_proc.cpp
+++ b/libsanitizer/tsan/tsan_rtl_proc.cpp
@@ -35,7 +35,6 @@ void ProcDestroy(Processor *proc) {
 #if !SANITIZER_GO
   AllocatorProcFinish(proc);
 #endif
-  ctx->clock_alloc.FlushCache(&proc->clock_cache);
   ctx->metamap.OnProcIdle(proc);
   if (common_flags()->detect_deadlocks)
      ctx->dd->DestroyPhysicalThread(proc->dd_pt);
diff --git a/libsanitizer/tsan/tsan_rtl_report.cpp b/libsanitizer/tsan/tsan_rtl_report.cpp
index 811695d..4cf8816 100644
--- a/libsanitizer/tsan/tsan_rtl_report.cpp
+++ b/libsanitizer/tsan/tsan_rtl_report.cpp
@@ -175,22 +175,26 @@ void ScopedReportBase::AddStack(StackTrace stack, bool suppressable) {
 }
 
 void ScopedReportBase::AddMemoryAccess(uptr addr, uptr external_tag, Shadow s,
-                                       StackTrace stack, const MutexSet *mset) {
+                                       Tid tid, StackTrace stack,
+                                       const MutexSet *mset) {
+  uptr addr0, size;
+  AccessType typ;
+  s.GetAccess(&addr0, &size, &typ);
   auto *mop = New<ReportMop>();
   rep_->mops.PushBack(mop);
-  mop->tid = s.tid();
-  mop->addr = addr + s.addr0();
-  mop->size = s.size();
-  mop->write = s.IsWrite();
-  mop->atomic = s.IsAtomic();
+  mop->tid = tid;
+  mop->addr = addr + addr0;
+  mop->size = size;
+  mop->write = !(typ & kAccessRead);
+  mop->atomic = typ & kAccessAtomic;
   mop->stack = SymbolizeStack(stack);
   mop->external_tag = external_tag;
   if (mop->stack)
     mop->stack->suppressable = true;
   for (uptr i = 0; i < mset->Size(); i++) {
     MutexSet::Desc d = mset->Get(i);
-    u64 mid = this->AddMutex(d.id);
-    ReportMopMutex mtx = {mid, d.write};
+    int id = this->AddMutex(d.addr, d.stack_id);
+    ReportMopMutex mtx = {id, d.write};
     mop->mset.PushBack(mtx);
   }
 }
@@ -219,18 +223,6 @@ void ScopedReportBase::AddThread(const ThreadContext *tctx, bool suppressable) {
 }
 
 #if !SANITIZER_GO
-static bool FindThreadByUidLockedCallback(ThreadContextBase *tctx, void *arg) {
-  int unique_id = *(int *)arg;
-  return tctx->unique_id == (u32)unique_id;
-}
-
-static ThreadContext *FindThreadByUidLocked(Tid unique_id) {
-  ctx->thread_registry.CheckLocked();
-  return static_cast<ThreadContext *>(
-      ctx->thread_registry.FindThreadContextLocked(
-          FindThreadByUidLockedCallback, &unique_id));
-}
-
 static ThreadContext *FindThreadByTidLocked(Tid tid) {
   ctx->thread_registry.CheckLocked();
   return static_cast<ThreadContext *>(
@@ -262,55 +254,24 @@ ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack) {
 }
 #endif
 
-void ScopedReportBase::AddThread(Tid unique_tid, bool suppressable) {
+void ScopedReportBase::AddThread(Tid tid, bool suppressable) {
 #if !SANITIZER_GO
-  if (const ThreadContext *tctx = FindThreadByUidLocked(unique_tid))
+  if (const ThreadContext *tctx = FindThreadByTidLocked(tid))
     AddThread(tctx, suppressable);
 #endif
 }
 
-void ScopedReportBase::AddMutex(const SyncVar *s) {
-  for (uptr i = 0; i < rep_->mutexes.Size(); i++) {
-    if (rep_->mutexes[i]->id == s->uid)
-      return;
-  }
-  auto *rm = New<ReportMutex>();
-  rep_->mutexes.PushBack(rm);
-  rm->id = s->uid;
-  rm->addr = s->addr;
-  rm->destroyed = false;
-  rm->stack = SymbolizeStackId(s->creation_stack_id);
-}
-
-u64 ScopedReportBase::AddMutex(u64 id) {
-  u64 uid = 0;
-  u64 mid = id;
-  uptr addr = SyncVar::SplitId(id, &uid);
-  SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
-  // Check that the mutex is still alive.
-  // Another mutex can be created at the same address,
-  // so check uid as well.
-  if (s && s->CheckId(uid)) {
-    Lock l(&s->mtx);
-    mid = s->uid;
-    AddMutex(s);
-  } else {
-    AddDeadMutex(id);
-  }
-  return mid;
-}
-
-void ScopedReportBase::AddDeadMutex(u64 id) {
+int ScopedReportBase::AddMutex(uptr addr, StackID creation_stack_id) {
   for (uptr i = 0; i < rep_->mutexes.Size(); i++) {
-    if (rep_->mutexes[i]->id == id)
-      return;
+    if (rep_->mutexes[i]->addr == addr)
+      return rep_->mutexes[i]->id;
   }
   auto *rm = New<ReportMutex>();
   rep_->mutexes.PushBack(rm);
-  rm->id = id;
-  rm->addr = 0;
-  rm->destroyed = true;
-  rm->stack = 0;
+  rm->id = rep_->mutexes.Size() - 1;
+  rm->addr = addr;
+  rm->stack = SymbolizeStackId(creation_stack_id);
+  return rm->id;
 }
 
 void ScopedReportBase::AddLocation(uptr addr, uptr size) {
@@ -327,7 +288,7 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
     loc->tid = creat_tid;
     loc->stack = SymbolizeStackId(creat_stack);
     rep_->locs.PushBack(loc);
-    ThreadContext *tctx = FindThreadByUidLocked(creat_tid);
+    ThreadContext *tctx = FindThreadByTidLocked(creat_tid);
     if (tctx)
       AddThread(tctx);
     return;
@@ -343,16 +304,15 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
   if (!b)
     b = JavaHeapBlock(addr, &block_begin);
   if (b != 0) {
-    ThreadContext *tctx = FindThreadByTidLocked(b->tid);
     auto *loc = New<ReportLocation>();
     loc->type = ReportLocationHeap;
-    loc->heap_chunk_start = (uptr)allocator()->GetBlockBegin((void *)addr);
+    loc->heap_chunk_start = block_begin;
     loc->heap_chunk_size = b->siz;
     loc->external_tag = b->tag;
-    loc->tid = tctx ? tctx->tid : b->tid;
+    loc->tid = b->tid;
     loc->stack = SymbolizeStackId(b->stk);
     rep_->locs.PushBack(loc);
-    if (tctx)
+    if (ThreadContext *tctx = FindThreadByTidLocked(b->tid))
       AddThread(tctx);
     return;
   }
@@ -380,6 +340,8 @@ void ScopedReportBase::AddSleep(StackID stack_id) {
 
 void ScopedReportBase::SetCount(int count) { rep_->count = count; }
 
+void ScopedReportBase::SetSigNum(int sig) { rep_->signum = sig; }
+
 const ReportDesc *ScopedReportBase::GetReport() const { return rep_; }
 
 ScopedReport::ScopedReport(ReportType typ, uptr tag)
@@ -387,71 +349,6 @@ ScopedReport::ScopedReport(ReportType typ, uptr tag)
 
 ScopedReport::~ScopedReport() {}
 
-void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
-                  MutexSet *mset, uptr *tag) {
-  // This function restores stack trace and mutex set for the thread/epoch.
-  // It does so by getting stack trace and mutex set at the beginning of
-  // trace part, and then replaying the trace till the given epoch.
-  Trace* trace = ThreadTrace(tid);
-  ReadLock l(&trace->mtx);
-  const int partidx = (epoch / kTracePartSize) % TraceParts();
-  TraceHeader* hdr = &trace->headers[partidx];
-  if (epoch < hdr->epoch0 || epoch >= hdr->epoch0 + kTracePartSize)
-    return;
-  CHECK_EQ(RoundDown(epoch, kTracePartSize), hdr->epoch0);
-  const u64 epoch0 = RoundDown(epoch, TraceSize());
-  const u64 eend = epoch % TraceSize();
-  const u64 ebegin = RoundDown(eend, kTracePartSize);
-  DPrintf("#%d: RestoreStack epoch=%zu ebegin=%zu eend=%zu partidx=%d\n",
-          tid, (uptr)epoch, (uptr)ebegin, (uptr)eend, partidx);
-  Vector<uptr> stack;
-  stack.Resize(hdr->stack0.size + 64);
-  for (uptr i = 0; i < hdr->stack0.size; i++) {
-    stack[i] = hdr->stack0.trace[i];
-    DPrintf2("  #%02zu: pc=%zx\n", i, stack[i]);
-  }
-  if (mset)
-    *mset = hdr->mset0;
-  uptr pos = hdr->stack0.size;
-  Event *events = (Event*)GetThreadTrace(tid);
-  for (uptr i = ebegin; i <= eend; i++) {
-    Event ev = events[i];
-    EventType typ = (EventType)(ev >> kEventPCBits);
-    uptr pc = (uptr)(ev & ((1ull << kEventPCBits) - 1));
-    DPrintf2("  %zu typ=%d pc=%zx\n", i, typ, pc);
-    if (typ == EventTypeMop) {
-      stack[pos] = pc;
-    } else if (typ == EventTypeFuncEnter) {
-      if (stack.Size() < pos + 2)
-        stack.Resize(pos + 2);
-      stack[pos++] = pc;
-    } else if (typ == EventTypeFuncExit) {
-      if (pos > 0)
-        pos--;
-    }
-    if (mset) {
-      if (typ == EventTypeLock) {
-        mset->Add(pc, true, epoch0 + i);
-      } else if (typ == EventTypeUnlock) {
-        mset->Del(pc, true);
-      } else if (typ == EventTypeRLock) {
-        mset->Add(pc, false, epoch0 + i);
-      } else if (typ == EventTypeRUnlock) {
-        mset->Del(pc, false);
-      }
-    }
-    for (uptr j = 0; j <= pos; j++)
-      DPrintf2("      #%zu: %zx\n", j, stack[j]);
-  }
-  if (pos == 0 && stack[0] == 0)
-    return;
-  pos++;
-  stk->Init(&stack[0], pos);
-  ExtractTagFromStack(stk, tag);
-}
-
-namespace v3 {
-
 // Replays the trace up to last_pos position in the last part
 // or up to the provided epoch/sid (whichever is earlier)
 // and calls the provided function f for each event.
@@ -469,6 +366,7 @@ void TraceReplay(Trace *trace, TracePart *last, Event *last_pos, Sid sid,
     Event *end = &part->events[TracePart::kSize - 1];
     if (part == last)
       end = last_pos;
+    f(kFreeSid, kEpochOver, nullptr);  // notify about part start
     for (Event *evp = &part->events[0]; evp < end; evp++) {
       Event *evp0 = evp;
       if (!evp->is_access && !evp->is_func) {
@@ -528,21 +426,36 @@ static constexpr bool IsWithinAccess(uptr addr1, uptr size1, uptr addr2,
   return addr1 >= addr2 && addr1 + size1 <= addr2 + size2;
 }
 
-// Replays the trace of thread tid up to the target event identified
-// by sid/epoch/addr/size/typ and restores and returns stack, mutex set
+// Replays the trace of slot sid up to the target event identified
+// by epoch/addr/size/typ and restores and returns tid, stack, mutex set
 // and tag for that event. If there are multiple such events, it returns
 // the last one. Returns false if the event is not present in the trace.
-bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
-                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
+bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
+                  AccessType typ, Tid *ptid, VarSizeStackTrace *pstk,
                   MutexSet *pmset, uptr *ptag) {
   // This function restores stack trace and mutex set for the thread/epoch.
   // It does so by getting stack trace and mutex set at the beginning of
   // trace part, and then replaying the trace till the given epoch.
-  DPrintf2("RestoreStack: tid=%u sid=%u@%u addr=0x%zx/%zu typ=%x\n", tid,
+  DPrintf2("RestoreStack: sid=%u@%u addr=0x%zx/%zu typ=%x\n",
            static_cast<int>(sid), static_cast<int>(epoch), addr, size,
            static_cast<int>(typ));
   ctx->slot_mtx.CheckLocked();  // needed to prevent trace part recycling
   ctx->thread_registry.CheckLocked();
+  TidSlot *slot = &ctx->slots[static_cast<uptr>(sid)];
+  Tid tid = kInvalidTid;
+  // Need to lock the slot mutex as it protects slot->journal.
+  slot->mtx.CheckLocked();
+  for (uptr i = 0; i < slot->journal.Size(); i++) {
+    DPrintf2("  journal: epoch=%d tid=%d\n",
+             static_cast<int>(slot->journal[i].epoch), slot->journal[i].tid);
+    if (i == slot->journal.Size() - 1 || slot->journal[i + 1].epoch > epoch) {
+      tid = slot->journal[i].tid;
+      break;
+    }
+  }
+  if (tid == kInvalidTid)
+    return false;
+  *ptid = tid;
   ThreadContext *tctx =
       static_cast<ThreadContext *>(ctx->thread_registry.GetThreadLocked(tid));
   Trace *trace = &tctx->trace;
@@ -553,8 +466,10 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
   {
     Lock lock(&trace->mtx);
     first_part = trace->parts.Front();
-    if (!first_part)
+    if (!first_part) {
+      DPrintf2("RestoreStack: tid=%d trace=%p no trace parts\n", tid, trace);
       return false;
+    }
     last_part = trace->parts.Back();
     last_pos = trace->final_pos;
     if (tctx->thr)
@@ -567,9 +482,18 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
   bool is_read = typ & kAccessRead;
   bool is_atomic = typ & kAccessAtomic;
   bool is_free = typ & kAccessFree;
+  DPrintf2("RestoreStack: tid=%d parts=[%p-%p] last_pos=%p\n", tid,
+           trace->parts.Front(), last_part, last_pos);
   TraceReplay(
       trace, last_part, last_pos, sid, epoch,
       [&](Sid ev_sid, Epoch ev_epoch, Event *evp) {
+        if (evp == nullptr) {
+          // Each trace part is self-consistent, so we reset state.
+          stack.Resize(0);
+          mset->Reset();
+          prev_pc = 0;
+          return;
+        }
         bool match = ev_sid == sid && ev_epoch == epoch;
         if (evp->is_access) {
           if (evp->is_func == 0 && evp->type == EventType::kAccessExt &&
@@ -592,12 +516,15 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
         if (evp->is_func) {
           auto *ev = reinterpret_cast<EventFunc *>(evp);
           if (ev->pc) {
-            DPrintf2("  FuncEnter: pc=0x%llx\n", ev->pc);
+            DPrintf2(" FuncEnter: pc=0x%llx\n", ev->pc);
             stack.PushBack(ev->pc);
           } else {
-            DPrintf2("  FuncExit\n");
-            CHECK(stack.Size());
-            stack.PopBack();
+            DPrintf2(" FuncExit\n");
+            // We don't log pathologically large stacks in each part,
+            // if the stack was truncated we can have more func exits than
+            // entries.
+            if (stack.Size())
+              stack.PopBack();
           }
           return;
         }
@@ -666,8 +593,6 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
   return found;
 }
 
-}  // namespace v3
-
 bool RacyStacks::operator==(const RacyStacks &other) const {
   if (hash[0] == other.hash[0] && hash[1] == other.hash[1])
     return true;
@@ -758,10 +683,7 @@ bool OutputReport(ThreadState *thr, const ScopedReport &srep) {
     ctx->fired_suppressions.push_back(s);
   }
   {
-    bool old_is_freeing = thr->is_freeing;
-    thr->is_freeing = false;
     bool suppressed = OnReport(rep, pc_or_addr != 0);
-    thr->is_freeing = old_is_freeing;
     if (suppressed) {
       thr->current_report = nullptr;
       return false;
@@ -808,97 +730,72 @@ static bool IsFiredSuppression(Context *ctx, ReportType type, uptr addr) {
   return false;
 }
 
-static bool RaceBetweenAtomicAndFree(ThreadState *thr) {
-  Shadow s0(thr->racy_state[0]);
-  Shadow s1(thr->racy_state[1]);
-  CHECK(!(s0.IsAtomic() && s1.IsAtomic()));
-  if (!s0.IsAtomic() && !s1.IsAtomic())
-    return true;
-  if (s0.IsAtomic() && s1.IsFreed())
-    return true;
-  if (s1.IsAtomic() && thr->is_freeing)
-    return true;
-  return false;
-}
-
-void ReportRace(ThreadState *thr) {
+void ReportRace(ThreadState *thr, RawShadow *shadow_mem, Shadow cur, Shadow old,
+                AccessType typ0) {
   CheckedMutex::CheckNoLocks();
 
   // Symbolizer makes lots of intercepted calls. If we try to process them,
   // at best it will cause deadlocks on internal mutexes.
   ScopedIgnoreInterceptors ignore;
 
+  uptr addr = ShadowToMem(shadow_mem);
+  DPrintf("#%d: ReportRace %p\n", thr->tid, (void *)addr);
   if (!ShouldReport(thr, ReportTypeRace))
     return;
-  if (!flags()->report_atomic_races && !RaceBetweenAtomicAndFree(thr))
+  uptr addr_off0, size0;
+  cur.GetAccess(&addr_off0, &size0, nullptr);
+  uptr addr_off1, size1, typ1;
+  old.GetAccess(&addr_off1, &size1, &typ1);
+  if (!flags()->report_atomic_races &&
+      ((typ0 & kAccessAtomic) || (typ1 & kAccessAtomic)) &&
+      !(typ0 & kAccessFree) && !(typ1 & kAccessFree))
     return;
 
-  bool freed = false;
-  {
-    Shadow s(thr->racy_state[1]);
-    freed = s.GetFreedAndReset();
-    thr->racy_state[1] = s.raw();
-  }
-
-  uptr addr = ShadowToMem(thr->racy_shadow_addr);
-  uptr addr_min = 0;
-  uptr addr_max = 0;
-  {
-    uptr a0 = addr + Shadow(thr->racy_state[0]).addr0();
-    uptr a1 = addr + Shadow(thr->racy_state[1]).addr0();
-    uptr e0 = a0 + Shadow(thr->racy_state[0]).size();
-    uptr e1 = a1 + Shadow(thr->racy_state[1]).size();
-    addr_min = min(a0, a1);
-    addr_max = max(e0, e1);
-    if (IsExpectedReport(addr_min, addr_max - addr_min))
-      return;
-  }
+  const uptr kMop = 2;
+  Shadow s[kMop] = {cur, old};
+  uptr addr0 = addr + addr_off0;
+  uptr addr1 = addr + addr_off1;
+  uptr end0 = addr0 + size0;
+  uptr end1 = addr1 + size1;
+  uptr addr_min = min(addr0, addr1);
+  uptr addr_max = max(end0, end1);
+  if (IsExpectedReport(addr_min, addr_max - addr_min))
+    return;
   if (HandleRacyAddress(thr, addr_min, addr_max))
     return;
 
-  ReportType typ = ReportTypeRace;
-  if (thr->is_vptr_access && freed)
-    typ = ReportTypeVptrUseAfterFree;
-  else if (thr->is_vptr_access)
-    typ = ReportTypeVptrRace;
-  else if (freed)
-    typ = ReportTypeUseAfterFree;
+  ReportType rep_typ = ReportTypeRace;
+  if ((typ0 & kAccessVptr) && (typ1 & kAccessFree))
+    rep_typ = ReportTypeVptrUseAfterFree;
+  else if (typ0 & kAccessVptr)
+    rep_typ = ReportTypeVptrRace;
+  else if (typ1 & kAccessFree)
+    rep_typ = ReportTypeUseAfterFree;
 
-  if (IsFiredSuppression(ctx, typ, addr))
+  if (IsFiredSuppression(ctx, rep_typ, addr))
     return;
 
-  const uptr kMop = 2;
   VarSizeStackTrace traces[kMop];
-  uptr tags[kMop] = {kExternalTagNone};
-  uptr toppc = TraceTopPC(thr);
-  if (toppc >> kEventPCBits) {
-    // This is a work-around for a known issue.
-    // The scenario where this happens is rather elaborate and requires
-    // an instrumented __sanitizer_report_error_summary callback and
-    // a __tsan_symbolize_external callback and a race during a range memory
-    // access larger than 8 bytes. MemoryAccessRange adds the current PC to
-    // the trace and starts processing memory accesses. A first memory access
-    // triggers a race, we report it and call the instrumented
-    // __sanitizer_report_error_summary, which adds more stuff to the trace
-    // since it is intrumented. Then a second memory access in MemoryAccessRange
-    // also triggers a race and we get here and call TraceTopPC to get the
-    // current PC, however now it contains some unrelated events from the
-    // callback. Most likely, TraceTopPC will now return a EventTypeFuncExit
-    // event. Later we subtract -1 from it (in GetPreviousInstructionPc)
-    // and the resulting PC has kExternalPCBit set, so we pass it to
-    // __tsan_symbolize_external_ex. __tsan_symbolize_external_ex is within its
-    // rights to crash since the PC is completely bogus.
-    // test/tsan/double_race.cpp contains a test case for this.
-    toppc = 0;
-  }
-  ObtainCurrentStack(thr, toppc, &traces[0], &tags[0]);
-  if (IsFiredSuppression(ctx, typ, traces[0]))
+  Tid tids[kMop] = {thr->tid, kInvalidTid};
+  uptr tags[kMop] = {kExternalTagNone, kExternalTagNone};
+
+  ObtainCurrentStack(thr, thr->trace_prev_pc, &traces[0], &tags[0]);
+  if (IsFiredSuppression(ctx, rep_typ, traces[0]))
     return;
 
-  DynamicMutexSet mset2;
-  Shadow s2(thr->racy_state[1]);
-  RestoreStack(s2.tid(), s2.epoch(), &traces[1], mset2, &tags[1]);
-  if (IsFiredSuppression(ctx, typ, traces[1]))
+  DynamicMutexSet mset1;
+  MutexSet *mset[kMop] = {&thr->mset, mset1};
+
+  // We need to lock the slot during RestoreStack because it protects
+  // the slot journal.
+  Lock slot_lock(&ctx->slots[static_cast<uptr>(s[1].sid())].mtx);
+  ThreadRegistryLock l0(&ctx->thread_registry);
+  Lock slots_lock(&ctx->slot_mtx);
+  if (!RestoreStack(EventType::kAccessExt, s[1].sid(), s[1].epoch(), addr1,
+                    size1, typ1, &tids[1], &traces[1], mset[1], &tags[1]))
+    return;
+
+  if (IsFiredSuppression(ctx, rep_typ, traces[1]))
     return;
 
   if (HandleRacyStacks(thr, traces))
@@ -908,39 +805,41 @@ void ReportRace(ThreadState *thr) {
   uptr tag = kExternalTagNone;
   for (uptr i = 0; i < kMop; i++) {
     if (tags[i] != kExternalTagNone) {
-      typ = ReportTypeExternalRace;
+      rep_typ = ReportTypeExternalRace;
       tag = tags[i];
       break;
     }
   }
 
-  ThreadRegistryLock l0(&ctx->thread_registry);
-  ScopedReport rep(typ, tag);
-  for (uptr i = 0; i < kMop; i++) {
-    Shadow s(thr->racy_state[i]);
-    rep.AddMemoryAccess(addr, tags[i], s, traces[i],
-                        i == 0 ? &thr->mset : mset2);
-  }
+  ScopedReport rep(rep_typ, tag);
+  for (uptr i = 0; i < kMop; i++)
+    rep.AddMemoryAccess(addr, tags[i], s[i], tids[i], traces[i], mset[i]);
 
   for (uptr i = 0; i < kMop; i++) {
-    FastState s(thr->racy_state[i]);
     ThreadContext *tctx = static_cast<ThreadContext *>(
-        ctx->thread_registry.GetThreadLocked(s.tid()));
-    if (s.epoch() < tctx->epoch0 || s.epoch() > tctx->epoch1)
-      continue;
+        ctx->thread_registry.GetThreadLocked(tids[i]));
     rep.AddThread(tctx);
   }
 
   rep.AddLocation(addr_min, addr_max - addr_min);
 
-#if !SANITIZER_GO
-  {
-    Shadow s(thr->racy_state[1]);
-    if (s.epoch() <= thr->last_sleep_clock.get(s.tid()))
-      rep.AddSleep(thr->last_sleep_stack_id);
+  if (flags()->print_full_thread_history) {
+    const ReportDesc *rep_desc = rep.GetReport();
+    for (uptr i = 0; i < rep_desc->threads.Size(); i++) {
+      Tid parent_tid = rep_desc->threads[i]->parent_tid;
+      if (parent_tid == kMainTid || parent_tid == kInvalidTid)
+        continue;
+      ThreadContext *parent_tctx = static_cast<ThreadContext *>(
+          ctx->thread_registry.GetThreadLocked(parent_tid));
+      rep.AddThread(parent_tctx);
+    }
   }
-#endif
 
+#if !SANITIZER_GO
+  if (!((typ0 | typ1) & kAccessFree) &&
+      s[1].epoch() <= thr->last_sleep_clock.Get(s[1].sid()))
+    rep.AddSleep(thr->last_sleep_stack_id);
+#endif
   OutputReport(thr, rep);
 }
 
diff --git a/libsanitizer/tsan/tsan_rtl_thread.cpp b/libsanitizer/tsan/tsan_rtl_thread.cpp
index 6e652ee..86c8b37 100644
--- a/libsanitizer/tsan/tsan_rtl_thread.cpp
+++ b/libsanitizer/tsan/tsan_rtl_thread.cpp
@@ -21,20 +21,14 @@ namespace __tsan {
 
 // ThreadContext implementation.
 
-ThreadContext::ThreadContext(Tid tid)
-    : ThreadContextBase(tid), thr(), sync(), epoch0(), epoch1() {}
+ThreadContext::ThreadContext(Tid tid) : ThreadContextBase(tid), thr(), sync() {}
 
 #if !SANITIZER_GO
 ThreadContext::~ThreadContext() {
 }
 #endif
 
-void ThreadContext::OnReset() {
-  CHECK_EQ(sync.size(), 0);
-  uptr trace_p = GetThreadTrace(tid);
-  ReleaseMemoryPagesToOS(trace_p, trace_p + TraceSize() * sizeof(Event));
-  //!!! ReleaseMemoryToOS(GetThreadTraceHeader(tid), sizeof(Trace));
-}
+void ThreadContext::OnReset() { CHECK(!sync); }
 
 #if !SANITIZER_GO
 struct ThreadLeak {
@@ -57,7 +51,9 @@ static void CollectThreadLeaks(ThreadContextBase *tctx_base, void *arg) {
 }
 #endif
 
-#if !SANITIZER_GO
+// Disabled on Mac because lldb test TestTsanBasic fails:
+// https://reviews.llvm.org/D112603#3163158
+#if !SANITIZER_GO && !SANITIZER_MAC
 static void ReportIgnoresEnabled(ThreadContext *tctx, IgnoreSet *set) {
   if (tctx->tid == kMainTid) {
     Printf("ThreadSanitizer: main thread finished with ignores enabled\n");
@@ -112,30 +108,35 @@ int ThreadCount(ThreadState *thr) {
 }
 
 struct OnCreatedArgs {
-  ThreadState *thr;
-  uptr pc;
+  VectorClock *sync;
+  uptr sync_epoch;
+  StackID stack;
 };
 
 Tid ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached) {
-  OnCreatedArgs args = { thr, pc };
-  u32 parent_tid = thr ? thr->tid : kInvalidTid;  // No parent for GCD workers.
-  Tid tid = ctx->thread_registry.CreateThread(uid, detached, parent_tid, &args);
-  DPrintf("#%d: ThreadCreate tid=%d uid=%zu\n", parent_tid, tid, uid);
+  // The main thread and GCD workers don't have a parent thread.
+  Tid parent = kInvalidTid;
+  OnCreatedArgs arg = {nullptr, 0, kInvalidStackID};
+  if (thr) {
+    parent = thr->tid;
+    arg.stack = CurrentStackId(thr, pc);
+    if (!thr->ignore_sync) {
+      SlotLocker locker(thr);
+      thr->clock.ReleaseStore(&arg.sync);
+      arg.sync_epoch = ctx->global_epoch;
+      IncrementEpoch(thr);
+    }
+  }
+  Tid tid = ctx->thread_registry.CreateThread(uid, detached, parent, &arg);
+  DPrintf("#%d: ThreadCreate tid=%d uid=%zu\n", parent, tid, uid);
   return tid;
 }
 
 void ThreadContext::OnCreated(void *arg) {
-  thr = 0;
-  if (tid == kMainTid)
-    return;
   OnCreatedArgs *args = static_cast<OnCreatedArgs *>(arg);
-  if (!args->thr)  // GCD workers don't have a parent thread.
-    return;
-  args->thr->fast_state.IncrementEpoch();
-  // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(args->thr, args->thr->fast_state, EventTypeMop, 0);
-  ReleaseImpl(args->thr, 0, &sync);
-  creation_stack_id = CurrentStackId(args->thr, args->pc);
+  sync = args->sync;
+  sync_epoch = args->sync_epoch;
+  creation_stack_id = args->stack;
 }
 
 extern "C" void __tsan_stack_initialization() {}
@@ -150,6 +151,15 @@ struct OnStartedArgs {
 
 void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
                  ThreadType thread_type) {
+  ctx->thread_registry.StartThread(tid, os_id, thread_type, thr);
+  if (!thr->ignore_sync) {
+    SlotAttachAndLock(thr);
+    if (thr->tctx->sync_epoch == ctx->global_epoch)
+      thr->clock.Acquire(thr->tctx->sync);
+    SlotUnlock(thr);
+  }
+  Free(thr->tctx->sync);
+
   uptr stk_addr = 0;
   uptr stk_size = 0;
   uptr tls_addr = 0;
@@ -159,12 +169,10 @@ void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
     GetThreadStackAndTls(tid == kMainTid, &stk_addr, &stk_size, &tls_addr,
                          &tls_size);
 #endif
-
-  ThreadRegistry *tr = &ctx->thread_registry;
-  OnStartedArgs args = { thr, stk_addr, stk_size, tls_addr, tls_size };
-  tr->StartThread(tid, os_id, thread_type, &args);
-
-  while (!thr->tctx->trace.parts.Empty()) thr->tctx->trace.parts.PopBack();
+  thr->stk_addr = stk_addr;
+  thr->stk_size = stk_size;
+  thr->tls_addr = tls_addr;
+  thr->tls_size = tls_size;
 
 #if !SANITIZER_GO
   if (ctx->after_multithreaded_fork) {
@@ -192,69 +200,80 @@ void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
 }
 
 void ThreadContext::OnStarted(void *arg) {
-  OnStartedArgs *args = static_cast<OnStartedArgs *>(arg);
-  thr = args->thr;
-  // RoundUp so that one trace part does not contain events
-  // from different threads.
-  epoch0 = RoundUp(epoch1 + 1, kTracePartSize);
-  epoch1 = (u64)-1;
-  new (thr)
-      ThreadState(ctx, tid, unique_id, epoch0, reuse_count, args->stk_addr,
-                  args->stk_size, args->tls_addr, args->tls_size);
+  thr = static_cast<ThreadState *>(arg);
+  DPrintf("#%d: ThreadStart\n", tid);
+  new (thr) ThreadState(tid);
   if (common_flags()->detect_deadlocks)
-    thr->dd_lt = ctx->dd->CreateLogicalThread(unique_id);
-  thr->fast_state.SetHistorySize(flags()->history_size);
-  // Commit switch to the new part of the trace.
-  // TraceAddEvent will reset stack0/mset0 in the new part for us.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-
-  thr->fast_synch_epoch = epoch0;
-  AcquireImpl(thr, 0, &sync);
-  sync.Reset(&thr->proc()->clock_cache);
+    thr->dd_lt = ctx->dd->CreateLogicalThread(tid);
   thr->tctx = this;
+#if !SANITIZER_GO
   thr->is_inited = true;
-  DPrintf(
-      "#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
-      "tls_addr=%zx tls_size=%zx\n",
-      tid, (uptr)epoch0, args->stk_addr, args->stk_size, args->tls_addr,
-      args->tls_size);
+#endif
 }
 
 void ThreadFinish(ThreadState *thr) {
+  DPrintf("#%d: ThreadFinish\n", thr->tid);
   ThreadCheckIgnore(thr);
   if (thr->stk_addr && thr->stk_size)
     DontNeedShadowFor(thr->stk_addr, thr->stk_size);
   if (thr->tls_addr && thr->tls_size)
     DontNeedShadowFor(thr->tls_addr, thr->tls_size);
   thr->is_dead = true;
-  ctx->thread_registry.FinishThread(thr->tid);
-}
-
-void ThreadContext::OnFinished() {
-#if SANITIZER_GO
+#if !SANITIZER_GO
+  thr->is_inited = false;
+  thr->ignore_interceptors++;
+  PlatformCleanUpThreadState(thr);
+#endif
+  if (!thr->ignore_sync) {
+    SlotLocker locker(thr);
+    ThreadRegistryLock lock(&ctx->thread_registry);
+    // Note: detached is protected by the thread registry mutex,
+    // the thread may be detaching concurrently in another thread.
+    if (!thr->tctx->detached) {
+      thr->clock.ReleaseStore(&thr->tctx->sync);
+      thr->tctx->sync_epoch = ctx->global_epoch;
+      IncrementEpoch(thr);
+    }
+  }
+#if !SANITIZER_GO
+  UnmapOrDie(thr->shadow_stack, kShadowStackSize * sizeof(uptr));
+#else
   Free(thr->shadow_stack);
+#endif
+  thr->shadow_stack = nullptr;
   thr->shadow_stack_pos = nullptr;
   thr->shadow_stack_end = nullptr;
-#endif
-  if (!detached) {
-    thr->fast_state.IncrementEpoch();
-    // Can't increment epoch w/o writing to the trace as well.
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-    ReleaseImpl(thr, 0, &sync);
-  }
-  epoch1 = thr->fast_state.epoch();
-
   if (common_flags()->detect_deadlocks)
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
-  thr->clock.ResetCached(&thr->proc()->clock_cache);
-#if !SANITIZER_GO
-  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
-#endif
-#if !SANITIZER_GO
-  PlatformCleanUpThreadState(thr);
-#endif
+  SlotDetach(thr);
+  ctx->thread_registry.FinishThread(thr->tid);
   thr->~ThreadState();
-  thr = 0;
+}
+
+void ThreadContext::OnFinished() {
+  Lock lock(&ctx->slot_mtx);
+  Lock lock1(&trace.mtx);
+  // Queue all trace parts into the global recycle queue.
+  auto parts = &trace.parts;
+  while (trace.local_head) {
+    CHECK(parts->Queued(trace.local_head));
+    ctx->trace_part_recycle.PushBack(trace.local_head);
+    trace.local_head = parts->Next(trace.local_head);
+  }
+  ctx->trace_part_recycle_finished += parts->Size();
+  if (ctx->trace_part_recycle_finished > Trace::kFinishedThreadHi) {
+    ctx->trace_part_finished_excess += parts->Size();
+    trace.parts_allocated = 0;
+  } else if (ctx->trace_part_recycle_finished > Trace::kFinishedThreadLo &&
+             parts->Size() > 1) {
+    ctx->trace_part_finished_excess += parts->Size() - 1;
+    trace.parts_allocated = 1;
+  }
+  // From now on replay will use trace->final_pos.
+  trace.final_pos = (Event *)atomic_load_relaxed(&thr->trace_pos);
+  atomic_store_relaxed(&thr->trace_pos, 0);
+  thr->tctx = nullptr;
+  thr = nullptr;
 }
 
 struct ConsumeThreadContext {
@@ -262,60 +281,47 @@ struct ConsumeThreadContext {
   ThreadContextBase *tctx;
 };
 
-static bool ConsumeThreadByUid(ThreadContextBase *tctx, void *arg) {
-  ConsumeThreadContext *findCtx = (ConsumeThreadContext *)arg;
-  if (tctx->user_id == findCtx->uid && tctx->status != ThreadStatusInvalid) {
-    if (findCtx->tctx) {
-      // Ensure that user_id is unique. If it's not the case we are screwed.
-      // Something went wrong before, but now there is no way to recover.
-      // Returning a wrong thread is not an option, it may lead to very hard
-      // to debug false positives (e.g. if we join a wrong thread).
-      Report("ThreadSanitizer: dup thread with used id 0x%zx\n", findCtx->uid);
-      Die();
-    }
-    findCtx->tctx = tctx;
-    tctx->user_id = 0;
-  }
-  return false;
-}
-
 Tid ThreadConsumeTid(ThreadState *thr, uptr pc, uptr uid) {
-  ConsumeThreadContext findCtx = {uid, nullptr};
-  ctx->thread_registry.FindThread(ConsumeThreadByUid, &findCtx);
-  Tid tid = findCtx.tctx ? findCtx.tctx->tid : kInvalidTid;
-  DPrintf("#%d: ThreadTid uid=%zu tid=%d\n", thr->tid, uid, tid);
-  return tid;
+  return ctx->thread_registry.ConsumeThreadUserId(uid);
 }
 
+struct JoinArg {
+  VectorClock *sync;
+  uptr sync_epoch;
+};
+
 void ThreadJoin(ThreadState *thr, uptr pc, Tid tid) {
   CHECK_GT(tid, 0);
-  CHECK_LT(tid, kMaxTid);
   DPrintf("#%d: ThreadJoin tid=%d\n", thr->tid, tid);
-  ctx->thread_registry.JoinThread(tid, thr);
+  JoinArg arg = {};
+  ctx->thread_registry.JoinThread(tid, &arg);
+  if (!thr->ignore_sync) {
+    SlotLocker locker(thr);
+    if (arg.sync_epoch == ctx->global_epoch)
+      thr->clock.Acquire(arg.sync);
+  }
+  Free(arg.sync);
 }
 
-void ThreadContext::OnJoined(void *arg) {
-  ThreadState *caller_thr = static_cast<ThreadState *>(arg);
-  AcquireImpl(caller_thr, 0, &sync);
-  sync.Reset(&caller_thr->proc()->clock_cache);
+void ThreadContext::OnJoined(void *ptr) {
+  auto arg = static_cast<JoinArg *>(ptr);
+  arg->sync = sync;
+  arg->sync_epoch = sync_epoch;
+  sync = nullptr;
+  sync_epoch = 0;
 }
 
-void ThreadContext::OnDead() { CHECK_EQ(sync.size(), 0); }
+void ThreadContext::OnDead() { CHECK_EQ(sync, nullptr); }
 
 void ThreadDetach(ThreadState *thr, uptr pc, Tid tid) {
   CHECK_GT(tid, 0);
-  CHECK_LT(tid, kMaxTid);
   ctx->thread_registry.DetachThread(tid, thr);
 }
 
-void ThreadContext::OnDetached(void *arg) {
-  ThreadState *thr1 = static_cast<ThreadState *>(arg);
-  sync.Reset(&thr1->proc()->clock_cache);
-}
+void ThreadContext::OnDetached(void *arg) { Free(sync); }
 
 void ThreadNotJoined(ThreadState *thr, uptr pc, Tid tid, uptr uid) {
   CHECK_GT(tid, 0);
-  CHECK_LT(tid, kMaxTid);
   ctx->thread_registry.SetThreadUserId(tid, uid);
 }
 
diff --git a/libsanitizer/tsan/tsan_shadow.h b/libsanitizer/tsan/tsan_shadow.h
index 8b7bc34..843573e 100644
--- a/libsanitizer/tsan/tsan_shadow.h
+++ b/libsanitizer/tsan/tsan_shadow.h
@@ -10,223 +10,170 @@
 #define TSAN_SHADOW_H
 
 #include "tsan_defs.h"
-#include "tsan_trace.h"
 
 namespace __tsan {
 
-// FastState (from most significant bit):
-//   ignore          : 1
-//   tid             : kTidBits
-//   unused          : -
-//   history_size    : 3
-//   epoch           : kClkBits
 class FastState {
  public:
-  FastState(u64 tid, u64 epoch) {
-    x_ = tid << kTidShift;
-    x_ |= epoch;
-    DCHECK_EQ(tid, this->tid());
-    DCHECK_EQ(epoch, this->epoch());
-    DCHECK_EQ(GetIgnoreBit(), false);
-  }
-
-  explicit FastState(u64 x) : x_(x) {}
-
-  u64 raw() const { return x_; }
-
-  u64 tid() const {
-    u64 res = (x_ & ~kIgnoreBit) >> kTidShift;
-    return res;
-  }
-
-  u64 TidWithIgnore() const {
-    u64 res = x_ >> kTidShift;
-    return res;
-  }
-
-  u64 epoch() const {
-    u64 res = x_ & ((1ull << kClkBits) - 1);
-    return res;
-  }
+  FastState() { Reset(); }
 
-  void IncrementEpoch() {
-    u64 old_epoch = epoch();
-    x_ += 1;
-    DCHECK_EQ(old_epoch + 1, epoch());
-    (void)old_epoch;
+  void Reset() {
+    part_.unused0_ = 0;
+    part_.sid_ = static_cast<u8>(kFreeSid);
+    part_.epoch_ = static_cast<u16>(kEpochLast);
+    part_.unused1_ = 0;
+    part_.ignore_accesses_ = false;
   }
 
-  void SetIgnoreBit() { x_ |= kIgnoreBit; }
-  void ClearIgnoreBit() { x_ &= ~kIgnoreBit; }
-  bool GetIgnoreBit() const { return (s64)x_ < 0; }
+  void SetSid(Sid sid) { part_.sid_ = static_cast<u8>(sid); }
 
-  void SetHistorySize(int hs) {
-    CHECK_GE(hs, 0);
-    CHECK_LE(hs, 7);
-    x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
-  }
+  Sid sid() const { return static_cast<Sid>(part_.sid_); }
 
-  ALWAYS_INLINE
-  int GetHistorySize() const {
-    return (int)((x_ >> kHistoryShift) & kHistoryMask);
-  }
+  Epoch epoch() const { return static_cast<Epoch>(part_.epoch_); }
 
-  void ClearHistorySize() { SetHistorySize(0); }
+  void SetEpoch(Epoch epoch) { part_.epoch_ = static_cast<u16>(epoch); }
 
-  ALWAYS_INLINE
-  u64 GetTracePos() const {
-    const int hs = GetHistorySize();
-    // When hs == 0, the trace consists of 2 parts.
-    const u64 mask = (1ull << (kTracePartSizeBits + hs + 1)) - 1;
-    return epoch() & mask;
-  }
+  void SetIgnoreBit() { part_.ignore_accesses_ = 1; }
+  void ClearIgnoreBit() { part_.ignore_accesses_ = 0; }
+  bool GetIgnoreBit() const { return part_.ignore_accesses_; }
 
  private:
   friend class Shadow;
-  static const int kTidShift = 64 - kTidBits - 1;
-  static const u64 kIgnoreBit = 1ull << 63;
-  static const u64 kFreedBit = 1ull << 63;
-  static const u64 kHistoryShift = kClkBits;
-  static const u64 kHistoryMask = 7;
-  u64 x_;
+  struct Parts {
+    u32 unused0_ : 8;
+    u32 sid_ : 8;
+    u32 epoch_ : kEpochBits;
+    u32 unused1_ : 1;
+    u32 ignore_accesses_ : 1;
+  };
+  union {
+    Parts part_;
+    u32 raw_;
+  };
 };
 
-// Shadow (from most significant bit):
-//   freed           : 1
-//   tid             : kTidBits
-//   is_atomic       : 1
-//   is_read         : 1
-//   size_log        : 2
-//   addr0           : 3
-//   epoch           : kClkBits
-class Shadow : public FastState {
- public:
-  explicit Shadow(u64 x) : FastState(x) {}
+static_assert(sizeof(FastState) == kShadowSize, "bad FastState size");
 
-  explicit Shadow(const FastState &s) : FastState(s.x_) { ClearHistorySize(); }
-
-  void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
-    DCHECK_EQ((x_ >> kClkBits) & 31, 0);
-    DCHECK_LE(addr0, 7);
-    DCHECK_LE(kAccessSizeLog, 3);
-    x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
-    DCHECK_EQ(kAccessSizeLog, size_log());
-    DCHECK_EQ(addr0, this->addr0());
-  }
-
-  void SetWrite(unsigned kAccessIsWrite) {
-    DCHECK_EQ(x_ & kReadBit, 0);
-    if (!kAccessIsWrite)
-      x_ |= kReadBit;
-    DCHECK_EQ(kAccessIsWrite, IsWrite());
-  }
-
-  void SetAtomic(bool kIsAtomic) {
-    DCHECK(!IsAtomic());
-    if (kIsAtomic)
-      x_ |= kAtomicBit;
-    DCHECK_EQ(IsAtomic(), kIsAtomic);
-  }
-
-  bool IsAtomic() const { return x_ & kAtomicBit; }
-
-  bool IsZero() const { return x_ == 0; }
-
-  static inline bool TidsAreEqual(const Shadow s1, const Shadow s2) {
-    u64 shifted_xor = (s1.x_ ^ s2.x_) >> kTidShift;
-    DCHECK_EQ(shifted_xor == 0, s1.TidWithIgnore() == s2.TidWithIgnore());
-    return shifted_xor == 0;
-  }
-
-  static ALWAYS_INLINE bool Addr0AndSizeAreEqual(const Shadow s1,
-                                                 const Shadow s2) {
-    u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
-    return masked_xor == 0;
+class Shadow {
+ public:
+  static constexpr RawShadow kEmpty = static_cast<RawShadow>(0);
+
+  Shadow(FastState state, u32 addr, u32 size, AccessType typ) {
+    raw_ = state.raw_;
+    DCHECK_GT(size, 0);
+    DCHECK_LE(size, 8);
+    UNUSED Sid sid0 = part_.sid_;
+    UNUSED u16 epoch0 = part_.epoch_;
+    raw_ |= (!!(typ & kAccessAtomic) << kIsAtomicShift) |
+            (!!(typ & kAccessRead) << kIsReadShift) |
+            (((((1u << size) - 1) << (addr & 0x7)) & 0xff) << kAccessShift);
+    // Note: we don't check kAccessAtomic because it overlaps with
+    // FastState::ignore_accesses_ and it may be set spuriously.
+    DCHECK_EQ(part_.is_read_, !!(typ & kAccessRead));
+    DCHECK_EQ(sid(), sid0);
+    DCHECK_EQ(epoch(), epoch0);
+  }
+
+  explicit Shadow(RawShadow x = Shadow::kEmpty) { raw_ = static_cast<u32>(x); }
+
+  RawShadow raw() const { return static_cast<RawShadow>(raw_); }
+  Sid sid() const { return part_.sid_; }
+  Epoch epoch() const { return static_cast<Epoch>(part_.epoch_); }
+  u8 access() const { return part_.access_; }
+
+  void GetAccess(uptr *addr, uptr *size, AccessType *typ) const {
+    DCHECK(part_.access_ != 0 || raw_ == static_cast<u32>(Shadow::kRodata));
+    if (addr)
+      *addr = part_.access_ ? __builtin_ffs(part_.access_) - 1 : 0;
+    if (size)
+      *size = part_.access_ == kFreeAccess ? kShadowCell
+                                           : __builtin_popcount(part_.access_);
+    if (typ)
+      *typ = (part_.is_read_ ? kAccessRead : kAccessWrite) |
+             (part_.is_atomic_ ? kAccessAtomic : 0) |
+             (part_.access_ == kFreeAccess ? kAccessFree : 0);
   }
 
-  static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
-                                               unsigned kS2AccessSize) {
-    bool res = false;
-    u64 diff = s1.addr0() - s2.addr0();
-    if ((s64)diff < 0) {  // s1.addr0 < s2.addr0
-      // if (s1.addr0() + size1) > s2.addr0()) return true;
-      if (s1.size() > -diff)
-        res = true;
-    } else {
-      // if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
-      if (kS2AccessSize > diff)
-        res = true;
-    }
-    DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
-    DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
+  ALWAYS_INLINE
+  bool IsBothReadsOrAtomic(AccessType typ) const {
+    u32 is_read = !!(typ & kAccessRead);
+    u32 is_atomic = !!(typ & kAccessAtomic);
+    bool res =
+        raw_ & ((is_atomic << kIsAtomicShift) | (is_read << kIsReadShift));
+    DCHECK_EQ(res,
+              (part_.is_read_ && is_read) || (part_.is_atomic_ && is_atomic));
     return res;
   }
 
-  u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
-  u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
-  bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
-  bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
-
-  // The idea behind the freed bit is as follows.
-  // When the memory is freed (or otherwise unaccessible) we write to the shadow
-  // values with tid/epoch related to the free and the freed bit set.
-  // During memory accesses processing the freed bit is considered
-  // as msb of tid. So any access races with shadow with freed bit set
-  // (it is as if write from a thread with which we never synchronized before).
-  // This allows us to detect accesses to freed memory w/o additional
-  // overheads in memory access processing and at the same time restore
-  // tid/epoch of free.
-  void MarkAsFreed() { x_ |= kFreedBit; }
-
-  bool IsFreed() const { return x_ & kFreedBit; }
-
-  bool GetFreedAndReset() {
-    bool res = x_ & kFreedBit;
-    x_ &= ~kFreedBit;
+  ALWAYS_INLINE
+  bool IsRWWeakerOrEqual(AccessType typ) const {
+    u32 is_read = !!(typ & kAccessRead);
+    u32 is_atomic = !!(typ & kAccessAtomic);
+    UNUSED u32 res0 =
+        (part_.is_atomic_ > is_atomic) ||
+        (part_.is_atomic_ == is_atomic && part_.is_read_ >= is_read);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    const u32 kAtomicReadMask = (1 << kIsAtomicShift) | (1 << kIsReadShift);
+    bool res = (raw_ & kAtomicReadMask) >=
+               ((is_atomic << kIsAtomicShift) | (is_read << kIsReadShift));
+
+    DCHECK_EQ(res, res0);
     return res;
+#else
+    return res0;
+#endif
   }
 
-  bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
-    bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift) |
-                   (u64(kIsAtomic) << kAtomicShift));
-    DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
-    return v;
-  }
-
-  bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
-    bool v = ((x_ >> kReadShift) & 3) <= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
-    DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
-                     (IsAtomic() == kIsAtomic && !IsWrite() <= !kIsWrite));
-    return v;
+  // The FreedMarker must not pass "the same access check" so that we don't
+  // return from the race detection algorithm early.
+  static RawShadow FreedMarker() {
+    FastState fs;
+    fs.SetSid(kFreeSid);
+    fs.SetEpoch(kEpochLast);
+    Shadow s(fs, 0, 8, kAccessWrite);
+    return s.raw();
   }
 
-  bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
-    bool v = ((x_ >> kReadShift) & 3) >= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
-    DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
-                     (IsAtomic() == kIsAtomic && !IsWrite() >= !kIsWrite));
-    return v;
+  static RawShadow FreedInfo(Sid sid, Epoch epoch) {
+    Shadow s;
+    s.part_.sid_ = sid;
+    s.part_.epoch_ = static_cast<u16>(epoch);
+    s.part_.access_ = kFreeAccess;
+    return s.raw();
   }
 
  private:
-  static const u64 kReadShift = 5 + kClkBits;
-  static const u64 kReadBit = 1ull << kReadShift;
-  static const u64 kAtomicShift = 6 + kClkBits;
-  static const u64 kAtomicBit = 1ull << kAtomicShift;
-
-  u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
-
-  static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
-    if (s1.addr0() == s2.addr0())
-      return true;
-    if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
-      return true;
-    if (s2.addr0() < s1.addr0() && s2.addr0() + s2.size() > s1.addr0())
-      return true;
-    return false;
-  }
+  struct Parts {
+    u8 access_;
+    Sid sid_;
+    u16 epoch_ : kEpochBits;
+    u16 is_read_ : 1;
+    u16 is_atomic_ : 1;
+  };
+  union {
+    Parts part_;
+    u32 raw_;
+  };
+
+  static constexpr u8 kFreeAccess = 0x81;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  static constexpr uptr kAccessShift = 0;
+  static constexpr uptr kIsReadShift = 30;
+  static constexpr uptr kIsAtomicShift = 31;
+#else
+  static constexpr uptr kAccessShift = 24;
+  static constexpr uptr kIsReadShift = 1;
+  static constexpr uptr kIsAtomicShift = 0;
+#endif
+
+ public:
+  // .rodata shadow marker, see MapRodata and ContainsSameAccessFast.
+  static constexpr RawShadow kRodata =
+      static_cast<RawShadow>(1 << kIsReadShift);
 };
 
-const RawShadow kShadowRodata = (RawShadow)-1;  // .rodata shadow marker
+static_assert(sizeof(Shadow) == kShadowSize, "bad Shadow size");
 
 }  // namespace __tsan
 
diff --git a/libsanitizer/tsan/tsan_sync.cpp b/libsanitizer/tsan/tsan_sync.cpp
index f042aba..09d4178 100644
--- a/libsanitizer/tsan/tsan_sync.cpp
+++ b/libsanitizer/tsan/tsan_sync.cpp
@@ -18,43 +18,31 @@ namespace __tsan {
 
 void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s);
 
-SyncVar::SyncVar() : mtx(MutexTypeSyncVar) { Reset(0); }
+SyncVar::SyncVar() : mtx(MutexTypeSyncVar) { Reset(); }
 
-void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid,
-                   bool save_stack) {
+void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, bool save_stack) {
+  Reset();
   this->addr = addr;
-  this->uid = uid;
-  this->next = 0;
-
-  creation_stack_id = kInvalidStackID;
+  next = 0;
   if (save_stack && !SANITIZER_GO)  // Go does not use them
     creation_stack_id = CurrentStackId(thr, pc);
   if (common_flags()->detect_deadlocks)
     DDMutexInit(thr, pc, this);
 }
 
-void SyncVar::Reset(Processor *proc) {
-  uid = 0;
+void SyncVar::Reset() {
+  CHECK(!ctx->resetting);
   creation_stack_id = kInvalidStackID;
   owner_tid = kInvalidTid;
-  last_lock = 0;
+  last_lock.Reset();
   recursion = 0;
   atomic_store_relaxed(&flags, 0);
-
-  if (proc == 0) {
-    CHECK_EQ(clock.size(), 0);
-    CHECK_EQ(read_clock.size(), 0);
-  } else {
-    clock.Reset(&proc->clock_cache);
-    read_clock.Reset(&proc->clock_cache);
-  }
+  Free(clock);
+  Free(read_clock);
 }
 
 MetaMap::MetaMap()
-    : block_alloc_(LINKER_INITIALIZED, "heap block allocator"),
-      sync_alloc_(LINKER_INITIALIZED, "sync allocator") {
-  atomic_store(&uid_gen_, 0, memory_order_relaxed);
-}
+    : block_alloc_("heap block allocator"), sync_alloc_("sync allocator") {}
 
 void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
   u32 idx = block_alloc_.Alloc(&thr->proc()->block_cache);
@@ -68,16 +56,16 @@ void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
   *meta = idx | kFlagBlock;
 }
 
-uptr MetaMap::FreeBlock(Processor *proc, uptr p) {
+uptr MetaMap::FreeBlock(Processor *proc, uptr p, bool reset) {
   MBlock* b = GetBlock(p);
   if (b == 0)
     return 0;
   uptr sz = RoundUpTo(b->siz, kMetaShadowCell);
-  FreeRange(proc, p, sz);
+  FreeRange(proc, p, sz, reset);
   return sz;
 }
 
-bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz) {
+bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz, bool reset) {
   bool has_something = false;
   u32 *meta = MemToMeta(p);
   u32 *end = MemToMeta(p + sz);
@@ -99,7 +87,8 @@ bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz) {
         DCHECK(idx & kFlagSync);
         SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
         u32 next = s->next;
-        s->Reset(proc);
+        if (reset)
+          s->Reset();
         sync_alloc_.Free(&proc->sync_cache, idx & ~kFlagMask);
         idx = next;
       } else {
@@ -116,30 +105,30 @@ bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz) {
 // which can be huge. The function probes pages one-by-one until it finds a page
 // without meta objects, at this point it stops freeing meta objects. Because
 // thread stacks grow top-down, we do the same starting from end as well.
-void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
+void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz, bool reset) {
   if (SANITIZER_GO) {
     // UnmapOrDie/MmapFixedNoReserve does not work on Windows,
     // so we do the optimization only for C/C++.
-    FreeRange(proc, p, sz);
+    FreeRange(proc, p, sz, reset);
     return;
   }
   const uptr kMetaRatio = kMetaShadowCell / kMetaShadowSize;
   const uptr kPageSize = GetPageSizeCached() * kMetaRatio;
   if (sz <= 4 * kPageSize) {
     // If the range is small, just do the normal free procedure.
-    FreeRange(proc, p, sz);
+    FreeRange(proc, p, sz, reset);
     return;
   }
   // First, round both ends of the range to page size.
   uptr diff = RoundUp(p, kPageSize) - p;
   if (diff != 0) {
-    FreeRange(proc, p, diff);
+    FreeRange(proc, p, diff, reset);
     p += diff;
     sz -= diff;
   }
   diff = p + sz - RoundDown(p + sz, kPageSize);
   if (diff != 0) {
-    FreeRange(proc, p + sz - diff, diff);
+    FreeRange(proc, p + sz - diff, diff, reset);
     sz -= diff;
   }
   // Now we must have a non-empty page-aligned range.
@@ -150,7 +139,7 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
   const uptr sz0 = sz;
   // Probe start of the range.
   for (uptr checked = 0; sz > 0; checked += kPageSize) {
-    bool has_something = FreeRange(proc, p, kPageSize);
+    bool has_something = FreeRange(proc, p, kPageSize, reset);
     p += kPageSize;
     sz -= kPageSize;
     if (!has_something && checked > (128 << 10))
@@ -158,7 +147,7 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
   }
   // Probe end of the range.
   for (uptr checked = 0; sz > 0; checked += kPageSize) {
-    bool has_something = FreeRange(proc, p + sz - kPageSize, kPageSize);
+    bool has_something = FreeRange(proc, p + sz - kPageSize, kPageSize, reset);
     sz -= kPageSize;
     // Stacks grow down, so sync object are most likely at the end of the region
     // (if it is a stack). The very end of the stack is TLS and tsan increases
@@ -177,6 +166,27 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
     Die();
 }
 
+void MetaMap::ResetClocks() {
+  // This can be called from the background thread
+  // which does not have proc/cache.
+  // The cache is too large for stack.
+  static InternalAllocatorCache cache;
+  internal_memset(&cache, 0, sizeof(cache));
+  internal_allocator()->InitCache(&cache);
+  sync_alloc_.ForEach([&](SyncVar *s) {
+    if (s->clock) {
+      InternalFree(s->clock, &cache);
+      s->clock = nullptr;
+    }
+    if (s->read_clock) {
+      InternalFree(s->read_clock, &cache);
+      s->read_clock = nullptr;
+    }
+    s->last_lock.Reset();
+  });
+  internal_allocator()->DestroyCache(&cache);
+}
+
 MBlock* MetaMap::GetBlock(uptr p) {
   u32 *meta = MemToMeta(p);
   u32 idx = *meta;
@@ -193,6 +203,7 @@ MBlock* MetaMap::GetBlock(uptr p) {
 
 SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
                           bool save_stack) {
+  DCHECK(!create || thr->slot_locked);
   u32 *meta = MemToMeta(addr);
   u32 idx0 = *meta;
   u32 myidx = 0;
@@ -203,7 +214,7 @@ SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
       SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
       if (LIKELY(s->addr == addr)) {
         if (UNLIKELY(myidx != 0)) {
-          mys->Reset(thr->proc());
+          mys->Reset();
           sync_alloc_.Free(&thr->proc()->sync_cache, myidx);
         }
         return s;
@@ -218,10 +229,9 @@ SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
     }
 
     if (LIKELY(myidx == 0)) {
-      const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
       myidx = sync_alloc_.Alloc(&thr->proc()->sync_cache);
       mys = sync_alloc_.Map(myidx);
-      mys->Init(thr, pc, addr, uid, save_stack);
+      mys->Init(thr, pc, addr, save_stack);
     }
     mys->next = idx0;
     if (atomic_compare_exchange_strong((atomic_uint32_t*)meta, &idx0,
diff --git a/libsanitizer/tsan/tsan_sync.h b/libsanitizer/tsan/tsan_sync.h
index fc8fa28..67d3c0b 100644
--- a/libsanitizer/tsan/tsan_sync.h
+++ b/libsanitizer/tsan/tsan_sync.h
@@ -16,8 +16,9 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
 #include "tsan_defs.h"
-#include "tsan_clock.h"
 #include "tsan_dense_alloc.h"
+#include "tsan_shadow.h"
+#include "tsan_vector_clock.h"
 
 namespace __tsan {
 
@@ -53,34 +54,18 @@ struct SyncVar {
 
   uptr addr;  // overwritten by DenseSlabAlloc freelist
   Mutex mtx;
-  u64 uid;  // Globally unique id.
   StackID creation_stack_id;
   Tid owner_tid;  // Set only by exclusive owners.
-  u64 last_lock;
+  FastState last_lock;
   int recursion;
   atomic_uint32_t flags;
   u32 next;  // in MetaMap
   DDMutex dd;
-  SyncClock read_clock;  // Used for rw mutexes only.
-  // The clock is placed last, so that it is situated on a different cache line
-  // with the mtx. This reduces contention for hot sync objects.
-  SyncClock clock;
+  VectorClock *read_clock;  // Used for rw mutexes only.
+  VectorClock *clock;
 
-  void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid, bool save_stack);
-  void Reset(Processor *proc);
-
-  u64 GetId() const {
-    // 48 lsb is addr, then 14 bits is low part of uid, then 2 zero bits.
-    return GetLsb((u64)addr | (uid << 48), 60);
-  }
-  bool CheckId(u64 uid) const {
-    CHECK_EQ(uid, GetLsb(uid, 14));
-    return GetLsb(this->uid, 14) == uid;
-  }
-  static uptr SplitId(u64 id, u64 *uid) {
-    *uid = id >> 48;
-    return (uptr)GetLsb(id, 48);
-  }
+  void Init(ThreadState *thr, uptr pc, uptr addr, bool save_stack);
+  void Reset();
 
   bool IsFlagSet(u32 f) const {
     return atomic_load_relaxed(&flags) & f;
@@ -110,9 +95,20 @@ class MetaMap {
   MetaMap();
 
   void AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz);
-  uptr FreeBlock(Processor *proc, uptr p);
-  bool FreeRange(Processor *proc, uptr p, uptr sz);
-  void ResetRange(Processor *proc, uptr p, uptr sz);
+
+  // FreeBlock resets all sync objects in the range if reset=true and must not
+  // run concurrently with ResetClocks which resets all sync objects
+  // w/o any synchronization (as part of DoReset).
+  // If we don't have a thread slot (very early/late in thread lifetime or
+  // Go/Java callbacks) or the slot is not locked, then reset must be set to
+  // false. In such case sync object clocks will be reset later (when it's
+  // reused or during the next ResetClocks).
+  uptr FreeBlock(Processor *proc, uptr p, bool reset);
+  bool FreeRange(Processor *proc, uptr p, uptr sz, bool reset);
+  void ResetRange(Processor *proc, uptr p, uptr sz, bool reset);
+  // Reset vector clocks of all sync objects.
+  // Must be called when no other threads access sync objects.
+  void ResetClocks();
   MBlock* GetBlock(uptr p);
 
   SyncVar *GetSyncOrCreate(ThreadState *thr, uptr pc, uptr addr,
@@ -142,7 +138,6 @@ class MetaMap {
   typedef DenseSlabAlloc<SyncVar, 1 << 20, 1 << 10, kFlagMask> SyncAlloc;
   BlockAlloc block_alloc_;
   SyncAlloc sync_alloc_;
-  atomic_uint64_t uid_gen_;
 
   SyncVar *GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
                    bool save_stack);
diff --git a/libsanitizer/tsan/tsan_trace.h b/libsanitizer/tsan/tsan_trace.h
index a771ad9..01bb7b3 100644
--- a/libsanitizer/tsan/tsan_trace.h
+++ b/libsanitizer/tsan/tsan_trace.h
@@ -19,57 +19,6 @@
 
 namespace __tsan {
 
-const int kTracePartSizeBits = 13;
-const int kTracePartSize = 1 << kTracePartSizeBits;
-const int kTraceParts = 2 * 1024 * 1024 / kTracePartSize;
-const int kTraceSize = kTracePartSize * kTraceParts;
-
-// Must fit into 3 bits.
-enum EventType {
-  EventTypeMop,
-  EventTypeFuncEnter,
-  EventTypeFuncExit,
-  EventTypeLock,
-  EventTypeUnlock,
-  EventTypeRLock,
-  EventTypeRUnlock
-};
-
-// Represents a thread event (from most significant bit):
-// u64 typ  : 3;   // EventType.
-// u64 addr : 61;  // Associated pc.
-typedef u64 Event;
-
-const uptr kEventPCBits = 61;
-
-struct TraceHeader {
-#if !SANITIZER_GO
-  BufferedStackTrace stack0;  // Start stack for the trace.
-#else
-  VarSizeStackTrace stack0;
-#endif
-  u64        epoch0;  // Start epoch for the trace.
-  MutexSet   mset0;
-
-  TraceHeader() : stack0(), epoch0() {}
-};
-
-struct Trace {
-  Mutex mtx;
-#if !SANITIZER_GO
-  // Must be last to catch overflow as paging fault.
-  // Go shadow stack is dynamically allocated.
-  uptr shadow_stack[kShadowStackSize];
-#endif
-  // Must be the last field, because we unmap the unused part in
-  // CreateThreadContext.
-  TraceHeader headers[kTraceParts];
-
-  Trace() : mtx(MutexTypeTrace) {}
-};
-
-namespace v3 {
-
 enum class EventType : u64 {
   kAccessExt,
   kAccessRange,
@@ -99,6 +48,8 @@ static constexpr Event NopEvent = {1, 0, EventType::kAccessExt, 0};
 // close enough to each other. Otherwise we fall back to EventAccessExt.
 struct EventAccess {
   static constexpr uptr kPCBits = 15;
+  static_assert(kPCBits + kCompressedAddrBits + 5 == 64,
+                "unused bits in EventAccess");
 
   u64 is_access : 1;  // = 1
   u64 is_read : 1;
@@ -119,13 +70,23 @@ static_assert(sizeof(EventFunc) == 8, "bad EventFunc size");
 
 // Extended memory access with full PC.
 struct EventAccessExt {
+  // Note: precisely specifying the unused parts of the bitfield is critical for
+  // performance. If we don't specify them, compiler will generate code to load
+  // the old value and shuffle it to extract the unused bits to apply to the new
+  // value. If we specify the unused part and store 0 in there, all that
+  // unnecessary code goes away (store of the 0 const is combined with other
+  // constant parts).
+  static constexpr uptr kUnusedBits = 11;
+  static_assert(kCompressedAddrBits + kUnusedBits + 9 == 64,
+                "unused bits in EventAccessExt");
+
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
   EventType type : 3;  // = EventType::kAccessExt
   u64 is_read : 1;
   u64 is_atomic : 1;
   u64 size_log : 2;
-  u64 _ : 11;
+  u64 _ : kUnusedBits;
   u64 addr : kCompressedAddrBits;
   u64 pc;
 };
@@ -134,6 +95,8 @@ static_assert(sizeof(EventAccessExt) == 16, "bad EventAccessExt size");
 // Access to a memory range.
 struct EventAccessRange {
   static constexpr uptr kSizeLoBits = 13;
+  static_assert(kCompressedAddrBits + kSizeLoBits + 7 == 64,
+                "unused bits in EventAccessRange");
 
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
@@ -150,6 +113,13 @@ static_assert(sizeof(EventAccessRange) == 16, "bad EventAccessRange size");
 // Mutex lock.
 struct EventLock {
   static constexpr uptr kStackIDLoBits = 15;
+  static constexpr uptr kStackIDHiBits =
+      sizeof(StackID) * kByteBits - kStackIDLoBits;
+  static constexpr uptr kUnusedBits = 3;
+  static_assert(kCompressedAddrBits + kStackIDLoBits + 5 == 64,
+                "unused bits in EventLock");
+  static_assert(kCompressedAddrBits + kStackIDHiBits + kUnusedBits == 64,
+                "unused bits in EventLock");
 
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
@@ -157,29 +127,37 @@ struct EventLock {
   u64 pc : kCompressedAddrBits;
   u64 stack_lo : kStackIDLoBits;
   u64 stack_hi : sizeof(StackID) * kByteBits - kStackIDLoBits;
-  u64 _ : 3;
+  u64 _ : kUnusedBits;
   u64 addr : kCompressedAddrBits;
 };
 static_assert(sizeof(EventLock) == 16, "bad EventLock size");
 
 // Mutex unlock.
 struct EventUnlock {
+  static constexpr uptr kUnusedBits = 15;
+  static_assert(kCompressedAddrBits + kUnusedBits + 5 == 64,
+                "unused bits in EventUnlock");
+
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
   EventType type : 3;  // = EventType::kUnlock
-  u64 _ : 15;
+  u64 _ : kUnusedBits;
   u64 addr : kCompressedAddrBits;
 };
 static_assert(sizeof(EventUnlock) == 8, "bad EventUnlock size");
 
 // Time change event.
 struct EventTime {
+  static constexpr uptr kUnusedBits = 37;
+  static_assert(kUnusedBits + sizeof(Sid) * kByteBits + kEpochBits + 5 == 64,
+                "unused bits in EventTime");
+
   u64 is_access : 1;   // = 0
   u64 is_func : 1;     // = 0
   EventType type : 3;  // = EventType::kTime
   u64 sid : sizeof(Sid) * kByteBits;
   u64 epoch : kEpochBits;
-  u64 _ : 64 - 5 - sizeof(Sid) * kByteBits - kEpochBits;
+  u64 _ : kUnusedBits;
 };
 static_assert(sizeof(EventTime) == 8, "bad EventTime size");
 
@@ -188,10 +166,12 @@ struct Trace;
 struct TraceHeader {
   Trace* trace = nullptr;  // back-pointer to Trace containing this part
   INode trace_parts;       // in Trace::parts
+  INode global;            // in Contex::trace_part_recycle
 };
 
 struct TracePart : TraceHeader {
-  static constexpr uptr kByteSize = 256 << 10;
+  // There are a lot of goroutines in Go, so we use smaller parts.
+  static constexpr uptr kByteSize = (SANITIZER_GO ? 128 : 256) << 10;
   static constexpr uptr kSize =
       (kByteSize - sizeof(TraceHeader)) / sizeof(Event);
   // TraceAcquire does a fast event pointer overflow check by comparing
@@ -209,13 +189,26 @@ static_assert(sizeof(TracePart) == TracePart::kByteSize, "bad TracePart size");
 struct Trace {
   Mutex mtx;
   IList<TraceHeader, &TraceHeader::trace_parts, TracePart> parts;
-  Event* final_pos =
-      nullptr;  // final position in the last part for finished threads
+  // First node non-queued into ctx->trace_part_recycle.
+  TracePart* local_head;
+  // Final position in the last part for finished threads.
+  Event* final_pos = nullptr;
+  // Number of trace parts allocated on behalf of this trace specifically.
+  // Total number of parts in this trace can be larger if we retake some
+  // parts from other traces.
+  uptr parts_allocated = 0;
 
   Trace() : mtx(MutexTypeTrace) {}
-};
 
-}  // namespace v3
+  // We need at least 3 parts per thread, because we want to keep at last
+  // 2 parts per thread that are not queued into ctx->trace_part_recycle
+  // (the current one being filled and one full part that ensures that
+  // we always have at least one part worth of previous memory accesses).
+  static constexpr uptr kMinParts = 3;
+
+  static constexpr uptr kFinishedThreadLo = 16;
+  static constexpr uptr kFinishedThreadHi = 64;
+};
 
 }  // namespace __tsan
 
diff --git a/libsanitizer/tsan/tsan_update_shadow_word.inc b/libsanitizer/tsan/tsan_update_shadow_word.inc
deleted file mode 100644
index a58ef0f..0000000
--- a/libsanitizer/tsan/tsan_update_shadow_word.inc
+++ /dev/null
@@ -1,59 +0,0 @@
-//===-- tsan_update_shadow_word.inc -----------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer (TSan), a race detector.
-//
-// Body of the hottest inner loop.
-// If we wrap this body into a function, compilers (both gcc and clang)
-// produce sligtly less efficient code.
-//===----------------------------------------------------------------------===//
-do {
-  const unsigned kAccessSize = 1 << kAccessSizeLog;
-  u64 *sp = &shadow_mem[idx];
-  old = LoadShadow(sp);
-  if (LIKELY(old.IsZero())) {
-    if (!stored) {
-      StoreIfNotYetStored(sp, &store_word);
-      stored = true;
-    }
-    break;
-  }
-  // is the memory access equal to the previous?
-  if (LIKELY(Shadow::Addr0AndSizeAreEqual(cur, old))) {
-    // same thread?
-    if (LIKELY(Shadow::TidsAreEqual(old, cur))) {
-      if (LIKELY(old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic))) {
-        StoreIfNotYetStored(sp, &store_word);
-        stored = true;
-      }
-      break;
-    }
-    if (HappensBefore(old, thr)) {
-      if (old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic)) {
-        StoreIfNotYetStored(sp, &store_word);
-        stored = true;
-      }
-      break;
-    }
-    if (LIKELY(old.IsBothReadsOrAtomic(kAccessIsWrite, kIsAtomic)))
-      break;
-    goto RACE;
-  }
-  // Do the memory access intersect?
-  if (Shadow::TwoRangesIntersect(old, cur, kAccessSize)) {
-    if (Shadow::TidsAreEqual(old, cur))
-      break;
-    if (old.IsBothReadsOrAtomic(kAccessIsWrite, kIsAtomic))
-      break;
-    if (LIKELY(HappensBefore(old, thr)))
-      break;
-    goto RACE;
-  }
-  // The accesses do not intersect.
-  break;
-} while (0);
diff --git a/libsanitizer/ubsan/ubsan_diag.cpp b/libsanitizer/ubsan/ubsan_diag.cpp
index 8de51bc..3673e66 100644
--- a/libsanitizer/ubsan/ubsan_diag.cpp
+++ b/libsanitizer/ubsan/ubsan_diag.cpp
@@ -32,15 +32,13 @@ using namespace __ubsan;
 // Windows.
 // TODO(yln): This is a temporary workaround. GetStackTrace functions will be
 // removed in the future.
-void ubsan_GetStackTrace(BufferedStackTrace *stack, uptr max_depth,
-                         uptr pc, uptr bp, void *context, bool fast) {
+void ubsan_GetStackTrace(BufferedStackTrace *stack, uptr max_depth, uptr pc,
+                         uptr bp, void *context, bool request_fast) {
   uptr top = 0;
   uptr bottom = 0;
-  if (StackTrace::WillUseFastUnwind(fast)) {
-    GetThreadStackTopAndBottom(false, &top, &bottom);
-    stack->Unwind(max_depth, pc, bp, nullptr, top, bottom, true);
-  } else
-    stack->Unwind(max_depth, pc, bp, context, 0, 0, false);
+  GetThreadStackTopAndBottom(false, &top, &bottom);
+  bool fast = StackTrace::WillUseFastUnwind(request_fast);
+  stack->Unwind(max_depth, pc, bp, context, top, bottom, fast);
 }
 
 static void MaybePrintStackTrace(uptr pc, uptr bp) {
diff --git a/libsanitizer/ubsan/ubsan_flags.cpp b/libsanitizer/ubsan/ubsan_flags.cpp
index 9a66bd3..25cefd4 100644
--- a/libsanitizer/ubsan/ubsan_flags.cpp
+++ b/libsanitizer/ubsan/ubsan_flags.cpp
@@ -50,7 +50,6 @@ void InitializeFlags() {
   {
     CommonFlags cf;
     cf.CopyFrom(*common_flags());
-    cf.print_summary = false;
     cf.external_symbolizer_path = GetFlag("UBSAN_SYMBOLIZER_PATH");
     OverrideCommonFlags(cf);
   }
diff --git a/libsanitizer/ubsan/ubsan_handlers.cpp b/libsanitizer/ubsan/ubsan_handlers.cpp
index 2184625..e201e6b 100644
--- a/libsanitizer/ubsan/ubsan_handlers.cpp
+++ b/libsanitizer/ubsan/ubsan_handlers.cpp
@@ -894,21 +894,6 @@ void __ubsan_handle_cfi_bad_type(CFICheckFailData *Data, ValueHandle Vtable,
 
 }  // namespace __ubsan
 
-void __ubsan::__ubsan_handle_cfi_bad_icall(CFIBadIcallData *CallData,
-                                           ValueHandle Function) {
-  GET_REPORT_OPTIONS(false);
-  CFICheckFailData Data = {CFITCK_ICall, CallData->Loc, CallData->Type};
-  handleCFIBadIcall(&Data, Function, Opts);
-}
-
-void __ubsan::__ubsan_handle_cfi_bad_icall_abort(CFIBadIcallData *CallData,
-                                                 ValueHandle Function) {
-  GET_REPORT_OPTIONS(true);
-  CFICheckFailData Data = {CFITCK_ICall, CallData->Loc, CallData->Type};
-  handleCFIBadIcall(&Data, Function, Opts);
-  Die();
-}
-
 void __ubsan::__ubsan_handle_cfi_check_fail(CFICheckFailData *Data,
                                             ValueHandle Value,
                                             uptr ValidVtable) {
diff --git a/libsanitizer/ubsan/ubsan_handlers.h b/libsanitizer/ubsan/ubsan_handlers.h
index 9f41235..219fb15 100644
--- a/libsanitizer/ubsan/ubsan_handlers.h
+++ b/libsanitizer/ubsan/ubsan_handlers.h
@@ -215,20 +215,12 @@ enum CFITypeCheckKind : unsigned char {
   CFITCK_VMFCall,
 };
 
-struct CFIBadIcallData {
-  SourceLocation Loc;
-  const TypeDescriptor &Type;
-};
-
 struct CFICheckFailData {
   CFITypeCheckKind CheckKind;
   SourceLocation Loc;
   const TypeDescriptor &Type;
 };
 
-/// \brief Handle control flow integrity failure for indirect function calls.
-RECOVERABLE(cfi_bad_icall, CFIBadIcallData *Data, ValueHandle Function)
-
 /// \brief Handle control flow integrity failures.
 RECOVERABLE(cfi_check_fail, CFICheckFailData *Data, ValueHandle Function,
             uptr VtableIsValid)
diff --git a/libsanitizer/ubsan/ubsan_handlers_cxx.h b/libsanitizer/ubsan/ubsan_handlers_cxx.h
index f7b9fc5..fd534c2 100644
--- a/libsanitizer/ubsan/ubsan_handlers_cxx.h
+++ b/libsanitizer/ubsan/ubsan_handlers_cxx.h
@@ -51,4 +51,4 @@ __ubsan_handle_function_type_mismatch_v1_abort(FunctionTypeMismatchData *Data,
                                                ValueHandle fnRTTI);
 }
 
-#endif // UBSAN_HANDLERS_H
+#endif // UBSAN_HANDLERS_CXX_H
diff --git a/libsanitizer/ubsan/ubsan_init.cpp b/libsanitizer/ubsan/ubsan_init.cpp
index 9931d85..5802d58 100644
--- a/libsanitizer/ubsan/ubsan_init.cpp
+++ b/libsanitizer/ubsan/ubsan_init.cpp
@@ -12,13 +12,14 @@
 
 #include "ubsan_platform.h"
 #if CAN_SANITIZE_UB
-#include "ubsan_diag.h"
-#include "ubsan_init.h"
-#include "ubsan_flags.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_interface_internal.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_mutex.h"
 #include "sanitizer_common/sanitizer_symbolizer.h"
+#include "ubsan_diag.h"
+#include "ubsan_flags.h"
+#include "ubsan_init.h"
 
 using namespace __ubsan;
 
diff --git a/libsanitizer/ubsan/ubsan_platform.h b/libsanitizer/ubsan/ubsan_platform.h
index ad3e883..d2cc2e1 100644
--- a/libsanitizer/ubsan/ubsan_platform.h
+++ b/libsanitizer/ubsan/ubsan_platform.h
@@ -12,7 +12,6 @@
 #ifndef UBSAN_PLATFORM_H
 #define UBSAN_PLATFORM_H
 
-#ifndef CAN_SANITIZE_UB
 // Other platforms should be easy to add, and probably work as-is.
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__) ||        \
     defined(__NetBSD__) || defined(__DragonFly__) ||                           \
@@ -22,6 +21,5 @@
 #else
 # define CAN_SANITIZE_UB 0
 #endif
-#endif //CAN_SANITIZE_UB
 
 #endif