From afdcc96d9f0a3325a70d8433cd103efdb56340a9 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Fri, 30 May 2014 13:36:29 +0000
Subject: [PATCH] tsan: optimize memory access functions The optimization is
 two-fold: First, the algorithm now uses SSE instructions to handle all 4
 shadow slots at once. This makes processing faster. Second, if shadow
 contains the same access, we do not store the event into trace. This
 increases effective trace size, that is, tsan can remember up to 10x more
 previous memory accesses.

Perofrmance impact:
Before:
[       OK ] DISABLED_BENCH.Mop8Read (2461 ms)
[       OK ] DISABLED_BENCH.Mop8Write (1836 ms)
After:
[       OK ] DISABLED_BENCH.Mop8Read (1204 ms)
[       OK ] DISABLED_BENCH.Mop8Write (976 ms)
But this measures only fast-path.
On large real applications the speedup is ~20%.

Trace size impact:
On app1:
Memory accesses                   :       1163265870
  Including same                  :        791312905 (68%)
on app2:
Memory accesses                   :        166875345
  Including same                  :        150449689 (90%)
90% of filtered events means that trace size is effectively 10x larger.

llvm-svn: 209897
---
 .../tests/sanitizer_deadlock_detector_test.cc      |   5 +
 compiler-rt/lib/tsan/check_analyze.sh              |  20 ++-
 compiler-rt/lib/tsan/rtl/Makefile.old              |   2 +-
 compiler-rt/lib/tsan/rtl/tsan_defs.h               |   1 +
 compiler-rt/lib/tsan/rtl/tsan_rtl.cc               | 152 +++++++++++++++++++--
 compiler-rt/lib/tsan/rtl/tsan_rtl.h                |  85 +++++-------
 compiler-rt/lib/tsan/rtl/tsan_stat.cc              |   1 +
 compiler-rt/lib/tsan/rtl/tsan_stat.h               |   1 +
 .../lib/tsan/rtl/tsan_update_shadow_word_inl.h     |  13 +-
 compiler-rt/lib/tsan/tests/unit/tsan_mman_test.cc  |   5 +
 10 files changed, 202 insertions(+), 83 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_deadlock_detector_test.cc b/compiler-rt/lib/sanitizer_common/tests/sanitizer_deadlock_detector_test.cc
index ac19dcf..8c83633 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_deadlock_detector_test.cc
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_deadlock_detector_test.cc
@@ -268,9 +268,14 @@ void RunMultipleEpochsTest() {
   }
   EXPECT_EQ(d.testOnlyGetEpoch(), 4 * d.size());
 
+#if TSAN_DEBUG == 0
+  // EXPECT_DEATH clones a thread with 4K stack,
+  // which is overflown by tsan memory accesses functions in debug mode.
+
   // Can not handle the locks from the previous epoch.
   // The caller should update the lock id.
   EXPECT_DEATH(d.onLock(&dtls, l0), "CHECK failed.*current_epoch_");
+#endif
 }
 
 TEST(DeadlockDetector, MultipleEpochsTest) {
diff --git a/compiler-rt/lib/tsan/check_analyze.sh b/compiler-rt/lib/tsan/check_analyze.sh
index 39d570b..08bfc7a 100755
--- a/compiler-rt/lib/tsan/check_analyze.sh
+++ b/compiler-rt/lib/tsan/check_analyze.sh
@@ -8,11 +8,11 @@ PrintRes() {
 
 PrintRes
 
-mops="write1 \
+wmops="write1 \
       write2 \
       write4 \
-      write8 \
-      read1 \
+      write8"
+rmops="read1 \
       read2 \
       read4 \
       read8"
@@ -27,10 +27,16 @@ check() {
   fi
 }
 
-for f in $mops; do
-  check $f rsp 1   # To read caller pc.
-  check $f push 0
-  check $f pop 0
+for f in $wmops; do
+  check $f rsp 3
+  check $f push 1
+  check $f pop 5
+done
+
+for f in $rmops; do
+  check $f rsp 3
+  check $f push 1
+  check $f pop 4
 done
 
 for f in $func; do
diff --git a/compiler-rt/lib/tsan/rtl/Makefile.old b/compiler-rt/lib/tsan/rtl/Makefile.old
index 2a71869..79c761c 100644
--- a/compiler-rt/lib/tsan/rtl/Makefile.old
+++ b/compiler-rt/lib/tsan/rtl/Makefile.old
@@ -1,4 +1,4 @@
-CXXFLAGS = -std=c++11 -fPIE -g -Wall -Werror -fno-builtin -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
+CXXFLAGS = -std=c++11 -fPIE -g -Wall -Werror -fno-builtin -msse3 -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
 CLANG=clang
 ifeq ($(DEBUG), 0)
   CXXFLAGS += -O3
diff --git a/compiler-rt/lib/tsan/rtl/tsan_defs.h b/compiler-rt/lib/tsan/rtl/tsan_defs.h
index 35f4e94..969d09f 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_defs.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_defs.h
@@ -54,6 +54,7 @@ const uptr kShadowCnt = TSAN_SHADOW_COUNT;
 # endif
 #else
 // Count of shadow values in a shadow cell.
+#define TSAN_SHADOW_COUNT 4
 const uptr kShadowCnt = 4;
 #endif
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cc b/compiler-rt/lib/tsan/rtl/tsan_rtl.cc
index 7faf61a..310a069 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cc
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cc
@@ -25,6 +25,16 @@
 #include "tsan_suppressions.h"
 #include "tsan_symbolize.h"
 
+#ifdef __SSE3__
+// <emmintrin.h> transitively includes <stdlib.h>,
+// and it's prohibited to include std headers into tsan runtime.
+// So we do this dirty trick.
+#define _MM_MALLOC_H_INCLUDED
+#define __MM_MALLOC_H
+#include <emmintrin.h>
+typedef __m128i m128;
+#endif
+
 volatile int __tsan_resumed = 0;
 
 extern "C" void __tsan_resume() {
@@ -471,7 +481,8 @@ void StoreIfNotYetStored(u64 *sp, u64 *s) {
   *s = 0;
 }
 
-static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
+ALWAYS_INLINE
+void HandleRace(ThreadState *thr, u64 *shadow_mem,
                               Shadow cur, Shadow old) {
   thr->racy_state[0] = cur.raw();
   thr->racy_state[1] = old.raw();
@@ -483,16 +494,12 @@ static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
 #endif
 }
 
-static inline bool OldIsInSameSynchEpoch(Shadow old, ThreadState *thr) {
-  return old.epoch() >= thr->fast_synch_epoch;
-}
-
 static inline bool HappensBefore(Shadow old, ThreadState *thr) {
   return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
 }
 
-ALWAYS_INLINE USED
-void MemoryAccessImpl(ThreadState *thr, uptr addr,
+ALWAYS_INLINE
+void MemoryAccessImpl1(ThreadState *thr, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
     u64 *shadow_mem, Shadow cur) {
   StatInc(thr, StatMop);
@@ -586,6 +593,90 @@ void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr,
   }
 }
 
+ALWAYS_INLINE
+bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+  Shadow cur(a);
+  for (uptr i = 0; i < kShadowCnt; i++) {
+    Shadow old(LoadShadow(&s[i]));
+    if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
+        old.TidWithIgnore() == cur.TidWithIgnore() &&
+        old.epoch() > sync_epoch &&
+        old.IsAtomic() == cur.IsAtomic() &&
+        old.IsRead() <= cur.IsRead())
+      return true;
+  }
+  return false;
+}
+
+#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
+#define SHUF(v0, v1, i0, i1, i2, i3) _mm_castps_si128(_mm_shuffle_ps( \
+    _mm_castsi128_ps(v0), _mm_castsi128_ps(v1), \
+    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
+ALWAYS_INLINE
+bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+  // This is an optimized version of ContainsSameAccessSlow.
+  // load current access into access[0:63]
+  const m128 access     = _mm_cvtsi64_si128(a);
+  // duplicate high part of access in addr0:
+  // addr0[0:31]        = access[32:63]
+  // addr0[32:63]       = access[32:63]
+  // addr0[64:95]       = access[32:63]
+  // addr0[96:127]      = access[32:63]
+  const m128 addr0      = SHUF(access, access, 1, 1, 1, 1);
+  // load 4 shadow slots
+  const m128 shadow0    = _mm_load_si128((__m128i*)s);
+  const m128 shadow1    = _mm_load_si128((__m128i*)s + 1);
+  // load high parts of 4 shadow slots into addr_vect:
+  // addr_vect[0:31]    = shadow0[32:63]
+  // addr_vect[32:63]   = shadow0[96:127]
+  // addr_vect[64:95]   = shadow1[32:63]
+  // addr_vect[96:127]  = shadow1[96:127]
+  m128 addr_vect        = SHUF(shadow0, shadow1, 1, 3, 1, 3);
+  if (!is_write) {
+    // set IsRead bit in addr_vect
+    const m128 rw_mask1 = _mm_cvtsi64_si128(1<<15);
+    const m128 rw_mask  = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
+    addr_vect           = _mm_or_si128(addr_vect, rw_mask);
+  }
+  // addr0 == addr_vect?
+  const m128 addr_res   = _mm_cmpeq_epi32(addr0, addr_vect);
+  // epoch1[0:63]       = sync_epoch
+  const m128 epoch1     = _mm_cvtsi64_si128(sync_epoch);
+  // epoch[0:31]        = sync_epoch[0:31]
+  // epoch[32:63]       = sync_epoch[0:31]
+  // epoch[64:95]       = sync_epoch[0:31]
+  // epoch[96:127]      = sync_epoch[0:31]
+  const m128 epoch      = SHUF(epoch1, epoch1, 0, 0, 0, 0);
+  // load low parts of shadow cell epochs into epoch_vect:
+  // epoch_vect[0:31]   = shadow0[0:31]
+  // epoch_vect[32:63]  = shadow0[64:95]
+  // epoch_vect[64:95]  = shadow1[0:31]
+  // epoch_vect[96:127] = shadow1[64:95]
+  const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
+  // epoch_vect >= sync_epoch?
+  const m128 epoch_res  = _mm_cmpgt_epi32(epoch_vect, epoch);
+  // addr_res & epoch_res
+  const m128 res        = _mm_and_si128(addr_res, epoch_res);
+  // mask[0] = res[7]
+  // mask[1] = res[15]
+  // ...
+  // mask[15] = res[127]
+  const int mask        = _mm_movemask_epi8(res);
+  return mask != 0;
+}
+#endif
+
+ALWAYS_INLINE
+bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
+  bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
+  DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
+  return res;
+#else
+  return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
+#endif
+}
+
 ALWAYS_INLINE USED
 void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic) {
@@ -618,14 +709,12 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
   }
 
   FastState fast_state = thr->fast_state;
-  if (fast_state.GetIgnoreBit())
+  if (fast_state.GetIgnoreBit()) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopIgnored);
     return;
-  if (kCollectHistory) {
-    fast_state.IncrementEpoch();
-    thr->fast_state = fast_state;
-    // We must not store to the trace if we do not store to the shadow.
-    // That is, this call must be moved somewhere below.
-    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
   }
 
   Shadow cur(fast_state);
@@ -633,7 +722,40 @@ void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
   cur.SetWrite(kAccessIsWrite);
   cur.SetAtomic(kIsAtomic);
 
-  MemoryAccessImpl(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
+      thr->fast_synch_epoch, kAccessIsWrite))) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopSame);
+    return;
+  }
+
+  if (kCollectHistory) {
+    fast_state.IncrementEpoch();
+    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
+    thr->fast_state = fast_state;
+    cur.IncrementEpoch();
+  }
+
+  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
+      shadow_mem, cur);
+}
+
+// Called by MemoryAccessRange in tsan_rtl_thread.cc
+void MemoryAccessImpl(ThreadState *thr, uptr addr,
+    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
+    u64 *shadow_mem, Shadow cur) {
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
+      thr->fast_synch_epoch, kAccessIsWrite))) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopSame);
+    return;
+  }
+
+  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
       shadow_mem, cur);
 }
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
index 9a44a04..1b590b8 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
@@ -78,14 +78,14 @@ const u64 kShadowRodata = (u64)-1;  // .rodata shadow marker
 // FastState (from most significant bit):
 //   ignore          : 1
 //   tid             : kTidBits
-//   epoch           : kClkBits
 //   unused          : -
 //   history_size    : 3
+//   epoch           : kClkBits
 class FastState {
  public:
   FastState(u64 tid, u64 epoch) {
     x_ = tid << kTidShift;
-    x_ |= epoch << kClkShift;
+    x_ |= epoch;
     DCHECK_EQ(tid, this->tid());
     DCHECK_EQ(epoch, this->epoch());
     DCHECK_EQ(GetIgnoreBit(), false);
@@ -110,13 +110,13 @@ class FastState {
   }
 
   u64 epoch() const {
-    u64 res = (x_ << (kTidBits + 1)) >> (64 - kClkBits);
+    u64 res = x_ & ((1ull << kClkBits) - 1);
     return res;
   }
 
   void IncrementEpoch() {
     u64 old_epoch = epoch();
-    x_ += 1 << kClkShift;
+    x_ += 1;
     DCHECK_EQ(old_epoch + 1, epoch());
     (void)old_epoch;
   }
@@ -128,17 +128,19 @@ class FastState {
   void SetHistorySize(int hs) {
     CHECK_GE(hs, 0);
     CHECK_LE(hs, 7);
-    x_ = (x_ & ~7) | hs;
+    x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
   }
 
+  ALWAYS_INLINE
   int GetHistorySize() const {
-    return (int)(x_ & 7);
+    return (int)((x_ >> kHistoryShift) & kHistoryMask);
   }
 
   void ClearHistorySize() {
-    x_ &= ~7;
+    SetHistorySize(0);
   }
 
+  ALWAYS_INLINE
   u64 GetTracePos() const {
     const int hs = GetHistorySize();
     // When hs == 0, the trace consists of 2 parts.
@@ -149,20 +151,21 @@ class FastState {
  private:
   friend class Shadow;
   static const int kTidShift = 64 - kTidBits - 1;
-  static const int kClkShift = kTidShift - kClkBits;
   static const u64 kIgnoreBit = 1ull << 63;
   static const u64 kFreedBit = 1ull << 63;
+  static const u64 kHistoryShift = kClkBits;
+  static const u64 kHistoryMask = 7;
   u64 x_;
 };
 
 // Shadow (from most significant bit):
 //   freed           : 1
 //   tid             : kTidBits
-//   epoch           : kClkBits
 //   is_atomic       : 1
 //   is_read         : 1
 //   size_log        : 2
 //   addr0           : 3
+//   epoch           : kClkBits
 class Shadow : public FastState {
  public:
   explicit Shadow(u64 x)
@@ -175,10 +178,10 @@ class Shadow : public FastState {
   }
 
   void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
-    DCHECK_EQ(x_ & 31, 0);
+    DCHECK_EQ((x_ >> kClkBits) & 31, 0);
     DCHECK_LE(addr0, 7);
     DCHECK_LE(kAccessSizeLog, 3);
-    x_ |= (kAccessSizeLog << 3) | addr0;
+    x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
     DCHECK_EQ(kAccessSizeLog, size_log());
     DCHECK_EQ(addr0, this->addr0());
   }
@@ -211,47 +214,34 @@ class Shadow : public FastState {
     return shifted_xor == 0;
   }
 
-  static inline bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
-    u64 masked_xor = (s1.x_ ^ s2.x_) & 31;
+  static ALWAYS_INLINE
+  bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
+    u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
     return masked_xor == 0;
   }
 
-  static inline bool TwoRangesIntersect(Shadow s1, Shadow s2,
+  static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
       unsigned kS2AccessSize) {
     bool res = false;
     u64 diff = s1.addr0() - s2.addr0();
     if ((s64)diff < 0) {  // s1.addr0 < s2.addr0  // NOLINT
       // if (s1.addr0() + size1) > s2.addr0()) return true;
-      if (s1.size() > -diff)  res = true;
+      if (s1.size() > -diff)
+        res = true;
     } else {
       // if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
-      if (kS2AccessSize > diff) res = true;
+      if (kS2AccessSize > diff)
+        res = true;
     }
-    DCHECK_EQ(res, TwoRangesIntersectSLOW(s1, s2));
-    DCHECK_EQ(res, TwoRangesIntersectSLOW(s2, s1));
+    DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
+    DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
     return res;
   }
 
-  // The idea behind the offset is as follows.
-  // Consider that we have 8 bool's contained within a single 8-byte block
-  // (mapped to a single shadow "cell"). Now consider that we write to the bools
-  // from a single thread (which we consider the common case).
-  // W/o offsetting each access will have to scan 4 shadow values at average
-  // to find the corresponding shadow value for the bool.
-  // With offsetting we start scanning shadow with the offset so that
-  // each access hits necessary shadow straight off (at least in an expected
-  // optimistic case).
-  // This logic works seamlessly for any layout of user data. For example,
-  // if user data is {int, short, char, char}, then accesses to the int are
-  // offsetted to 0, short - 4, 1st char - 6, 2nd char - 7. Hopefully, accesses
-  // from a single thread won't need to scan all 8 shadow values.
-  unsigned ComputeSearchOffset() {
-    return x_ & 7;
-  }
-  u64 addr0() const { return x_ & 7; }
-  u64 size() const { return 1ull << size_log(); }
-  bool IsWrite() const { return !IsRead(); }
-  bool IsRead() const { return x_ & kReadBit; }
+  u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
+  u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
+  bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
+  bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
 
   // The idea behind the freed bit is as follows.
   // When the memory is freed (or otherwise unaccessible) we write to the shadow
@@ -276,15 +266,14 @@ class Shadow : public FastState {
     return res;
   }
 
-  bool IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
-    // analyzes 5-th bit (is_read) and 6-th bit (is_atomic)
-    bool v = x_ & u64(((kIsWrite ^ 1) << kReadShift)
-        | (kIsAtomic << kAtomicShift));
+  bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
+    bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift)
+        | (u64(kIsAtomic) << kAtomicShift));
     DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
     return v;
   }
 
-  bool IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
+  bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
     bool v = ((x_ >> kReadShift) & 3)
         <= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
     DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
@@ -292,7 +281,7 @@ class Shadow : public FastState {
     return v;
   }
 
-  bool IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
+  bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
     bool v = ((x_ >> kReadShift) & 3)
         >= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
     DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
@@ -301,14 +290,14 @@ class Shadow : public FastState {
   }
 
  private:
-  static const u64 kReadShift   = 5;
+  static const u64 kReadShift   = 5 + kClkBits;
   static const u64 kReadBit     = 1ull << kReadShift;
-  static const u64 kAtomicShift = 6;
+  static const u64 kAtomicShift = 6 + kClkBits;
   static const u64 kAtomicBit   = 1ull << kAtomicShift;
 
-  u64 size_log() const { return (x_ >> 3) & 3; }
+  u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
 
-  static bool TwoRangesIntersectSLOW(const Shadow s1, const Shadow s2) {
+  static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
     if (s1.addr0() == s2.addr0()) return true;
     if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
       return true;
diff --git a/compiler-rt/lib/tsan/rtl/tsan_stat.cc b/compiler-rt/lib/tsan/rtl/tsan_stat.cc
index b42348a..350a2ba 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_stat.cc
+++ b/compiler-rt/lib/tsan/rtl/tsan_stat.cc
@@ -37,6 +37,7 @@ void StatOutput(u64 *stat) {
   name[StatMop4]                         = "            size 4                ";
   name[StatMop8]                         = "            size 8                ";
   name[StatMopSame]                      = "  Including same                  ";
+  name[StatMopIgnored]                   = "  Including ignored               ";
   name[StatMopRange]                     = "  Including range                 ";
   name[StatMopRodata]                    = "  Including .rodata               ";
   name[StatMopRangeRodata]               = "  Including .rodata range         ";
diff --git a/compiler-rt/lib/tsan/rtl/tsan_stat.h b/compiler-rt/lib/tsan/rtl/tsan_stat.h
index 8cdf146..0bd949e 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_stat.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_stat.h
@@ -26,6 +26,7 @@ enum StatType {
   StatMop4,
   StatMop8,
   StatMopSame,
+  StatMopIgnored,
   StatMopRange,
   StatMopRodata,
   StatMopRangeRodata,
diff --git a/compiler-rt/lib/tsan/rtl/tsan_update_shadow_word_inl.h b/compiler-rt/lib/tsan/rtl/tsan_update_shadow_word_inl.h
index a11c9bc..c80e0a8 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_update_shadow_word_inl.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_update_shadow_word_inl.h
@@ -16,8 +16,7 @@
 do {
   StatInc(thr, StatShadowProcessed);
   const unsigned kAccessSize = 1 << kAccessSizeLog;
-  unsigned off = cur.ComputeSearchOffset();
-  u64 *sp = &shadow_mem[(idx + off) % kShadowCnt];
+  u64 *sp = &shadow_mem[idx];
   old = LoadShadow(sp);
   if (old.IsZero()) {
     StatInc(thr, StatShadowZero);
@@ -33,16 +32,6 @@ do {
     // same thread?
     if (Shadow::TidsAreEqual(old, cur)) {
       StatInc(thr, StatShadowSameThread);
-      if (OldIsInSameSynchEpoch(old, thr)) {
-        if (old.IsRWNotWeaker(kAccessIsWrite, kIsAtomic)) {
-          // found a slot that holds effectively the same info
-          // (that is, same tid, same sync epoch and same size)
-          StatInc(thr, StatMopSame);
-          return;
-        }
-        StoreIfNotYetStored(sp, &store_word);
-        break;
-      }
       if (old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic))
         StoreIfNotYetStored(sp, &store_word);
       break;
diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_mman_test.cc b/compiler-rt/lib/tsan/tests/unit/tsan_mman_test.cc
index d57ecbf..d8afeaf 100644
--- a/compiler-rt/lib/tsan/tests/unit/tsan_mman_test.cc
+++ b/compiler-rt/lib/tsan/tests/unit/tsan_mman_test.cc
@@ -144,6 +144,11 @@ TEST(Mman, Stats) {
 }
 
 TEST(Mman, CallocOverflow) {
+#if TSAN_DEBUG
+  // EXPECT_DEATH clones a thread with 4K stack,
+  // which is overflown by tsan memory accesses functions in debug mode.
+  return;
+#endif
   size_t kArraySize = 4096;
   volatile size_t kMaxSizeT = std::numeric_limits<size_t>::max();
   volatile size_t kArraySize2 = kMaxSizeT / kArraySize + 10;
-- 
2.7.4