From f77a8bee633f8fae4b3ee85afff619e11226d79b Mon Sep 17 00:00:00 2001 From: Vladimir Sadov Date: Thu, 20 Jun 2019 19:54:34 -0700 Subject: [PATCH] ensure process-wide fence when updating GC write barrier on ARM64 (#25130) * ensure process-wide fences when updating GC write barrier on ARM64 --- src/gc/sample/GCSample.cpp | 2 +- src/vm/gcenv.ee.cpp | 94 ++++++++++++++++++++++++++++++++-------------- src/vm/gchelpers.cpp | 25 ++++++------ src/vm/gchelpers.inl | 7 ++-- 4 files changed, 84 insertions(+), 44 deletions(-) diff --git a/src/gc/sample/GCSample.cpp b/src/gc/sample/GCSample.cpp index 4e22850..d7db90d 100644 --- a/src/gc/sample/GCSample.cpp +++ b/src/gc/sample/GCSample.cpp @@ -95,7 +95,7 @@ inline void ErectWriteBarrier(Object ** dst, Object * ref) return; // volatile is used here to prevent fetch of g_card_table from being reordered - // with g_lowest/highest_address check above. See comment in code:gc_heap::grow_brick_card_tables. + // with g_lowest/highest_address check above. See comments in StompWriteBarrier uint8_t* pCardByte = (uint8_t *)*(volatile uint8_t **)(&g_gc_card_table) + card_byte((uint8_t *)dst); if(*pCardByte != 0xFF) *pCardByte = 0xFF; diff --git a/src/vm/gcenv.ee.cpp b/src/vm/gcenv.ee.cpp index 871efd1..e4d3153 100644 --- a/src/vm/gcenv.ee.cpp +++ b/src/vm/gcenv.ee.cpp @@ -803,7 +803,7 @@ void GCToEEInterface::DiagWalkBGCSurvivors(void* gcContext) void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) { int stompWBCompleteActions = SWB_PASS; - bool is_runtime_suspended = false; + bool is_runtime_suspended = args->is_runtime_suspended; assert(args != nullptr); switch (args->operation) @@ -815,7 +815,13 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) assert(args->lowest_address != nullptr); assert(args->highest_address != nullptr); - g_card_table = args->card_table; + // We are sensitive to the order of writes here (more comments on this further in the method) + // In particular g_card_table must be written before writing the heap bounds. + // For platforms with weak memory ordering we will issue fences, for x64/x86 we are ok + // as long as compiler does not reorder these writes. + // That is unlikely since we have method calls in between. + // Just to be robust agains possible refactoring/inlining we will do a compiler-fenced store here. + VolatileStoreWithoutBarrier(&g_card_table, args->card_table); #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES assert(args->card_bundle_table != nullptr); @@ -830,53 +836,84 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) } #endif // FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP - stompWBCompleteActions |= ::StompWriteBarrierResize(args->is_runtime_suspended, args->requires_upper_bounds_check); - - // We need to make sure that other threads executing checked write barriers - // will see the g_card_table update before g_lowest/highest_address updates. - // Otherwise, the checked write barrier may AV accessing the old card table - // with address that it does not cover. - // - // Even x86's total store ordering is insufficient here because threads reading - // g_card_table do so via the instruction cache, whereas g_lowest/highest_address - // are read via the data cache. - // - // The g_card_table update is covered by section 8.1.3 of the Intel Software - // Development Manual, Volume 3A (System Programming Guide, Part 1), about - // "cross-modifying code": We need all _executing_ threads to invalidate - // their instruction cache, which FlushProcessWriteBuffers achieves by sending - // an IPI (inter-process interrupt). + stompWBCompleteActions |= ::StompWriteBarrierResize(is_runtime_suspended, args->requires_upper_bounds_check); + is_runtime_suspended = (stompWBCompleteActions & SWB_EE_RESTART) || is_runtime_suspended; if (stompWBCompleteActions & SWB_ICACHE_FLUSH) { - // flushing icache on current processor (thread) + // flushing/invalidating the write barrier's body for the current process + // NOTE: the underlying API may flush more than needed or nothing at all if Icache is coherent. ::FlushWriteBarrierInstructionCache(); - // asking other processors (threads) to invalidate their icache + } + + // IMPORTANT: managed heap segments may surround unmanaged/stack segments. In such cases adding another managed + // heap segment may put a stack/unmanaged write inside the new heap range. However the old card table would + // not cover it. Therefore we must ensure that the write barriers see the new table before seeing the new bounds. + // + // On architectures with strong ordering, we only need to prevent compiler reordering. + // Otherwise we put a process-wide fence here (so that we could use an ordinary read in the barrier) + +#if defined(_ARM64_) || defined(_ARM_) + if (!is_runtime_suspended) + { + // If runtime is not suspended, force all threads to see the changed table before seeing updated heap boundaries. + // See: http://vstfdevdiv:8080/DevDiv2/DevDiv/_workitems/edit/346765 FlushProcessWriteBuffers(); } +#endif g_lowest_address = args->lowest_address; - VolatileStore(&g_highest_address, args->highest_address); + g_highest_address = args->highest_address; #if defined(_ARM64_) || defined(_ARM_) // Need to reupdate for changes to g_highest_address g_lowest_address - is_runtime_suspended = (stompWBCompleteActions & SWB_EE_RESTART) || args->is_runtime_suspended; stompWBCompleteActions |= ::StompWriteBarrierResize(is_runtime_suspended, args->requires_upper_bounds_check); #ifdef _ARM_ if (stompWBCompleteActions & SWB_ICACHE_FLUSH) { + // flushing/invalidating the write barrier's body for the current process + // NOTE: the underlying API may flush more than needed or nothing at all if Icache is coherent. ::FlushWriteBarrierInstructionCache(); } #endif +#endif + + // At this point either the old or the new set of globals (card_table, bounds etc) can be used. Card tables and card bundles allow such use. + // When card tables are de-published (at EE suspension) all the info will be merged, so the information will not be lost. + // Another point - we should not yet have any managed objects/addresses outside of the former bounds, so either old or new bounds are fine. + // That is - because bounds can only become wider and we are not yet done with widening. + // + // However!! + // Once we are done, a new object can (and likely will) be allocated outside of the former bounds. + // So, before such object can be used in a write barier, we must ensure that the barrier also uses the new bounds. + // + // This is easy to arrange for architectures with strong memory ordering. We only need to ensure that + // - object is allocated/published _after_ we publish bounds here + // - write barrier reads bounds after reading the new object locations + // + // for architectures with strong memory ordering (x86/x64) both conditions above are naturally guaranteed. + // Systems with weak ordering are more interesting. We could either: + // a) issue a write fence here and pair it with a read fence in the write barrier, or + // b) issue a process-wide full fence here and do ordinary reads in the barrier. + // + // We will do "b" because executing write barrier is by far more common than updating card table. + // + // I.E. - for weak architectures we have to do a process-wide fence. + // + // NOTE: suspending/resuming EE works the same as process-wide fence for our purposes here. + // (we care only about managed threads and suspend/resume will do full fences - good enough for us). + // - is_runtime_suspended = (stompWBCompleteActions & SWB_EE_RESTART) || args->is_runtime_suspended; - if(!is_runtime_suspended) +#if defined(_ARM64_) || defined(_ARM_) + is_runtime_suspended = (stompWBCompleteActions & SWB_EE_RESTART) || is_runtime_suspended; + if (!is_runtime_suspended) { - // If runtime is not suspended, force updated state to be visible to all threads - MemoryBarrier(); + // If runtime is not suspended, force all threads to see the changed state before observing future allocations. + FlushProcessWriteBuffers(); } #endif + if (stompWBCompleteActions & SWB_EE_RESTART) { assert(!args->is_runtime_suspended && @@ -885,6 +922,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) } return; // unlike other branches we have already done cleanup so bailing out here case WriteBarrierOp::StompEphemeral: + assert(args->is_runtime_suspended && "the runtime must be suspended here!"); // StompEphemeral requires a new ephemeral low and a new ephemeral high assert(args->ephemeral_low != nullptr); assert(args->ephemeral_high != nullptr); @@ -893,6 +931,7 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) stompWBCompleteActions |= ::StompWriteBarrierEphemeral(args->is_runtime_suspended); break; case WriteBarrierOp::Initialize: + assert(args->is_runtime_suspended && "the runtime must be suspended here!"); // This operation should only be invoked once, upon initialization. assert(g_card_table == nullptr); assert(g_lowest_address == nullptr); @@ -902,7 +941,6 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) assert(args->highest_address != nullptr); assert(args->ephemeral_low != nullptr); assert(args->ephemeral_high != nullptr); - assert(args->is_runtime_suspended && "the runtime must be suspended here!"); assert(!args->requires_upper_bounds_check && "the ephemeral generation must be at the top of the heap!"); g_card_table = args->card_table; @@ -926,8 +964,8 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) break; case WriteBarrierOp::SwitchToWriteWatch: #ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP - assert(args->write_watch_table != nullptr); assert(args->is_runtime_suspended && "the runtime must be suspended here!"); + assert(args->write_watch_table != nullptr); g_sw_ww_table = args->write_watch_table; g_sw_ww_enabled_for_gc_heap = true; stompWBCompleteActions |= ::SwitchToWriteWatchBarrier(true); diff --git a/src/vm/gchelpers.cpp b/src/vm/gchelpers.cpp index 22a1a5d..07429a0 100644 --- a/src/vm/gchelpers.cpp +++ b/src/vm/gchelpers.cpp @@ -1373,7 +1373,7 @@ extern "C" HCIMPL2_RAW(VOID, JIT_CheckedWriteBarrier, Object **dst, Object *ref) // no HELPER_METHOD_FRAME because we are MODE_COOPERATIVE, GC_NOTRIGGER - *dst = ref; + VolatileStore(dst, ref); // if the dst is outside of the heap (unboxed value classes) then we // simply exit @@ -1407,8 +1407,8 @@ extern "C" HCIMPL2_RAW(VOID, JIT_CheckedWriteBarrier, Object **dst, Object *ref) CheckedAfterRefInEphemFilter++; #endif // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered - // with g_lowest/highest_address check above. See comment in code:gc_heap::grow_brick_card_tables. - BYTE* pCardByte = (BYTE *)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst); + // with g_lowest/highest_address check above. See comment in StompWriteBarrier. + BYTE* pCardByte = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst); if(*pCardByte != 0xFF) { #ifdef FEATURE_COUNT_GC_WRITE_BARRIERS @@ -1440,7 +1440,7 @@ extern "C" HCIMPL2_RAW(VOID, JIT_WriteBarrier, Object **dst, Object *ref) #endif // no HELPER_METHOD_FRAME because we are MODE_COOPERATIVE, GC_NOTRIGGER - *dst = ref; + VolatileStore(dst, ref); // If the store above succeeded, "dst" should be in the heap. assert(GCHeapUtilities::GetGCHeap()->IsHeapPointer((void*)dst)); @@ -1467,7 +1467,9 @@ extern "C" HCIMPL2_RAW(VOID, JIT_WriteBarrier, Object **dst, Object *ref) #ifdef FEATURE_COUNT_GC_WRITE_BARRIERS UncheckedAfterRefInEphemFilter++; #endif - BYTE* pCardByte = (BYTE *)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst); + // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered + // with g_lowest/highest_address check above. See comment in StompWriteBarrier. + BYTE* pCardByte = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst); if(*pCardByte != 0xFF) { #ifdef FEATURE_COUNT_GC_WRITE_BARRIERS @@ -1478,7 +1480,6 @@ extern "C" HCIMPL2_RAW(VOID, JIT_WriteBarrier, Object **dst, Object *ref) #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES SetCardBundleByte((BYTE*)dst); #endif - } } } @@ -1498,6 +1499,7 @@ extern "C" HCIMPL2_RAW(VOID, JIT_WriteBarrierEnsureNonHeapTarget, Object **dst, // no HELPER_METHOD_FRAME because we are MODE_COOPERATIVE, GC_NOTRIGGER + // not a release store because NonHeap. *dst = ref; } HCIMPLEND_RAW @@ -1531,8 +1533,8 @@ void ErectWriteBarrier(OBJECTREF *dst, OBJECTREF ref) if ((BYTE*) OBJECTREFToObject(ref) >= g_ephemeral_low && (BYTE*) OBJECTREFToObject(ref) < g_ephemeral_high) { // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered - // with g_lowest/highest_address check above. See comment in code:gc_heap::grow_brick_card_tables. - BYTE* pCardByte = (BYTE *)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst); + // with g_lowest/highest_address check above. See comment in StompWriteBarrier. + BYTE* pCardByte = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst); if (*pCardByte != 0xFF) { *pCardByte = 0xFF; @@ -1540,7 +1542,6 @@ void ErectWriteBarrier(OBJECTREF *dst, OBJECTREF ref) #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES SetCardBundleByte((BYTE*)dst); #endif - } } } @@ -1571,8 +1572,9 @@ void ErectWriteBarrierForMT(MethodTable **dst, MethodTable *ref) BYTE *refObject = *(BYTE **)((MethodTable*)ref)->GetLoaderAllocatorObjectHandle(); if((BYTE*) refObject >= g_ephemeral_low && (BYTE*) refObject < g_ephemeral_high) { - // See comment above - BYTE* pCardByte = (BYTE *)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst); + // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered + // with g_lowest/highest_address check above. See comment in StompWriteBarrier. + BYTE* pCardByte = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + card_byte((BYTE *)dst); if( !((*pCardByte) & card_bit((BYTE *)dst)) ) { *pCardByte = 0xFF; @@ -1580,7 +1582,6 @@ void ErectWriteBarrierForMT(MethodTable **dst, MethodTable *ref) #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES SetCardBundleByte((BYTE*)dst); #endif - } } } diff --git a/src/vm/gchelpers.inl b/src/vm/gchelpers.inl index 1b14077..fed186d 100644 --- a/src/vm/gchelpers.inl +++ b/src/vm/gchelpers.inl @@ -66,9 +66,10 @@ FORCEINLINE void InlinedSetCardsAfterBulkCopyHelper(Object **start, size_t len) // calculate the number of clumps to mark (round_up(end) - start) size_t clumpCount = endingClump - startingClump; - // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered - // with g_lowest/highest_address check at the beginning of this function. - uint8_t* card = ((uint8_t*)VolatileLoadWithoutBarrier(&g_card_table)) + startingClump; + + // VolatileLoadWithoutBarrier() is used here to prevent fetch of g_card_table from being reordered + // with g_lowest/highest_address check above. See comment in StompWriteBarrier. + BYTE* card = (BYTE*)VolatileLoadWithoutBarrier(&g_card_table) + startingClump; // Fill the cards. To avoid cache line thrashing we check whether the cards have already been set before // writing. -- 2.7.4