src/gc/unix/gcenv.unix.cpp

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 #include <cstdint>
   6 #include <cstddef>
   7 #include <cassert>
   8 #include <memory>
   9 #include <pthread.h>
  10 #include <signal.h>
  11
  12 #include "config.h"
  13 #include "common.h"
  14
  15 #include "gcenv.structs.h"
  16 #include "gcenv.base.h"
  17 #include "gcenv.os.h"
  18 #include "gcenv.unix.inl"
  19 #include "volatile.h"
  20
  21 #if HAVE_SYS_TIME_H
  22  #include <sys/time.h>
  23 #else
  24  #error "sys/time.h required by GC PAL for the time being"
  25 #endif // HAVE_SYS_TIME_
  26
  27 #if HAVE_SYS_MMAN_H
  28  #include <sys/mman.h>
  29 #else
  30  #error "sys/mman.h required by GC PAL"
  31 #endif // HAVE_SYS_MMAN_H
  32
  33 #ifdef __linux__
  34 #include <sys/syscall.h> // __NR_membarrier
  35 // Ensure __NR_membarrier is defined for portable builds.
  36 # if !defined(__NR_membarrier)
  37 #  if defined(__amd64__)
  38 #   define __NR_membarrier  324
  39 #  elif defined(__i386__)
  40 #   define __NR_membarrier  375
  41 #  elif defined(__arm__)
  42 #   define __NR_membarrier  389
  43 #  elif defined(__aarch64__)
  44 #   define __NR_membarrier  283
  45 #  elif
  46 #   error Unknown architecture
  47 #  endif
  48 # endif
  49 #endif
  50
  51 #include <time.h> // nanosleep
  52 #include <sched.h> // sched_yield
  53 #include <errno.h>
  54 #include <unistd.h> // sysconf
  55 #include "globals.h"
  56 #include "cgroup.h"
  57
  58 #if HAVE_NUMA_H
  59
  60 #include <numa.h>
  61 #include <numaif.h>
  62 #include <dlfcn.h>
  63
  64 // List of all functions from the numa library that are used
  65 #define FOR_ALL_NUMA_FUNCTIONS \
  66     PER_FUNCTION_BLOCK(mbind) \
  67     PER_FUNCTION_BLOCK(numa_available) \
  68     PER_FUNCTION_BLOCK(numa_max_node) \
  69     PER_FUNCTION_BLOCK(numa_node_of_cpu)
  70
  71 // Declare pointers to all the used numa functions
  72 #define PER_FUNCTION_BLOCK(fn) extern decltype(fn)* fn##_ptr;
  73 FOR_ALL_NUMA_FUNCTIONS
  74 #undef PER_FUNCTION_BLOCK
  75
  76 // Redefine all calls to numa functions as calls through pointers that are set
  77 // to the functions of libnuma in the initialization.
  78 #define mbind(...) mbind_ptr(__VA_ARGS__)
  79 #define numa_available() numa_available_ptr()
  80 #define numa_max_node() numa_max_node_ptr()
  81 #define numa_node_of_cpu(...) numa_node_of_cpu_ptr(__VA_ARGS__)
  82
  83 #endif // HAVE_NUMA_H
  84
  85 #if defined(_ARM_) || defined(_ARM64_)
  86 #define SYSCONF_GET_NUMPROCS _SC_NPROCESSORS_CONF
  87 #else
  88 #define SYSCONF_GET_NUMPROCS _SC_NPROCESSORS_ONLN
  89 #endif
  90
  91 // The cached number of logical CPUs observed.
  92 static uint32_t g_logicalCpuCount = 0;
  93
  94 // The cached number of CPUs available for the current process.
  95 static uint32_t g_currentProcessCpuCount = 0;
  96
  97 //
  98 // Helper membarrier function
  99 //
 100 #ifdef __NR_membarrier
 101 # define membarrier(...)  syscall(__NR_membarrier, __VA_ARGS__)
 102 #else
 103 # define membarrier(...)  -ENOSYS
 104 #endif
 105
 106 enum membarrier_cmd
 107 {
 108     MEMBARRIER_CMD_QUERY                                 = 0,
 109     MEMBARRIER_CMD_GLOBAL                                = (1 << 0),
 110     MEMBARRIER_CMD_GLOBAL_EXPEDITED                      = (1 << 1),
 111     MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED             = (1 << 2),
 112     MEMBARRIER_CMD_PRIVATE_EXPEDITED                     = (1 << 3),
 113     MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED            = (1 << 4),
 114     MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE           = (1 << 5),
 115     MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE  = (1 << 6)
 116 };
 117
 118 //
 119 // Tracks if the OS supports FlushProcessWriteBuffers using membarrier
 120 //
 121 static int s_flushUsingMemBarrier = 0;
 122
 123 // Helper memory page used by the FlushProcessWriteBuffers
 124 static uint8_t* g_helperPage = 0;
 125
 126 // Mutex to make the FlushProcessWriteBuffersMutex thread safe
 127 static pthread_mutex_t g_flushProcessWriteBuffersMutex;
 128
 129 size_t GetRestrictedPhysicalMemoryLimit();
 130 bool GetPhysicalMemoryUsed(size_t* val);
 131 bool GetCpuLimit(uint32_t* val);
 132
 133 static size_t g_RestrictedPhysicalMemoryLimit = 0;
 134
 135 uint32_t g_pageSizeUnixInl = 0;
 136
 137 AffinitySet g_processAffinitySet;
 138
 139 #if HAVE_CPUSET_T
 140 typedef cpuset_t cpu_set_t;
 141 #endif
 142
 143 // The highest NUMA node available
 144 int g_highestNumaNode = 0;
 145 // Is numa available
 146 bool g_numaAvailable = false;
 147
 148 void* g_numaHandle = nullptr;
 149
 150 #if HAVE_NUMA_H
 151 #define PER_FUNCTION_BLOCK(fn) decltype(fn)* fn##_ptr;
 152 FOR_ALL_NUMA_FUNCTIONS
 153 #undef PER_FUNCTION_BLOCK
 154 #endif // HAVE_NUMA_H
 155
 156
 157 // Initialize data structures for getting and setting thread affinities to processors and
 158 // querying NUMA related processor information.
 159 // On systems with no NUMA support, it behaves as if there was a single NUMA node with
 160 // a single group of processors.
 161 void NUMASupportInitialize()
 162 {
 163 #if HAVE_NUMA_H
 164     g_numaHandle = dlopen("libnuma.so", RTLD_LAZY);
 165     if (g_numaHandle == 0)
 166     {
 167         g_numaHandle = dlopen("libnuma.so.1", RTLD_LAZY);
 168     }
 169     if (g_numaHandle != 0)
 170     {
 171         dlsym(g_numaHandle, "numa_allocate_cpumask");
 172 #define PER_FUNCTION_BLOCK(fn) \
 173     fn##_ptr = (decltype(fn)*)dlsym(g_numaHandle, #fn); \
 174     if (fn##_ptr == NULL) { fprintf(stderr, "Cannot get symbol " #fn " from libnuma\n"); abort(); }
 175 FOR_ALL_NUMA_FUNCTIONS
 176 #undef PER_FUNCTION_BLOCK
 177
 178         if (numa_available() == -1)
 179         {
 180             dlclose(g_numaHandle);
 181         }
 182         else
 183         {
 184             g_numaAvailable = true;
 185             g_highestNumaNode = numa_max_node();
 186         }
 187     }
 188 #endif // HAVE_NUMA_H
 189     if (!g_numaAvailable)
 190     {
 191         // No NUMA
 192         g_highestNumaNode = 0;
 193     }
 194 }
 195
 196 // Cleanup of the NUMA support data structures
 197 void NUMASupportCleanup()
 198 {
 199 #if HAVE_NUMA_H
 200     if (g_numaAvailable)
 201     {
 202         dlclose(g_numaHandle);
 203     }
 204 #endif // HAVE_NUMA_H
 205 }
 206
 207 // Initialize the interface implementation
 208 // Return:
 209 //  true if it has succeeded, false if it has failed
 210 bool GCToOSInterface::Initialize()
 211 {
 212     int pageSize = sysconf( _SC_PAGE_SIZE );
 213
 214     g_pageSizeUnixInl = uint32_t((pageSize > 0) ? pageSize : 0x1000);
 215
 216     // Calculate and cache the number of processors on this machine
 217     int cpuCount = sysconf(SYSCONF_GET_NUMPROCS);
 218     if (cpuCount == -1)
 219     {
 220         return false;
 221     }
 222
 223     g_logicalCpuCount = cpuCount;
 224
 225     //
 226     // support for FlusProcessWriteBuffers
 227     //
 228
 229     assert(s_flushUsingMemBarrier == 0);
 230
 231     // Starting with Linux kernel 4.14, process memory barriers can be generated
 232     // using MEMBARRIER_CMD_PRIVATE_EXPEDITED.
 233     int mask = membarrier(MEMBARRIER_CMD_QUERY, 0);
 234     if (mask >= 0 &&
 235         mask & MEMBARRIER_CMD_PRIVATE_EXPEDITED &&
 236         // Register intent to use the private expedited command.
 237         membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0) == 0)
 238     {
 239         s_flushUsingMemBarrier = TRUE;
 240     }
 241     else
 242     {
 243         assert(g_helperPage == 0);
 244
 245         g_helperPage = static_cast<uint8_t*>(mmap(0, OS_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
 246
 247         if (g_helperPage == MAP_FAILED)
 248         {
 249             return false;
 250         }
 251
 252         // Verify that the s_helperPage is really aligned to the g_SystemInfo.dwPageSize
 253         assert((((size_t)g_helperPage) & (OS_PAGE_SIZE - 1)) == 0);
 254
 255         // Locking the page ensures that it stays in memory during the two mprotect
 256         // calls in the FlushProcessWriteBuffers below. If the page was unmapped between
 257         // those calls, they would not have the expected effect of generating IPI.
 258         int status = mlock(g_helperPage, OS_PAGE_SIZE);
 259
 260         if (status != 0)
 261         {
 262             return false;
 263         }
 264
 265         status = pthread_mutex_init(&g_flushProcessWriteBuffersMutex, NULL);
 266         if (status != 0)
 267         {
 268             munlock(g_helperPage, OS_PAGE_SIZE);
 269             return false;
 270         }
 271     }
 272
 273 #if HAVE_MACH_ABSOLUTE_TIME
 274     kern_return_t machRet;
 275     if ((machRet = mach_timebase_info(&g_TimebaseInfo)) != KERN_SUCCESS)
 276     {
 277         return false;
 278     }
 279 #endif // HAVE_MACH_ABSOLUTE_TIME
 280
 281     InitializeCGroup();
 282
 283 #if HAVE_SCHED_GETAFFINITY
 284
 285     g_currentProcessCpuCount = 0;
 286
 287     cpu_set_t cpuSet;
 288     int st = sched_getaffinity(0, sizeof(cpu_set_t), &cpuSet);
 289
 290     if (st == 0)
 291     {
 292         for (size_t i = 0; i < g_logicalCpuCount; i++)
 293         {
 294             if (CPU_ISSET(i, &cpuSet))
 295             {
 296                 g_currentProcessCpuCount++;
 297                 g_processAffinitySet.Add(i);
 298             }
 299         }
 300     }
 301     else
 302     {
 303         // We should not get any of the errors that the sched_getaffinity can return since none
 304         // of them applies for the current thread, so this is an unexpected kind of failure.
 305         assert(false);
 306     }
 307
 308 #else // HAVE_SCHED_GETAFFINITY
 309
 310     g_currentProcessCpuCount = g_logicalCpuCount;
 311
 312     for (size_t i = 0; i < g_logicalCpuCount; i++)
 313     {
 314         g_processAffinitySet.Add(i);
 315     }
 316
 317 #endif // HAVE_SCHED_GETAFFINITY
 318
 319     NUMASupportInitialize();
 320
 321     return true;
 322 }
 323
 324 // Shutdown the interface implementation
 325 void GCToOSInterface::Shutdown()
 326 {
 327     int ret = munlock(g_helperPage, OS_PAGE_SIZE);
 328     assert(ret == 0);
 329     ret = pthread_mutex_destroy(&g_flushProcessWriteBuffersMutex);
 330     assert(ret == 0);
 331
 332     munmap(g_helperPage, OS_PAGE_SIZE);
 333
 334     CleanupCGroup();
 335     NUMASupportCleanup();
 336 }
 337
 338 // Get numeric id of the current thread if possible on the
 339 // current platform. It is indended for logging purposes only.
 340 // Return:
 341 //  Numeric id of the current thread, as best we can retrieve it.
 342 uint64_t GCToOSInterface::GetCurrentThreadIdForLogging()
 343 {
 344 #if defined(__linux__)
 345     return (uint64_t)syscall(SYS_gettid);
 346 #elif HAVE_PTHREAD_GETTHREADID_NP
 347     return (uint64_t)pthread_getthreadid_np();
 348 #elif HAVE_PTHREAD_THREADID_NP
 349     unsigned long long tid;
 350     pthread_threadid_np(pthread_self(), &tid);
 351     return (uint64_t)tid;
 352 #else
 353     // Fallback in case we don't know how to get integer thread id on the current platform
 354     return (uint64_t)pthread_self();
 355 #endif
 356 }
 357
 358 // Get the process ID of the process.
 359 uint32_t GCToOSInterface::GetCurrentProcessId()
 360 {
 361     return getpid();
 362 }
 363
 364 // Set ideal processor for the current thread
 365 // Parameters:
 366 //  srcProcNo - processor number the thread currently runs on
 367 //  dstProcNo - processor number the thread should be migrated to
 368 // Return:
 369 //  true if it has succeeded, false if it has failed
 370 bool GCToOSInterface::SetCurrentThreadIdealAffinity(uint16_t srcProcNo, uint16_t dstProcNo)
 371 {
 372     return GCToOSInterface::SetThreadAffinity(dstProcNo);
 373 }
 374
 375 // Get the number of the current processor
 376 uint32_t GCToOSInterface::GetCurrentProcessorNumber()
 377 {
 378 #if HAVE_SCHED_GETCPU
 379     int processorNumber = sched_getcpu();
 380     assert(processorNumber != -1);
 381     return processorNumber;
 382 #else
 383     return 0;
 384 #endif
 385 }
 386
 387 // Check if the OS supports getting current processor number
 388 bool GCToOSInterface::CanGetCurrentProcessorNumber()
 389 {
 390     return HAVE_SCHED_GETCPU;
 391 }
 392
 393 // Flush write buffers of processors that are executing threads of the current process
 394 void GCToOSInterface::FlushProcessWriteBuffers()
 395 {
 396     if (s_flushUsingMemBarrier)
 397     {
 398         int status = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0);
 399         assert(status == 0 && "Failed to flush using membarrier");
 400     }
 401     else
 402     {
 403         int status = pthread_mutex_lock(&g_flushProcessWriteBuffersMutex);
 404         assert(status == 0 && "Failed to lock the flushProcessWriteBuffersMutex lock");
 405
 406         // Changing a helper memory page protection from read / write to no access
 407         // causes the OS to issue IPI to flush TLBs on all processors. This also
 408         // results in flushing the processor buffers.
 409         status = mprotect(g_helperPage, OS_PAGE_SIZE, PROT_READ | PROT_WRITE);
 410         assert(status == 0 && "Failed to change helper page protection to read / write");
 411
 412         // Ensure that the page is dirty before we change the protection so that
 413         // we prevent the OS from skipping the global TLB flush.
 414         __sync_add_and_fetch((size_t*)g_helperPage, 1);
 415
 416         status = mprotect(g_helperPage, OS_PAGE_SIZE, PROT_NONE);
 417         assert(status == 0 && "Failed to change helper page protection to no access");
 418
 419         status = pthread_mutex_unlock(&g_flushProcessWriteBuffersMutex);
 420         assert(status == 0 && "Failed to unlock the flushProcessWriteBuffersMutex lock");
 421     }
 422 }
 423
 424 // Break into a debugger. Uses a compiler intrinsic if one is available,
 425 // otherwise raises a SIGTRAP.
 426 void GCToOSInterface::DebugBreak()
 427 {
 428     // __has_builtin is only defined by clang. GCC doesn't have a debug
 429     // trap intrinsic anyway.
 430 #ifndef __has_builtin
 431  #define __has_builtin(x) 0
 432 #endif // __has_builtin
 433
 434 #if __has_builtin(__builtin_debugtrap)
 435     __builtin_debugtrap();
 436 #else
 437     raise(SIGTRAP);
 438 #endif
 439 }
 440
 441 // Causes the calling thread to sleep for the specified number of milliseconds
 442 // Parameters:
 443 //  sleepMSec   - time to sleep before switching to another thread
 444 void GCToOSInterface::Sleep(uint32_t sleepMSec)
 445 {
 446     if (sleepMSec == 0)
 447     {
 448         return;
 449     }
 450
 451     timespec requested;
 452     requested.tv_sec = sleepMSec / tccSecondsToMilliSeconds;
 453     requested.tv_nsec = (sleepMSec - requested.tv_sec * tccSecondsToMilliSeconds) * tccMilliSecondsToNanoSeconds;
 454
 455     timespec remaining;
 456     while (nanosleep(&requested, &remaining) == EINTR)
 457     {
 458         requested = remaining;
 459     }
 460 }
 461
 462 // Causes the calling thread to yield execution to another thread that is ready to run on the current processor.
 463 // Parameters:
 464 //  switchCount - number of times the YieldThread was called in a loop
 465 void GCToOSInterface::YieldThread(uint32_t switchCount)
 466 {
 467     int ret = sched_yield();
 468
 469     // sched_yield never fails on Linux, unclear about other OSes
 470     assert(ret == 0);
 471 }
 472
 473 // Reserve virtual memory range.
 474 // Parameters:
 475 //  size      - size of the virtual memory range
 476 //  alignment - requested memory alignment, 0 means no specific alignment requested
 477 //  flags     - flags to control special settings like write watching
 478 // Return:
 479 //  Starting virtual address of the reserved range
 480 static void* VirtualReserveInner(size_t size, size_t alignment, uint32_t flags, uint32_t hugePagesFlag = 0)
 481 {
 482     assert(!(flags & VirtualReserveFlags::WriteWatch) && "WriteWatch not supported on Unix");
 483     if (alignment == 0)
 484     {
 485         alignment = OS_PAGE_SIZE;
 486     }
 487
 488     size_t alignedSize = size + (alignment - OS_PAGE_SIZE);
 489     void * pRetVal = mmap(nullptr, alignedSize, PROT_NONE, MAP_ANON | MAP_PRIVATE | hugePagesFlag, -1, 0);
 490
 491     if (pRetVal != NULL)
 492     {
 493         void * pAlignedRetVal = (void *)(((size_t)pRetVal + (alignment - 1)) & ~(alignment - 1));
 494         size_t startPadding = (size_t)pAlignedRetVal - (size_t)pRetVal;
 495         if (startPadding != 0)
 496         {
 497             int ret = munmap(pRetVal, startPadding);
 498             assert(ret == 0);
 499         }
 500
 501         size_t endPadding = alignedSize - (startPadding + size);
 502         if (endPadding != 0)
 503         {
 504             int ret = munmap((void *)((size_t)pAlignedRetVal + size), endPadding);
 505             assert(ret == 0);
 506         }
 507
 508         pRetVal = pAlignedRetVal;
 509     }
 510
 511     return pRetVal;
 512 }
 513
 514 // Reserve virtual memory range.
 515 // Parameters:
 516 //  size      - size of the virtual memory range
 517 //  alignment - requested memory alignment, 0 means no specific alignment requested
 518 //  flags     - flags to control special settings like write watching
 519 // Return:
 520 //  Starting virtual address of the reserved range
 521 void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags)
 522 {
 523     return VirtualReserveInner(size, alignment, flags);
 524 }
 525
 526 // Release virtual memory range previously reserved using VirtualReserve
 527 // Parameters:
 528 //  address - starting virtual address
 529 //  size    - size of the virtual memory range
 530 // Return:
 531 //  true if it has succeeded, false if it has failed
 532 bool GCToOSInterface::VirtualRelease(void* address, size_t size)
 533 {
 534     int ret = munmap(address, size);
 535
 536     return (ret == 0);
 537 }
 538
 539 // Commit virtual memory range.
 540 // Parameters:
 541 //  size      - size of the virtual memory range
 542 // Return:
 543 //  Starting virtual address of the committed range
 544 void* GCToOSInterface::VirtualReserveAndCommitLargePages(size_t size)
 545 {
 546 #if HAVE_MAP_HUGETLB
 547     uint32_t largePagesFlag = MAP_HUGETLB;
 548 #else
 549     uint32_t largePagesFlag = 0;
 550 #endif
 551
 552     void* pRetVal = VirtualReserveInner(size, OS_PAGE_SIZE, 0, largePagesFlag);
 553     if (VirtualCommit(pRetVal, size, NUMA_NODE_UNDEFINED))
 554     {
 555         return pRetVal;
 556     }
 557
 558     return nullptr;
 559 }
 560
 561 // Commit virtual memory range. It must be part of a range reserved using VirtualReserve.
 562 // Parameters:
 563 //  address - starting virtual address
 564 //  size    - size of the virtual memory range
 565 // Return:
 566 //  true if it has succeeded, false if it has failed
 567 bool GCToOSInterface::VirtualCommit(void* address, size_t size, uint16_t node)
 568 {
 569     bool success = mprotect(address, size, PROT_WRITE | PROT_READ) == 0;
 570
 571 #if HAVE_NUMA_H
 572     if (success && g_numaAvailable && (node != NUMA_NODE_UNDEFINED))
 573     {
 574         if ((int)node <= g_highestNumaNode)
 575         {
 576             int usedNodeMaskBits = g_highestNumaNode + 1;
 577             int nodeMaskLength = (usedNodeMaskBits + sizeof(unsigned long) - 1) / sizeof(unsigned long);
 578             unsigned long nodeMask[nodeMaskLength];
 579             memset(nodeMask, 0, sizeof(nodeMask));
 580
 581             int index = node / sizeof(unsigned long);
 582             nodeMask[index] = ((unsigned long)1) << (node & (sizeof(unsigned long) - 1));
 583
 584             int st = mbind(address, size, MPOL_PREFERRED, nodeMask, usedNodeMaskBits, 0);
 585             assert(st == 0);
 586             // If the mbind fails, we still return the allocated memory since the node is just a hint
 587         }
 588     }
 589 #endif // HAVE_NUMA_H
 590
 591     return success;
 592 }
 593
 594 // Decomit virtual memory range.
 595 // Parameters:
 596 //  address - starting virtual address
 597 //  size    - size of the virtual memory range
 598 // Return:
 599 //  true if it has succeeded, false if it has failed
 600 bool GCToOSInterface::VirtualDecommit(void* address, size_t size)
 601 {
 602     // TODO: This can fail, however the GC does not handle the failure gracefully
 603     // Explicitly calling mmap instead of mprotect here makes it
 604     // that much more clear to the operating system that we no
 605     // longer need these pages. Also, GC depends on re-commited pages to
 606     // be zeroed-out.
 607     return mmap(address, size, PROT_NONE, MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0) != NULL;
 608 }
 609
 610 // Reset virtual memory range. Indicates that data in the memory range specified by address and size is no
 611 // longer of interest, but it should not be decommitted.
 612 // Parameters:
 613 //  address - starting virtual address
 614 //  size    - size of the virtual memory range
 615 //  unlock  - true if the memory range should also be unlocked
 616 // Return:
 617 //  true if it has succeeded, false if it has failed
 618 bool GCToOSInterface::VirtualReset(void * address, size_t size, bool unlock)
 619 {
 620     int st;
 621 #if HAVE_MADV_FREE
 622     // Try to use MADV_FREE if supported. It tells the kernel that the application doesn't
 623     // need the pages in the range. Freeing the pages can be delayed until a memory pressure
 624     // occurs.
 625     st = madvise(address, size, MADV_FREE);
 626     if (st != 0)
 627 #endif
 628     {
 629         // In case the MADV_FREE is not supported, use MADV_DONTNEED
 630         st = madvise(address, size, MADV_DONTNEED);
 631     }
 632
 633     return (st == 0);
 634 }
 635
 636 // Check if the OS supports write watching
 637 bool GCToOSInterface::SupportsWriteWatch()
 638 {
 639     return false;
 640 }
 641
 642 // Reset the write tracking state for the specified virtual memory range.
 643 // Parameters:
 644 //  address - starting virtual address
 645 //  size    - size of the virtual memory range
 646 void GCToOSInterface::ResetWriteWatch(void* address, size_t size)
 647 {
 648     assert(!"should never call ResetWriteWatch on Unix");
 649 }
 650
 651 // Retrieve addresses of the pages that are written to in a region of virtual memory
 652 // Parameters:
 653 //  resetState         - true indicates to reset the write tracking state
 654 //  address            - starting virtual address
 655 //  size               - size of the virtual memory range
 656 //  pageAddresses      - buffer that receives an array of page addresses in the memory region
 657 //  pageAddressesCount - on input, size of the lpAddresses array, in array elements
 658 //                       on output, the number of page addresses that are returned in the array.
 659 // Return:
 660 //  true if it has succeeded, false if it has failed
 661 bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size, void** pageAddresses, uintptr_t* pageAddressesCount)
 662 {
 663     assert(!"should never call GetWriteWatch on Unix");
 664     return false;
 665 }
 666
 667 // Get size of the largest cache on the processor die
 668 // Parameters:
 669 //  trueSize - true to return true cache size, false to return scaled up size based on
 670 //             the processor architecture
 671 // Return:
 672 //  Size of the cache
 673 size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
 674 {
 675     // TODO(segilles) processor detection
 676     return 0;
 677 }
 678
 679 // Sets the calling thread's affinity to only run on the processor specified
 680 // Parameters:
 681 //  procNo - The requested processor for the calling thread.
 682 // Return:
 683 //  true if setting the affinity was successful, false otherwise.
 684 bool GCToOSInterface::SetThreadAffinity(uint16_t procNo)
 685 {
 686 #if HAVE_PTHREAD_GETAFFINITY_NP
 687     cpu_set_t cpuSet;
 688     CPU_ZERO(&cpuSet);
 689     CPU_SET((int)procNo, &cpuSet);
 690
 691     int st = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuSet);
 692
 693     return (st == 0);
 694
 695 #else  // HAVE_PTHREAD_GETAFFINITY_NP
 696     // There is no API to manage thread affinity, so let's ignore the request
 697     return false;
 698 #endif // HAVE_PTHREAD_GETAFFINITY_NP
 699 }
 700
 701 // Boosts the calling thread's thread priority to a level higher than the default
 702 // for new threads.
 703 // Parameters:
 704 //  None.
 705 // Return:
 706 //  true if the priority boost was successful, false otherwise.
 707 bool GCToOSInterface::BoostThreadPriority()
 708 {
 709     // [LOCALGC TODO] Thread priority for unix
 710     return false;
 711 }
 712
 713 // Set the set of processors enabled for GC threads for the current process based on config specified affinity mask and set
 714 // Parameters:
 715 //  configAffinityMask - mask specified by the GCHeapAffinitizeMask config
 716 //  configAffinitySet  - affinity set specified by the GCHeapAffinitizeRanges config
 717 // Return:
 718 //  set of enabled processors
 719 const AffinitySet* GCToOSInterface::SetGCThreadsAffinitySet(uintptr_t configAffinityMask, const AffinitySet* configAffinitySet)
 720 {
 721     if (!configAffinitySet->IsEmpty())
 722     {
 723         // Update the process affinity set using the configured set
 724         for (size_t i = 0; i < MAX_SUPPORTED_CPUS; i++)
 725         {
 726             if (g_processAffinitySet.Contains(i) && !configAffinitySet->Contains(i))
 727             {
 728                 g_processAffinitySet.Remove(i);
 729             }
 730         }
 731     }
 732
 733     return &g_processAffinitySet;
 734 }
 735
 736 // Get number of processors assigned to the current process
 737 // Return:
 738 //  The number of processors
 739 uint32_t GCToOSInterface::GetCurrentProcessCpuCount()
 740 {
 741     return g_currentProcessCpuCount;
 742 }
 743
 744 // Return the size of the user-mode portion of the virtual address space of this process.
 745 // Return:
 746 //  non zero if it has succeeded, 0 if it has failed
 747 size_t GCToOSInterface::GetVirtualMemoryLimit()
 748 {
 749 #ifdef BIT64
 750     // There is no API to get the total virtual address space size on
 751     // Unix, so we use a constant value representing 128TB, which is
 752     // the approximate size of total user virtual address space on
 753     // the currently supported Unix systems.
 754     static const uint64_t _128TB = (1ull << 47);
 755     return _128TB;
 756 #else
 757     return (size_t)-1;
 758 #endif
 759 }
 760
 761 // Get the physical memory that this process can use.
 762 // Return:
 763 //  non zero if it has succeeded, 0 if it has failed
 764 // Remarks:
 765 //  If a process runs with a restricted memory limit, it returns the limit. If there's no limit
 766 //  specified, it returns amount of actual physical memory.
 767 uint64_t GCToOSInterface::GetPhysicalMemoryLimit(bool* is_restricted)
 768 {
 769     size_t restricted_limit;
 770     if (is_restricted)
 771         *is_restricted = false;
 772
 773     // The limit was not cached
 774     if (g_RestrictedPhysicalMemoryLimit == 0)
 775     {
 776         restricted_limit = GetRestrictedPhysicalMemoryLimit();
 777         VolatileStore(&g_RestrictedPhysicalMemoryLimit, restricted_limit);
 778     }
 779     restricted_limit = g_RestrictedPhysicalMemoryLimit;
 780
 781     if (restricted_limit != 0 && restricted_limit != SIZE_T_MAX)
 782     {
 783         if (is_restricted)
 784             *is_restricted = true;
 785         return restricted_limit;
 786     }
 787
 788     long pages = sysconf(_SC_PHYS_PAGES);
 789     if (pages == -1)
 790     {
 791         return 0;
 792     }
 793
 794     long pageSize = sysconf(_SC_PAGE_SIZE);
 795     if (pageSize == -1)
 796     {
 797         return 0;
 798     }
 799
 800     return pages * pageSize;
 801 }
 802
 803 // Get memory status
 804 // Parameters:
 805 //  memory_load - A number between 0 and 100 that specifies the approximate percentage of physical memory
 806 //      that is in use (0 indicates no memory use and 100 indicates full memory use).
 807 //  available_physical - The amount of physical memory currently available, in bytes.
 808 //  available_page_file - The maximum amount of memory the current process can commit, in bytes.
 809 void GCToOSInterface::GetMemoryStatus(uint32_t* memory_load, uint64_t* available_physical, uint64_t* available_page_file)
 810 {
 811     if (memory_load != nullptr || available_physical != nullptr)
 812     {
 813         uint64_t total = GetPhysicalMemoryLimit();
 814
 815         uint64_t available = 0;
 816         uint32_t load = 0;
 817         size_t used;
 818
 819         // Get the physical memory in use - from it, we can get the physical memory available.
 820         // We do this only when we have the total physical memory available.
 821         if (total > 0 && GetPhysicalMemoryUsed(&used))
 822         {
 823             available = total > used ? total-used : 0;
 824             load = (uint32_t)(((float)used * 100) / (float)total);
 825         }
 826
 827         if (memory_load != nullptr)
 828             *memory_load = load;
 829         if (available_physical != nullptr)
 830             *available_physical = available;
 831     }
 832
 833     if (available_page_file != nullptr)
 834         *available_page_file = 0;
 835 }
 836
 837 // Get a high precision performance counter
 838 // Return:
 839 //  The counter value
 840 int64_t GCToOSInterface::QueryPerformanceCounter()
 841 {
 842     // TODO: This is not a particularly efficient implementation - we certainly could
 843     // do much more specific platform-dependent versions if we find that this method
 844     // runs hot. However, most likely it does not.
 845     struct timeval tv;
 846     if (gettimeofday(&tv, NULL) == -1)
 847     {
 848         assert(!"gettimeofday() failed");
 849         // TODO (segilles) unconditional asserts
 850         return 0;
 851     }
 852     return (int64_t) tv.tv_sec * (int64_t) tccSecondsToMicroSeconds + (int64_t) tv.tv_usec;
 853 }
 854
 855 // Get a frequency of the high precision performance counter
 856 // Return:
 857 //  The counter frequency
 858 int64_t GCToOSInterface::QueryPerformanceFrequency()
 859 {
 860     // The counter frequency of gettimeofday is in microseconds.
 861     return tccSecondsToMicroSeconds;
 862 }
 863
 864 // Get a time stamp with a low precision
 865 // Return:
 866 //  Time stamp in milliseconds
 867 uint32_t GCToOSInterface::GetLowPrecisionTimeStamp()
 868 {
 869     // TODO(segilles) this is pretty naive, we can do better
 870     uint64_t retval = 0;
 871     struct timeval tv;
 872     if (gettimeofday(&tv, NULL) == 0)
 873     {
 874         retval = (tv.tv_sec * tccSecondsToMilliSeconds) + (tv.tv_usec / tccMilliSecondsToMicroSeconds);
 875     }
 876     else
 877     {
 878         assert(!"gettimeofday() failed\n");
 879     }
 880
 881     return retval;
 882 }
 883
 884 // Gets the total number of processors on the machine, not taking
 885 // into account current process affinity.
 886 // Return:
 887 //  Number of processors on the machine
 888 uint32_t GCToOSInterface::GetTotalProcessorCount()
 889 {
 890     // Calculated in GCToOSInterface::Initialize using
 891     // sysconf(_SC_NPROCESSORS_ONLN)
 892     return g_logicalCpuCount;
 893 }
 894
 895 bool GCToOSInterface::CanEnableGCNumaAware()
 896 {
 897     return g_numaAvailable;
 898 }
 899
 900 // Get processor number and optionally its NUMA node number for the specified heap number
 901 // Parameters:
 902 //  heap_number - heap number to get the result for
 903 //  proc_no     - set to the selected processor number
 904 //  node_no     - set to the NUMA node of the selected processor or to NUMA_NODE_UNDEFINED
 905 // Return:
 906 //  true if it succeeded
 907 bool GCToOSInterface::GetProcessorForHeap(uint16_t heap_number, uint16_t* proc_no, uint16_t* node_no)
 908 {
 909     bool success = false;
 910
 911     uint16_t availableProcNumber = 0;
 912     for (size_t procNumber = 0; procNumber < g_logicalCpuCount; procNumber++)
 913     {
 914         if (g_processAffinitySet.Contains(procNumber))
 915         {
 916             if (availableProcNumber == heap_number)
 917             {
 918                 *proc_no = procNumber;
 919 #if HAVE_NUMA_H
 920                 if (GCToOSInterface::CanEnableGCNumaAware())
 921                 {
 922                     int result = numa_node_of_cpu(procNumber);
 923                     *node_no = (result >= 0) ? (uint16_t)result : NUMA_NODE_UNDEFINED;
 924                 }
 925                 else
 926 #endif // HAVE_NUMA_H
 927                 {
 928                     *node_no = NUMA_NODE_UNDEFINED;
 929                 }
 930
 931                 success = true;
 932                 break;
 933             }
 934             availableProcNumber++;
 935         }
 936     }
 937
 938     return success;
 939 }
 940
 941 // Parse the confing string describing affinitization ranges and update the passed in affinitySet accordingly
 942 // Parameters:
 943 //  config_string - string describing the affinitization range, platform specific
 944 //  start_index  - the range start index extracted from the config_string
 945 //  end_index    - the range end index extracted from the config_string, equal to the start_index if only an index and not a range was passed in
 946 // Return:
 947 //  true if the configString was successfully parsed, false if it was not correct
 948 bool GCToOSInterface::ParseGCHeapAffinitizeRangesEntry(const char** config_string, size_t* start_index, size_t* end_index)
 949 {
 950     return ParseIndexOrRange(config_string, start_index, end_index);
 951 }
 952
 953 // Initialize the critical section
 954 void CLRCriticalSection::Initialize()
 955 {
 956     int st = pthread_mutex_init(&m_cs.mutex, NULL);
 957     assert(st == 0);
 958 }
 959
 960 // Destroy the critical section
 961 void CLRCriticalSection::Destroy()
 962 {
 963     int st = pthread_mutex_destroy(&m_cs.mutex);
 964     assert(st == 0);
 965 }
 966
 967 // Enter the critical section. Blocks until the section can be entered.
 968 void CLRCriticalSection::Enter()
 969 {
 970     pthread_mutex_lock(&m_cs.mutex);
 971 }
 972
 973 // Leave the critical section
 974 void CLRCriticalSection::Leave()
 975 {
 976     pthread_mutex_unlock(&m_cs.mutex);
 977 }