libgo/go/runtime/mgc.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Garbage collector (GC).
   6 //
   7 // The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
   8 // GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
   9 // non-generational and non-compacting. Allocation is done using size segregated per P allocation
  10 // areas to minimize fragmentation while eliminating locks in the common case.
  11 //
  12 // The algorithm decomposes into several steps.
  13 // This is a high level description of the algorithm being used. For an overview of GC a good
  14 // place to start is Richard Jones' gchandbook.org.
  15 //
  16 // The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
  17 // Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
  18 // On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
  19 // 966-975.
  20 // For journal quality proofs that these steps are complete, correct, and terminate see
  21 // Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
  22 // Concurrency and Computation: Practice and Experience 15(3-5), 2003.
  23 //
  24 // 1. GC performs sweep termination.
  25 //
  26 //    a. Stop the world. This causes all Ps to reach a GC safe-point.
  27 //
  28 //    b. Sweep any unswept spans. There will only be unswept spans if
  29 //    this GC cycle was forced before the expected time.
  30 //
  31 // 2. GC performs the mark phase.
  32 //
  33 //    a. Prepare for the mark phase by setting gcphase to _GCmark
  34 //    (from _GCoff), enabling the write barrier, enabling mutator
  35 //    assists, and enqueueing root mark jobs. No objects may be
  36 //    scanned until all Ps have enabled the write barrier, which is
  37 //    accomplished using STW.
  38 //
  39 //    b. Start the world. From this point, GC work is done by mark
  40 //    workers started by the scheduler and by assists performed as
  41 //    part of allocation. The write barrier shades both the
  42 //    overwritten pointer and the new pointer value for any pointer
  43 //    writes (see mbarrier.go for details). Newly allocated objects
  44 //    are immediately marked black.
  45 //
  46 //    c. GC performs root marking jobs. This includes scanning all
  47 //    stacks, shading all globals, and shading any heap pointers in
  48 //    off-heap runtime data structures. Scanning a stack stops a
  49 //    goroutine, shades any pointers found on its stack, and then
  50 //    resumes the goroutine.
  51 //
  52 //    d. GC drains the work queue of grey objects, scanning each grey
  53 //    object to black and shading all pointers found in the object
  54 //    (which in turn may add those pointers to the work queue).
  55 //
  56 //    e. Because GC work is spread across local caches, GC uses a
  57 //    distributed termination algorithm to detect when there are no
  58 //    more root marking jobs or grey objects (see gcMarkDone). At this
  59 //    point, GC transitions to mark termination.
  60 //
  61 // 3. GC performs mark termination.
  62 //
  63 //    a. Stop the world.
  64 //
  65 //    b. Set gcphase to _GCmarktermination, and disable workers and
  66 //    assists.
  67 //
  68 //    c. Perform housekeeping like flushing mcaches.
  69 //
  70 // 4. GC performs the sweep phase.
  71 //
  72 //    a. Prepare for the sweep phase by setting gcphase to _GCoff,
  73 //    setting up sweep state and disabling the write barrier.
  74 //
  75 //    b. Start the world. From this point on, newly allocated objects
  76 //    are white, and allocating sweeps spans before use if necessary.
  77 //
  78 //    c. GC does concurrent sweeping in the background and in response
  79 //    to allocation. See description below.
  80 //
  81 // 5. When sufficient allocation has taken place, replay the sequence
  82 // starting with 1 above. See discussion of GC rate below.
  83
  84 // Concurrent sweep.
  85 //
  86 // The sweep phase proceeds concurrently with normal program execution.
  87 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
  88 // and concurrently in a background goroutine (this helps programs that are not CPU bound).
  89 // At the end of STW mark termination all spans are marked as "needs sweeping".
  90 //
  91 // The background sweeper goroutine simply sweeps spans one-by-one.
  92 //
  93 // To avoid requesting more OS memory while there are unswept spans, when a
  94 // goroutine needs another span, it first attempts to reclaim that much memory
  95 // by sweeping. When a goroutine needs to allocate a new small-object span, it
  96 // sweeps small-object spans for the same object size until it frees at least
  97 // one object. When a goroutine needs to allocate large-object span from heap,
  98 // it sweeps spans until it frees at least that many pages into heap. There is
  99 // one case where this may not suffice: if a goroutine sweeps and frees two
 100 // nonadjacent one-page spans to the heap, it will allocate a new two-page
 101 // span, but there can still be other one-page unswept spans which could be
 102 // combined into a two-page span.
 103 //
 104 // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
 105 // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
 106 // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
 107 // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
 108 // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
 109 // The finalizer goroutine is kicked off only when all spans are swept.
 110 // When the next GC starts, it sweeps all not-yet-swept spans (if any).
 111
 112 // GC rate.
 113 // Next GC is after we've allocated an extra amount of memory proportional to
 114 // the amount already in use. The proportion is controlled by GOGC environment variable
 115 // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 116 // (this mark is tracked in next_gc variable). This keeps the GC cost in linear
 117 // proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 118 // (and also the amount of extra memory used).
 119
 120 // Oblets
 121 //
 122 // In order to prevent long pauses while scanning large objects and to
 123 // improve parallelism, the garbage collector breaks up scan jobs for
 124 // objects larger than maxObletBytes into "oblets" of at most
 125 // maxObletBytes. When scanning encounters the beginning of a large
 126 // object, it scans only the first oblet and enqueues the remaining
 127 // oblets as new scan jobs.
 128
 129 package runtime
 130
 131 import (
 132         "internal/cpu"
 133         "runtime/internal/atomic"
 134         "unsafe"
 135 )
 136
 137 const (
 138         _DebugGC         = 0
 139         _ConcurrentSweep = true
 140         _FinBlockSize    = 4 * 1024
 141
 142         // sweepMinHeapDistance is a lower bound on the heap distance
 143         // (in bytes) reserved for concurrent sweeping between GC
 144         // cycles.
 145         sweepMinHeapDistance = 1024 * 1024
 146 )
 147
 148 // heapminimum is the minimum heap size at which to trigger GC.
 149 // For small heaps, this overrides the usual GOGC*live set rule.
 150 //
 151 // When there is a very small live set but a lot of allocation, simply
 152 // collecting when the heap reaches GOGC*live results in many GC
 153 // cycles and high total per-GC overhead. This minimum amortizes this
 154 // per-GC overhead while keeping the heap reasonably small.
 155 //
 156 // During initialization this is set to 4MB*GOGC/100. In the case of
 157 // GOGC==0, this will set heapminimum to 0, resulting in constant
 158 // collection even when the heap size is small, which is useful for
 159 // debugging.
 160 var heapminimum uint64 = defaultHeapMinimum
 161
 162 // defaultHeapMinimum is the value of heapminimum for GOGC==100.
 163 const defaultHeapMinimum = 4 << 20
 164
 165 // Initialized from $GOGC.  GOGC=off means no GC.
 166 var gcpercent int32
 167
 168 func gcinit() {
 169         if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
 170                 throw("size of Workbuf is suboptimal")
 171         }
 172
 173         // No sweep on the first cycle.
 174         mheap_.sweepdone = 1
 175
 176         // Set a reasonable initial GC trigger.
 177         memstats.triggerRatio = 7 / 8.0
 178
 179         // Fake a heap_marked value so it looks like a trigger at
 180         // heapminimum is the appropriate growth from heap_marked.
 181         // This will go into computing the initial GC goal.
 182         memstats.heap_marked = uint64(float64(heapminimum) / (1 + memstats.triggerRatio))
 183
 184         // Set gcpercent from the environment. This will also compute
 185         // and set the GC trigger and goal.
 186         _ = setGCPercent(readgogc())
 187
 188         work.startSema = 1
 189         work.markDoneSema = 1
 190 }
 191
 192 func readgogc() int32 {
 193         p := gogetenv("GOGC")
 194         if p == "off" {
 195                 return -1
 196         }
 197         if n, ok := atoi32(p); ok {
 198                 return n
 199         }
 200         return 100
 201 }
 202
 203 // gcenable is called after the bulk of the runtime initialization,
 204 // just before we're about to start letting user code run.
 205 // It kicks off the background sweeper goroutine, the background
 206 // scavenger goroutine, and enables GC.
 207 func gcenable() {
 208         // Kick off sweeping and scavenging.
 209         c := make(chan int, 2)
 210         expectSystemGoroutine()
 211         go bgsweep(c)
 212         expectSystemGoroutine()
 213         go bgscavenge(c)
 214         <-c
 215         <-c
 216         memstats.enablegc = true // now that runtime is initialized, GC is okay
 217 }
 218
 219 //go:linkname setGCPercent runtime..z2fdebug.setGCPercent
 220 func setGCPercent(in int32) (out int32) {
 221         // Run on the system stack since we grab the heap lock.
 222         systemstack(func() {
 223                 lock(&mheap_.lock)
 224                 out = gcpercent
 225                 if in < 0 {
 226                         in = -1
 227                 }
 228                 gcpercent = in
 229                 heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
 230                 // Update pacing in response to gcpercent change.
 231                 gcSetTriggerRatio(memstats.triggerRatio)
 232                 unlock(&mheap_.lock)
 233         })
 234
 235         // If we just disabled GC, wait for any concurrent GC mark to
 236         // finish so we always return with no GC running.
 237         if in < 0 {
 238                 gcWaitOnMark(atomic.Load(&work.cycles))
 239         }
 240
 241         return out
 242 }
 243
 244 // Garbage collector phase.
 245 // Indicates to write barrier and synchronization task to perform.
 246 var gcphase uint32
 247
 248 // The compiler knows about this variable.
 249 // If you change it, you must change gofrontend/wb.cc, too.
 250 // If you change the first four bytes, you must also change the write
 251 // barrier insertion code.
 252 var writeBarrier struct {
 253         enabled bool    // compiler emits a check of this before calling write barrier
 254         pad     [3]byte // compiler uses 32-bit load for "enabled" field
 255         needed  bool    // whether we need a write barrier for current GC phase
 256         cgo     bool    // whether we need a write barrier for a cgo check
 257         alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
 258 }
 259
 260 // gcBlackenEnabled is 1 if mutator assists and background mark
 261 // workers are allowed to blacken objects. This must only be set when
 262 // gcphase == _GCmark.
 263 var gcBlackenEnabled uint32
 264
 265 const (
 266         _GCoff             = iota // GC not running; sweeping in background, write barrier disabled
 267         _GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
 268         _GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
 269 )
 270
 271 //go:nosplit
 272 func setGCPhase(x uint32) {
 273         atomic.Store(&gcphase, x)
 274         writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
 275         writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
 276 }
 277
 278 // gcMarkWorkerMode represents the mode that a concurrent mark worker
 279 // should operate in.
 280 //
 281 // Concurrent marking happens through four different mechanisms. One
 282 // is mutator assists, which happen in response to allocations and are
 283 // not scheduled. The other three are variations in the per-P mark
 284 // workers and are distinguished by gcMarkWorkerMode.
 285 type gcMarkWorkerMode int
 286
 287 const (
 288         // gcMarkWorkerDedicatedMode indicates that the P of a mark
 289         // worker is dedicated to running that mark worker. The mark
 290         // worker should run without preemption.
 291         gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
 292
 293         // gcMarkWorkerFractionalMode indicates that a P is currently
 294         // running the "fractional" mark worker. The fractional worker
 295         // is necessary when GOMAXPROCS*gcBackgroundUtilization is not
 296         // an integer. The fractional worker should run until it is
 297         // preempted and will be scheduled to pick up the fractional
 298         // part of GOMAXPROCS*gcBackgroundUtilization.
 299         gcMarkWorkerFractionalMode
 300
 301         // gcMarkWorkerIdleMode indicates that a P is running the mark
 302         // worker because it has nothing else to do. The idle worker
 303         // should run until it is preempted and account its time
 304         // against gcController.idleMarkTime.
 305         gcMarkWorkerIdleMode
 306 )
 307
 308 // gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
 309 // to use in execution traces.
 310 var gcMarkWorkerModeStrings = [...]string{
 311         "GC (dedicated)",
 312         "GC (fractional)",
 313         "GC (idle)",
 314 }
 315
 316 // gcController implements the GC pacing controller that determines
 317 // when to trigger concurrent garbage collection and how much marking
 318 // work to do in mutator assists and background marking.
 319 //
 320 // It uses a feedback control algorithm to adjust the memstats.gc_trigger
 321 // trigger based on the heap growth and GC CPU utilization each cycle.
 322 // This algorithm optimizes for heap growth to match GOGC and for CPU
 323 // utilization between assist and background marking to be 25% of
 324 // GOMAXPROCS. The high-level design of this algorithm is documented
 325 // at https://golang.org/s/go15gcpacing.
 326 //
 327 // All fields of gcController are used only during a single mark
 328 // cycle.
 329 var gcController gcControllerState
 330
 331 type gcControllerState struct {
 332         // scanWork is the total scan work performed this cycle. This
 333         // is updated atomically during the cycle. Updates occur in
 334         // bounded batches, since it is both written and read
 335         // throughout the cycle. At the end of the cycle, this is how
 336         // much of the retained heap is scannable.
 337         //
 338         // Currently this is the bytes of heap scanned. For most uses,
 339         // this is an opaque unit of work, but for estimation the
 340         // definition is important.
 341         scanWork int64
 342
 343         // bgScanCredit is the scan work credit accumulated by the
 344         // concurrent background scan. This credit is accumulated by
 345         // the background scan and stolen by mutator assists. This is
 346         // updated atomically. Updates occur in bounded batches, since
 347         // it is both written and read throughout the cycle.
 348         bgScanCredit int64
 349
 350         // assistTime is the nanoseconds spent in mutator assists
 351         // during this cycle. This is updated atomically. Updates
 352         // occur in bounded batches, since it is both written and read
 353         // throughout the cycle.
 354         assistTime int64
 355
 356         // dedicatedMarkTime is the nanoseconds spent in dedicated
 357         // mark workers during this cycle. This is updated atomically
 358         // at the end of the concurrent mark phase.
 359         dedicatedMarkTime int64
 360
 361         // fractionalMarkTime is the nanoseconds spent in the
 362         // fractional mark worker during this cycle. This is updated
 363         // atomically throughout the cycle and will be up-to-date if
 364         // the fractional mark worker is not currently running.
 365         fractionalMarkTime int64
 366
 367         // idleMarkTime is the nanoseconds spent in idle marking
 368         // during this cycle. This is updated atomically throughout
 369         // the cycle.
 370         idleMarkTime int64
 371
 372         // markStartTime is the absolute start time in nanoseconds
 373         // that assists and background mark workers started.
 374         markStartTime int64
 375
 376         // dedicatedMarkWorkersNeeded is the number of dedicated mark
 377         // workers that need to be started. This is computed at the
 378         // beginning of each cycle and decremented atomically as
 379         // dedicated mark workers get started.
 380         dedicatedMarkWorkersNeeded int64
 381
 382         // assistWorkPerByte is the ratio of scan work to allocated
 383         // bytes that should be performed by mutator assists. This is
 384         // computed at the beginning of each cycle and updated every
 385         // time heap_scan is updated.
 386         assistWorkPerByte float64
 387
 388         // assistBytesPerWork is 1/assistWorkPerByte.
 389         assistBytesPerWork float64
 390
 391         // fractionalUtilizationGoal is the fraction of wall clock
 392         // time that should be spent in the fractional mark worker on
 393         // each P that isn't running a dedicated worker.
 394         //
 395         // For example, if the utilization goal is 25% and there are
 396         // no dedicated workers, this will be 0.25. If the goal is
 397         // 25%, there is one dedicated worker, and GOMAXPROCS is 5,
 398         // this will be 0.05 to make up the missing 5%.
 399         //
 400         // If this is zero, no fractional workers are needed.
 401         fractionalUtilizationGoal float64
 402
 403         _ cpu.CacheLinePad
 404 }
 405
 406 // startCycle resets the GC controller's state and computes estimates
 407 // for a new GC cycle. The caller must hold worldsema.
 408 func (c *gcControllerState) startCycle() {
 409         c.scanWork = 0
 410         c.bgScanCredit = 0
 411         c.assistTime = 0
 412         c.dedicatedMarkTime = 0
 413         c.fractionalMarkTime = 0
 414         c.idleMarkTime = 0
 415
 416         // Ensure that the heap goal is at least a little larger than
 417         // the current live heap size. This may not be the case if GC
 418         // start is delayed or if the allocation that pushed heap_live
 419         // over gc_trigger is large or if the trigger is really close to
 420         // GOGC. Assist is proportional to this distance, so enforce a
 421         // minimum distance, even if it means going over the GOGC goal
 422         // by a tiny bit.
 423         if memstats.next_gc < memstats.heap_live+1024*1024 {
 424                 memstats.next_gc = memstats.heap_live + 1024*1024
 425         }
 426
 427         // Compute the background mark utilization goal. In general,
 428         // this may not come out exactly. We round the number of
 429         // dedicated workers so that the utilization is closest to
 430         // 25%. For small GOMAXPROCS, this would introduce too much
 431         // error, so we add fractional workers in that case.
 432         totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
 433         c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
 434         utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
 435         const maxUtilError = 0.3
 436         if utilError < -maxUtilError || utilError > maxUtilError {
 437                 // Rounding put us more than 30% off our goal. With
 438                 // gcBackgroundUtilization of 25%, this happens for
 439                 // GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
 440                 // workers to compensate.
 441                 if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
 442                         // Too many dedicated workers.
 443                         c.dedicatedMarkWorkersNeeded--
 444                 }
 445                 c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
 446         } else {
 447                 c.fractionalUtilizationGoal = 0
 448         }
 449
 450         // In STW mode, we just want dedicated workers.
 451         if debug.gcstoptheworld > 0 {
 452                 c.dedicatedMarkWorkersNeeded = int64(gomaxprocs)
 453                 c.fractionalUtilizationGoal = 0
 454         }
 455
 456         // Clear per-P state
 457         for _, p := range allp {
 458                 p.gcAssistTime = 0
 459                 p.gcFractionalMarkTime = 0
 460         }
 461
 462         // Compute initial values for controls that are updated
 463         // throughout the cycle.
 464         c.revise()
 465
 466         if debug.gcpacertrace > 0 {
 467                 print("pacer: assist ratio=", c.assistWorkPerByte,
 468                         " (scan ", memstats.heap_scan>>20, " MB in ",
 469                         work.initialHeapLive>>20, "->",
 470                         memstats.next_gc>>20, " MB)",
 471                         " workers=", c.dedicatedMarkWorkersNeeded,
 472                         "+", c.fractionalUtilizationGoal, "\n")
 473         }
 474 }
 475
 476 // revise updates the assist ratio during the GC cycle to account for
 477 // improved estimates. This should be called either under STW or
 478 // whenever memstats.heap_scan, memstats.heap_live, or
 479 // memstats.next_gc is updated (with mheap_.lock held).
 480 //
 481 // It should only be called when gcBlackenEnabled != 0 (because this
 482 // is when assists are enabled and the necessary statistics are
 483 // available).
 484 func (c *gcControllerState) revise() {
 485         gcpercent := gcpercent
 486         if gcpercent < 0 {
 487                 // If GC is disabled but we're running a forced GC,
 488                 // act like GOGC is huge for the below calculations.
 489                 gcpercent = 100000
 490         }
 491         live := atomic.Load64(&memstats.heap_live)
 492
 493         var heapGoal, scanWorkExpected int64
 494         if live <= memstats.next_gc {
 495                 // We're under the soft goal. Pace GC to complete at
 496                 // next_gc assuming the heap is in steady-state.
 497                 heapGoal = int64(memstats.next_gc)
 498
 499                 // Compute the expected scan work remaining.
 500                 //
 501                 // This is estimated based on the expected
 502                 // steady-state scannable heap. For example, with
 503                 // GOGC=100, only half of the scannable heap is
 504                 // expected to be live, so that's what we target.
 505                 //
 506                 // (This is a float calculation to avoid overflowing on
 507                 // 100*heap_scan.)
 508                 scanWorkExpected = int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
 509         } else {
 510                 // We're past the soft goal. Pace GC so that in the
 511                 // worst case it will complete by the hard goal.
 512                 const maxOvershoot = 1.1
 513                 heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
 514
 515                 // Compute the upper bound on the scan work remaining.
 516                 scanWorkExpected = int64(memstats.heap_scan)
 517         }
 518
 519         // Compute the remaining scan work estimate.
 520         //
 521         // Note that we currently count allocations during GC as both
 522         // scannable heap (heap_scan) and scan work completed
 523         // (scanWork), so allocation will change this difference will
 524         // slowly in the soft regime and not at all in the hard
 525         // regime.
 526         scanWorkRemaining := scanWorkExpected - c.scanWork
 527         if scanWorkRemaining < 1000 {
 528                 // We set a somewhat arbitrary lower bound on
 529                 // remaining scan work since if we aim a little high,
 530                 // we can miss by a little.
 531                 //
 532                 // We *do* need to enforce that this is at least 1,
 533                 // since marking is racy and double-scanning objects
 534                 // may legitimately make the remaining scan work
 535                 // negative, even in the hard goal regime.
 536                 scanWorkRemaining = 1000
 537         }
 538
 539         // Compute the heap distance remaining.
 540         heapRemaining := heapGoal - int64(live)
 541         if heapRemaining <= 0 {
 542                 // This shouldn't happen, but if it does, avoid
 543                 // dividing by zero or setting the assist negative.
 544                 heapRemaining = 1
 545         }
 546
 547         // Compute the mutator assist ratio so by the time the mutator
 548         // allocates the remaining heap bytes up to next_gc, it will
 549         // have done (or stolen) the remaining amount of scan work.
 550         c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
 551         c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
 552 }
 553
 554 // endCycle computes the trigger ratio for the next cycle.
 555 func (c *gcControllerState) endCycle() float64 {
 556         if work.userForced {
 557                 // Forced GC means this cycle didn't start at the
 558                 // trigger, so where it finished isn't good
 559                 // information about how to adjust the trigger.
 560                 // Just leave it where it is.
 561                 return memstats.triggerRatio
 562         }
 563
 564         // Proportional response gain for the trigger controller. Must
 565         // be in [0, 1]. Lower values smooth out transient effects but
 566         // take longer to respond to phase changes. Higher values
 567         // react to phase changes quickly, but are more affected by
 568         // transient changes. Values near 1 may be unstable.
 569         const triggerGain = 0.5
 570
 571         // Compute next cycle trigger ratio. First, this computes the
 572         // "error" for this cycle; that is, how far off the trigger
 573         // was from what it should have been, accounting for both heap
 574         // growth and GC CPU utilization. We compute the actual heap
 575         // growth during this cycle and scale that by how far off from
 576         // the goal CPU utilization we were (to estimate the heap
 577         // growth if we had the desired CPU utilization). The
 578         // difference between this estimate and the GOGC-based goal
 579         // heap growth is the error.
 580         goalGrowthRatio := gcEffectiveGrowthRatio()
 581         actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
 582         assistDuration := nanotime() - c.markStartTime
 583
 584         // Assume background mark hit its utilization goal.
 585         utilization := gcBackgroundUtilization
 586         // Add assist utilization; avoid divide by zero.
 587         if assistDuration > 0 {
 588                 utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
 589         }
 590
 591         triggerError := goalGrowthRatio - memstats.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-memstats.triggerRatio)
 592
 593         // Finally, we adjust the trigger for next time by this error,
 594         // damped by the proportional gain.
 595         triggerRatio := memstats.triggerRatio + triggerGain*triggerError
 596
 597         if debug.gcpacertrace > 0 {
 598                 // Print controller state in terms of the design
 599                 // document.
 600                 H_m_prev := memstats.heap_marked
 601                 h_t := memstats.triggerRatio
 602                 H_T := memstats.gc_trigger
 603                 h_a := actualGrowthRatio
 604                 H_a := memstats.heap_live
 605                 h_g := goalGrowthRatio
 606                 H_g := int64(float64(H_m_prev) * (1 + h_g))
 607                 u_a := utilization
 608                 u_g := gcGoalUtilization
 609                 W_a := c.scanWork
 610                 print("pacer: H_m_prev=", H_m_prev,
 611                         " h_t=", h_t, " H_T=", H_T,
 612                         " h_a=", h_a, " H_a=", H_a,
 613                         " h_g=", h_g, " H_g=", H_g,
 614                         " u_a=", u_a, " u_g=", u_g,
 615                         " W_a=", W_a,
 616                         " goalΔ=", goalGrowthRatio-h_t,
 617                         " actualΔ=", h_a-h_t,
 618                         " u_a/u_g=", u_a/u_g,
 619                         "\n")
 620         }
 621
 622         return triggerRatio
 623 }
 624
 625 // enlistWorker encourages another dedicated mark worker to start on
 626 // another P if there are spare worker slots. It is used by putfull
 627 // when more work is made available.
 628 //
 629 //go:nowritebarrier
 630 func (c *gcControllerState) enlistWorker() {
 631         // If there are idle Ps, wake one so it will run an idle worker.
 632         // NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
 633         //
 634         //      if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
 635         //              wakep()
 636         //              return
 637         //      }
 638
 639         // There are no idle Ps. If we need more dedicated workers,
 640         // try to preempt a running P so it will switch to a worker.
 641         if c.dedicatedMarkWorkersNeeded <= 0 {
 642                 return
 643         }
 644         // Pick a random other P to preempt.
 645         if gomaxprocs <= 1 {
 646                 return
 647         }
 648         gp := getg()
 649         if gp == nil || gp.m == nil || gp.m.p == 0 {
 650                 return
 651         }
 652         myID := gp.m.p.ptr().id
 653         for tries := 0; tries < 5; tries++ {
 654                 id := int32(fastrandn(uint32(gomaxprocs - 1)))
 655                 if id >= myID {
 656                         id++
 657                 }
 658                 p := allp[id]
 659                 if p.status != _Prunning {
 660                         continue
 661                 }
 662                 if preemptone(p) {
 663                         return
 664                 }
 665         }
 666 }
 667
 668 // findRunnableGCWorker returns the background mark worker for _p_ if it
 669 // should be run. This must only be called when gcBlackenEnabled != 0.
 670 func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 671         if gcBlackenEnabled == 0 {
 672                 throw("gcControllerState.findRunnable: blackening not enabled")
 673         }
 674         if _p_.gcBgMarkWorker == 0 {
 675                 // The mark worker associated with this P is blocked
 676                 // performing a mark transition. We can't run it
 677                 // because it may be on some other run or wait queue.
 678                 return nil
 679         }
 680
 681         if !gcMarkWorkAvailable(_p_) {
 682                 // No work to be done right now. This can happen at
 683                 // the end of the mark phase when there are still
 684                 // assists tapering off. Don't bother running a worker
 685                 // now because it'll just return immediately.
 686                 return nil
 687         }
 688
 689         decIfPositive := func(ptr *int64) bool {
 690                 if *ptr > 0 {
 691                         if atomic.Xaddint64(ptr, -1) >= 0 {
 692                                 return true
 693                         }
 694                         // We lost a race
 695                         atomic.Xaddint64(ptr, +1)
 696                 }
 697                 return false
 698         }
 699
 700         if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
 701                 // This P is now dedicated to marking until the end of
 702                 // the concurrent mark phase.
 703                 _p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
 704         } else if c.fractionalUtilizationGoal == 0 {
 705                 // No need for fractional workers.
 706                 return nil
 707         } else {
 708                 // Is this P behind on the fractional utilization
 709                 // goal?
 710                 //
 711                 // This should be kept in sync with pollFractionalWorkerExit.
 712                 delta := nanotime() - gcController.markStartTime
 713                 if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
 714                         // Nope. No need to run a fractional worker.
 715                         return nil
 716                 }
 717                 // Run a fractional worker.
 718                 _p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 719         }
 720
 721         // Run the background mark worker
 722         gp := _p_.gcBgMarkWorker.ptr()
 723         casgstatus(gp, _Gwaiting, _Grunnable)
 724         if trace.enabled {
 725                 traceGoUnpark(gp, 0)
 726         }
 727         return gp
 728 }
 729
 730 // pollFractionalWorkerExit reports whether a fractional mark worker
 731 // should self-preempt. It assumes it is called from the fractional
 732 // worker.
 733 func pollFractionalWorkerExit() bool {
 734         // This should be kept in sync with the fractional worker
 735         // scheduler logic in findRunnableGCWorker.
 736         now := nanotime()
 737         delta := now - gcController.markStartTime
 738         if delta <= 0 {
 739                 return true
 740         }
 741         p := getg().m.p.ptr()
 742         selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
 743         // Add some slack to the utilization goal so that the
 744         // fractional worker isn't behind again the instant it exits.
 745         return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
 746 }
 747
 748 // gcSetTriggerRatio sets the trigger ratio and updates everything
 749 // derived from it: the absolute trigger, the heap goal, mark pacing,
 750 // and sweep pacing.
 751 //
 752 // This can be called any time. If GC is the in the middle of a
 753 // concurrent phase, it will adjust the pacing of that phase.
 754 //
 755 // This depends on gcpercent, memstats.heap_marked, and
 756 // memstats.heap_live. These must be up to date.
 757 //
 758 // mheap_.lock must be held or the world must be stopped.
 759 func gcSetTriggerRatio(triggerRatio float64) {
 760         // Compute the next GC goal, which is when the allocated heap
 761         // has grown by GOGC/100 over the heap marked by the last
 762         // cycle.
 763         goal := ^uint64(0)
 764         if gcpercent >= 0 {
 765                 goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
 766         }
 767
 768         // Set the trigger ratio, capped to reasonable bounds.
 769         if triggerRatio < 0 {
 770                 // This can happen if the mutator is allocating very
 771                 // quickly or the GC is scanning very slowly.
 772                 triggerRatio = 0
 773         } else if gcpercent >= 0 {
 774                 // Ensure there's always a little margin so that the
 775                 // mutator assist ratio isn't infinity.
 776                 maxTriggerRatio := 0.95 * float64(gcpercent) / 100
 777                 if triggerRatio > maxTriggerRatio {
 778                         triggerRatio = maxTriggerRatio
 779                 }
 780         }
 781         memstats.triggerRatio = triggerRatio
 782
 783         // Compute the absolute GC trigger from the trigger ratio.
 784         //
 785         // We trigger the next GC cycle when the allocated heap has
 786         // grown by the trigger ratio over the marked heap size.
 787         trigger := ^uint64(0)
 788         if gcpercent >= 0 {
 789                 trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
 790                 // Don't trigger below the minimum heap size.
 791                 minTrigger := heapminimum
 792                 if !isSweepDone() {
 793                         // Concurrent sweep happens in the heap growth
 794                         // from heap_live to gc_trigger, so ensure
 795                         // that concurrent sweep has some heap growth
 796                         // in which to perform sweeping before we
 797                         // start the next GC cycle.
 798                         sweepMin := atomic.Load64(&memstats.heap_live) + sweepMinHeapDistance
 799                         if sweepMin > minTrigger {
 800                                 minTrigger = sweepMin
 801                         }
 802                 }
 803                 if trigger < minTrigger {
 804                         trigger = minTrigger
 805                 }
 806                 if int64(trigger) < 0 {
 807                         print("runtime: next_gc=", memstats.next_gc, " heap_marked=", memstats.heap_marked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
 808                         throw("gc_trigger underflow")
 809                 }
 810                 if trigger > goal {
 811                         // The trigger ratio is always less than GOGC/100, but
 812                         // other bounds on the trigger may have raised it.
 813                         // Push up the goal, too.
 814                         goal = trigger
 815                 }
 816         }
 817
 818         // Commit to the trigger and goal.
 819         memstats.gc_trigger = trigger
 820         memstats.next_gc = goal
 821         if trace.enabled {
 822                 traceNextGC()
 823         }
 824
 825         // Update mark pacing.
 826         if gcphase != _GCoff {
 827                 gcController.revise()
 828         }
 829
 830         // Update sweep pacing.
 831         if isSweepDone() {
 832                 mheap_.sweepPagesPerByte = 0
 833         } else {
 834                 // Concurrent sweep needs to sweep all of the in-use
 835                 // pages by the time the allocated heap reaches the GC
 836                 // trigger. Compute the ratio of in-use pages to sweep
 837                 // per byte allocated, accounting for the fact that
 838                 // some might already be swept.
 839                 heapLiveBasis := atomic.Load64(&memstats.heap_live)
 840                 heapDistance := int64(trigger) - int64(heapLiveBasis)
 841                 // Add a little margin so rounding errors and
 842                 // concurrent sweep are less likely to leave pages
 843                 // unswept when GC starts.
 844                 heapDistance -= 1024 * 1024
 845                 if heapDistance < _PageSize {
 846                         // Avoid setting the sweep ratio extremely high
 847                         heapDistance = _PageSize
 848                 }
 849                 pagesSwept := atomic.Load64(&mheap_.pagesSwept)
 850                 sweepDistancePages := int64(mheap_.pagesInUse) - int64(pagesSwept)
 851                 if sweepDistancePages <= 0 {
 852                         mheap_.sweepPagesPerByte = 0
 853                 } else {
 854                         mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
 855                         mheap_.sweepHeapLiveBasis = heapLiveBasis
 856                         // Write pagesSweptBasis last, since this
 857                         // signals concurrent sweeps to recompute
 858                         // their debt.
 859                         atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
 860                 }
 861         }
 862
 863         gcPaceScavenger()
 864 }
 865
 866 // gcEffectiveGrowthRatio returns the current effective heap growth
 867 // ratio (GOGC/100) based on heap_marked from the previous GC and
 868 // next_gc for the current GC.
 869 //
 870 // This may differ from gcpercent/100 because of various upper and
 871 // lower bounds on gcpercent. For example, if the heap is smaller than
 872 // heapminimum, this can be higher than gcpercent/100.
 873 //
 874 // mheap_.lock must be held or the world must be stopped.
 875 func gcEffectiveGrowthRatio() float64 {
 876         egogc := float64(memstats.next_gc-memstats.heap_marked) / float64(memstats.heap_marked)
 877         if egogc < 0 {
 878                 // Shouldn't happen, but just in case.
 879                 egogc = 0
 880         }
 881         return egogc
 882 }
 883
 884 // gcGoalUtilization is the goal CPU utilization for
 885 // marking as a fraction of GOMAXPROCS.
 886 const gcGoalUtilization = 0.30
 887
 888 // gcBackgroundUtilization is the fixed CPU utilization for background
 889 // marking. It must be <= gcGoalUtilization. The difference between
 890 // gcGoalUtilization and gcBackgroundUtilization will be made up by
 891 // mark assists. The scheduler will aim to use within 50% of this
 892 // goal.
 893 //
 894 // Setting this to < gcGoalUtilization avoids saturating the trigger
 895 // feedback controller when there are no assists, which allows it to
 896 // better control CPU and heap growth. However, the larger the gap,
 897 // the more mutator assists are expected to happen, which impact
 898 // mutator latency.
 899 const gcBackgroundUtilization = 0.25
 900
 901 // gcCreditSlack is the amount of scan work credit that can
 902 // accumulate locally before updating gcController.scanWork and,
 903 // optionally, gcController.bgScanCredit. Lower values give a more
 904 // accurate assist ratio and make it more likely that assists will
 905 // successfully steal background credit. Higher values reduce memory
 906 // contention.
 907 const gcCreditSlack = 2000
 908
 909 // gcAssistTimeSlack is the nanoseconds of mutator assist time that
 910 // can accumulate on a P before updating gcController.assistTime.
 911 const gcAssistTimeSlack = 5000
 912
 913 // gcOverAssistWork determines how many extra units of scan work a GC
 914 // assist does when an assist happens. This amortizes the cost of an
 915 // assist by pre-paying for this many bytes of future allocations.
 916 const gcOverAssistWork = 64 << 10
 917
 918 var work struct {
 919         full  lfstack          // lock-free list of full blocks workbuf
 920         empty lfstack          // lock-free list of empty blocks workbuf
 921         pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
 922
 923         wbufSpans struct {
 924                 lock mutex
 925                 // free is a list of spans dedicated to workbufs, but
 926                 // that don't currently contain any workbufs.
 927                 free mSpanList
 928                 // busy is a list of all spans containing workbufs on
 929                 // one of the workbuf lists.
 930                 busy mSpanList
 931         }
 932
 933         // Restore 64-bit alignment on 32-bit.
 934         _ uint32
 935
 936         // bytesMarked is the number of bytes marked this cycle. This
 937         // includes bytes blackened in scanned objects, noscan objects
 938         // that go straight to black, and permagrey objects scanned by
 939         // markroot during the concurrent scan phase. This is updated
 940         // atomically during the cycle. Updates may be batched
 941         // arbitrarily, since the value is only read at the end of the
 942         // cycle.
 943         //
 944         // Because of benign races during marking, this number may not
 945         // be the exact number of marked bytes, but it should be very
 946         // close.
 947         //
 948         // Put this field here because it needs 64-bit atomic access
 949         // (and thus 8-byte alignment even on 32-bit architectures).
 950         bytesMarked uint64
 951
 952         markrootNext uint32 // next markroot job
 953         markrootJobs uint32 // number of markroot jobs
 954
 955         nproc  uint32
 956         tstart int64
 957         nwait  uint32
 958         ndone  uint32
 959
 960         // Number of roots of various root types. Set by gcMarkRootPrepare.
 961         nFlushCacheRoots                    int
 962         nDataRoots, nSpanRoots, nStackRoots int
 963
 964         // Each type of GC state transition is protected by a lock.
 965         // Since multiple threads can simultaneously detect the state
 966         // transition condition, any thread that detects a transition
 967         // condition must acquire the appropriate transition lock,
 968         // re-check the transition condition and return if it no
 969         // longer holds or perform the transition if it does.
 970         // Likewise, any transition must invalidate the transition
 971         // condition before releasing the lock. This ensures that each
 972         // transition is performed by exactly one thread and threads
 973         // that need the transition to happen block until it has
 974         // happened.
 975         //
 976         // startSema protects the transition from "off" to mark or
 977         // mark termination.
 978         startSema uint32
 979         // markDoneSema protects transitions from mark to mark termination.
 980         markDoneSema uint32
 981
 982         bgMarkReady note   // signal background mark worker has started
 983         bgMarkDone  uint32 // cas to 1 when at a background mark completion point
 984         // Background mark completion signaling
 985
 986         // mode is the concurrency mode of the current GC cycle.
 987         mode gcMode
 988
 989         // userForced indicates the current GC cycle was forced by an
 990         // explicit user call.
 991         userForced bool
 992
 993         // totaltime is the CPU nanoseconds spent in GC since the
 994         // program started if debug.gctrace > 0.
 995         totaltime int64
 996
 997         // initialHeapLive is the value of memstats.heap_live at the
 998         // beginning of this GC cycle.
 999         initialHeapLive uint64
1000
1001         // assistQueue is a queue of assists that are blocked because
1002         // there was neither enough credit to steal or enough work to
1003         // do.
1004         assistQueue struct {
1005                 lock mutex
1006                 q    gQueue
1007         }
1008
1009         // sweepWaiters is a list of blocked goroutines to wake when
1010         // we transition from mark termination to sweep.
1011         sweepWaiters struct {
1012                 lock mutex
1013                 list gList
1014         }
1015
1016         // cycles is the number of completed GC cycles, where a GC
1017         // cycle is sweep termination, mark, mark termination, and
1018         // sweep. This differs from memstats.numgc, which is
1019         // incremented at mark termination.
1020         cycles uint32
1021
1022         // Timing/utilization stats for this cycle.
1023         stwprocs, maxprocs                 int32
1024         tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
1025
1026         pauseNS    int64 // total STW time this cycle
1027         pauseStart int64 // nanotime() of last STW
1028
1029         // debug.gctrace heap sizes for this cycle.
1030         heap0, heap1, heap2, heapGoal uint64
1031 }
1032
1033 // GC runs a garbage collection and blocks the caller until the
1034 // garbage collection is complete. It may also block the entire
1035 // program.
1036 func GC() {
1037         // We consider a cycle to be: sweep termination, mark, mark
1038         // termination, and sweep. This function shouldn't return
1039         // until a full cycle has been completed, from beginning to
1040         // end. Hence, we always want to finish up the current cycle
1041         // and start a new one. That means:
1042         //
1043         // 1. In sweep termination, mark, or mark termination of cycle
1044         // N, wait until mark termination N completes and transitions
1045         // to sweep N.
1046         //
1047         // 2. In sweep N, help with sweep N.
1048         //
1049         // At this point we can begin a full cycle N+1.
1050         //
1051         // 3. Trigger cycle N+1 by starting sweep termination N+1.
1052         //
1053         // 4. Wait for mark termination N+1 to complete.
1054         //
1055         // 5. Help with sweep N+1 until it's done.
1056         //
1057         // This all has to be written to deal with the fact that the
1058         // GC may move ahead on its own. For example, when we block
1059         // until mark termination N, we may wake up in cycle N+2.
1060
1061         // Wait until the current sweep termination, mark, and mark
1062         // termination complete.
1063         n := atomic.Load(&work.cycles)
1064         gcWaitOnMark(n)
1065
1066         // We're now in sweep N or later. Trigger GC cycle N+1, which
1067         // will first finish sweep N if necessary and then enter sweep
1068         // termination N+1.
1069         gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
1070
1071         // Wait for mark termination N+1 to complete.
1072         gcWaitOnMark(n + 1)
1073
1074         // Finish sweep N+1 before returning. We do this both to
1075         // complete the cycle and because runtime.GC() is often used
1076         // as part of tests and benchmarks to get the system into a
1077         // relatively stable and isolated state.
1078         for atomic.Load(&work.cycles) == n+1 && sweepone() != ^uintptr(0) {
1079                 sweep.nbgsweep++
1080                 Gosched()
1081         }
1082
1083         // Callers may assume that the heap profile reflects the
1084         // just-completed cycle when this returns (historically this
1085         // happened because this was a STW GC), but right now the
1086         // profile still reflects mark termination N, not N+1.
1087         //
1088         // As soon as all of the sweep frees from cycle N+1 are done,
1089         // we can go ahead and publish the heap profile.
1090         //
1091         // First, wait for sweeping to finish. (We know there are no
1092         // more spans on the sweep queue, but we may be concurrently
1093         // sweeping spans, so we have to wait.)
1094         for atomic.Load(&work.cycles) == n+1 && atomic.Load(&mheap_.sweepers) != 0 {
1095                 Gosched()
1096         }
1097
1098         // Now we're really done with sweeping, so we can publish the
1099         // stable heap profile. Only do this if we haven't already hit
1100         // another mark termination.
1101         mp := acquirem()
1102         cycle := atomic.Load(&work.cycles)
1103         if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
1104                 mProf_PostSweep()
1105         }
1106         releasem(mp)
1107 }
1108
1109 // gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
1110 // already completed this mark phase, it returns immediately.
1111 func gcWaitOnMark(n uint32) {
1112         for {
1113                 // Disable phase transitions.
1114                 lock(&work.sweepWaiters.lock)
1115                 nMarks := atomic.Load(&work.cycles)
1116                 if gcphase != _GCmark {
1117                         // We've already completed this cycle's mark.
1118                         nMarks++
1119                 }
1120                 if nMarks > n {
1121                         // We're done.
1122                         unlock(&work.sweepWaiters.lock)
1123                         return
1124                 }
1125
1126                 // Wait until sweep termination, mark, and mark
1127                 // termination of cycle N complete.
1128                 work.sweepWaiters.list.push(getg())
1129                 goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
1130         }
1131 }
1132
1133 // gcMode indicates how concurrent a GC cycle should be.
1134 type gcMode int
1135
1136 const (
1137         gcBackgroundMode gcMode = iota // concurrent GC and sweep
1138         gcForceMode                    // stop-the-world GC now, concurrent sweep
1139         gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
1140 )
1141
1142 // A gcTrigger is a predicate for starting a GC cycle. Specifically,
1143 // it is an exit condition for the _GCoff phase.
1144 type gcTrigger struct {
1145         kind gcTriggerKind
1146         now  int64  // gcTriggerTime: current time
1147         n    uint32 // gcTriggerCycle: cycle number to start
1148 }
1149
1150 type gcTriggerKind int
1151
1152 const (
1153         // gcTriggerHeap indicates that a cycle should be started when
1154         // the heap size reaches the trigger heap size computed by the
1155         // controller.
1156         gcTriggerHeap gcTriggerKind = iota
1157
1158         // gcTriggerTime indicates that a cycle should be started when
1159         // it's been more than forcegcperiod nanoseconds since the
1160         // previous GC cycle.
1161         gcTriggerTime
1162
1163         // gcTriggerCycle indicates that a cycle should be started if
1164         // we have not yet started cycle number gcTrigger.n (relative
1165         // to work.cycles).
1166         gcTriggerCycle
1167 )
1168
1169 // test reports whether the trigger condition is satisfied, meaning
1170 // that the exit condition for the _GCoff phase has been met. The exit
1171 // condition should be tested when allocating.
1172 func (t gcTrigger) test() bool {
1173         if !memstats.enablegc || panicking != 0 || gcphase != _GCoff {
1174                 return false
1175         }
1176         switch t.kind {
1177         case gcTriggerHeap:
1178                 // Non-atomic access to heap_live for performance. If
1179                 // we are going to trigger on this, this thread just
1180                 // atomically wrote heap_live anyway and we'll see our
1181                 // own write.
1182                 return memstats.heap_live >= memstats.gc_trigger
1183         case gcTriggerTime:
1184                 if gcpercent < 0 {
1185                         return false
1186                 }
1187                 lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
1188                 return lastgc != 0 && t.now-lastgc > forcegcperiod
1189         case gcTriggerCycle:
1190                 // t.n > work.cycles, but accounting for wraparound.
1191                 return int32(t.n-work.cycles) > 0
1192         }
1193         return true
1194 }
1195
1196 // gcStart starts the GC. It transitions from _GCoff to _GCmark (if
1197 // debug.gcstoptheworld == 0) or performs all of GC (if
1198 // debug.gcstoptheworld != 0).
1199 //
1200 // This may return without performing this transition in some cases,
1201 // such as when called on a system stack or with locks held.
1202 func gcStart(trigger gcTrigger) {
1203         // Since this is called from malloc and malloc is called in
1204         // the guts of a number of libraries that might be holding
1205         // locks, don't attempt to start GC in non-preemptible or
1206         // potentially unstable situations.
1207         mp := acquirem()
1208         if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
1209                 releasem(mp)
1210                 return
1211         }
1212         releasem(mp)
1213         mp = nil
1214
1215         // Pick up the remaining unswept/not being swept spans concurrently
1216         //
1217         // This shouldn't happen if we're being invoked in background
1218         // mode since proportional sweep should have just finished
1219         // sweeping everything, but rounding errors, etc, may leave a
1220         // few spans unswept. In forced mode, this is necessary since
1221         // GC can be forced at any point in the sweeping cycle.
1222         //
1223         // We check the transition condition continuously here in case
1224         // this G gets delayed in to the next GC cycle.
1225         for trigger.test() && sweepone() != ^uintptr(0) {
1226                 sweep.nbgsweep++
1227         }
1228
1229         // Perform GC initialization and the sweep termination
1230         // transition.
1231         semacquire(&work.startSema)
1232         // Re-check transition condition under transition lock.
1233         if !trigger.test() {
1234                 semrelease(&work.startSema)
1235                 return
1236         }
1237
1238         // For stats, check if this GC was forced by the user.
1239         work.userForced = trigger.kind == gcTriggerCycle
1240
1241         // In gcstoptheworld debug mode, upgrade the mode accordingly.
1242         // We do this after re-checking the transition condition so
1243         // that multiple goroutines that detect the heap trigger don't
1244         // start multiple STW GCs.
1245         mode := gcBackgroundMode
1246         if debug.gcstoptheworld == 1 {
1247                 mode = gcForceMode
1248         } else if debug.gcstoptheworld == 2 {
1249                 mode = gcForceBlockMode
1250         }
1251
1252         // Ok, we're doing it! Stop everybody else
1253         semacquire(&worldsema)
1254
1255         if trace.enabled {
1256                 traceGCStart()
1257         }
1258
1259         // Check that all Ps have finished deferred mcache flushes.
1260         for _, p := range allp {
1261                 if fg := atomic.Load(&p.mcache.flushGen); fg != mheap_.sweepgen {
1262                         println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
1263                         throw("p mcache not flushed")
1264                 }
1265         }
1266
1267         gcBgMarkStartWorkers()
1268
1269         systemstack(gcResetMarkState)
1270
1271         work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
1272         if work.stwprocs > ncpu {
1273                 // This is used to compute CPU time of the STW phases,
1274                 // so it can't be more than ncpu, even if GOMAXPROCS is.
1275                 work.stwprocs = ncpu
1276         }
1277         work.heap0 = atomic.Load64(&memstats.heap_live)
1278         work.pauseNS = 0
1279         work.mode = mode
1280
1281         now := nanotime()
1282         work.tSweepTerm = now
1283         work.pauseStart = now
1284         if trace.enabled {
1285                 traceGCSTWStart(1)
1286         }
1287         systemstack(stopTheWorldWithSema)
1288         // Finish sweep before we start concurrent scan.
1289         systemstack(func() {
1290                 finishsweep_m()
1291         })
1292         // clearpools before we start the GC. If we wait they memory will not be
1293         // reclaimed until the next GC cycle.
1294         clearpools()
1295
1296         work.cycles++
1297
1298         gcController.startCycle()
1299         work.heapGoal = memstats.next_gc
1300
1301         // In STW mode, disable scheduling of user Gs. This may also
1302         // disable scheduling of this goroutine, so it may block as
1303         // soon as we start the world again.
1304         if mode != gcBackgroundMode {
1305                 schedEnableUser(false)
1306         }
1307
1308         // Enter concurrent mark phase and enable
1309         // write barriers.
1310         //
1311         // Because the world is stopped, all Ps will
1312         // observe that write barriers are enabled by
1313         // the time we start the world and begin
1314         // scanning.
1315         //
1316         // Write barriers must be enabled before assists are
1317         // enabled because they must be enabled before
1318         // any non-leaf heap objects are marked. Since
1319         // allocations are blocked until assists can
1320         // happen, we want enable assists as early as
1321         // possible.
1322         setGCPhase(_GCmark)
1323
1324         gcBgMarkPrepare() // Must happen before assist enable.
1325         gcMarkRootPrepare()
1326
1327         // Mark all active tinyalloc blocks. Since we're
1328         // allocating from these, they need to be black like
1329         // other allocations. The alternative is to blacken
1330         // the tiny block on every allocation from it, which
1331         // would slow down the tiny allocator.
1332         gcMarkTinyAllocs()
1333
1334         // At this point all Ps have enabled the write
1335         // barrier, thus maintaining the no white to
1336         // black invariant. Enable mutator assists to
1337         // put back-pressure on fast allocating
1338         // mutators.
1339         atomic.Store(&gcBlackenEnabled, 1)
1340
1341         // Assists and workers can start the moment we start
1342         // the world.
1343         gcController.markStartTime = now
1344
1345         // Concurrent mark.
1346         systemstack(func() {
1347                 now = startTheWorldWithSema(trace.enabled)
1348                 work.pauseNS += now - work.pauseStart
1349                 work.tMark = now
1350         })
1351         // In STW mode, we could block the instant systemstack
1352         // returns, so don't do anything important here. Make sure we
1353         // block rather than returning to user code.
1354         if mode != gcBackgroundMode {
1355                 Gosched()
1356         }
1357
1358         semrelease(&work.startSema)
1359 }
1360
1361 // gcMarkDoneFlushed counts the number of P's with flushed work.
1362 //
1363 // Ideally this would be a captured local in gcMarkDone, but forEachP
1364 // escapes its callback closure, so it can't capture anything.
1365 //
1366 // This is protected by markDoneSema.
1367 var gcMarkDoneFlushed uint32
1368
1369 // debugCachedWork enables extra checks for debugging premature mark
1370 // termination.
1371 //
1372 // For debugging issue #27993.
1373 const debugCachedWork = false
1374
1375 // gcWorkPauseGen is for debugging the mark completion algorithm.
1376 // gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
1377 // Only used if debugCachedWork is true.
1378 //
1379 // For debugging issue #27993.
1380 var gcWorkPauseGen uint32 = 1
1381
1382 // gcMarkDone transitions the GC from mark to mark termination if all
1383 // reachable objects have been marked (that is, there are no grey
1384 // objects and can be no more in the future). Otherwise, it flushes
1385 // all local work to the global queues where it can be discovered by
1386 // other workers.
1387 //
1388 // This should be called when all local mark work has been drained and
1389 // there are no remaining workers. Specifically, when
1390 //
1391 //   work.nwait == work.nproc && !gcMarkWorkAvailable(p)
1392 //
1393 // The calling context must be preemptible.
1394 //
1395 // Flushing local work is important because idle Ps may have local
1396 // work queued. This is the only way to make that work visible and
1397 // drive GC to completion.
1398 //
1399 // It is explicitly okay to have write barriers in this function. If
1400 // it does transition to mark termination, then all reachable objects
1401 // have been marked, so the write barrier cannot shade any more
1402 // objects.
1403 func gcMarkDone() {
1404         // Ensure only one thread is running the ragged barrier at a
1405         // time.
1406         semacquire(&work.markDoneSema)
1407
1408 top:
1409         // Re-check transition condition under transition lock.
1410         //
1411         // It's critical that this checks the global work queues are
1412         // empty before performing the ragged barrier. Otherwise,
1413         // there could be global work that a P could take after the P
1414         // has passed the ragged barrier.
1415         if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
1416                 semrelease(&work.markDoneSema)
1417                 return
1418         }
1419
1420         // Flush all local buffers and collect flushedWork flags.
1421         gcMarkDoneFlushed = 0
1422         systemstack(func() {
1423                 gp := getg().m.curg
1424                 // Mark the user stack as preemptible so that it may be scanned.
1425                 // Otherwise, our attempt to force all P's to a safepoint could
1426                 // result in a deadlock as we attempt to preempt a worker that's
1427                 // trying to preempt us (e.g. for a stack scan).
1428                 casgstatus(gp, _Grunning, _Gwaiting)
1429                 forEachP(func(_p_ *p) {
1430                         // Flush the write barrier buffer, since this may add
1431                         // work to the gcWork.
1432                         wbBufFlush1(_p_)
1433                         // For debugging, shrink the write barrier
1434                         // buffer so it flushes immediately.
1435                         // wbBuf.reset will keep it at this size as
1436                         // long as throwOnGCWork is set.
1437                         if debugCachedWork {
1438                                 b := &_p_.wbBuf
1439                                 b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
1440                                 b.debugGen = gcWorkPauseGen
1441                         }
1442                         // Flush the gcWork, since this may create global work
1443                         // and set the flushedWork flag.
1444                         //
1445                         // TODO(austin): Break up these workbufs to
1446                         // better distribute work.
1447                         _p_.gcw.dispose()
1448                         // Collect the flushedWork flag.
1449                         if _p_.gcw.flushedWork {
1450                                 atomic.Xadd(&gcMarkDoneFlushed, 1)
1451                                 _p_.gcw.flushedWork = false
1452                         } else if debugCachedWork {
1453                                 // For debugging, freeze the gcWork
1454                                 // until we know whether we've reached
1455                                 // completion or not. If we think
1456                                 // we've reached completion, but
1457                                 // there's a paused gcWork, then
1458                                 // that's a bug.
1459                                 _p_.gcw.pauseGen = gcWorkPauseGen
1460                                 // Capture the G's stack.
1461                                 for i := range _p_.gcw.pauseStack {
1462                                         _p_.gcw.pauseStack[i].pc = 0
1463                                 }
1464                                 callers(1, _p_.gcw.pauseStack[:])
1465                         }
1466                 })
1467                 casgstatus(gp, _Gwaiting, _Grunning)
1468         })
1469
1470         if gcMarkDoneFlushed != 0 {
1471                 if debugCachedWork {
1472                         // Release paused gcWorks.
1473                         atomic.Xadd(&gcWorkPauseGen, 1)
1474                 }
1475                 // More grey objects were discovered since the
1476                 // previous termination check, so there may be more
1477                 // work to do. Keep going. It's possible the
1478                 // transition condition became true again during the
1479                 // ragged barrier, so re-check it.
1480                 goto top
1481         }
1482
1483         if debugCachedWork {
1484                 throwOnGCWork = true
1485                 // Release paused gcWorks. If there are any, they
1486                 // should now observe throwOnGCWork and panic.
1487                 atomic.Xadd(&gcWorkPauseGen, 1)
1488         }
1489
1490         // There was no global work, no local work, and no Ps
1491         // communicated work since we took markDoneSema. Therefore
1492         // there are no grey objects and no more objects can be
1493         // shaded. Transition to mark termination.
1494         now := nanotime()
1495         work.tMarkTerm = now
1496         work.pauseStart = now
1497         getg().m.preemptoff = "gcing"
1498         if trace.enabled {
1499                 traceGCSTWStart(0)
1500         }
1501         systemstack(stopTheWorldWithSema)
1502         // The gcphase is _GCmark, it will transition to _GCmarktermination
1503         // below. The important thing is that the wb remains active until
1504         // all marking is complete. This includes writes made by the GC.
1505
1506         if debugCachedWork {
1507                 // For debugging, double check that no work was added after we
1508                 // went around above and disable write barrier buffering.
1509                 for _, p := range allp {
1510                         gcw := &p.gcw
1511                         if !gcw.empty() {
1512                                 printlock()
1513                                 print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
1514                                 if gcw.wbuf1 == nil {
1515                                         print(" wbuf1=<nil>")
1516                                 } else {
1517                                         print(" wbuf1.n=", gcw.wbuf1.nobj)
1518                                 }
1519                                 if gcw.wbuf2 == nil {
1520                                         print(" wbuf2=<nil>")
1521                                 } else {
1522                                         print(" wbuf2.n=", gcw.wbuf2.nobj)
1523                                 }
1524                                 print("\n")
1525                                 if gcw.pauseGen == gcw.putGen {
1526                                         println("runtime: checkPut already failed at this generation")
1527                                 }
1528                                 throw("throwOnGCWork")
1529                         }
1530                 }
1531         } else {
1532                 // For unknown reasons (see issue #27993), there is
1533                 // sometimes work left over when we enter mark
1534                 // termination. Detect this and resume concurrent
1535                 // mark. This is obviously unfortunate.
1536                 //
1537                 // Switch to the system stack to call wbBufFlush1,
1538                 // though in this case it doesn't matter because we're
1539                 // non-preemptible anyway.
1540                 restart := false
1541                 systemstack(func() {
1542                         for _, p := range allp {
1543                                 wbBufFlush1(p)
1544                                 if !p.gcw.empty() {
1545                                         restart = true
1546                                         break
1547                                 }
1548                         }
1549                 })
1550                 if restart {
1551                         getg().m.preemptoff = ""
1552                         systemstack(func() {
1553                                 now := startTheWorldWithSema(true)
1554                                 work.pauseNS += now - work.pauseStart
1555                         })
1556                         goto top
1557                 }
1558         }
1559
1560         // Disable assists and background workers. We must do
1561         // this before waking blocked assists.
1562         atomic.Store(&gcBlackenEnabled, 0)
1563
1564         // Wake all blocked assists. These will run when we
1565         // start the world again.
1566         gcWakeAllAssists()
1567
1568         // Likewise, release the transition lock. Blocked
1569         // workers and assists will run when we start the
1570         // world again.
1571         semrelease(&work.markDoneSema)
1572
1573         // In STW mode, re-enable user goroutines. These will be
1574         // queued to run after we start the world.
1575         schedEnableUser(true)
1576
1577         // endCycle depends on all gcWork cache stats being flushed.
1578         // The termination algorithm above ensured that up to
1579         // allocations since the ragged barrier.
1580         nextTriggerRatio := gcController.endCycle()
1581
1582         // Perform mark termination. This will restart the world.
1583         gcMarkTermination(nextTriggerRatio)
1584 }
1585
1586 func gcMarkTermination(nextTriggerRatio float64) {
1587         // World is stopped.
1588         // Start marktermination which includes enabling the write barrier.
1589         atomic.Store(&gcBlackenEnabled, 0)
1590         setGCPhase(_GCmarktermination)
1591
1592         work.heap1 = memstats.heap_live
1593         startTime := nanotime()
1594
1595         mp := acquirem()
1596         mp.preemptoff = "gcing"
1597         _g_ := getg()
1598         _g_.m.traceback = 2
1599         gp := _g_.m.curg
1600         casgstatus(gp, _Grunning, _Gwaiting)
1601         gp.waitreason = waitReasonGarbageCollection
1602
1603         // Run gc on the g0 stack. We do this so that the g stack
1604         // we're currently running on will no longer change. Cuts
1605         // the root set down a bit (g0 stacks are not scanned, and
1606         // we don't need to scan gc's internal state).  We also
1607         // need to switch to g0 so we can shrink the stack.
1608         systemstack(func() {
1609                 gcMark(startTime)
1610                 // Must return immediately.
1611                 // The outer function's stack may have moved
1612                 // during gcMark (it shrinks stacks, including the
1613                 // outer function's stack), so we must not refer
1614                 // to any of its variables. Return back to the
1615                 // non-system stack to pick up the new addresses
1616                 // before continuing.
1617         })
1618
1619         systemstack(func() {
1620                 work.heap2 = work.bytesMarked
1621                 if debug.gccheckmark > 0 {
1622                         // Run a full non-parallel, stop-the-world
1623                         // mark using checkmark bits, to check that we
1624                         // didn't forget to mark anything during the
1625                         // concurrent mark process.
1626                         gcResetMarkState()
1627                         initCheckmarks()
1628                         gcw := &getg().m.p.ptr().gcw
1629                         gcDrain(gcw, 0)
1630                         wbBufFlush1(getg().m.p.ptr())
1631                         gcw.dispose()
1632                         clearCheckmarks()
1633                 }
1634
1635                 // marking is complete so we can turn the write barrier off
1636                 setGCPhase(_GCoff)
1637                 gcSweep(work.mode)
1638         })
1639
1640         _g_.m.traceback = 0
1641         casgstatus(gp, _Gwaiting, _Grunning)
1642
1643         if trace.enabled {
1644                 traceGCDone()
1645         }
1646
1647         // all done
1648         mp.preemptoff = ""
1649
1650         if gcphase != _GCoff {
1651                 throw("gc done but gcphase != _GCoff")
1652         }
1653
1654         // Update GC trigger and pacing for the next cycle.
1655         gcSetTriggerRatio(nextTriggerRatio)
1656
1657         // Update timing memstats
1658         now := nanotime()
1659         sec, nsec, _ := time_now()
1660         unixNow := sec*1e9 + int64(nsec)
1661         work.pauseNS += now - work.pauseStart
1662         work.tEnd = now
1663         atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
1664         atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
1665         memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
1666         memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
1667         memstats.pause_total_ns += uint64(work.pauseNS)
1668
1669         // Update work.totaltime.
1670         sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
1671         // We report idle marking time below, but omit it from the
1672         // overall utilization here since it's "free".
1673         markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
1674         markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
1675         cycleCpu := sweepTermCpu + markCpu + markTermCpu
1676         work.totaltime += cycleCpu
1677
1678         // Compute overall GC CPU utilization.
1679         totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
1680         memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
1681
1682         // Reset sweep state.
1683         sweep.nbgsweep = 0
1684         sweep.npausesweep = 0
1685
1686         if work.userForced {
1687                 memstats.numforcedgc++
1688         }
1689
1690         // Bump GC cycle count and wake goroutines waiting on sweep.
1691         lock(&work.sweepWaiters.lock)
1692         memstats.numgc++
1693         injectglist(&work.sweepWaiters.list)
1694         unlock(&work.sweepWaiters.lock)
1695
1696         // Finish the current heap profiling cycle and start a new
1697         // heap profiling cycle. We do this before starting the world
1698         // so events don't leak into the wrong cycle.
1699         mProf_NextCycle()
1700
1701         systemstack(func() { startTheWorldWithSema(true) })
1702
1703         // Flush the heap profile so we can start a new cycle next GC.
1704         // This is relatively expensive, so we don't do it with the
1705         // world stopped.
1706         mProf_Flush()
1707
1708         // Prepare workbufs for freeing by the sweeper. We do this
1709         // asynchronously because it can take non-trivial time.
1710         prepareFreeWorkbufs()
1711
1712         // Ensure all mcaches are flushed. Each P will flush its own
1713         // mcache before allocating, but idle Ps may not. Since this
1714         // is necessary to sweep all spans, we need to ensure all
1715         // mcaches are flushed before we start the next GC cycle.
1716         systemstack(func() {
1717                 forEachP(func(_p_ *p) {
1718                         _p_.mcache.prepareForSweep()
1719                 })
1720         })
1721
1722         // Print gctrace before dropping worldsema. As soon as we drop
1723         // worldsema another cycle could start and smash the stats
1724         // we're trying to print.
1725         if debug.gctrace > 0 {
1726                 util := int(memstats.gc_cpu_fraction * 100)
1727
1728                 var sbuf [24]byte
1729                 printlock()
1730                 print("gc ", memstats.numgc,
1731                         " @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1732                         util, "%: ")
1733                 prev := work.tSweepTerm
1734                 for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1735                         if i != 0 {
1736                                 print("+")
1737                         }
1738                         print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1739                         prev = ns
1740                 }
1741                 print(" ms clock, ")
1742                 for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
1743                         if i == 2 || i == 3 {
1744                                 // Separate mark time components with /.
1745                                 print("/")
1746                         } else if i != 0 {
1747                                 print("+")
1748                         }
1749                         print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1750                 }
1751                 print(" ms cpu, ",
1752                         work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1753                         work.heapGoal>>20, " MB goal, ",
1754                         work.maxprocs, " P")
1755                 if work.userForced {
1756                         print(" (forced)")
1757                 }
1758                 print("\n")
1759                 printunlock()
1760         }
1761
1762         semrelease(&worldsema)
1763         // Careful: another GC cycle may start now.
1764
1765         releasem(mp)
1766         mp = nil
1767
1768         // now that gc is done, kick off finalizer thread if needed
1769         if !concurrentSweep {
1770                 // give the queued finalizers, if any, a chance to run
1771                 Gosched()
1772         }
1773 }
1774
1775 // gcBgMarkStartWorkers prepares background mark worker goroutines.
1776 // These goroutines will not run until the mark phase, but they must
1777 // be started while the work is not stopped and from a regular G
1778 // stack. The caller must hold worldsema.
1779 func gcBgMarkStartWorkers() {
1780         // Background marking is performed by per-P G's. Ensure that
1781         // each P has a background GC G.
1782         for _, p := range allp {
1783                 if p.gcBgMarkWorker == 0 {
1784                         expectSystemGoroutine()
1785                         go gcBgMarkWorker(p)
1786                         notetsleepg(&work.bgMarkReady, -1)
1787                         noteclear(&work.bgMarkReady)
1788                 }
1789         }
1790 }
1791
1792 // gcBgMarkPrepare sets up state for background marking.
1793 // Mutator assists must not yet be enabled.
1794 func gcBgMarkPrepare() {
1795         // Background marking will stop when the work queues are empty
1796         // and there are no more workers (note that, since this is
1797         // concurrent, this may be a transient state, but mark
1798         // termination will clean it up). Between background workers
1799         // and assists, we don't really know how many workers there
1800         // will be, so we pretend to have an arbitrarily large number
1801         // of workers, almost all of which are "waiting". While a
1802         // worker is working it decrements nwait. If nproc == nwait,
1803         // there are no workers.
1804         work.nproc = ^uint32(0)
1805         work.nwait = ^uint32(0)
1806 }
1807
1808 func gcBgMarkWorker(_p_ *p) {
1809         setSystemGoroutine()
1810
1811         gp := getg()
1812
1813         type parkInfo struct {
1814                 m      muintptr // Release this m on park.
1815                 attach puintptr // If non-nil, attach to this p on park.
1816         }
1817         // We pass park to a gopark unlock function, so it can't be on
1818         // the stack (see gopark). Prevent deadlock from recursively
1819         // starting GC by disabling preemption.
1820         gp.m.preemptoff = "GC worker init"
1821         park := new(parkInfo)
1822         gp.m.preemptoff = ""
1823
1824         park.m.set(acquirem())
1825         park.attach.set(_p_)
1826         // Inform gcBgMarkStartWorkers that this worker is ready.
1827         // After this point, the background mark worker is scheduled
1828         // cooperatively by gcController.findRunnable. Hence, it must
1829         // never be preempted, as this would put it into _Grunnable
1830         // and put it on a run queue. Instead, when the preempt flag
1831         // is set, this puts itself into _Gwaiting to be woken up by
1832         // gcController.findRunnable at the appropriate time.
1833         notewakeup(&work.bgMarkReady)
1834
1835         for {
1836                 // Go to sleep until woken by gcController.findRunnable.
1837                 // We can't releasem yet since even the call to gopark
1838                 // may be preempted.
1839                 gopark(func(g *g, parkp unsafe.Pointer) bool {
1840                         park := (*parkInfo)(parkp)
1841
1842                         // The worker G is no longer running, so it's
1843                         // now safe to allow preemption.
1844                         releasem(park.m.ptr())
1845
1846                         // If the worker isn't attached to its P,
1847                         // attach now. During initialization and after
1848                         // a phase change, the worker may have been
1849                         // running on a different P. As soon as we
1850                         // attach, the owner P may schedule the
1851                         // worker, so this must be done after the G is
1852                         // stopped.
1853                         if park.attach != 0 {
1854                                 p := park.attach.ptr()
1855                                 park.attach.set(nil)
1856                                 // cas the worker because we may be
1857                                 // racing with a new worker starting
1858                                 // on this P.
1859                                 if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
1860                                         // The P got a new worker.
1861                                         // Exit this worker.
1862                                         return false
1863                                 }
1864                         }
1865                         return true
1866                 }, unsafe.Pointer(park), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
1867
1868                 // Loop until the P dies and disassociates this
1869                 // worker (the P may later be reused, in which case
1870                 // it will get a new worker) or we failed to associate.
1871                 if _p_.gcBgMarkWorker.ptr() != gp {
1872                         break
1873                 }
1874
1875                 // Disable preemption so we can use the gcw. If the
1876                 // scheduler wants to preempt us, we'll stop draining,
1877                 // dispose the gcw, and then preempt.
1878                 park.m.set(acquirem())
1879
1880                 if gcBlackenEnabled == 0 {
1881                         throw("gcBgMarkWorker: blackening not enabled")
1882                 }
1883
1884                 startTime := nanotime()
1885                 _p_.gcMarkWorkerStartTime = startTime
1886
1887                 decnwait := atomic.Xadd(&work.nwait, -1)
1888                 if decnwait == work.nproc {
1889                         println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1890                         throw("work.nwait was > work.nproc")
1891                 }
1892
1893                 systemstack(func() {
1894                         // Mark our goroutine preemptible so its stack
1895                         // can be scanned. This lets two mark workers
1896                         // scan each other (otherwise, they would
1897                         // deadlock). We must not modify anything on
1898                         // the G stack. However, stack shrinking is
1899                         // disabled for mark workers, so it is safe to
1900                         // read from the G stack.
1901                         casgstatus(gp, _Grunning, _Gwaiting)
1902                         switch _p_.gcMarkWorkerMode {
1903                         default:
1904                                 throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1905                         case gcMarkWorkerDedicatedMode:
1906                                 gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
1907                                 if gp.preempt {
1908                                         // We were preempted. This is
1909                                         // a useful signal to kick
1910                                         // everything out of the run
1911                                         // queue so it can run
1912                                         // somewhere else.
1913                                         lock(&sched.lock)
1914                                         for {
1915                                                 gp, _ := runqget(_p_)
1916                                                 if gp == nil {
1917                                                         break
1918                                                 }
1919                                                 globrunqput(gp)
1920                                         }
1921                                         unlock(&sched.lock)
1922                                 }
1923                                 // Go back to draining, this time
1924                                 // without preemption.
1925                                 gcDrain(&_p_.gcw, gcDrainFlushBgCredit)
1926                         case gcMarkWorkerFractionalMode:
1927                                 gcDrain(&_p_.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1928                         case gcMarkWorkerIdleMode:
1929                                 gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1930                         }
1931                         casgstatus(gp, _Gwaiting, _Grunning)
1932                 })
1933
1934                 // Account for time.
1935                 duration := nanotime() - startTime
1936                 switch _p_.gcMarkWorkerMode {
1937                 case gcMarkWorkerDedicatedMode:
1938                         atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
1939                         atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
1940                 case gcMarkWorkerFractionalMode:
1941                         atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
1942                         atomic.Xaddint64(&_p_.gcFractionalMarkTime, duration)
1943                 case gcMarkWorkerIdleMode:
1944                         atomic.Xaddint64(&gcController.idleMarkTime, duration)
1945                 }
1946
1947                 // Was this the last worker and did we run out
1948                 // of work?
1949                 incnwait := atomic.Xadd(&work.nwait, +1)
1950                 if incnwait > work.nproc {
1951                         println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
1952                                 "work.nwait=", incnwait, "work.nproc=", work.nproc)
1953                         throw("work.nwait > work.nproc")
1954                 }
1955
1956                 // If this worker reached a background mark completion
1957                 // point, signal the main GC goroutine.
1958                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1959                         // Make this G preemptible and disassociate it
1960                         // as the worker for this P so
1961                         // findRunnableGCWorker doesn't try to
1962                         // schedule it.
1963                         _p_.gcBgMarkWorker.set(nil)
1964                         releasem(park.m.ptr())
1965
1966                         gcMarkDone()
1967
1968                         // Disable preemption and prepare to reattach
1969                         // to the P.
1970                         //
1971                         // We may be running on a different P at this
1972                         // point, so we can't reattach until this G is
1973                         // parked.
1974                         park.m.set(acquirem())
1975                         park.attach.set(_p_)
1976                 }
1977         }
1978 }
1979
1980 // gcMarkWorkAvailable reports whether executing a mark worker
1981 // on p is potentially useful. p may be nil, in which case it only
1982 // checks the global sources of work.
1983 func gcMarkWorkAvailable(p *p) bool {
1984         if p != nil && !p.gcw.empty() {
1985                 return true
1986         }
1987         if !work.full.empty() {
1988                 return true // global work available
1989         }
1990         if work.markrootNext < work.markrootJobs {
1991                 return true // root scan work available
1992         }
1993         return false
1994 }
1995
1996 // gcMark runs the mark (or, for concurrent GC, mark termination)
1997 // All gcWork caches must be empty.
1998 // STW is in effect at this point.
1999 func gcMark(start_time int64) {
2000         if debug.allocfreetrace > 0 {
2001                 tracegc()
2002         }
2003
2004         if gcphase != _GCmarktermination {
2005                 throw("in gcMark expecting to see gcphase as _GCmarktermination")
2006         }
2007         work.tstart = start_time
2008
2009         // Check that there's no marking work remaining.
2010         if work.full != 0 || work.markrootNext < work.markrootJobs {
2011                 print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
2012                 panic("non-empty mark queue after concurrent mark")
2013         }
2014
2015         if debug.gccheckmark > 0 {
2016                 // This is expensive when there's a large number of
2017                 // Gs, so only do it if checkmark is also enabled.
2018                 gcMarkRootCheck()
2019         }
2020         if work.full != 0 {
2021                 throw("work.full != 0")
2022         }
2023
2024         // Clear out buffers and double-check that all gcWork caches
2025         // are empty. This should be ensured by gcMarkDone before we
2026         // enter mark termination.
2027         //
2028         // TODO: We could clear out buffers just before mark if this
2029         // has a non-negligible impact on STW time.
2030         for _, p := range allp {
2031                 // The write barrier may have buffered pointers since
2032                 // the gcMarkDone barrier. However, since the barrier
2033                 // ensured all reachable objects were marked, all of
2034                 // these must be pointers to black objects. Hence we
2035                 // can just discard the write barrier buffer.
2036                 if debug.gccheckmark > 0 || throwOnGCWork {
2037                         // For debugging, flush the buffer and make
2038                         // sure it really was all marked.
2039                         wbBufFlush1(p)
2040                 } else {
2041                         p.wbBuf.reset()
2042                 }
2043
2044                 gcw := &p.gcw
2045                 if !gcw.empty() {
2046                         printlock()
2047                         print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
2048                         if gcw.wbuf1 == nil {
2049                                 print(" wbuf1=<nil>")
2050                         } else {
2051                                 print(" wbuf1.n=", gcw.wbuf1.nobj)
2052                         }
2053                         if gcw.wbuf2 == nil {
2054                                 print(" wbuf2=<nil>")
2055                         } else {
2056                                 print(" wbuf2.n=", gcw.wbuf2.nobj)
2057                         }
2058                         print("\n")
2059                         throw("P has cached GC work at end of mark termination")
2060                 }
2061                 // There may still be cached empty buffers, which we
2062                 // need to flush since we're going to free them. Also,
2063                 // there may be non-zero stats because we allocated
2064                 // black after the gcMarkDone barrier.
2065                 gcw.dispose()
2066         }
2067
2068         throwOnGCWork = false
2069
2070         cachestats()
2071
2072         // Update the marked heap stat.
2073         memstats.heap_marked = work.bytesMarked
2074
2075         // Update other GC heap size stats. This must happen after
2076         // cachestats (which flushes local statistics to these) and
2077         // flushallmcaches (which modifies heap_live).
2078         memstats.heap_live = work.bytesMarked
2079         memstats.heap_scan = uint64(gcController.scanWork)
2080
2081         if trace.enabled {
2082                 traceHeapAlloc()
2083         }
2084 }
2085
2086 // gcSweep must be called on the system stack because it acquires the heap
2087 // lock. See mheap for details.
2088 //go:systemstack
2089 func gcSweep(mode gcMode) {
2090         if gcphase != _GCoff {
2091                 throw("gcSweep being done but phase is not GCoff")
2092         }
2093
2094         lock(&mheap_.lock)
2095         mheap_.sweepgen += 2
2096         mheap_.sweepdone = 0
2097         if mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
2098                 // We should have drained this list during the last
2099                 // sweep phase. We certainly need to start this phase
2100                 // with an empty swept list.
2101                 throw("non-empty swept list")
2102         }
2103         mheap_.pagesSwept = 0
2104         mheap_.sweepArenas = mheap_.allArenas
2105         mheap_.reclaimIndex = 0
2106         mheap_.reclaimCredit = 0
2107         unlock(&mheap_.lock)
2108
2109         if !_ConcurrentSweep || mode == gcForceBlockMode {
2110                 // Special case synchronous sweep.
2111                 // Record that no proportional sweeping has to happen.
2112                 lock(&mheap_.lock)
2113                 mheap_.sweepPagesPerByte = 0
2114                 unlock(&mheap_.lock)
2115                 // Sweep all spans eagerly.
2116                 for sweepone() != ^uintptr(0) {
2117                         sweep.npausesweep++
2118                 }
2119                 // Free workbufs eagerly.
2120                 prepareFreeWorkbufs()
2121                 for freeSomeWbufs(false) {
2122                 }
2123                 // All "free" events for this mark/sweep cycle have
2124                 // now happened, so we can make this profile cycle
2125                 // available immediately.
2126                 mProf_NextCycle()
2127                 mProf_Flush()
2128                 return
2129         }
2130
2131         // Background sweep.
2132         lock(&sweep.lock)
2133         if sweep.parked {
2134                 sweep.parked = false
2135                 ready(sweep.g, 0, true)
2136         }
2137         unlock(&sweep.lock)
2138 }
2139
2140 // gcResetMarkState resets global state prior to marking (concurrent
2141 // or STW) and resets the stack scan state of all Gs.
2142 //
2143 // This is safe to do without the world stopped because any Gs created
2144 // during or after this will start out in the reset state.
2145 //
2146 // gcResetMarkState must be called on the system stack because it acquires
2147 // the heap lock. See mheap for details.
2148 //
2149 //go:systemstack
2150 func gcResetMarkState() {
2151         // This may be called during a concurrent phase, so make sure
2152         // allgs doesn't change.
2153         lock(&allglock)
2154         for _, gp := range allgs {
2155                 gp.gcscandone = false  // set to true in gcphasework
2156                 gp.gcscanvalid = false // stack has not been scanned
2157                 gp.gcAssistBytes = 0
2158         }
2159         unlock(&allglock)
2160
2161         // Clear page marks. This is just 1MB per 64GB of heap, so the
2162         // time here is pretty trivial.
2163         lock(&mheap_.lock)
2164         arenas := mheap_.allArenas
2165         unlock(&mheap_.lock)
2166         for _, ai := range arenas {
2167                 ha := mheap_.arenas[ai.l1()][ai.l2()]
2168                 for i := range ha.pageMarks {
2169                         ha.pageMarks[i] = 0
2170                 }
2171         }
2172
2173         work.bytesMarked = 0
2174         work.initialHeapLive = atomic.Load64(&memstats.heap_live)
2175 }
2176
2177 // Hooks for other packages
2178
2179 var poolcleanup func()
2180
2181 //go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
2182 func sync_runtime_registerPoolCleanup(f func()) {
2183         poolcleanup = f
2184 }
2185
2186 func clearpools() {
2187         // clear sync.Pools
2188         if poolcleanup != nil {
2189                 poolcleanup()
2190         }
2191
2192         // Clear central sudog cache.
2193         // Leave per-P caches alone, they have strictly bounded size.
2194         // Disconnect cached list before dropping it on the floor,
2195         // so that a dangling ref to one entry does not pin all of them.
2196         lock(&sched.sudoglock)
2197         var sg, sgnext *sudog
2198         for sg = sched.sudogcache; sg != nil; sg = sgnext {
2199                 sgnext = sg.next
2200                 sg.next = nil
2201         }
2202         sched.sudogcache = nil
2203         unlock(&sched.sudoglock)
2204
2205         // Clear central defer pools.
2206         // Leave per-P pools alone, they have strictly bounded size.
2207         lock(&sched.deferlock)
2208         // disconnect cached list before dropping it on the floor,
2209         // so that a dangling ref to one entry does not pin all of them.
2210         var d, dlink *_defer
2211         for d = sched.deferpool; d != nil; d = dlink {
2212                 dlink = d.link
2213                 d.link = nil
2214         }
2215         sched.deferpool = nil
2216         unlock(&sched.deferlock)
2217 }
2218
2219 // Timing
2220
2221 // itoaDiv formats val/(10**dec) into buf.
2222 func itoaDiv(buf []byte, val uint64, dec int) []byte {
2223         i := len(buf) - 1
2224         idec := i - dec
2225         for val >= 10 || i >= idec {
2226                 buf[i] = byte(val%10 + '0')
2227                 i--
2228                 if i == idec {
2229                         buf[i] = '.'
2230                         i--
2231                 }
2232                 val /= 10
2233         }
2234         buf[i] = byte(val + '0')
2235         return buf[i:]
2236 }
2237
2238 // fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
2239 func fmtNSAsMS(buf []byte, ns uint64) []byte {
2240         if ns >= 10e6 {
2241                 // Format as whole milliseconds.
2242                 return itoaDiv(buf, ns/1e6, 0)
2243         }
2244         // Format two digits of precision, with at most three decimal places.
2245         x := ns / 1e3
2246         if x == 0 {
2247                 buf[0] = '0'
2248                 return buf[:1]
2249         }
2250         dec := 3
2251         for x >= 100 {
2252                 x /= 10
2253                 dec--
2254         }
2255         return itoaDiv(buf, x, dec)
2256 }