Importing Upstream version 4.8.2
[platform/upstream/gcc48.git] / libgo / runtime / mgc0.c
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Garbage collector.
6
7 #include <unistd.h>
8
9 #include "runtime.h"
10 #include "arch.h"
11 #include "malloc.h"
12 #include "mgc0.h"
13 #include "race.h"
14 #include "go-type.h"
15
16 // Map gccgo field names to gc field names.
17 // Slice aka __go_open_array.
18 #define array __values
19 #define cap __capacity
20 // Iface aka __go_interface
21 #define tab __methods
22 // Eface aka __go_empty_interface.
23 #define type __type_descriptor
24 // Hmap aka __go_map
25 typedef struct __go_map Hmap;
26 // Type aka __go_type_descriptor
27 #define kind __code
28 #define string __reflection
29 #define KindPtr GO_PTR
30 #define KindNoPointers GO_NO_POINTERS
31 // PtrType aka __go_ptr_type
32 #define elem __element_type
33
34 #ifdef USING_SPLIT_STACK
35
36 extern void * __splitstack_find (void *, void *, size_t *, void **, void **,
37                                  void **);
38
39 extern void * __splitstack_find_context (void *context[10], size_t *, void **,
40                                          void **, void **);
41
42 #endif
43
44 enum {
45         Debug = 0,
46         DebugMark = 0,  // run second pass to check mark
47         CollectStats = 0,
48         ScanStackByFrames = 0,
49         IgnorePreciseGC = 0,
50
51         // Four bits per word (see #defines below).
52         wordsPerBitmapWord = sizeof(void*)*8/4,
53         bitShift = sizeof(void*)*8/4,
54
55         handoffThreshold = 4,
56         IntermediateBufferCapacity = 64,
57
58         // Bits in type information
59         PRECISE = 1,
60         LOOP = 2,
61         PC_BITS = PRECISE | LOOP,
62 };
63
64 // Bits in per-word bitmap.
65 // #defines because enum might not be able to hold the values.
66 //
67 // Each word in the bitmap describes wordsPerBitmapWord words
68 // of heap memory.  There are 4 bitmap bits dedicated to each heap word,
69 // so on a 64-bit system there is one bitmap word per 16 heap words.
70 // The bits in the word are packed together by type first, then by
71 // heap location, so each 64-bit bitmap word consists of, from top to bottom,
72 // the 16 bitSpecial bits for the corresponding heap words, then the 16 bitMarked bits,
73 // then the 16 bitNoPointers/bitBlockBoundary bits, then the 16 bitAllocated bits.
74 // This layout makes it easier to iterate over the bits of a given type.
75 //
76 // The bitmap starts at mheap.arena_start and extends *backward* from
77 // there.  On a 64-bit system the off'th word in the arena is tracked by
78 // the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
79 // the only difference is that the divisor is 8.)
80 //
81 // To pull out the bits corresponding to a given pointer p, we use:
82 //
83 //      off = p - (uintptr*)mheap.arena_start;  // word offset
84 //      b = (uintptr*)mheap.arena_start - off/wordsPerBitmapWord - 1;
85 //      shift = off % wordsPerBitmapWord
86 //      bits = *b >> shift;
87 //      /* then test bits & bitAllocated, bits & bitMarked, etc. */
88 //
89 #define bitAllocated            ((uintptr)1<<(bitShift*0))
90 #define bitNoPointers           ((uintptr)1<<(bitShift*1))      /* when bitAllocated is set */
91 #define bitMarked               ((uintptr)1<<(bitShift*2))      /* when bitAllocated is set */
92 #define bitSpecial              ((uintptr)1<<(bitShift*3))      /* when bitAllocated is set - has finalizer or being profiled */
93 #define bitBlockBoundary        ((uintptr)1<<(bitShift*1))      /* when bitAllocated is NOT set */
94
95 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial)
96
97 // Holding worldsema grants an M the right to try to stop the world.
98 // The procedure is:
99 //
100 //      runtime_semacquire(&runtime_worldsema);
101 //      m->gcing = 1;
102 //      runtime_stoptheworld();
103 //
104 //      ... do stuff ...
105 //
106 //      m->gcing = 0;
107 //      runtime_semrelease(&runtime_worldsema);
108 //      runtime_starttheworld();
109 //
110 uint32 runtime_worldsema = 1;
111
112 static int32 gctrace;
113
114 // The size of Workbuf is N*PageSize.
115 typedef struct Workbuf Workbuf;
116 struct Workbuf
117 {
118 #define SIZE (2*PageSize-sizeof(LFNode)-sizeof(uintptr))
119         LFNode  node; // must be first
120         uintptr nobj;
121         Obj     obj[SIZE/sizeof(Obj) - 1];
122         uint8   _padding[SIZE%sizeof(Obj) + sizeof(Obj)];
123 #undef SIZE
124 };
125
126 typedef struct Finalizer Finalizer;
127 struct Finalizer
128 {
129         FuncVal *fn;
130         void *arg;
131         const struct __go_func_type *ft;
132 };
133
134 typedef struct FinBlock FinBlock;
135 struct FinBlock
136 {
137         FinBlock *alllink;
138         FinBlock *next;
139         int32 cnt;
140         int32 cap;
141         Finalizer fin[1];
142 };
143
144 static G *fing;
145 static FinBlock *finq; // list of finalizers that are to be executed
146 static FinBlock *finc; // cache of free blocks
147 static FinBlock *allfin; // list of all blocks
148 static Lock finlock;
149 static int32 fingwait;
150
151 static void runfinq(void*);
152 static Workbuf* getempty(Workbuf*);
153 static Workbuf* getfull(Workbuf*);
154 static void     putempty(Workbuf*);
155 static Workbuf* handoff(Workbuf*);
156 static void     gchelperstart(void);
157
158 static struct {
159         uint64  full;  // lock-free list of full blocks
160         uint64  empty; // lock-free list of empty blocks
161         byte    pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
162         uint32  nproc;
163         volatile uint32 nwait;
164         volatile uint32 ndone;
165         volatile uint32 debugmarkdone;
166         Note    alldone;
167         ParFor  *markfor;
168         ParFor  *sweepfor;
169
170         Lock;
171         byte    *chunk;
172         uintptr nchunk;
173
174         Obj     *roots;
175         uint32  nroot;
176         uint32  rootcap;
177 } work;
178
179 enum {
180         GC_DEFAULT_PTR = GC_NUM_INSTR,
181         GC_MAP_NEXT,
182         GC_CHAN,
183
184         GC_NUM_INSTR2
185 };
186
187 static struct {
188         struct {
189                 uint64 sum;
190                 uint64 cnt;
191         } ptr;
192         uint64 nbytes;
193         struct {
194                 uint64 sum;
195                 uint64 cnt;
196                 uint64 notype;
197                 uint64 typelookup;
198         } obj;
199         uint64 rescan;
200         uint64 rescanbytes;
201         uint64 instr[GC_NUM_INSTR2];
202         uint64 putempty;
203         uint64 getfull;
204 } gcstats;
205
206 // markonly marks an object. It returns true if the object
207 // has been marked by this function, false otherwise.
208 // This function doesn't append the object to any buffer.
209 static bool
210 markonly(void *obj)
211 {
212         byte *p;
213         uintptr *bitp, bits, shift, x, xbits, off;
214         MSpan *s;
215         PageID k;
216
217         // Words outside the arena cannot be pointers.
218         if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
219                 return false;
220
221         // obj may be a pointer to a live object.
222         // Try to find the beginning of the object.
223
224         // Round down to word boundary.
225         obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
226
227         // Find bits for this word.
228         off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
229         bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
230         shift = off % wordsPerBitmapWord;
231         xbits = *bitp;
232         bits = xbits >> shift;
233
234         // Pointing at the beginning of a block?
235         if((bits & (bitAllocated|bitBlockBoundary)) != 0)
236                 goto found;
237
238         // Otherwise consult span table to find beginning.
239         // (Manually inlined copy of MHeap_LookupMaybe.)
240         k = (uintptr)obj>>PageShift;
241         x = k;
242         if(sizeof(void*) == 8)
243                 x -= (uintptr)runtime_mheap->arena_start>>PageShift;
244         s = runtime_mheap->map[x];
245         if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
246                 return false;
247         p = (byte*)((uintptr)s->start<<PageShift);
248         if(s->sizeclass == 0) {
249                 obj = p;
250         } else {
251                 if((byte*)obj >= (byte*)s->limit)
252                         return false;
253                 uintptr size = s->elemsize;
254                 int32 i = ((byte*)obj - p)/size;
255                 obj = p+i*size;
256         }
257
258         // Now that we know the object header, reload bits.
259         off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
260         bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
261         shift = off % wordsPerBitmapWord;
262         xbits = *bitp;
263         bits = xbits >> shift;
264
265 found:
266         // Now we have bits, bitp, and shift correct for
267         // obj pointing at the base of the object.
268         // Only care about allocated and not marked.
269         if((bits & (bitAllocated|bitMarked)) != bitAllocated)
270                 return false;
271         if(work.nproc == 1)
272                 *bitp |= bitMarked<<shift;
273         else {
274                 for(;;) {
275                         x = *bitp;
276                         if(x & (bitMarked<<shift))
277                                 return false;
278                         if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
279                                 break;
280                 }
281         }
282
283         // The object is now marked
284         return true;
285 }
286
287 // PtrTarget is a structure used by intermediate buffers.
288 // The intermediate buffers hold GC data before it
289 // is moved/flushed to the work buffer (Workbuf).
290 // The size of an intermediate buffer is very small,
291 // such as 32 or 64 elements.
292 typedef struct PtrTarget PtrTarget;
293 struct PtrTarget
294 {
295         void *p;
296         uintptr ti;
297 };
298
299 typedef struct BufferList BufferList;
300 struct BufferList
301 {
302         PtrTarget ptrtarget[IntermediateBufferCapacity];
303         Obj obj[IntermediateBufferCapacity];
304         uint32 busy;
305         byte pad[CacheLineSize];
306 };
307 static BufferList bufferList[MaxGcproc];
308
309 static Type *itabtype;
310
311 static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
312
313 // flushptrbuf moves data from the PtrTarget buffer to the work buffer.
314 // The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned,
315 // while the work buffer contains blocks which have been marked
316 // and are prepared to be scanned by the garbage collector.
317 //
318 // _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer.
319 //
320 // A simplified drawing explaining how the todo-list moves from a structure to another:
321 //
322 //     scanblock
323 //  (find pointers)
324 //    Obj ------> PtrTarget (pointer targets)
325 //     â†‘          |
326 //     |          |
327 //     `----------'
328 //     flushptrbuf
329 //  (find block start, mark and enqueue)
330 static void
331 flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj)
332 {
333         byte *p, *arena_start, *obj;
334         uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti, n;
335         MSpan *s;
336         PageID k;
337         Obj *wp;
338         Workbuf *wbuf;
339         PtrTarget *ptrbuf_end;
340
341         arena_start = runtime_mheap->arena_start;
342
343         wp = *_wp;
344         wbuf = *_wbuf;
345         nobj = *_nobj;
346
347         ptrbuf_end = *ptrbufpos;
348         n = ptrbuf_end - ptrbuf;
349         *ptrbufpos = ptrbuf;
350
351         if(CollectStats) {
352                 runtime_xadd64(&gcstats.ptr.sum, n);
353                 runtime_xadd64(&gcstats.ptr.cnt, 1);
354         }
355
356         // If buffer is nearly full, get a new one.
357         if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) {
358                 if(wbuf != nil)
359                         wbuf->nobj = nobj;
360                 wbuf = getempty(wbuf);
361                 wp = wbuf->obj;
362                 nobj = 0;
363
364                 if(n >= nelem(wbuf->obj))
365                         runtime_throw("ptrbuf has to be smaller than WorkBuf");
366         }
367
368         // TODO(atom): This block is a branch of an if-then-else statement.
369         //             The single-threaded branch may be added in a next CL.
370         {
371                 // Multi-threaded version.
372
373                 while(ptrbuf < ptrbuf_end) {
374                         obj = ptrbuf->p;
375                         ti = ptrbuf->ti;
376                         ptrbuf++;
377
378                         // obj belongs to interval [mheap.arena_start, mheap.arena_used).
379                         if(Debug > 1) {
380                                 if(obj < runtime_mheap->arena_start || obj >= runtime_mheap->arena_used)
381                                         runtime_throw("object is outside of mheap");
382                         }
383
384                         // obj may be a pointer to a live object.
385                         // Try to find the beginning of the object.
386
387                         // Round down to word boundary.
388                         if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) {
389                                 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
390                                 ti = 0;
391                         }
392
393                         // Find bits for this word.
394                         off = (uintptr*)obj - (uintptr*)arena_start;
395                         bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
396                         shift = off % wordsPerBitmapWord;
397                         xbits = *bitp;
398                         bits = xbits >> shift;
399
400                         // Pointing at the beginning of a block?
401                         if((bits & (bitAllocated|bitBlockBoundary)) != 0)
402                                 goto found;
403
404                         ti = 0;
405
406                         // Pointing just past the beginning?
407                         // Scan backward a little to find a block boundary.
408                         for(j=shift; j-->0; ) {
409                                 if(((xbits>>j) & (bitAllocated|bitBlockBoundary)) != 0) {
410                                         obj = (byte*)obj - (shift-j)*PtrSize;
411                                         shift = j;
412                                         bits = xbits>>shift;
413                                         goto found;
414                                 }
415                         }
416
417                         // Otherwise consult span table to find beginning.
418                         // (Manually inlined copy of MHeap_LookupMaybe.)
419                         k = (uintptr)obj>>PageShift;
420                         x = k;
421                         if(sizeof(void*) == 8)
422                                 x -= (uintptr)arena_start>>PageShift;
423                         s = runtime_mheap->map[x];
424                         if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
425                                 continue;
426                         p = (byte*)((uintptr)s->start<<PageShift);
427                         if(s->sizeclass == 0) {
428                                 obj = p;
429                         } else {
430                                 if((byte*)obj >= (byte*)s->limit)
431                                         continue;
432                                 size = s->elemsize;
433                                 int32 i = ((byte*)obj - p)/size;
434                                 obj = p+i*size;
435                         }
436
437                         // Now that we know the object header, reload bits.
438                         off = (uintptr*)obj - (uintptr*)arena_start;
439                         bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
440                         shift = off % wordsPerBitmapWord;
441                         xbits = *bitp;
442                         bits = xbits >> shift;
443
444                 found:
445                         // Now we have bits, bitp, and shift correct for
446                         // obj pointing at the base of the object.
447                         // Only care about allocated and not marked.
448                         if((bits & (bitAllocated|bitMarked)) != bitAllocated)
449                                 continue;
450                         if(work.nproc == 1)
451                                 *bitp |= bitMarked<<shift;
452                         else {
453                                 for(;;) {
454                                         x = *bitp;
455                                         if(x & (bitMarked<<shift))
456                                                 goto continue_obj;
457                                         if(runtime_casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
458                                                 break;
459                                 }
460                         }
461
462                         // If object has no pointers, don't need to scan further.
463                         if((bits & bitNoPointers) != 0)
464                                 continue;
465
466                         // Ask span about size class.
467                         // (Manually inlined copy of MHeap_Lookup.)
468                         x = (uintptr)obj >> PageShift;
469                         if(sizeof(void*) == 8)
470                                 x -= (uintptr)arena_start>>PageShift;
471                         s = runtime_mheap->map[x];
472
473                         PREFETCH(obj);
474
475                         *wp = (Obj){obj, s->elemsize, ti};
476                         wp++;
477                         nobj++;
478                 continue_obj:;
479                 }
480
481                 // If another proc wants a pointer, give it some.
482                 if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
483                         wbuf->nobj = nobj;
484                         wbuf = handoff(wbuf);
485                         nobj = wbuf->nobj;
486                         wp = wbuf->obj + nobj;
487                 }
488         }
489
490         *_wp = wp;
491         *_wbuf = wbuf;
492         *_nobj = nobj;
493 }
494
495 static void
496 flushobjbuf(Obj *objbuf, Obj **objbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj)
497 {
498         uintptr nobj, off;
499         Obj *wp, obj;
500         Workbuf *wbuf;
501         Obj *objbuf_end;
502
503         wp = *_wp;
504         wbuf = *_wbuf;
505         nobj = *_nobj;
506
507         objbuf_end = *objbufpos;
508         *objbufpos = objbuf;
509
510         while(objbuf < objbuf_end) {
511                 obj = *objbuf++;
512
513                 // Align obj.b to a word boundary.
514                 off = (uintptr)obj.p & (PtrSize-1);
515                 if(off != 0) {
516                         obj.p += PtrSize - off;
517                         obj.n -= PtrSize - off;
518                         obj.ti = 0;
519                 }
520
521                 if(obj.p == nil || obj.n == 0)
522                         continue;
523
524                 // If buffer is full, get a new one.
525                 if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
526                         if(wbuf != nil)
527                                 wbuf->nobj = nobj;
528                         wbuf = getempty(wbuf);
529                         wp = wbuf->obj;
530                         nobj = 0;
531                 }
532
533                 *wp = obj;
534                 wp++;
535                 nobj++;
536         }
537
538         // If another proc wants a pointer, give it some.
539         if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
540                 wbuf->nobj = nobj;
541                 wbuf = handoff(wbuf);
542                 nobj = wbuf->nobj;
543                 wp = wbuf->obj + nobj;
544         }
545
546         *_wp = wp;
547         *_wbuf = wbuf;
548         *_nobj = nobj;
549 }
550
551 // Program that scans the whole block and treats every block element as a potential pointer
552 static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
553
554 #if 0
555 // Hashmap iterator program
556 static uintptr mapProg[2] = {0, GC_MAP_NEXT};
557
558 // Hchan program
559 static uintptr chanProg[2] = {0, GC_CHAN};
560 #endif
561
562 // Local variables of a program fragment or loop
563 typedef struct Frame Frame;
564 struct Frame {
565         uintptr count, elemsize, b;
566         uintptr *loop_or_ret;
567 };
568
569 // Sanity check for the derived type info objti.
570 static void
571 checkptr(void *obj, uintptr objti)
572 {
573         uintptr type, tisize, i, x;
574         byte *objstart;
575         Type *t;
576         MSpan *s;
577
578         if(!Debug)
579                 runtime_throw("checkptr is debug only");
580
581         if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
582                 return;
583         type = runtime_gettype(obj);
584         t = (Type*)(type & ~(uintptr)(PtrSize-1));
585         if(t == nil)
586                 return;
587         x = (uintptr)obj >> PageShift;
588         if(sizeof(void*) == 8)
589                 x -= (uintptr)(runtime_mheap->arena_start)>>PageShift;
590         s = runtime_mheap->map[x];
591         objstart = (byte*)((uintptr)s->start<<PageShift);
592         if(s->sizeclass != 0) {
593                 i = ((byte*)obj - objstart)/s->elemsize;
594                 objstart += i*s->elemsize;
595         }
596         tisize = *(uintptr*)objti;
597         // Sanity check for object size: it should fit into the memory block.
598         if((byte*)obj + tisize > objstart + s->elemsize)
599                 runtime_throw("invalid gc type info");
600         if(obj != objstart)
601                 return;
602         // If obj points to the beginning of the memory block,
603         // check type info as well.
604         if(t->string == nil ||
605                 // Gob allocates unsafe pointers for indirection.
606                 (runtime_strcmp((const char *)t->string->str, (const char*)"unsafe.Pointer") &&
607                 // Runtime and gc think differently about closures.
608                  runtime_strstr((const char *)t->string->str, (const char*)"struct { F uintptr") != (const char *)t->string->str)) {
609 #if 0
610                 pc1 = (uintptr*)objti;
611                 pc2 = (uintptr*)t->gc;
612                 // A simple best-effort check until first GC_END.
613                 for(j = 1; pc1[j] != GC_END && pc2[j] != GC_END; j++) {
614                         if(pc1[j] != pc2[j]) {
615                                 runtime_printf("invalid gc type info for '%s' at %p, type info %p, block info %p\n",
616                                         t->string ? (const int8*)t->string->str : (const int8*)"?", j, pc1[j], pc2[j]);
617                                 runtime_throw("invalid gc type info");
618                         }
619                 }
620 #endif
621         }
622 }                                       
623
624 // scanblock scans a block of n bytes starting at pointer b for references
625 // to other objects, scanning any it finds recursively until there are no
626 // unscanned objects left.  Instead of using an explicit recursion, it keeps
627 // a work list in the Workbuf* structures and loops in the main function
628 // body.  Keeping an explicit work list is easier on the stack allocator and
629 // more efficient.
630 //
631 // wbuf: current work buffer
632 // wp:   storage for next queued pointer (write pointer)
633 // nobj: number of queued objects
634 static void
635 scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
636 {
637         byte *b, *arena_start, *arena_used;
638         uintptr n, i, end_b, elemsize, size, ti, objti, count /* , type */;
639         uintptr *pc, precise_type, nominal_size;
640 #if 0
641         uintptr *map_ret, mapkey_size, mapval_size, mapkey_ti, mapval_ti, *chan_ret, chancap;
642 #endif
643         void *obj;
644         const Type *t;
645         Slice *sliceptr;
646         Frame *stack_ptr, stack_top, stack[GC_STACK_CAPACITY+4];
647         BufferList *scanbuffers;
648         PtrTarget *ptrbuf, *ptrbuf_end, *ptrbufpos;
649         Obj *objbuf, *objbuf_end, *objbufpos;
650         Eface *eface;
651         Iface *iface;
652 #if 0
653         Hmap *hmap;
654         MapType *maptype;
655         bool mapkey_kind, mapval_kind;
656         struct hash_gciter map_iter;
657         struct hash_gciter_data d;
658         Hchan *chan;
659         ChanType *chantype;
660 #endif
661
662         if(sizeof(Workbuf) % PageSize != 0)
663                 runtime_throw("scanblock: size of Workbuf is suboptimal");
664
665         // Memory arena parameters.
666         arena_start = runtime_mheap->arena_start;
667         arena_used = runtime_mheap->arena_used;
668
669         stack_ptr = stack+nelem(stack)-1;
670         
671         precise_type = false;
672         nominal_size = 0;
673
674         // Allocate ptrbuf
675         {
676                 scanbuffers = &bufferList[runtime_m()->helpgc];
677                 ptrbuf = &scanbuffers->ptrtarget[0];
678                 ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget);
679                 objbuf = &scanbuffers->obj[0];
680                 objbuf_end = &scanbuffers->obj[0] + nelem(scanbuffers->obj);
681         }
682
683         ptrbufpos = ptrbuf;
684         objbufpos = objbuf;
685
686         // (Silence the compiler)
687 #if 0
688         map_ret = nil;
689         mapkey_size = mapval_size = 0;
690         mapkey_kind = mapval_kind = false;
691         mapkey_ti = mapval_ti = 0;
692         chan = nil;
693         chantype = nil;
694         chan_ret = nil;
695 #endif
696
697         goto next_block;
698
699         for(;;) {
700                 // Each iteration scans the block b of length n, queueing pointers in
701                 // the work buffer.
702                 if(Debug > 1) {
703                         runtime_printf("scanblock %p %D\n", b, (int64)n);
704                 }
705
706                 if(CollectStats) {
707                         runtime_xadd64(&gcstats.nbytes, n);
708                         runtime_xadd64(&gcstats.obj.sum, nobj);
709                         runtime_xadd64(&gcstats.obj.cnt, 1);
710                 }
711
712                 if(ti != 0 && false) {
713                         pc = (uintptr*)(ti & ~(uintptr)PC_BITS);
714                         precise_type = (ti & PRECISE);
715                         stack_top.elemsize = pc[0];
716                         if(!precise_type)
717                                 nominal_size = pc[0];
718                         if(ti & LOOP) {
719                                 stack_top.count = 0;    // 0 means an infinite number of iterations
720                                 stack_top.loop_or_ret = pc+1;
721                         } else {
722                                 stack_top.count = 1;
723                         }
724                         if(Debug) {
725                                 // Simple sanity check for provided type info ti:
726                                 // The declared size of the object must be not larger than the actual size
727                                 // (it can be smaller due to inferior pointers).
728                                 // It's difficult to make a comprehensive check due to inferior pointers,
729                                 // reflection, gob, etc.
730                                 if(pc[0] > n) {
731                                         runtime_printf("invalid gc type info: type info size %p, block size %p\n", pc[0], n);
732                                         runtime_throw("invalid gc type info");
733                                 }
734                         }
735                 } else if(UseSpanType && false) {
736                         if(CollectStats)
737                                 runtime_xadd64(&gcstats.obj.notype, 1);
738
739 #if 0
740                         type = runtime_gettype(b);
741                         if(type != 0) {
742                                 if(CollectStats)
743                                         runtime_xadd64(&gcstats.obj.typelookup, 1);
744
745                                 t = (Type*)(type & ~(uintptr)(PtrSize-1));
746                                 switch(type & (PtrSize-1)) {
747                                 case TypeInfo_SingleObject:
748                                         pc = (uintptr*)t->gc;
749                                         precise_type = true;  // type information about 'b' is precise
750                                         stack_top.count = 1;
751                                         stack_top.elemsize = pc[0];
752                                         break;
753                                 case TypeInfo_Array:
754                                         pc = (uintptr*)t->gc;
755                                         if(pc[0] == 0)
756                                                 goto next_block;
757                                         precise_type = true;  // type information about 'b' is precise
758                                         stack_top.count = 0;  // 0 means an infinite number of iterations
759                                         stack_top.elemsize = pc[0];
760                                         stack_top.loop_or_ret = pc+1;
761                                         break;
762                                 case TypeInfo_Map:
763                                         hmap = (Hmap*)b;
764                                         maptype = (MapType*)t;
765                                         if(hash_gciter_init(hmap, &map_iter)) {
766                                                 mapkey_size = maptype->key->size;
767                                                 mapkey_kind = maptype->key->kind;
768                                                 mapkey_ti   = (uintptr)maptype->key->gc | PRECISE;
769                                                 mapval_size = maptype->elem->size;
770                                                 mapval_kind = maptype->elem->kind;
771                                                 mapval_ti   = (uintptr)maptype->elem->gc | PRECISE;
772
773                                                 map_ret = nil;
774                                                 pc = mapProg;
775                                         } else {
776                                                 goto next_block;
777                                         }
778                                         break;
779                                 case TypeInfo_Chan:
780                                         chan = (Hchan*)b;
781                                         chantype = (ChanType*)t;
782                                         chan_ret = nil;
783                                         pc = chanProg;
784                                         break;
785                                 default:
786                                         runtime_throw("scanblock: invalid type");
787                                         return;
788                                 }
789                         } else {
790                                 pc = defaultProg;
791                         }
792 #endif
793                 } else {
794                         pc = defaultProg;
795                 }
796
797                 if(IgnorePreciseGC)
798                         pc = defaultProg;
799
800                 pc++;
801                 stack_top.b = (uintptr)b;
802
803                 end_b = (uintptr)b + n - PtrSize;
804
805         for(;;) {
806                 if(CollectStats)
807                         runtime_xadd64(&gcstats.instr[pc[0]], 1);
808
809                 obj = nil;
810                 objti = 0;
811                 switch(pc[0]) {
812                 case GC_PTR:
813                         obj = *(void**)(stack_top.b + pc[1]);
814                         objti = pc[2];
815                         pc += 3;
816                         if(Debug)
817                                 checkptr(obj, objti);
818                         break;
819
820                 case GC_SLICE:
821                         sliceptr = (Slice*)(stack_top.b + pc[1]);
822                         if(sliceptr->cap != 0) {
823                                 obj = sliceptr->array;
824                                 // Can't use slice element type for scanning,
825                                 // because if it points to an array embedded
826                                 // in the beginning of a struct,
827                                 // we will scan the whole struct as the slice.
828                                 // So just obtain type info from heap.
829                         }
830                         pc += 3;
831                         break;
832
833                 case GC_APTR:
834                         obj = *(void**)(stack_top.b + pc[1]);
835                         pc += 2;
836                         break;
837
838                 case GC_STRING:
839                         obj = *(void**)(stack_top.b + pc[1]);
840                         markonly(obj);
841                         pc += 2;
842                         continue;
843
844                 case GC_EFACE:
845                         eface = (Eface*)(stack_top.b + pc[1]);
846                         pc += 2;
847                         if(eface->type == nil)
848                                 continue;
849
850                         // eface->type
851                         t = eface->type;
852                         if((const byte*)t >= arena_start && (const byte*)t < arena_used) {
853                                 union { const Type *tc; Type *tr; } u;
854                                 u.tc = t;
855                                 *ptrbufpos++ = (struct PtrTarget){(void*)u.tr, 0};
856                                 if(ptrbufpos == ptrbuf_end)
857                                         flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
858                         }
859
860                         // eface->__object
861                         if((byte*)eface->__object >= arena_start && (byte*)eface->__object < arena_used) {
862                                 if(t->__size <= sizeof(void*)) {
863                                         if((t->kind & KindNoPointers))
864                                                 continue;
865
866                                         obj = eface->__object;
867                                         if((t->kind & ~KindNoPointers) == KindPtr)
868                                                 // objti = (uintptr)((PtrType*)t)->elem->gc;
869                                                 objti = 0;
870                                 } else {
871                                         obj = eface->__object;
872                                         // objti = (uintptr)t->gc;
873                                         objti = 0;
874                                 }
875                         }
876                         break;
877
878                 case GC_IFACE:
879                         iface = (Iface*)(stack_top.b + pc[1]);
880                         pc += 2;
881                         if(iface->tab == nil)
882                                 continue;
883                         
884                         // iface->tab
885                         if((byte*)iface->tab >= arena_start && (byte*)iface->tab < arena_used) {
886                                 // *ptrbufpos++ = (struct PtrTarget){iface->tab, (uintptr)itabtype->gc};
887                                 *ptrbufpos++ = (struct PtrTarget){iface->tab, 0};
888                                 if(ptrbufpos == ptrbuf_end)
889                                         flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
890                         }
891
892                         // iface->data
893                         if((byte*)iface->__object >= arena_start && (byte*)iface->__object < arena_used) {
894                                 // t = iface->tab->type;
895                                 t = nil;
896                                 if(t->__size <= sizeof(void*)) {
897                                         if((t->kind & KindNoPointers))
898                                                 continue;
899
900                                         obj = iface->__object;
901                                         if((t->kind & ~KindNoPointers) == KindPtr)
902                                                 // objti = (uintptr)((const PtrType*)t)->elem->gc;
903                                                 objti = 0;
904                                 } else {
905                                         obj = iface->__object;
906                                         // objti = (uintptr)t->gc;
907                                         objti = 0;
908                                 }
909                         }
910                         break;
911
912                 case GC_DEFAULT_PTR:
913                         while(stack_top.b <= end_b) {
914                                 obj = *(byte**)stack_top.b;
915                                 stack_top.b += PtrSize;
916                                 if((byte*)obj >= arena_start && (byte*)obj < arena_used) {
917                                         *ptrbufpos++ = (struct PtrTarget){obj, 0};
918                                         if(ptrbufpos == ptrbuf_end)
919                                                 flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
920                                 }
921                         }
922                         goto next_block;
923
924                 case GC_END:
925                         if(--stack_top.count != 0) {
926                                 // Next iteration of a loop if possible.
927                                 stack_top.b += stack_top.elemsize;
928                                 if(stack_top.b + stack_top.elemsize <= end_b+PtrSize) {
929                                         pc = stack_top.loop_or_ret;
930                                         continue;
931                                 }
932                                 i = stack_top.b;
933                         } else {
934                                 // Stack pop if possible.
935                                 if(stack_ptr+1 < stack+nelem(stack)) {
936                                         pc = stack_top.loop_or_ret;
937                                         stack_top = *(++stack_ptr);
938                                         continue;
939                                 }
940                                 i = (uintptr)b + nominal_size;
941                         }
942                         if(!precise_type) {
943                                 // Quickly scan [b+i,b+n) for possible pointers.
944                                 for(; i<=end_b; i+=PtrSize) {
945                                         if(*(byte**)i != nil) {
946                                                 // Found a value that may be a pointer.
947                                                 // Do a rescan of the entire block.
948                                                 enqueue((Obj){b, n, 0}, &wbuf, &wp, &nobj);
949                                                 if(CollectStats) {
950                                                         runtime_xadd64(&gcstats.rescan, 1);
951                                                         runtime_xadd64(&gcstats.rescanbytes, n);
952                                                 }
953                                                 break;
954                                         }
955                                 }
956                         }
957                         goto next_block;
958
959                 case GC_ARRAY_START:
960                         i = stack_top.b + pc[1];
961                         count = pc[2];
962                         elemsize = pc[3];
963                         pc += 4;
964
965                         // Stack push.
966                         *stack_ptr-- = stack_top;
967                         stack_top = (Frame){count, elemsize, i, pc};
968                         continue;
969
970                 case GC_ARRAY_NEXT:
971                         if(--stack_top.count != 0) {
972                                 stack_top.b += stack_top.elemsize;
973                                 pc = stack_top.loop_or_ret;
974                         } else {
975                                 // Stack pop.
976                                 stack_top = *(++stack_ptr);
977                                 pc += 1;
978                         }
979                         continue;
980
981                 case GC_CALL:
982                         // Stack push.
983                         *stack_ptr-- = stack_top;
984                         stack_top = (Frame){1, 0, stack_top.b + pc[1], pc+3 /*return address*/};
985                         pc = (uintptr*)((byte*)pc + *(int32*)(pc+2));  // target of the CALL instruction
986                         continue;
987
988 #if 0
989                 case GC_MAP_PTR:
990                         hmap = *(Hmap**)(stack_top.b + pc[1]);
991                         if(hmap == nil) {
992                                 pc += 3;
993                                 continue;
994                         }
995                         if(markonly(hmap)) {
996                                 maptype = (MapType*)pc[2];
997                                 if(hash_gciter_init(hmap, &map_iter)) {
998                                         mapkey_size = maptype->key->size;
999                                         mapkey_kind = maptype->key->kind;
1000                                         mapkey_ti   = (uintptr)maptype->key->gc | PRECISE;
1001                                         mapval_size = maptype->elem->size;
1002                                         mapval_kind = maptype->elem->kind;
1003                                         mapval_ti   = (uintptr)maptype->elem->gc | PRECISE;
1004
1005                                         // Start mapProg.
1006                                         map_ret = pc+3;
1007                                         pc = mapProg+1;
1008                                 } else {
1009                                         pc += 3;
1010                                 }
1011                         } else {
1012                                 pc += 3;
1013                         }
1014                         continue;
1015
1016                 case GC_MAP_NEXT:
1017                         // Add all keys and values to buffers, mark all subtables.
1018                         while(hash_gciter_next(&map_iter, &d)) {
1019                                 // buffers: reserve space for 2 objects.
1020                                 if(ptrbufpos+2 >= ptrbuf_end)
1021                                         flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
1022                                 if(objbufpos+2 >= objbuf_end)
1023                                         flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
1024
1025                                 if(d.st != nil)
1026                                         markonly(d.st);
1027
1028                                 if(d.key_data != nil) {
1029                                         if(!(mapkey_kind & KindNoPointers) || d.indirectkey) {
1030                                                 if(!d.indirectkey)
1031                                                         *objbufpos++ = (Obj){d.key_data, mapkey_size, mapkey_ti};
1032                                                 else {
1033                                                         if(Debug) {
1034                                                                 obj = *(void**)d.key_data;
1035                                                                 if(!(arena_start <= obj && obj < arena_used))
1036                                                                         runtime_throw("scanblock: inconsistent hashmap");
1037                                                         }
1038                                                         *ptrbufpos++ = (struct PtrTarget){*(void**)d.key_data, mapkey_ti};
1039                                                 }
1040                                         }
1041                                         if(!(mapval_kind & KindNoPointers) || d.indirectval) {
1042                                                 if(!d.indirectval)
1043                                                         *objbufpos++ = (Obj){d.val_data, mapval_size, mapval_ti};
1044                                                 else {
1045                                                         if(Debug) {
1046                                                                 obj = *(void**)d.val_data;
1047                                                                 if(!(arena_start <= obj && obj < arena_used))
1048                                                                         runtime_throw("scanblock: inconsistent hashmap");
1049                                                         }
1050                                                         *ptrbufpos++ = (struct PtrTarget){*(void**)d.val_data, mapval_ti};
1051                                                 }
1052                                         }
1053                                 }
1054                         }
1055                         if(map_ret == nil)
1056                                 goto next_block;
1057                         pc = map_ret;
1058                         continue;
1059 #endif
1060
1061                 case GC_REGION:
1062                         obj = (void*)(stack_top.b + pc[1]);
1063                         size = pc[2];
1064                         objti = pc[3];
1065                         pc += 4;
1066
1067                         *objbufpos++ = (Obj){obj, size, objti};
1068                         if(objbufpos == objbuf_end)
1069                                 flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
1070                         continue;
1071
1072 #if 0
1073                 case GC_CHAN_PTR:
1074                         // Similar to GC_MAP_PTR
1075                         chan = *(Hchan**)(stack_top.b + pc[1]);
1076                         if(chan == nil) {
1077                                 pc += 3;
1078                                 continue;
1079                         }
1080                         if(markonly(chan)) {
1081                                 chantype = (ChanType*)pc[2];
1082                                 if(!(chantype->elem->kind & KindNoPointers)) {
1083                                         // Start chanProg.
1084                                         chan_ret = pc+3;
1085                                         pc = chanProg+1;
1086                                         continue;
1087                                 }
1088                         }
1089                         pc += 3;
1090                         continue;
1091
1092                 case GC_CHAN:
1093                         // There are no heap pointers in struct Hchan,
1094                         // so we can ignore the leading sizeof(Hchan) bytes.
1095                         if(!(chantype->elem->kind & KindNoPointers)) {
1096                                 // Channel's buffer follows Hchan immediately in memory.
1097                                 // Size of buffer (cap(c)) is second int in the chan struct.
1098                                 chancap = ((uintgo*)chan)[1];
1099                                 if(chancap > 0) {
1100                                         // TODO(atom): split into two chunks so that only the
1101                                         // in-use part of the circular buffer is scanned.
1102                                         // (Channel routines zero the unused part, so the current
1103                                         // code does not lead to leaks, it's just a little inefficient.)
1104                                         *objbufpos++ = (Obj){(byte*)chan+runtime_Hchansize, chancap*chantype->elem->size,
1105                                                 (uintptr)chantype->elem->gc | PRECISE | LOOP};
1106                                         if(objbufpos == objbuf_end)
1107                                                 flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
1108                                 }
1109                         }
1110                         if(chan_ret == nil)
1111                                 goto next_block;
1112                         pc = chan_ret;
1113                         continue;
1114 #endif
1115
1116                 default:
1117                         runtime_throw("scanblock: invalid GC instruction");
1118                         return;
1119                 }
1120
1121                 if((byte*)obj >= arena_start && (byte*)obj < arena_used) {
1122                         *ptrbufpos++ = (struct PtrTarget){obj, objti};
1123                         if(ptrbufpos == ptrbuf_end)
1124                                 flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
1125                 }
1126         }
1127
1128         next_block:
1129                 // Done scanning [b, b+n).  Prepare for the next iteration of
1130                 // the loop by setting b, n, ti to the parameters for the next block.
1131
1132                 if(nobj == 0) {
1133                         flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj);
1134                         flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
1135
1136                         if(nobj == 0) {
1137                                 if(!keepworking) {
1138                                         if(wbuf)
1139                                                 putempty(wbuf);
1140                                         goto endscan;
1141                                 }
1142                                 // Emptied our buffer: refill.
1143                                 wbuf = getfull(wbuf);
1144                                 if(wbuf == nil)
1145                                         goto endscan;
1146                                 nobj = wbuf->nobj;
1147                                 wp = wbuf->obj + wbuf->nobj;
1148                         }
1149                 }
1150
1151                 // Fetch b from the work buffer.
1152                 --wp;
1153                 b = wp->p;
1154                 n = wp->n;
1155                 ti = wp->ti;
1156                 nobj--;
1157         }
1158
1159 endscan:;
1160 }
1161
1162 // debug_scanblock is the debug copy of scanblock.
1163 // it is simpler, slower, single-threaded, recursive,
1164 // and uses bitSpecial as the mark bit.
1165 static void
1166 debug_scanblock(byte *b, uintptr n)
1167 {
1168         byte *obj, *p;
1169         void **vp;
1170         uintptr size, *bitp, bits, shift, i, xbits, off;
1171         MSpan *s;
1172
1173         if(!DebugMark)
1174                 runtime_throw("debug_scanblock without DebugMark");
1175
1176         if((intptr)n < 0) {
1177                 runtime_printf("debug_scanblock %p %D\n", b, (int64)n);
1178                 runtime_throw("debug_scanblock");
1179         }
1180
1181         // Align b to a word boundary.
1182         off = (uintptr)b & (PtrSize-1);
1183         if(off != 0) {
1184                 b += PtrSize - off;
1185                 n -= PtrSize - off;
1186         }
1187
1188         vp = (void**)b;
1189         n /= PtrSize;
1190         for(i=0; i<(uintptr)n; i++) {
1191                 obj = (byte*)vp[i];
1192
1193                 // Words outside the arena cannot be pointers.
1194                 if((byte*)obj < runtime_mheap->arena_start || (byte*)obj >= runtime_mheap->arena_used)
1195                         continue;
1196
1197                 // Round down to word boundary.
1198                 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
1199
1200                 // Consult span table to find beginning.
1201                 s = runtime_MHeap_LookupMaybe(runtime_mheap, obj);
1202                 if(s == nil)
1203                         continue;
1204
1205                 p =  (byte*)((uintptr)s->start<<PageShift);
1206                 size = s->elemsize;
1207                 if(s->sizeclass == 0) {
1208                         obj = p;
1209                 } else {
1210                         if((byte*)obj >= (byte*)s->limit)
1211                                 continue;
1212                         int32 i = ((byte*)obj - p)/size;
1213                         obj = p+i*size;
1214                 }
1215
1216                 // Now that we know the object header, reload bits.
1217                 off = (uintptr*)obj - (uintptr*)runtime_mheap->arena_start;
1218                 bitp = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
1219                 shift = off % wordsPerBitmapWord;
1220                 xbits = *bitp;
1221                 bits = xbits >> shift;
1222
1223                 // Now we have bits, bitp, and shift correct for
1224                 // obj pointing at the base of the object.
1225                 // If not allocated or already marked, done.
1226                 if((bits & bitAllocated) == 0 || (bits & bitSpecial) != 0)  // NOTE: bitSpecial not bitMarked
1227                         continue;
1228                 *bitp |= bitSpecial<<shift;
1229                 if(!(bits & bitMarked))
1230                         runtime_printf("found unmarked block %p in %p\n", obj, vp+i);
1231
1232                 // If object has no pointers, don't need to scan further.
1233                 if((bits & bitNoPointers) != 0)
1234                         continue;
1235
1236                 debug_scanblock(obj, size);
1237         }
1238 }
1239
1240 // Append obj to the work buffer.
1241 // _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer.
1242 static void
1243 enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj)
1244 {
1245         uintptr nobj, off;
1246         Obj *wp;
1247         Workbuf *wbuf;
1248
1249         if(Debug > 1)
1250                 runtime_printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti);
1251
1252         // Align obj.b to a word boundary.
1253         off = (uintptr)obj.p & (PtrSize-1);
1254         if(off != 0) {
1255                 obj.p += PtrSize - off;
1256                 obj.n -= PtrSize - off;
1257                 obj.ti = 0;
1258         }
1259
1260         if(obj.p == nil || obj.n == 0)
1261                 return;
1262
1263         // Load work buffer state
1264         wp = *_wp;
1265         wbuf = *_wbuf;
1266         nobj = *_nobj;
1267
1268         // If another proc wants a pointer, give it some.
1269         if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
1270                 wbuf->nobj = nobj;
1271                 wbuf = handoff(wbuf);
1272                 nobj = wbuf->nobj;
1273                 wp = wbuf->obj + nobj;
1274         }
1275
1276         // If buffer is full, get a new one.
1277         if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
1278                 if(wbuf != nil)
1279                         wbuf->nobj = nobj;
1280                 wbuf = getempty(wbuf);
1281                 wp = wbuf->obj;
1282                 nobj = 0;
1283         }
1284
1285         *wp = obj;
1286         wp++;
1287         nobj++;
1288
1289         // Save work buffer state
1290         *_wp = wp;
1291         *_wbuf = wbuf;
1292         *_nobj = nobj;
1293 }
1294
1295 static void
1296 markroot(ParFor *desc, uint32 i)
1297 {
1298         Obj *wp;
1299         Workbuf *wbuf;
1300         uintptr nobj;
1301
1302         USED(&desc);
1303         wp = nil;
1304         wbuf = nil;
1305         nobj = 0;
1306         enqueue(work.roots[i], &wbuf, &wp, &nobj);
1307         scanblock(wbuf, wp, nobj, false);
1308 }
1309
1310 // Get an empty work buffer off the work.empty list,
1311 // allocating new buffers as needed.
1312 static Workbuf*
1313 getempty(Workbuf *b)
1314 {
1315         if(b != nil)
1316                 runtime_lfstackpush(&work.full, &b->node);
1317         b = (Workbuf*)runtime_lfstackpop(&work.empty);
1318         if(b == nil) {
1319                 // Need to allocate.
1320                 runtime_lock(&work);
1321                 if(work.nchunk < sizeof *b) {
1322                         work.nchunk = 1<<20;
1323                         work.chunk = runtime_SysAlloc(work.nchunk);
1324                         if(work.chunk == nil)
1325                                 runtime_throw("runtime: cannot allocate memory");
1326                 }
1327                 b = (Workbuf*)work.chunk;
1328                 work.chunk += sizeof *b;
1329                 work.nchunk -= sizeof *b;
1330                 runtime_unlock(&work);
1331         }
1332         b->nobj = 0;
1333         return b;
1334 }
1335
1336 static void
1337 putempty(Workbuf *b)
1338 {
1339         if(CollectStats)
1340                 runtime_xadd64(&gcstats.putempty, 1);
1341
1342         runtime_lfstackpush(&work.empty, &b->node);
1343 }
1344
1345 // Get a full work buffer off the work.full list, or return nil.
1346 static Workbuf*
1347 getfull(Workbuf *b)
1348 {
1349         M *m;
1350         int32 i;
1351
1352         if(CollectStats)
1353                 runtime_xadd64(&gcstats.getfull, 1);
1354
1355         if(b != nil)
1356                 runtime_lfstackpush(&work.empty, &b->node);
1357         b = (Workbuf*)runtime_lfstackpop(&work.full);
1358         if(b != nil || work.nproc == 1)
1359                 return b;
1360
1361         m = runtime_m();
1362         runtime_xadd(&work.nwait, +1);
1363         for(i=0;; i++) {
1364                 if(work.full != 0) {
1365                         runtime_xadd(&work.nwait, -1);
1366                         b = (Workbuf*)runtime_lfstackpop(&work.full);
1367                         if(b != nil)
1368                                 return b;
1369                         runtime_xadd(&work.nwait, +1);
1370                 }
1371                 if(work.nwait == work.nproc)
1372                         return nil;
1373                 if(i < 10) {
1374                         m->gcstats.nprocyield++;
1375                         runtime_procyield(20);
1376                 } else if(i < 20) {
1377                         m->gcstats.nosyield++;
1378                         runtime_osyield();
1379                 } else {
1380                         m->gcstats.nsleep++;
1381                         runtime_usleep(100);
1382                 }
1383         }
1384 }
1385
1386 static Workbuf*
1387 handoff(Workbuf *b)
1388 {
1389         M *m;
1390         int32 n;
1391         Workbuf *b1;
1392
1393         m = runtime_m();
1394
1395         // Make new buffer with half of b's pointers.
1396         b1 = getempty(nil);
1397         n = b->nobj/2;
1398         b->nobj -= n;
1399         b1->nobj = n;
1400         runtime_memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
1401         m->gcstats.nhandoff++;
1402         m->gcstats.nhandoffcnt += n;
1403
1404         // Put b on full list - let first half of b get stolen.
1405         runtime_lfstackpush(&work.full, &b->node);
1406         return b1;
1407 }
1408
1409 static void
1410 addroot(Obj obj)
1411 {
1412         uint32 cap;
1413         Obj *new;
1414
1415         if(work.nroot >= work.rootcap) {
1416                 cap = PageSize/sizeof(Obj);
1417                 if(cap < 2*work.rootcap)
1418                         cap = 2*work.rootcap;
1419                 new = (Obj*)runtime_SysAlloc(cap*sizeof(Obj));
1420                 if(new == nil)
1421                         runtime_throw("runtime: cannot allocate memory");
1422                 if(work.roots != nil) {
1423                         runtime_memmove(new, work.roots, work.rootcap*sizeof(Obj));
1424                         runtime_SysFree(work.roots, work.rootcap*sizeof(Obj));
1425                 }
1426                 work.roots = new;
1427                 work.rootcap = cap;
1428         }
1429         work.roots[work.nroot] = obj;
1430         work.nroot++;
1431 }
1432
1433 static void
1434 addstackroots(G *gp)
1435 {
1436 #ifdef USING_SPLIT_STACK
1437         M *mp;
1438         void* sp;
1439         size_t spsize;
1440         void* next_segment;
1441         void* next_sp;
1442         void* initial_sp;
1443
1444         if(gp == runtime_g()) {
1445                 // Scanning our own stack.
1446                 sp = __splitstack_find(nil, nil, &spsize, &next_segment,
1447                                        &next_sp, &initial_sp);
1448         } else if((mp = gp->m) != nil && mp->helpgc) {
1449                 // gchelper's stack is in active use and has no interesting pointers.
1450                 return;
1451         } else {
1452                 // Scanning another goroutine's stack.
1453                 // The goroutine is usually asleep (the world is stopped).
1454
1455                 // The exception is that if the goroutine is about to enter or might
1456                 // have just exited a system call, it may be executing code such
1457                 // as schedlock and may have needed to start a new stack segment.
1458                 // Use the stack segment and stack pointer at the time of
1459                 // the system call instead, since that won't change underfoot.
1460                 if(gp->gcstack != nil) {
1461                         sp = gp->gcstack;
1462                         spsize = gp->gcstack_size;
1463                         next_segment = gp->gcnext_segment;
1464                         next_sp = gp->gcnext_sp;
1465                         initial_sp = gp->gcinitial_sp;
1466                 } else {
1467                         sp = __splitstack_find_context(&gp->stack_context[0],
1468                                                        &spsize, &next_segment,
1469                                                        &next_sp, &initial_sp);
1470                 }
1471         }
1472         if(sp != nil) {
1473                 addroot((Obj){sp, spsize, 0});
1474                 while((sp = __splitstack_find(next_segment, next_sp,
1475                                               &spsize, &next_segment,
1476                                               &next_sp, &initial_sp)) != nil)
1477                         addroot((Obj){sp, spsize, 0});
1478         }
1479 #else
1480         M *mp;
1481         byte* bottom;
1482         byte* top;
1483
1484         if(gp == runtime_g()) {
1485                 // Scanning our own stack.
1486                 bottom = (byte*)&gp;
1487         } else if((mp = gp->m) != nil && mp->helpgc) {
1488                 // gchelper's stack is in active use and has no interesting pointers.
1489                 return;
1490         } else {
1491                 // Scanning another goroutine's stack.
1492                 // The goroutine is usually asleep (the world is stopped).
1493                 bottom = (byte*)gp->gcnext_sp;
1494                 if(bottom == nil)
1495                         return;
1496         }
1497         top = (byte*)gp->gcinitial_sp + gp->gcstack_size;
1498         if(top > bottom)
1499                 addroot((Obj){bottom, top - bottom, 0});
1500         else
1501                 addroot((Obj){top, bottom - top, 0});
1502 #endif
1503 }
1504
1505 static void
1506 addfinroots(void *v)
1507 {
1508         uintptr size;
1509         void *base;
1510
1511         size = 0;
1512         if(!runtime_mlookup(v, (byte**)&base, &size, nil) || !runtime_blockspecial(base))
1513                 runtime_throw("mark - finalizer inconsistency");
1514
1515         // do not mark the finalizer block itself.  just mark the things it points at.
1516         addroot((Obj){base, size, 0});
1517 }
1518
1519 static struct root_list* roots;
1520
1521 void
1522 __go_register_gc_roots (struct root_list* r)
1523 {
1524         // FIXME: This needs locking if multiple goroutines can call
1525         // dlopen simultaneously.
1526         r->next = roots;
1527         roots = r;
1528 }
1529
1530 static void
1531 addroots(void)
1532 {
1533         struct root_list *pl;
1534         G *gp;
1535         FinBlock *fb;
1536         MSpan *s, **allspans;
1537         uint32 spanidx;
1538
1539         work.nroot = 0;
1540
1541         // mark data+bss.
1542         for(pl = roots; pl != nil; pl = pl->next) {
1543                 struct root* pr = &pl->roots[0];
1544                 while(1) {
1545                         void *decl = pr->decl;
1546                         if(decl == nil)
1547                                 break;
1548                         addroot((Obj){decl, pr->size, 0});
1549                         pr++;
1550                 }
1551         }
1552
1553         addroot((Obj){(byte*)&runtime_m0, sizeof runtime_m0, 0});
1554         addroot((Obj){(byte*)&runtime_g0, sizeof runtime_g0, 0});
1555         addroot((Obj){(byte*)&runtime_allg, sizeof runtime_allg, 0});
1556         addroot((Obj){(byte*)&runtime_allm, sizeof runtime_allm, 0});
1557         addroot((Obj){(byte*)&runtime_allp, sizeof runtime_allp, 0});
1558         runtime_proc_scan(addroot);
1559         runtime_MProf_Mark(addroot);
1560         runtime_time_scan(addroot);
1561
1562         // MSpan.types
1563         allspans = runtime_mheap->allspans;
1564         for(spanidx=0; spanidx<runtime_mheap->nspan; spanidx++) {
1565                 s = allspans[spanidx];
1566                 if(s->state == MSpanInUse) {
1567                         // The garbage collector ignores type pointers stored in MSpan.types:
1568                         //  - Compiler-generated types are stored outside of heap.
1569                         //  - The reflect package has runtime-generated types cached in its data structures.
1570                         //    The garbage collector relies on finding the references via that cache.
1571                         switch(s->types.compression) {
1572                         case MTypes_Empty:
1573                         case MTypes_Single:
1574                                 break;
1575                         case MTypes_Words:
1576                         case MTypes_Bytes:
1577                                 markonly((byte*)s->types.data);
1578                                 break;
1579                         }
1580                 }
1581         }
1582
1583         // stacks
1584         for(gp=runtime_allg; gp!=nil; gp=gp->alllink) {
1585                 switch(gp->status){
1586                 default:
1587                         runtime_printf("unexpected G.status %d\n", gp->status);
1588                         runtime_throw("mark - bad status");
1589                 case Gdead:
1590                         break;
1591                 case Grunning:
1592                         if(gp != runtime_g())
1593                                 runtime_throw("mark - world not stopped");
1594                         addstackroots(gp);
1595                         break;
1596                 case Grunnable:
1597                 case Gsyscall:
1598                 case Gwaiting:
1599                         addstackroots(gp);
1600                         break;
1601                 }
1602         }
1603
1604         runtime_walkfintab(addfinroots, addroot);
1605
1606         for(fb=allfin; fb; fb=fb->alllink)
1607                 addroot((Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
1608
1609         addroot((Obj){(byte*)&work, sizeof work, 0});
1610 }
1611
1612 static bool
1613 handlespecial(byte *p, uintptr size)
1614 {
1615         FuncVal *fn;
1616         const struct __go_func_type *ft;
1617         FinBlock *block;
1618         Finalizer *f;
1619         
1620         if(!runtime_getfinalizer(p, true, &fn, &ft)) {
1621                 runtime_setblockspecial(p, false);
1622                 runtime_MProf_Free(p, size);
1623                 return false;
1624         }
1625
1626         runtime_lock(&finlock);
1627         if(finq == nil || finq->cnt == finq->cap) {
1628                 if(finc == nil) {
1629                         finc = runtime_SysAlloc(PageSize);
1630                         if(finc == nil)
1631                                 runtime_throw("runtime: cannot allocate memory");
1632                         finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
1633                         finc->alllink = allfin;
1634                         allfin = finc;
1635                 }
1636                 block = finc;
1637                 finc = block->next;
1638                 block->next = finq;
1639                 finq = block;
1640         }
1641         f = &finq->fin[finq->cnt];
1642         finq->cnt++;
1643         f->fn = fn;
1644         f->ft = ft;
1645         f->arg = p;
1646         runtime_unlock(&finlock);
1647         return true;
1648 }
1649
1650 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
1651 // It clears the mark bits in preparation for the next GC round.
1652 static void
1653 sweepspan(ParFor *desc, uint32 idx)
1654 {
1655         M *m;
1656         int32 cl, n, npages;
1657         uintptr size;
1658         byte *p;
1659         MCache *c;
1660         byte *arena_start;
1661         MLink head, *end;
1662         int32 nfree;
1663         byte *type_data;
1664         byte compression;
1665         uintptr type_data_inc;
1666         MSpan *s;
1667
1668         m = runtime_m();
1669
1670         USED(&desc);
1671         s = runtime_mheap->allspans[idx];
1672         if(s->state != MSpanInUse)
1673                 return;
1674         arena_start = runtime_mheap->arena_start;
1675         p = (byte*)(s->start << PageShift);
1676         cl = s->sizeclass;
1677         size = s->elemsize;
1678         if(cl == 0) {
1679                 n = 1;
1680         } else {
1681                 // Chunk full of small blocks.
1682                 npages = runtime_class_to_allocnpages[cl];
1683                 n = (npages << PageShift) / size;
1684         }
1685         nfree = 0;
1686         end = &head;
1687         c = m->mcache;
1688         
1689         type_data = (byte*)s->types.data;
1690         type_data_inc = sizeof(uintptr);
1691         compression = s->types.compression;
1692         switch(compression) {
1693         case MTypes_Bytes:
1694                 type_data += 8*sizeof(uintptr);
1695                 type_data_inc = 1;
1696                 break;
1697         }
1698
1699         // Sweep through n objects of given size starting at p.
1700         // This thread owns the span now, so it can manipulate
1701         // the block bitmap without atomic operations.
1702         for(; n > 0; n--, p += size, type_data+=type_data_inc) {
1703                 uintptr off, *bitp, shift, bits;
1704
1705                 off = (uintptr*)p - (uintptr*)arena_start;
1706                 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
1707                 shift = off % wordsPerBitmapWord;
1708                 bits = *bitp>>shift;
1709
1710                 if((bits & bitAllocated) == 0)
1711                         continue;
1712
1713                 if((bits & bitMarked) != 0) {
1714                         if(DebugMark) {
1715                                 if(!(bits & bitSpecial))
1716                                         runtime_printf("found spurious mark on %p\n", p);
1717                                 *bitp &= ~(bitSpecial<<shift);
1718                         }
1719                         *bitp &= ~(bitMarked<<shift);
1720                         continue;
1721                 }
1722
1723                 // Special means it has a finalizer or is being profiled.
1724                 // In DebugMark mode, the bit has been coopted so
1725                 // we have to assume all blocks are special.
1726                 if(DebugMark || (bits & bitSpecial) != 0) {
1727                         if(handlespecial(p, size))
1728                                 continue;
1729                 }
1730
1731                 // Mark freed; restore block boundary bit.
1732                 *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
1733
1734                 if(cl == 0) {
1735                         // Free large span.
1736                         runtime_unmarkspan(p, 1<<PageShift);
1737                         *(uintptr*)p = (uintptr)0xdeaddeaddeaddeadll;   // needs zeroing
1738                         runtime_MHeap_Free(runtime_mheap, s, 1);
1739                         c->local_alloc -= size;
1740                         c->local_nfree++;
1741                 } else {
1742                         // Free small object.
1743                         switch(compression) {
1744                         case MTypes_Words:
1745                                 *(uintptr*)type_data = 0;
1746                                 break;
1747                         case MTypes_Bytes:
1748                                 *(byte*)type_data = 0;
1749                                 break;
1750                         }
1751                         if(size > sizeof(uintptr))
1752                                 ((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;       // mark as "needs to be zeroed"
1753                         
1754                         end->next = (MLink*)p;
1755                         end = (MLink*)p;
1756                         nfree++;
1757                 }
1758         }
1759
1760         if(nfree) {
1761                 c->local_by_size[cl].nfree += nfree;
1762                 c->local_alloc -= size * nfree;
1763                 c->local_nfree += nfree;
1764                 c->local_cachealloc -= nfree * size;
1765                 c->local_objects -= nfree;
1766                 runtime_MCentral_FreeSpan(&runtime_mheap->central[cl], s, nfree, head.next, end);
1767         }
1768 }
1769
1770 static void
1771 dumpspan(uint32 idx)
1772 {
1773         int32 sizeclass, n, npages, i, column;
1774         uintptr size;
1775         byte *p;
1776         byte *arena_start;
1777         MSpan *s;
1778         bool allocated, special;
1779
1780         s = runtime_mheap->allspans[idx];
1781         if(s->state != MSpanInUse)
1782                 return;
1783         arena_start = runtime_mheap->arena_start;
1784         p = (byte*)(s->start << PageShift);
1785         sizeclass = s->sizeclass;
1786         size = s->elemsize;
1787         if(sizeclass == 0) {
1788                 n = 1;
1789         } else {
1790                 npages = runtime_class_to_allocnpages[sizeclass];
1791                 n = (npages << PageShift) / size;
1792         }
1793         
1794         runtime_printf("%p .. %p:\n", p, p+n*size);
1795         column = 0;
1796         for(; n>0; n--, p+=size) {
1797                 uintptr off, *bitp, shift, bits;
1798
1799                 off = (uintptr*)p - (uintptr*)arena_start;
1800                 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
1801                 shift = off % wordsPerBitmapWord;
1802                 bits = *bitp>>shift;
1803
1804                 allocated = ((bits & bitAllocated) != 0);
1805                 special = ((bits & bitSpecial) != 0);
1806
1807                 for(i=0; (uint32)i<size; i+=sizeof(void*)) {
1808                         if(column == 0) {
1809                                 runtime_printf("\t");
1810                         }
1811                         if(i == 0) {
1812                                 runtime_printf(allocated ? "(" : "[");
1813                                 runtime_printf(special ? "@" : "");
1814                                 runtime_printf("%p: ", p+i);
1815                         } else {
1816                                 runtime_printf(" ");
1817                         }
1818
1819                         runtime_printf("%p", *(void**)(p+i));
1820
1821                         if(i+sizeof(void*) >= size) {
1822                                 runtime_printf(allocated ? ") " : "] ");
1823                         }
1824
1825                         column++;
1826                         if(column == 8) {
1827                                 runtime_printf("\n");
1828                                 column = 0;
1829                         }
1830                 }
1831         }
1832         runtime_printf("\n");
1833 }
1834
1835 // A debugging function to dump the contents of memory
1836 void
1837 runtime_memorydump(void)
1838 {
1839         uint32 spanidx;
1840
1841         for(spanidx=0; spanidx<runtime_mheap->nspan; spanidx++) {
1842                 dumpspan(spanidx);
1843         }
1844 }
1845
1846 void
1847 runtime_gchelper(void)
1848 {
1849         gchelperstart();
1850
1851         // parallel mark for over gc roots
1852         runtime_parfordo(work.markfor);
1853
1854         // help other threads scan secondary blocks
1855         scanblock(nil, nil, 0, true);
1856
1857         if(DebugMark) {
1858                 // wait while the main thread executes mark(debug_scanblock)
1859                 while(runtime_atomicload(&work.debugmarkdone) == 0)
1860                         runtime_usleep(10);
1861         }
1862
1863         runtime_parfordo(work.sweepfor);
1864         bufferList[runtime_m()->helpgc].busy = 0;
1865         if(runtime_xadd(&work.ndone, +1) == work.nproc-1)
1866                 runtime_notewakeup(&work.alldone);
1867 }
1868
1869 #define GcpercentUnknown (-2)
1870
1871 // Initialized from $GOGC.  GOGC=off means no gc.
1872 //
1873 // Next gc is after we've allocated an extra amount of
1874 // memory proportional to the amount already in use.
1875 // If gcpercent=100 and we're using 4M, we'll gc again
1876 // when we get to 8M.  This keeps the gc cost in linear
1877 // proportion to the allocation cost.  Adjusting gcpercent
1878 // just changes the linear constant (and also the amount of
1879 // extra memory used).
1880 static int32 gcpercent = GcpercentUnknown;
1881
1882 static void
1883 cachestats(GCStats *stats)
1884 {
1885         M *mp;
1886         MCache *c;
1887         P *p, **pp;
1888         uint32 i;
1889         uint64 stacks_inuse;
1890         uint64 *src, *dst;
1891
1892         if(stats)
1893                 runtime_memclr((byte*)stats, sizeof(*stats));
1894         stacks_inuse = 0;
1895         for(mp=runtime_allm; mp; mp=mp->alllink) {
1896                 //stacks_inuse += mp->stackinuse*FixedStack;
1897                 if(stats) {
1898                         src = (uint64*)&mp->gcstats;
1899                         dst = (uint64*)stats;
1900                         for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
1901                                 dst[i] += src[i];
1902                         runtime_memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
1903                 }
1904         }
1905         for(pp=runtime_allp; (p=*pp) != nil; pp++) {
1906                 c = p->mcache;
1907                 if(c==nil)
1908                         continue;
1909                 runtime_purgecachedstats(c);
1910                 for(i=0; i<nelem(c->local_by_size); i++) {
1911                         mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc;
1912                         c->local_by_size[i].nmalloc = 0;
1913                         mstats.by_size[i].nfree += c->local_by_size[i].nfree;
1914                         c->local_by_size[i].nfree = 0;
1915                 }
1916         }
1917         mstats.stacks_inuse = stacks_inuse;
1918 }
1919
1920 // Structure of arguments passed to function gc().
1921 // This allows the arguments to be passed via reflect_call.
1922 struct gc_args
1923 {
1924         int32 force;
1925 };
1926
1927 static void gc(struct gc_args *args);
1928
1929 static int32
1930 readgogc(void)
1931 {
1932         const byte *p;
1933
1934         p = runtime_getenv("GOGC");
1935         if(p == nil || p[0] == '\0')
1936                 return 100;
1937         if(runtime_strcmp((const char *)p, "off") == 0)
1938                 return -1;
1939         return runtime_atoi(p);
1940 }
1941
1942 void
1943 runtime_gc(int32 force)
1944 {
1945         M *m;
1946         const byte *p;
1947         struct gc_args a, *ap;
1948
1949         // The atomic operations are not atomic if the uint64s
1950         // are not aligned on uint64 boundaries. This has been
1951         // a problem in the past.
1952         if((((uintptr)&work.empty) & 7) != 0)
1953                 runtime_throw("runtime: gc work buffer is misaligned");
1954         if((((uintptr)&work.full) & 7) != 0)
1955                 runtime_throw("runtime: gc work buffer is misaligned");
1956
1957         // Make sure all registers are saved on stack so that
1958         // scanstack sees them.
1959         __builtin_unwind_init();
1960
1961         // The gc is turned off (via enablegc) until
1962         // the bootstrap has completed.
1963         // Also, malloc gets called in the guts
1964         // of a number of libraries that might be
1965         // holding locks.  To avoid priority inversion
1966         // problems, don't bother trying to run gc
1967         // while holding a lock.  The next mallocgc
1968         // without a lock will do the gc instead.
1969         m = runtime_m();
1970         if(!mstats.enablegc || m->locks > 0 || runtime_panicking)
1971                 return;
1972
1973         if(gcpercent == GcpercentUnknown) {     // first time through
1974                 gcpercent = readgogc();
1975
1976                 p = runtime_getenv("GOGCTRACE");
1977                 if(p != nil)
1978                         gctrace = runtime_atoi(p);
1979         }
1980         if(gcpercent < 0)
1981                 return;
1982
1983         // Run gc on a bigger stack to eliminate
1984         // a potentially large number of calls to runtime_morestack.
1985         // But not when using gccgo.
1986         a.force = force;
1987         ap = &a;
1988         gc(ap);
1989
1990         if(gctrace > 1 && !force) {
1991                 a.force = 1;
1992                 gc(&a);
1993         }
1994 }
1995
1996 static void
1997 gc(struct gc_args *args)
1998 {
1999         M *m;
2000         int64 t0, t1, t2, t3, t4;
2001         uint64 heap0, heap1, obj0, obj1, ninstr;
2002         GCStats stats;
2003         M *mp;
2004         uint32 i;
2005         // Eface eface;
2006
2007         runtime_semacquire(&runtime_worldsema);
2008         if(!args->force && mstats.heap_alloc < mstats.next_gc) {
2009                 runtime_semrelease(&runtime_worldsema);
2010                 return;
2011         }
2012
2013         m = runtime_m();
2014
2015         t0 = runtime_nanotime();
2016
2017         m->gcing = 1;
2018         runtime_stoptheworld();
2019
2020         if(CollectStats)
2021                 runtime_memclr((byte*)&gcstats, sizeof(gcstats));
2022
2023         for(mp=runtime_allm; mp; mp=mp->alllink)
2024                 runtime_settype_flush(mp, false);
2025
2026         heap0 = 0;
2027         obj0 = 0;
2028         if(gctrace) {
2029                 cachestats(nil);
2030                 heap0 = mstats.heap_alloc;
2031                 obj0 = mstats.nmalloc - mstats.nfree;
2032         }
2033
2034         m->locks++;     // disable gc during mallocs in parforalloc
2035         if(work.markfor == nil)
2036                 work.markfor = runtime_parforalloc(MaxGcproc);
2037         if(work.sweepfor == nil)
2038                 work.sweepfor = runtime_parforalloc(MaxGcproc);
2039         m->locks--;
2040
2041         if(itabtype == nil) {
2042                 // get C pointer to the Go type "itab"
2043                 // runtime_gc_itab_ptr(&eface);
2044                 // itabtype = ((PtrType*)eface.type)->elem;
2045         }
2046
2047         work.nwait = 0;
2048         work.ndone = 0;
2049         work.debugmarkdone = 0;
2050         work.nproc = runtime_gcprocs();
2051         addroots();
2052         runtime_parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
2053         runtime_parforsetup(work.sweepfor, work.nproc, runtime_mheap->nspan, nil, true, sweepspan);
2054         if(work.nproc > 1) {
2055                 runtime_noteclear(&work.alldone);
2056                 runtime_helpgc(work.nproc);
2057         }
2058
2059         t1 = runtime_nanotime();
2060
2061         gchelperstart();
2062         runtime_parfordo(work.markfor);
2063         scanblock(nil, nil, 0, true);
2064
2065         if(DebugMark) {
2066                 for(i=0; i<work.nroot; i++)
2067                         debug_scanblock(work.roots[i].p, work.roots[i].n);
2068                 runtime_atomicstore(&work.debugmarkdone, 1);
2069         }
2070         t2 = runtime_nanotime();
2071
2072         runtime_parfordo(work.sweepfor);
2073         bufferList[m->helpgc].busy = 0;
2074         t3 = runtime_nanotime();
2075
2076         if(work.nproc > 1)
2077                 runtime_notesleep(&work.alldone);
2078
2079         cachestats(&stats);
2080
2081         stats.nprocyield += work.sweepfor->nprocyield;
2082         stats.nosyield += work.sweepfor->nosyield;
2083         stats.nsleep += work.sweepfor->nsleep;
2084
2085         mstats.next_gc = mstats.heap_alloc+(mstats.heap_alloc-runtime_stacks_sys)*gcpercent/100;
2086         m->gcing = 0;
2087
2088         if(finq != nil) {
2089                 m->locks++;     // disable gc during the mallocs in newproc
2090                 // kick off or wake up goroutine to run queued finalizers
2091                 if(fing == nil)
2092                         fing = __go_go(runfinq, nil);
2093                 else if(fingwait) {
2094                         fingwait = 0;
2095                         runtime_ready(fing);
2096                 }
2097                 m->locks--;
2098         }
2099
2100         heap1 = mstats.heap_alloc;
2101         obj1 = mstats.nmalloc - mstats.nfree;
2102
2103         t4 = runtime_nanotime();
2104         mstats.last_gc = t4;
2105         mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
2106         mstats.pause_total_ns += t4 - t0;
2107         mstats.numgc++;
2108         if(mstats.debuggc)
2109                 runtime_printf("pause %D\n", t4-t0);
2110
2111         if(gctrace) {
2112                 runtime_printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
2113                                 " %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
2114                         mstats.numgc, work.nproc, (t2-t1)/1000000, (t3-t2)/1000000, (t1-t0+t4-t3)/1000000,
2115                         heap0>>20, heap1>>20, obj0, obj1,
2116                         mstats.nmalloc, mstats.nfree,
2117                         stats.nhandoff, stats.nhandoffcnt,
2118                         work.sweepfor->nsteal, work.sweepfor->nstealcnt,
2119                         stats.nprocyield, stats.nosyield, stats.nsleep);
2120                 if(CollectStats) {
2121                         runtime_printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
2122                                 gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
2123                         if(gcstats.ptr.cnt != 0)
2124                                 runtime_printf("avg ptrbufsize: %D (%D/%D)\n",
2125                                         gcstats.ptr.sum/gcstats.ptr.cnt, gcstats.ptr.sum, gcstats.ptr.cnt);
2126                         if(gcstats.obj.cnt != 0)
2127                                 runtime_printf("avg nobj: %D (%D/%D)\n",
2128                                         gcstats.obj.sum/gcstats.obj.cnt, gcstats.obj.sum, gcstats.obj.cnt);
2129                         runtime_printf("rescans: %D, %D bytes\n", gcstats.rescan, gcstats.rescanbytes);
2130
2131                         runtime_printf("instruction counts:\n");
2132                         ninstr = 0;
2133                         for(i=0; i<nelem(gcstats.instr); i++) {
2134                                 runtime_printf("\t%d:\t%D\n", i, gcstats.instr[i]);
2135                                 ninstr += gcstats.instr[i];
2136                         }
2137                         runtime_printf("\ttotal:\t%D\n", ninstr);
2138
2139                         runtime_printf("putempty: %D, getfull: %D\n", gcstats.putempty, gcstats.getfull);
2140                 }
2141         }
2142
2143         runtime_MProf_GC();
2144         runtime_semrelease(&runtime_worldsema);
2145         runtime_starttheworld();
2146
2147         // give the queued finalizers, if any, a chance to run
2148         if(finq != nil)
2149                 runtime_gosched();
2150 }
2151
2152 void runtime_ReadMemStats(MStats *)
2153   __asm__ (GOSYM_PREFIX "runtime.ReadMemStats");
2154
2155 void
2156 runtime_ReadMemStats(MStats *stats)
2157 {
2158         M *m;
2159
2160         // Have to acquire worldsema to stop the world,
2161         // because stoptheworld can only be used by
2162         // one goroutine at a time, and there might be
2163         // a pending garbage collection already calling it.
2164         runtime_semacquire(&runtime_worldsema);
2165         m = runtime_m();
2166         m->gcing = 1;
2167         runtime_stoptheworld();
2168         cachestats(nil);
2169         *stats = mstats;
2170         m->gcing = 0;
2171         runtime_semrelease(&runtime_worldsema);
2172         runtime_starttheworld();
2173 }
2174
2175 void runtime_debug_readGCStats(Slice*)
2176   __asm__("runtime_debug.readGCStats");
2177
2178 void
2179 runtime_debug_readGCStats(Slice *pauses)
2180 {
2181         uint64 *p;
2182         uint32 i, n;
2183
2184         // Calling code in runtime/debug should make the slice large enough.
2185         if((size_t)pauses->cap < nelem(mstats.pause_ns)+3)
2186                 runtime_throw("runtime: short slice passed to readGCStats");
2187
2188         // Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
2189         p = (uint64*)pauses->array;
2190         runtime_lock(runtime_mheap);
2191         n = mstats.numgc;
2192         if(n > nelem(mstats.pause_ns))
2193                 n = nelem(mstats.pause_ns);
2194         
2195         // The pause buffer is circular. The most recent pause is at
2196         // pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
2197         // from there to go back farther in time. We deliver the times
2198         // most recent first (in p[0]).
2199         for(i=0; i<n; i++)
2200                 p[i] = mstats.pause_ns[(mstats.numgc-1-i)%nelem(mstats.pause_ns)];
2201
2202         p[n] = mstats.last_gc;
2203         p[n+1] = mstats.numgc;
2204         p[n+2] = mstats.pause_total_ns; 
2205         runtime_unlock(runtime_mheap);
2206         pauses->__count = n+3;
2207 }
2208
2209 intgo runtime_debug_setGCPercent(intgo)
2210   __asm__("runtime_debug.setGCPercent");
2211
2212 intgo
2213 runtime_debug_setGCPercent(intgo in)
2214 {
2215         intgo out;
2216
2217         runtime_lock(runtime_mheap);
2218         if(gcpercent == GcpercentUnknown)
2219                 gcpercent = readgogc();
2220         out = gcpercent;
2221         if(in < 0)
2222                 in = -1;
2223         gcpercent = in;
2224         runtime_unlock(runtime_mheap);
2225         return out;
2226 }
2227
2228 static void
2229 gchelperstart(void)
2230 {
2231         M *m;
2232
2233         m = runtime_m();
2234         if(m->helpgc < 0 || m->helpgc >= MaxGcproc)
2235                 runtime_throw("gchelperstart: bad m->helpgc");
2236         if(runtime_xchg(&bufferList[m->helpgc].busy, 1))
2237                 runtime_throw("gchelperstart: already busy");
2238 }
2239
2240 static void
2241 runfinq(void* dummy __attribute__ ((unused)))
2242 {
2243         Finalizer *f;
2244         FinBlock *fb, *next;
2245         uint32 i;
2246
2247         for(;;) {
2248                 // There's no need for a lock in this section
2249                 // because it only conflicts with the garbage
2250                 // collector, and the garbage collector only
2251                 // runs when everyone else is stopped, and
2252                 // runfinq only stops at the gosched() or
2253                 // during the calls in the for loop.
2254                 fb = finq;
2255                 finq = nil;
2256                 if(fb == nil) {
2257                         fingwait = 1;
2258                         runtime_park(nil, nil, "finalizer wait");
2259                         continue;
2260                 }
2261                 if(raceenabled)
2262                         runtime_racefingo();
2263                 for(; fb; fb=next) {
2264                         next = fb->next;
2265                         for(i=0; i<(uint32)fb->cnt; i++) {
2266                                 void *param;
2267
2268                                 f = &fb->fin[i];
2269                                 param = &f->arg;
2270                                 reflect_call(f->ft, f->fn, 0, 0, &param, nil);
2271                                 f->fn = nil;
2272                                 f->arg = nil;
2273                         }
2274                         fb->cnt = 0;
2275                         fb->next = finc;
2276                         finc = fb;
2277                 }
2278                 runtime_gc(1);  // trigger another gc to clean up the finalized objects, if possible
2279         }
2280 }
2281
2282 // mark the block at v of size n as allocated.
2283 // If noptr is true, mark it as having no pointers.
2284 void
2285 runtime_markallocated(void *v, uintptr n, bool noptr)
2286 {
2287         uintptr *b, obits, bits, off, shift;
2288
2289         if(0)
2290                 runtime_printf("markallocated %p+%p\n", v, n);
2291
2292         if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
2293                 runtime_throw("markallocated: bad pointer");
2294
2295         off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
2296         b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
2297         shift = off % wordsPerBitmapWord;
2298
2299         for(;;) {
2300                 obits = *b;
2301                 bits = (obits & ~(bitMask<<shift)) | (bitAllocated<<shift);
2302                 if(noptr)
2303                         bits |= bitNoPointers<<shift;
2304                 if(runtime_singleproc) {
2305                         *b = bits;
2306                         break;
2307                 } else {
2308                         // more than one goroutine is potentially running: use atomic op
2309                         if(runtime_casp((void**)b, (void*)obits, (void*)bits))
2310                                 break;
2311                 }
2312         }
2313 }
2314
2315 // mark the block at v of size n as freed.
2316 void
2317 runtime_markfreed(void *v, uintptr n)
2318 {
2319         uintptr *b, obits, bits, off, shift;
2320
2321         if(0)
2322                 runtime_printf("markallocated %p+%p\n", v, n);
2323
2324         if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
2325                 runtime_throw("markallocated: bad pointer");
2326
2327         off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
2328         b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
2329         shift = off % wordsPerBitmapWord;
2330
2331         for(;;) {
2332                 obits = *b;
2333                 bits = (obits & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
2334                 if(runtime_singleproc) {
2335                         *b = bits;
2336                         break;
2337                 } else {
2338                         // more than one goroutine is potentially running: use atomic op
2339                         if(runtime_casp((void**)b, (void*)obits, (void*)bits))
2340                                 break;
2341                 }
2342         }
2343 }
2344
2345 // check that the block at v of size n is marked freed.
2346 void
2347 runtime_checkfreed(void *v, uintptr n)
2348 {
2349         uintptr *b, bits, off, shift;
2350
2351         if(!runtime_checking)
2352                 return;
2353
2354         if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
2355                 return; // not allocated, so okay
2356
2357         off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;  // word offset
2358         b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
2359         shift = off % wordsPerBitmapWord;
2360
2361         bits = *b>>shift;
2362         if((bits & bitAllocated) != 0) {
2363                 runtime_printf("checkfreed %p+%p: off=%p have=%p\n",
2364                         v, n, off, bits & bitMask);
2365                 runtime_throw("checkfreed: not freed");
2366         }
2367 }
2368
2369 // mark the span of memory at v as having n blocks of the given size.
2370 // if leftover is true, there is left over space at the end of the span.
2371 void
2372 runtime_markspan(void *v, uintptr size, uintptr n, bool leftover)
2373 {
2374         uintptr *b, off, shift;
2375         byte *p;
2376
2377         if((byte*)v+size*n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
2378                 runtime_throw("markspan: bad pointer");
2379
2380         p = v;
2381         if(leftover)    // mark a boundary just past end of last block too
2382                 n++;
2383         for(; n-- > 0; p += size) {
2384                 // Okay to use non-atomic ops here, because we control
2385                 // the entire span, and each bitmap word has bits for only
2386                 // one span, so no other goroutines are changing these
2387                 // bitmap words.
2388                 off = (uintptr*)p - (uintptr*)runtime_mheap->arena_start;  // word offset
2389                 b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
2390                 shift = off % wordsPerBitmapWord;
2391                 *b = (*b & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
2392         }
2393 }
2394
2395 // unmark the span of memory at v of length n bytes.
2396 void
2397 runtime_unmarkspan(void *v, uintptr n)
2398 {
2399         uintptr *p, *b, off;
2400
2401         if((byte*)v+n > (byte*)runtime_mheap->arena_used || (byte*)v < runtime_mheap->arena_start)
2402                 runtime_throw("markspan: bad pointer");
2403
2404         p = v;
2405         off = p - (uintptr*)runtime_mheap->arena_start;  // word offset
2406         if(off % wordsPerBitmapWord != 0)
2407                 runtime_throw("markspan: unaligned pointer");
2408         b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
2409         n /= PtrSize;
2410         if(n%wordsPerBitmapWord != 0)
2411                 runtime_throw("unmarkspan: unaligned length");
2412         // Okay to use non-atomic ops here, because we control
2413         // the entire span, and each bitmap word has bits for only
2414         // one span, so no other goroutines are changing these
2415         // bitmap words.
2416         n /= wordsPerBitmapWord;
2417         while(n-- > 0)
2418                 *b-- = 0;
2419 }
2420
2421 bool
2422 runtime_blockspecial(void *v)
2423 {
2424         uintptr *b, off, shift;
2425
2426         if(DebugMark)
2427                 return true;
2428
2429         off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;
2430         b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
2431         shift = off % wordsPerBitmapWord;
2432
2433         return (*b & (bitSpecial<<shift)) != 0;
2434 }
2435
2436 void
2437 runtime_setblockspecial(void *v, bool s)
2438 {
2439         uintptr *b, off, shift, bits, obits;
2440
2441         if(DebugMark)
2442                 return;
2443
2444         off = (uintptr*)v - (uintptr*)runtime_mheap->arena_start;
2445         b = (uintptr*)runtime_mheap->arena_start - off/wordsPerBitmapWord - 1;
2446         shift = off % wordsPerBitmapWord;
2447
2448         for(;;) {
2449                 obits = *b;
2450                 if(s)
2451                         bits = obits | (bitSpecial<<shift);
2452                 else
2453                         bits = obits & ~(bitSpecial<<shift);
2454                 if(runtime_singleproc) {
2455                         *b = bits;
2456                         break;
2457                 } else {
2458                         // more than one goroutine is potentially running: use atomic op
2459                         if(runtime_casp((void**)b, (void*)obits, (void*)bits))
2460                                 break;
2461                 }
2462         }
2463 }
2464
2465 void
2466 runtime_MHeap_MapBits(MHeap *h)
2467 {
2468         size_t page_size;
2469
2470         // Caller has added extra mappings to the arena.
2471         // Add extra mappings of bitmap words as needed.
2472         // We allocate extra bitmap pieces in chunks of bitmapChunk.
2473         enum {
2474                 bitmapChunk = 8192
2475         };
2476         uintptr n;
2477
2478         n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
2479         n = (n+bitmapChunk-1) & ~(bitmapChunk-1);
2480         if(h->bitmap_mapped >= n)
2481                 return;
2482
2483         page_size = getpagesize();
2484         n = (n+page_size-1) & ~(page_size-1);
2485
2486         runtime_SysMap(h->arena_start - n, n - h->bitmap_mapped);
2487         h->bitmap_mapped = n;
2488 }