From 3c50a689ca85f4fe56afbc8da9e894c4cc3af845 Mon Sep 17 00:00:00 2001
From: Ivan Maidanski <ivmai@mail.ru>
Date: Tue, 26 Jul 2011 20:30:36 +0400
Subject: [PATCH] gc7.0alpha5 tarball import

---
 Makefile                             |  27 +-
 Makefile.direct                      |  27 +-
 Makefile.in                          |   1 +
 NT_STATIC_THREADS_MAKEFILE           |   4 +-
 allchblk.c                           |  12 +-
 alloc.c                              |  10 +-
 configure                            |  36 +-
 configure.ac                         |  18 +-
 doc/README                           |   2 +-
 doc/README.changes                   |  56 +-
 doc/README.linux                     |   8 +-
 doc/doc.am                           |   1 +
 doc/gcdescr.html                     |  63 ++-
 doc/overview.html                    | 446 ++++++++++++++++
 headers.c                            |   2 +-
 include/gc.h                         |  29 +-
 include/gc_config_macros.h           |  21 +-
 include/gc_inline.h                  |  15 +-
 include/private/gc_locks.h           | 104 ++--
 include/private/gc_pmark.h           |  45 +-
 include/private/gc_priv.h            |  78 ++-
 include/private/gcconfig.h           |  22 +-
 include/private/pthread_support.h    |   5 +-
 include/private/thread_local_alloc.h |  58 ++-
 mach_dep.c                           |  80 +--
 malloc.c                             |  16 +-
 mallocx.c                            |  79 +--
 mark.c                               | 168 +++---
 mark_rts.c                           |   2 +-
 misc.c                               |  35 +-
 os_dep.c                             | 126 +----
 pthread_stop_world.c                 |  12 +-
 pthread_support.c                    |  54 +-
 reclaim.c                            |  11 +-
 setjmp_t.c                           |  26 +-
 sparc_mach_dep.S                     |   4 +
 tests/leak_test.c                    |   4 +
 tests/test.c                         | 100 +---
 tests/test_cpp.cc                    |   8 +-
 tests/thread_leak_test.c             |   1 +
 thread_local_alloc.c                 |  47 +-
 threadlibs.c                         |   6 +-
 typd_mlc.c                           |  24 +-
 version.h                            |   2 +-
 win32_threads.c                      | 954 ++++++++++++++++++++++++++---------
 45 files changed, 1851 insertions(+), 998 deletions(-)
 create mode 100644 doc/overview.html

diff --git a/Makefile b/Makefile
index 24326c0..78e8159 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@ VPATH= $(srcdir)
 
 # Atomic_ops installation directory.  If this doesn't exist, we create
 # it from the included libatomic_ops distribution.
-AO_VERSION=1.0
+AO_VERSION=1.1
 AO_SRC_DIR=$(srcdir)/libatomic_ops-$(AO_VERSION)
 AO_INSTALL_DIR=$(srcdir)/libatomic_ops-install
 
@@ -349,7 +349,7 @@ SRCS= $(CSRCS) mips_sgi_mach_dep.s rs6000_mach_dep.s alpha_mach_dep.S \
 
 DOC_FILES= README.QUICK doc/README.Mac doc/README.MacOSX doc/README.OS2 \
 	doc/README.amiga doc/README.cords doc/debugging.html \
-	doc/porting.html \
+	doc/porting.html doc/overview.html \
 	doc/README.dj doc/README.hp doc/README.linux doc/README.rs6000 \
 	doc/README.sgi doc/README.solaris2 doc/README.uts \
 	doc/README.win32 doc/barrett_diagram doc/README \
@@ -450,8 +450,9 @@ $(OBJS) tests/test.o dyn_load.o dyn_load_sunos53.o: \
 mark.o typd_mlc.o finalize.o ptr_chck.o: $(srcdir)/include/gc_mark.h \
 					 $(srcdir)/include/private/gc_pmark.h
 
-specific.o pthread_support.o: $(srcdir)/include/private/specific.h \
-			      $(srcdir)/include/gc_inline.h
+specific.o pthread_support.o thread_local_alloc.o win32_threads.o: \
+	$(srcdir)/include/private/specific.h $(srcdir)/include/gc_inline.h \
+	$(srcdir)/include/private/thread_local_alloc.h
 
 dbg_mlc.o gcj_mlc.o: $(srcdir)/include/private/dbg_mlc.h
 
@@ -465,6 +466,7 @@ tests:
 base_lib gc.a: $(OBJS) dyn_load.o $(UTILS)
 	echo > base_lib
 	rm -f dont_ar_1
+	cp $(AO_INSTALL_DIR)/lib/libatomic_ops.a gc.a
 	./if_mach SPARC SUNOS5 touch dont_ar_1
 	./if_mach SPARC SUNOS5 $(AR) rus gc.a $(OBJS) dyn_load.o
 	./if_mach M68K AMIGA touch dont_ar_1
@@ -513,7 +515,7 @@ dyn_load_sunos53.o: dyn_load.c
 
 # SunOS5 shared library version of the collector
 sunos5gc.so: $(OBJS) dyn_load_sunos53.o
-	$(CC) -G -o sunos5gc.so $(OBJS) dyn_load_sunos53.o -ldl
+	$(CC) -G -o sunos5gc.so $(OBJS) dyn_load_sunos53.o $(AO_INSTALL_DIR)/lib/libatomic_ops.a -ldl
 	ln sunos5gc.so libgc.so
 
 # Alpha/OSF shared library version of the collector
@@ -556,14 +558,11 @@ mach_dep.o: $(srcdir)/mach_dep.c $(srcdir)/mips_sgi_mach_dep.s \
 	    $(srcdir)/ia64_save_regs_in_stack.s \
 	    $(srcdir)/sparc_netbsd_mach_dep.s $(UTILS)
 	rm -f mach_dep.o
-	./if_mach MIPS IRIX5 $(CC) -c -o mach_dep.o $(srcdir)/mips_sgi_mach_dep.s
-	./if_mach MIPS RISCOS $(AS) -o mach_dep.o $(srcdir)/mips_ultrix_mach_dep.s
-	./if_mach MIPS ULTRIX $(AS) -o mach_dep.o $(srcdir)/mips_ultrix_mach_dep.s
-	./if_mach POWERPC DARWIN $(AS) -o mach_dep.o $(srcdir)/powerpc_darwin_mach_dep.s
-	./if_mach ALPHA LINUX $(CC) -c -o mach_dep.o $(srcdir)/alpha_mach_dep.S
-	./if_mach SPARC SUNOS5 $(CC) -c -o mach_dep.o $(srcdir)/sparc_mach_dep.S
-	./if_mach SPARC OPENBSD $(AS) -o mach_dep.o $(srcdir)/sparc_sunos4_mach_dep.s
-	./if_mach SPARC NETBSD $(AS) -o mach_dep.o $(srcdir)/sparc_netbsd_mach_dep.s
+	./if_mach SPARC SUNOS5 $(CC) -c -o mach_dep2.o $(srcdir)/sparc_mach_dep.S
+	./if_mach SPARC OPENBSD $(AS) -o mach_dep2.o $(srcdir)/sparc_sunos4_mach_dep.s
+	./if_mach SPARC NETBSD $(AS) -o mach_dep2.o $(srcdir)/sparc_netbsd_mach_dep.s
+	./if_mach SPARC "" $(CC) -c -o mach_dep1.o $(SPECIALCFLAGS) $(srcdir)/mach_dep.c
+	./if_mach SPARC "" ld -r -o mach_dep.o mach_dep1.o mach_dep2.o
 	./if_mach IA64 "" as $(AS_ABI_FLAG) -o ia64_save_regs_in_stack.o $(srcdir)/ia64_save_regs_in_stack.s
 	./if_mach IA64 "" $(CC) -c -o mach_dep1.o $(SPECIALCFLAGS) $(srcdir)/mach_dep.c
 	./if_mach IA64 "" ld -r -o mach_dep.o mach_dep1.o ia64_save_regs_in_stack.o
@@ -638,7 +637,7 @@ gctest: tests/test.o gc.a $(UTILS)
 # If an optimized setjmp_test generates a segmentation fault,
 # odds are your compiler is broken.  Gctest may still work.
 # Try compiling setjmp_t.c unoptimized.
-setjmp_test: $(srcdir)/setjmp_t.c $(srcdir)/include/gc.h $(UTILS)
+setjmp_test: $(srcdir)/setjmp_t.c $(srcdir)/include/gc.h $(UTILS) $(AO_INSTALL_DIR)
 	$(CC) $(CFLAGS) -o setjmp_test $(srcdir)/setjmp_t.c
 
 test:  KandRtest cord/cordtest
diff --git a/Makefile.direct b/Makefile.direct
index 24326c0..78e8159 100644
--- a/Makefile.direct
+++ b/Makefile.direct
@@ -32,7 +32,7 @@ VPATH= $(srcdir)
 
 # Atomic_ops installation directory.  If this doesn't exist, we create
 # it from the included libatomic_ops distribution.
-AO_VERSION=1.0
+AO_VERSION=1.1
 AO_SRC_DIR=$(srcdir)/libatomic_ops-$(AO_VERSION)
 AO_INSTALL_DIR=$(srcdir)/libatomic_ops-install
 
@@ -349,7 +349,7 @@ SRCS= $(CSRCS) mips_sgi_mach_dep.s rs6000_mach_dep.s alpha_mach_dep.S \
 
 DOC_FILES= README.QUICK doc/README.Mac doc/README.MacOSX doc/README.OS2 \
 	doc/README.amiga doc/README.cords doc/debugging.html \
-	doc/porting.html \
+	doc/porting.html doc/overview.html \
 	doc/README.dj doc/README.hp doc/README.linux doc/README.rs6000 \
 	doc/README.sgi doc/README.solaris2 doc/README.uts \
 	doc/README.win32 doc/barrett_diagram doc/README \
@@ -450,8 +450,9 @@ $(OBJS) tests/test.o dyn_load.o dyn_load_sunos53.o: \
 mark.o typd_mlc.o finalize.o ptr_chck.o: $(srcdir)/include/gc_mark.h \
 					 $(srcdir)/include/private/gc_pmark.h
 
-specific.o pthread_support.o: $(srcdir)/include/private/specific.h \
-			      $(srcdir)/include/gc_inline.h
+specific.o pthread_support.o thread_local_alloc.o win32_threads.o: \
+	$(srcdir)/include/private/specific.h $(srcdir)/include/gc_inline.h \
+	$(srcdir)/include/private/thread_local_alloc.h
 
 dbg_mlc.o gcj_mlc.o: $(srcdir)/include/private/dbg_mlc.h
 
@@ -465,6 +466,7 @@ tests:
 base_lib gc.a: $(OBJS) dyn_load.o $(UTILS)
 	echo > base_lib
 	rm -f dont_ar_1
+	cp $(AO_INSTALL_DIR)/lib/libatomic_ops.a gc.a
 	./if_mach SPARC SUNOS5 touch dont_ar_1
 	./if_mach SPARC SUNOS5 $(AR) rus gc.a $(OBJS) dyn_load.o
 	./if_mach M68K AMIGA touch dont_ar_1
@@ -513,7 +515,7 @@ dyn_load_sunos53.o: dyn_load.c
 
 # SunOS5 shared library version of the collector
 sunos5gc.so: $(OBJS) dyn_load_sunos53.o
-	$(CC) -G -o sunos5gc.so $(OBJS) dyn_load_sunos53.o -ldl
+	$(CC) -G -o sunos5gc.so $(OBJS) dyn_load_sunos53.o $(AO_INSTALL_DIR)/lib/libatomic_ops.a -ldl
 	ln sunos5gc.so libgc.so
 
 # Alpha/OSF shared library version of the collector
@@ -556,14 +558,11 @@ mach_dep.o: $(srcdir)/mach_dep.c $(srcdir)/mips_sgi_mach_dep.s \
 	    $(srcdir)/ia64_save_regs_in_stack.s \
 	    $(srcdir)/sparc_netbsd_mach_dep.s $(UTILS)
 	rm -f mach_dep.o
-	./if_mach MIPS IRIX5 $(CC) -c -o mach_dep.o $(srcdir)/mips_sgi_mach_dep.s
-	./if_mach MIPS RISCOS $(AS) -o mach_dep.o $(srcdir)/mips_ultrix_mach_dep.s
-	./if_mach MIPS ULTRIX $(AS) -o mach_dep.o $(srcdir)/mips_ultrix_mach_dep.s
-	./if_mach POWERPC DARWIN $(AS) -o mach_dep.o $(srcdir)/powerpc_darwin_mach_dep.s
-	./if_mach ALPHA LINUX $(CC) -c -o mach_dep.o $(srcdir)/alpha_mach_dep.S
-	./if_mach SPARC SUNOS5 $(CC) -c -o mach_dep.o $(srcdir)/sparc_mach_dep.S
-	./if_mach SPARC OPENBSD $(AS) -o mach_dep.o $(srcdir)/sparc_sunos4_mach_dep.s
-	./if_mach SPARC NETBSD $(AS) -o mach_dep.o $(srcdir)/sparc_netbsd_mach_dep.s
+	./if_mach SPARC SUNOS5 $(CC) -c -o mach_dep2.o $(srcdir)/sparc_mach_dep.S
+	./if_mach SPARC OPENBSD $(AS) -o mach_dep2.o $(srcdir)/sparc_sunos4_mach_dep.s
+	./if_mach SPARC NETBSD $(AS) -o mach_dep2.o $(srcdir)/sparc_netbsd_mach_dep.s
+	./if_mach SPARC "" $(CC) -c -o mach_dep1.o $(SPECIALCFLAGS) $(srcdir)/mach_dep.c
+	./if_mach SPARC "" ld -r -o mach_dep.o mach_dep1.o mach_dep2.o
 	./if_mach IA64 "" as $(AS_ABI_FLAG) -o ia64_save_regs_in_stack.o $(srcdir)/ia64_save_regs_in_stack.s
 	./if_mach IA64 "" $(CC) -c -o mach_dep1.o $(SPECIALCFLAGS) $(srcdir)/mach_dep.c
 	./if_mach IA64 "" ld -r -o mach_dep.o mach_dep1.o ia64_save_regs_in_stack.o
@@ -638,7 +637,7 @@ gctest: tests/test.o gc.a $(UTILS)
 # If an optimized setjmp_test generates a segmentation fault,
 # odds are your compiler is broken.  Gctest may still work.
 # Try compiling setjmp_t.c unoptimized.
-setjmp_test: $(srcdir)/setjmp_t.c $(srcdir)/include/gc.h $(UTILS)
+setjmp_test: $(srcdir)/setjmp_t.c $(srcdir)/include/gc.h $(UTILS) $(AO_INSTALL_DIR)
 	$(CC) $(CFLAGS) -o setjmp_test $(srcdir)/setjmp_t.c
 
 test:  KandRtest cord/cordtest
diff --git a/Makefile.in b/Makefile.in
index 0c881db..68708ef 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -527,6 +527,7 @@ dist_pkgdata_DATA = \
 	doc/README.solaris2 \
 	doc/README.uts \
 	doc/README.win32 \
+	doc/overview.html \
 	doc/tree.html \
 	doc/leak.html \
 	doc/gcinterface.html \
diff --git a/NT_STATIC_THREADS_MAKEFILE b/NT_STATIC_THREADS_MAKEFILE
index 91fb7f6..f37d6d1 100644
--- a/NT_STATIC_THREADS_MAKEFILE
+++ b/NT_STATIC_THREADS_MAKEFILE
@@ -10,8 +10,8 @@ CPU=$(MY_CPU)
 # should do, since we only need the headers.
 # We assume this was manually unpacked, since I'm not sure there is
 # a Windows standard command line tool to do this.
-AO_VERSION=0.6
-AO_SRC_DIR=$(srcdir)/atomic_ops-$(AO_VERSION)
+AO_VERSION=1.1
+AO_SRC_DIR=libatomic_ops-$(AO_VERSION)/src
 AO_INCLUDE_DIR=$(AO_SRC_DIR)
 
 OBJS= alloc.obj reclaim.obj allchblk.obj misc.obj mach_dep.obj os_dep.obj mark_rts.obj headers.obj mark.obj obj_map.obj blacklst.obj finalize.obj new_hblk.obj dbg_mlc.obj malloc.obj stubborn.obj dyn_load.obj typd_mlc.obj ptr_chck.obj gc_cpp.obj mallocx.obj win32_threads.obj
diff --git a/allchblk.c b/allchblk.c
index ad55bed..997580a 100644
--- a/allchblk.c
+++ b/allchblk.c
@@ -560,7 +560,7 @@ int index;	/* Index of free list */
 				/* free blocks in GC_add_to_fl.		*/
 #     endif
 #   ifdef USE_MUNMAP
-      hhdr -> hb_last_reclaimed = GC_gc_no;
+      hhdr -> hb_last_reclaimed = (unsigned short)GC_gc_no;
 #   endif
     hhdr -> hb_sz = h_size;
     GC_add_to_fl(h, hhdr);
@@ -568,7 +568,7 @@ int index;	/* Index of free list */
 }
 	
 struct hblk *
-GC_allochblk_nth(word sz/* bytes */, int kind, unsigned char flags, int n);
+GC_allochblk_nth(size_t sz/* bytes */, int kind, unsigned char flags, int n);
 
 /*
  * Allocate (and return pointer to) a heap block
@@ -580,7 +580,7 @@ GC_allochblk_nth(word sz/* bytes */, int kind, unsigned char flags, int n);
  * The client is responsible for clearing the block, if necessary.
  */
 struct hblk *
-GC_allochblk(size_t sz, int kind, unsigned flags/* IGNORE_OFF_PAGE or 0 */)
+GC_allochblk(size_t sz, int kind, unsigned char flags/* IGNORE_OFF_PAGE or 0 */)
 {
     word blocks;
     int start_list;
@@ -603,7 +603,7 @@ GC_allochblk(size_t sz, int kind, unsigned flags/* IGNORE_OFF_PAGE or 0 */)
  * Unlike the above, sz is in bytes.
  */
 struct hblk *
-GC_allochblk_nth(word sz, int kind, unsigned char flags, int n)
+GC_allochblk_nth(size_t sz, int kind, unsigned char flags, int n)
 {
     struct hblk *hbp;
     hdr * hhdr;		/* Header corr. to hbp */
@@ -822,7 +822,7 @@ signed_word size;
     GC_remove_counts(hbp, (word)size);
     hhdr->hb_sz = size;
 #   ifdef USE_MUNMAP
-      hhdr -> hb_last_reclaimed = GC_gc_no;
+      hhdr -> hb_last_reclaimed = (unsigned short)GC_gc_no;
 #   endif
     
     /* Check for duplicate deallocation in the easy case */
@@ -849,7 +849,7 @@ signed_word size;
 	  GC_remove_from_fl(prevhdr, FL_UNKNOWN);
 	  prevhdr -> hb_sz += hhdr -> hb_sz;
 #	  ifdef USE_MUNMAP
-	    prevhdr -> hb_last_reclaimed = GC_gc_no;
+	    prevhdr -> hb_last_reclaimed = (unsigned short)GC_gc_no;
 #	  endif
 	  GC_remove_header(hbp);
 	  hbp = prev;
diff --git a/alloc.c b/alloc.c
index 1be4516..0e292f7 100644
--- a/alloc.c
+++ b/alloc.c
@@ -569,8 +569,16 @@ void GC_clear_fl_marks(ptr_t q)
 	}
 	bit_no = MARK_BIT_NO((ptr_t)p - (ptr_t)h, sz);
 	if (mark_bit_from_hdr(hhdr, bit_no)) {
+	  int n_marks = hhdr -> hb_n_marks - 1;
       	  clear_mark_bit_from_hdr(hhdr, bit_no);
-          --hhdr -> hb_n_marks;
+#	  ifdef PARALLEL_MARK
+	    /* Appr. count, don't decrement to zero! */
+	    if (0 != n_marks) {
+              hhdr -> hb_n_marks = n_marks;
+	    }
+#	  else
+            hhdr -> hb_n_marks = n_marks;
+#	  endif
         }
 	GC_bytes_found -= sz;
    }
diff --git a/configure b/configure
index 36c8a6f..7e58904 100755
--- a/configure
+++ b/configure
@@ -1,7 +1,7 @@
 #! /bin/sh
 # From configure.ac Revision: 1.2 .
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.59 for gc 7.0alpha4.
+# Generated by GNU Autoconf 2.59 for gc 7.0alpha5.
 #
 # Report bugs to <Hans.Boehm@hp.com>.
 #
@@ -429,8 +429,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='gc'
 PACKAGE_TARNAME='gc'
-PACKAGE_VERSION='7.0alpha4'
-PACKAGE_STRING='gc 7.0alpha4'
+PACKAGE_VERSION='7.0alpha5'
+PACKAGE_STRING='gc 7.0alpha5'
 PACKAGE_BUGREPORT='Hans.Boehm@hp.com'
 
 ac_unique_file="gcj_mlc.c"
@@ -957,7 +957,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures gc 7.0alpha4 to adapt to many kinds of systems.
+\`configure' configures gc 7.0alpha5 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1024,7 +1024,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of gc 7.0alpha4:";;
+     short | recursive ) echo "Configuration of gc 7.0alpha5:";;
    esac
   cat <<\_ACEOF
 
@@ -1174,7 +1174,7 @@ fi
 test -n "$ac_init_help" && exit 0
 if $ac_init_version; then
   cat <<\_ACEOF
-gc configure 7.0alpha4
+gc configure 7.0alpha5
 generated by GNU Autoconf 2.59
 
 Copyright (C) 2003 Free Software Foundation, Inc.
@@ -1188,7 +1188,7 @@ cat >&5 <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by gc $as_me 7.0alpha4, which was
+It was created by gc $as_me 7.0alpha5, which was
 generated by GNU Autoconf 2.59.  Invocation command line was
 
   $ $0 $@
@@ -1960,7 +1960,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='gc'
- VERSION='7.0alpha4'
+ VERSION='7.0alpha5'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -4595,14 +4595,14 @@ echo $ECHO_N "checking which machine-dependent code should be used... $ECHO_C" >
 machdep=
 case "$host" in
  alpha-*-openbsd*)
-    machdep="alpha_mach_dep.lo"
+    machdep="mach_dep.lo"
     if test x"${ac_cv_lib_dl_dlopen}" != xyes ; then
        { echo "$as_me:$LINENO: WARNING: OpenBSD/Alpha without dlopen(). Shared library support is disabled" >&5
 echo "$as_me: WARNING: OpenBSD/Alpha without dlopen(). Shared library support is disabled" >&2;}
     fi
     ;;
  alpha*-*-linux*)
-    machdep="alpha_mach_dep.lo"
+    machdep="mach_dep.lo"
     ;;
  i?86-*-solaris2.[89] | i?86-*-solaris2.1?)
     cat >>confdefs.h <<\_ACEOF
@@ -4611,7 +4611,7 @@ _ACEOF
 
     ;;
  mipstx39-*-elf*)
-    machdep="mips_ultrix_mach_dep.lo"
+    machdep="mach_dep.lo"
     cat >>confdefs.h <<\_ACEOF
 #define STACKBASE __stackbase
 _ACEOF
@@ -4622,31 +4622,31 @@ _ACEOF
 
     ;;
  mips-dec-ultrix*)
-    machdep="mips_ultrix_mach-dep.lo"
+    machdep="mach-dep.lo"
     ;;
  mips-nec-sysv*|mips-unknown-sysv*)
     ;;
  mips*-*-linux*)
     ;;
  mips-*-*)
-    machdep="mips_sgi_mach_dep.lo"
+    machdep="mach_dep.lo"
     cat >>confdefs.h <<\_ACEOF
 #define NO_EXECUTE_PERMISSION 1
 _ACEOF
 
     ;;
  sparc-*-netbsd*)
-    machdep="sparc_netbsd_mach_dep.lo"
+    machdep="mach_dep.lo sparc_netbsd_mach_dep.lo"
     ;;
  sparc-sun-solaris2.3)
-    machdep="sparc_mach_dep.lo"
+    machdep="mach_dep.lo sparc_mach_dep.lo"
     cat >>confdefs.h <<\_ACEOF
 #define SUNOS53_SHARED_LIB 1
 _ACEOF
 
     ;;
  sparc*-sun-solaris2.*)
-    machdep="sparc_mach_dep.lo"
+    machdep="mach_dep.lo sparc_mach_dep.lo"
     ;;
  ia64-*-*)
     machdep="mach_dep.lo ia64_save_regs_in_stack.lo"
@@ -10714,7 +10714,7 @@ _ASBOX
 } >&5
 cat >&5 <<_CSEOF
 
-This file was extended by gc $as_me 7.0alpha4, which was
+This file was extended by gc $as_me 7.0alpha5, which was
 generated by GNU Autoconf 2.59.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -10772,7 +10772,7 @@ _ACEOF
 
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-gc config.status 7.0alpha4
+gc config.status 7.0alpha5
 configured by $0, generated by GNU Autoconf 2.59,
   with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/configure.ac b/configure.ac
index 9becbad..3410804 100644
--- a/configure.ac
+++ b/configure.ac
@@ -17,7 +17,7 @@ dnl Process this file with autoconf to produce configure.
 # Initialization
 # ==============
 
-AC_INIT(gc,7.0alpha4,Hans.Boehm@hp.com) 
+AC_INIT(gc,7.0alpha5,Hans.Boehm@hp.com) 
     ## version must conform to [0-9]+[.][0-9]+(alpha[0-9]+)?
 AC_CONFIG_SRCDIR(gcj_mlc.c)
 AC_CANONICAL_TARGET 
@@ -276,42 +276,42 @@ AC_MSG_CHECKING(which machine-dependent code should be used)
 machdep=
 case "$host" in
  alpha-*-openbsd*)
-    machdep="alpha_mach_dep.lo"
+    machdep="mach_dep.lo"
     if test x"${ac_cv_lib_dl_dlopen}" != xyes ; then
        AC_MSG_WARN(OpenBSD/Alpha without dlopen(). Shared library support is disabled)
     fi
     ;;
  alpha*-*-linux*)
-    machdep="alpha_mach_dep.lo"
+    machdep="mach_dep.lo"
     ;;
  i?86-*-solaris2.[[89]] | i?86-*-solaris2.1?)
     AC_DEFINE(SOLARIS25_PROC_VDB_BUG_FIXED)
     ;;
  mipstx39-*-elf*)
-    machdep="mips_ultrix_mach_dep.lo"
+    machdep="mach_dep.lo"
     AC_DEFINE(STACKBASE, __stackbase)
     AC_DEFINE(DATASTART_IS_ETEXT)
     ;;
  mips-dec-ultrix*)
-    machdep="mips_ultrix_mach-dep.lo"
+    machdep="mach-dep.lo"
     ;;
  mips-nec-sysv*|mips-unknown-sysv*)
     ;;
  mips*-*-linux*) 
     ;; 
  mips-*-*)
-    machdep="mips_sgi_mach_dep.lo"
+    machdep="mach_dep.lo"
     AC_DEFINE(NO_EXECUTE_PERMISSION)
     ;;
  sparc-*-netbsd*)
-    machdep="sparc_netbsd_mach_dep.lo"
+    machdep="mach_dep.lo sparc_netbsd_mach_dep.lo"
     ;;
  sparc-sun-solaris2.3)
-    machdep="sparc_mach_dep.lo"
+    machdep="mach_dep.lo sparc_mach_dep.lo"
     AC_DEFINE(SUNOS53_SHARED_LIB)
     ;;
  sparc*-sun-solaris2.*)
-    machdep="sparc_mach_dep.lo"
+    machdep="mach_dep.lo sparc_mach_dep.lo"
     ;;
  ia64-*-*)
     machdep="mach_dep.lo ia64_save_regs_in_stack.lo"
diff --git a/doc/README b/doc/README
index 2230f41..159fa89 100644
--- a/doc/README
+++ b/doc/README
@@ -31,7 +31,7 @@ are GPL'ed, but with an exception that should cover all uses in the
 collector.  (If you are concerned about such things, I recommend you look
 at the notice in config.guess or ltmain.sh.)
 
-This is version 7.0alpha2 of a conservative garbage collector for C and C++.
+This is version 7.0alpha5 of a conservative garbage collector for C and C++.
 
 You might find a more recent version of this at
 
diff --git a/doc/README.changes b/doc/README.changes
index da79786..25b61c4 100644
--- a/doc/README.changes
+++ b/doc/README.changes
@@ -2236,8 +2236,21 @@ Since gc6.5:
    there.
  - More consistently define HBLKSIZE to 4096 on 64 bit architectures with
    4K pages.  (Thanks to Andrew Haley.)
-
-Since gc6.6:
+ - With win32 threads, GC_stop_world needs to acquire GC_write_cs.  (Thanks
+   to Ben Hutchings for the observation and patch.)
+ - Move up struct callinfo declaration to make gcc 4.0.2. happy.
+
+Since 6.6:
+ - Add "int" to Solaris "end" and "etext" declaration in gc.h.  Declared
+   the symbols with underscores and as arrays, since that's what's actually
+   used.  Perhaps this could all just be removed?  (Thanks to John Bowman.)
+ - Fixed ARM GC_test_and_set code.  (Thanks to Kazu Hirata and Paul Brook.)
+ - Added casts for assignments to hb_last_reclaimed, which truncate the
+   value.  Added a cast to GC_adj_words_allocd.  Use GetModuleHandleA
+   when retrieving a handle to kernel32.dll under win32.  (Thanks to the
+   Visual Prolog developers.)
+
+Since gc6.7:
  - Remove GC_PROTO, VOLATILE, GC_PTR, and GC_CONST.  Assume ANSI C compiler
    and use ANSI constructs unconditionally.
  - Introduce #elif and #error in some of the appropriate places.
@@ -2383,9 +2396,45 @@ Since gc7.0alpha3
  - Added GC_getattr_np-based GC_get_stack_base (untested).
  - Separated thread local allocation into a separate file and added the
    beginning of win32 support for that.
+
+Since gc7.0alpha4
+   (more 6.6, 6.7 changes)
+ - Some Solaris fixes, including some more general changes in how
+   the assembly pieces of mach_dep.c are handled.
+ - Removed a lot of SOLARIS_THREADS-specific code that was only
+   needed with the old implementation.  This included many (mostly no-op)
+   versions of GC_is_fresh.
+ - Don't use atomic_ops in gc_locks.h unless we need threads.
+ - Fixed USE_MARK_BITS, which is once againthe default without PARALLEL_MARK.
+ - Removed Solaris GC_INIT hack.  It's a workaround for a long dead bug,
+   and it seemed to be wrong anyway.
+ - Changed win32_threads.c to require preprocessor-based interception
+   of thread routines by default.  A client call to GC_use_DllMain is
+   now required to get the old behavior in which DllMain is used to implicitly
+   register threads.  This was doen for uniformity with other platforms, and
+   because the DllMain solution seemed to require very tricky code which,
+   at least in the past, imposed hard bounds onthe number of threads.
+ - Many small changes to make thread support work again on Cygwin.
+ - Moved definition of allocator lock etc. to pthread_support.c and
+   win32_threads.c for those two cases.
+ - Got rid of the FASTLOCK() machinery.  It doesn't seem useful on modern
+   platforms.
+ - Cleaned up the uncollectable allocation routines, speeding up the
+   slower paths.  The code did enough unnecessary work off the critical path
+   that the underlying logic was getting hard to extract.
+ - No longer turn off THREAD_LOCAL_ALLOC with DBG_HDRS_ALL.  Indications
+   are it just works, and I think the reasons for it not working disappeared
+   a while ago.
+ - Fixed bugs in hb_n_marks calculation and assertion.
+ - Don't use __builtin_expect for pre-3.0 gcc.
+ - Define GWW_VDB only for recent Microsoft tool chains.
+ - Add overview.html to doc directory.
+ - Fix NT_STATIC_THREADS_MAKEFILE, various compiler warnings.
+ - Made thread local allocation sort of work with Cygwin.  The code should
+   be there to deal with other Windows variants, But non-Cygwin Windows
+   threads need more bug fixes.
   
 To do:
- - Fix USE_MARK_BITS.
  - REDIRECT_MALLOC and threads combination is getting closer, but currently
    usually fails because the DTV (dynamic thread vector) used to access
    thread-local storage is referenced only from the base of a thread stack,
@@ -2394,7 +2443,6 @@ To do:
    Typically large heap sections end up cleared.
  - Clone marker inner loop to support arch-dependent prefetching,
    and counting of objects marked for finalization.
- - function wrapping??
  - The USE_MUNMAP code should really use a separate data structure
    indexed by physical page to keep track of time since last use of
    a page.  Using hblk headers means we lose track of ages when
diff --git a/doc/README.linux b/doc/README.linux
index 1d0fd4c..99f4bbc 100644
--- a/doc/README.linux
+++ b/doc/README.linux
@@ -19,15 +19,15 @@ Linux threads.  These should not be touched by the client program.
 
 To use threads, you need to abide by the following requirements:
 
-1) You need to use LinuxThreads (which are included in libc6).
+1) You need to use LinuxThreads or NPTL (which are included in libc6).
 
    The collector relies on some implementation details of the LinuxThreads
-   package.  It is unlikely that this code will work on other
+   package.  This code may not work on other
    pthread implementations (in particular it will *not* work with
    MIT pthreads).
 
-2) You must compile the collector with -DGC_LINUX_THREADS and -D_REENTRANT
-   specified in the Makefile.
+2) You must compile the collector with -DGC_LINUX_THREADS (or
+   just -DGC_THREADS) and -D_REENTRANT specified in the Makefile.
 
 3a) Every file that makes thread calls should define GC_LINUX_THREADS and 
    _REENTRANT and then include gc.h.  Gc.h redefines some of the
diff --git a/doc/doc.am b/doc/doc.am
index a90e05d..d95fa17 100644
--- a/doc/doc.am
+++ b/doc/doc.am
@@ -43,6 +43,7 @@ dist_pkgdata_DATA = \
 	doc/README.solaris2 \
 	doc/README.uts \
 	doc/README.win32 \
+	doc/overview.html \
 	doc/tree.html \
 	doc/leak.html \
 	doc/gcinterface.html \
diff --git a/doc/gcdescr.html b/doc/gcdescr.html
index cab6bde..dc08470 100644
--- a/doc/gcdescr.html
+++ b/doc/gcdescr.html
@@ -1,7 +1,7 @@
 <HTML>
 <HEAD>
     <TITLE> Conservative GC Algorithmic Overview </TITLE>
-    <AUTHOR> Hans-J. Boehm, HP Labs (Much of this was written at SGI)</author>
+    <AUTHOR> Hans-J. Boehm, HP Labs (Some of this was written at SGI)</author>
 </HEAD>
 <BODY>
 <H1> <I>This is under construction, and may always be.</i> </h1>
@@ -549,6 +549,67 @@ by using ld's function call wrapping mechanism under Linux.
 Recent versions of the collector support several facilites to enhance
 the processor-scalability and thread performance of the collector.
 These are discussed in more detail <A HREF="scale.html">here</a>.
+We briefly outline the data approach to thread-local allocation in the
+next section.
+<H2>Thread-local allocation</h2>
+If thread-local allocation is enabled, the collector keeps separate
+arrays of free lists for each thread.  Thread-local allocation
+is currently only supported on a few platforms.
+<P>
+The free list arrays associated
+with each thread are only used to satisfy requests for objects that
+are  both very small, and belong to one of a small number of well-known
+kinds.  These currently include "normal" and pointer-free objects.
+Depending onthe configuration, "gcj" objects may also be included.
+<P>
+Thread-local free list entries contain either a pointer to the first
+element of a free list, or they contain a counter of the number of
+allocation "granules" allocated so far.  Initially they contain the
+value one, i.e. a small counter value.
+<P>
+Thread-local allocation allocates directly through the global
+allocator, if the object is of a size or kind not covered by the
+local free lists.
+<P>
+If there is an appropriate local free list, the allocator checks whether it
+contains a sufficiently small counter value.  If so, the counter is simply
+incremented by the counter value, and the global allocator is used.
+In this way, the initial few allocations of a given size bypass the local
+allocator.  A thread that only allocates a handful of objects of a given
+size will not build up its own free list for that size.  This avoids
+wasting space for unpopular objects sizes or kinds.
+<P>
+Once the counter passes a threshold, <TT>GC_malloc_many</tt> is called
+to allocate roughly <TT>HBLKSIZE</tt> space and put it on the corresponding
+local free list.  Further allocations of that size and kind then use
+this free list, and no longer need to acquire the allocation lock.
+The allocation procedure is otherwise similar to the global free lists.
+The local free lists are also linked using the first word in the object.
+In most cases this means they require considerably less time.
+<P>
+Local free lists are treated buy most of the rest of the collector
+as though they were in-use reachable data.  This requires some care,
+since pointer-free objects are not normally traced, and hence a special
+tracing procedure is required to mark all objects on pointer-free and
+gcj local free lists.
+<P>
+On thread exit, any remaining thread-local free list entries are
+transferred back to the global free list.
+<P>
+Note that if the collector is configured for thread-local allocation,
+GC versions before 7 do not invoke the thread-local allocator by default.
+<TT>GC_malloc</tt> only uses thread-local allocation in version 7 and later.
+In earlier versions, <TT>GC_MALLOC</tt> (all caps) may be directed
+to use thread-local allocation by defining <TT>GC_REDIRECT_TO_LOCAL</tt>
+and then include <TT>gc_local_alloc.h</tt>.
+<P>
+For some more details see <A HREF="scale.html">here</a>, and the
+technical report entitled
+<A HREF="http://www.hpl.hp.com/techreports/2000/HPL-2000-165.html">
+``Fast Multiprocessor Memory Allocation and Garbage Collection''
+</a>
+<P>
+<HR>
 <P>
 Comments are appreciated.  Please send mail to
 <A HREF="mailto:boehm@acm.org"><TT>boehm@acm.org</tt></a> or
diff --git a/doc/overview.html b/doc/overview.html
new file mode 100644
index 0000000..d31f937
--- /dev/null
+++ b/doc/overview.html
@@ -0,0 +1,446 @@
+<!DOCTYPE HTML>
+<html><head><title>A garbage collector for C and C++</title></head>
+<body>
+<table bgcolor="#f0f0ff" cellpadding="10%">
+  <tbody><tr>
+  <td><a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gcinterface.html">Interface Overview</a></td>
+  <td><a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/04tutorial.pdf">Tutorial Slides</a></td>
+  <td><a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/faq.html">FAQ</a></td>
+  <td><a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/simple_example.html">Example</a></td>
+  <td><a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source">Download</a></td>
+  <td><a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/license.txt">License</a></td>
+  </tr>
+</tbody></table>
+<h1>A garbage collector for C and C++</h1>
+<ul>
+<li><a href="#platforms">Platforms</a>
+</li><li><a href="#multiprocessors">Scalable multiprocessor versions</a>
+</li><li><a href="#details">Some collector details</a>
+</li><li><a href="#further">Further reading</a>
+</li><li><a href="#users">Current users</a>
+</li><li><a href="#collector">Local Links for this collector</a>
+</li><li><a href="#background">Local Background Links</a>
+</li><li><a href="#contacts">Contacts and Mailing List</a>
+</li></ul>
+[ This is an updated version of the page formerly at
+<tt>http://reality.sgi.com/boehm/gc.html</tt>
+and before that at
+<a href="ftp://parcftp.xerox.com/pub/gc/gc.html">
+<tt>ftp://parcftp.xerox.com/pub/gc/gc.html</tt></a>.]
+<p>
+The <a href="http://www.hpl.hp.com/personal/Hans_Boehm">Boehm</a>-<a href="http://www.cs.cornell.edu/annual_report/00-01/bios.htm#demers">Demers</a>-<a href="http://www-sul.stanford.edu/weiser/">Weiser</a>
+conservative garbage collector can
+be used as a garbage collecting
+replacement for C <tt>malloc</tt> or C++ <tt>new</tt>.
+It allows you to allocate memory basically as you normally would,
+without explicitly deallocating memory that is no longer useful.
+The collector automatically recycles memory when it determines
+that it can no longer be otherwise accessed.
+A simple example of such a use is given
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/simple_example.html">here</a>.
+</p><p>
+The collector is also used by a number of programming language
+implementations that either use C as intermediate code, want
+to facilitate easier interoperation with C libraries, or
+just prefer the simple collector interface.
+For a more detailed description of the interface, see
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gcinterface.html">here</a>.
+</p><p>
+Alternatively, the garbage collector  may be used as
+a <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/leak.html">leak detector</a>
+for C or C++ programs, though that is not its primary goal.
+</p><p>
+Typically several versions will be available.
+Usually you should first try to use
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/gc.tar.gz"><tt>gc_source/gc.tar.gz</tt></a>,
+which is normally an older, more stable version.
+</p><p>
+If that fails, try the latest explicitly numbered version
+in <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/">
+<tt>gc_source/</tt></a>.
+Later versions may contain additional features, platform support,
+or bug fixes, but are likely to be less well tested.
+Note that versions containing the letters <tt>alpha</tt> are even less
+well tested than others, especially on non-HP platforms.
+</p><p>
+A slightly older version of the garbage collector is now also
+included as part of the
+<a href="http://gcc.gnu.org/">GNU compiler</a>
+distribution.  The source
+code for that version is available for browsing
+<a href="http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/boehm-gc/">here</a>.
+</p><p>
+The arguments for and against conservative garbage collection
+in C and C++ are briefly
+discussed in
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/issues.html">issues.html</a>.  The beginnings of
+a frequently-asked-questions list are <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/faq.html">here</a>.
+</p><p>
+The garbage collector code is copyrighted by
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm">Hans-J. Boehm</a>,
+Alan J. Demers,
+<a href="http://www.xerox.com/">Xerox Corporation</a>,
+<a href="http://www.sgi.com/">Silicon Graphics</a>,
+and
+<a href="http://www.hp.com/">Hewlett-Packard Company</a>.
+It may be used and copied without payment of a fee under minimal restrictions.
+See the README file in the distribution  or the
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/license.txt">license</a> for more details.
+<b>IT IS PROVIDED AS IS,
+WITH ABSOLUTELY NO WARRANTY EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK</b>.
+</p><p>
+Empirically, this collector works with most unmodified C programs,
+simply by replacing
+<tt>malloc</tt> with <tt>GC_malloc</tt> calls,
+replacing <tt>realloc</tt> with <tt>GC_realloc</tt> calls, and removing
+free calls.  Exceptions are discussed
+in <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/issues.html">issues.html</a>. 
+</p><h2><a name="platforms">Platforms</a></h2>
+The collector is not completely portable, but the distribution
+includes ports to most standard PC and UNIX/Linux platforms.
+The collector should work on Linux, *BSD, recent Windows versions,
+MacOS X, HP/UX, Solaris,
+Tru64, Irix and a few other operating systems.
+Some ports are more polished than others.
+<p>
+Irix pthreads, Linux threads, Win32 threads, Solaris threads
+(old style and pthreads),
+HP/UX 11 pthreads, Tru64 pthreads, and MacOS X threads are supported
+in recent versions.
+</p><h3>Separately distributed ports</h3>
+For MacOS 9/Classic use, Patrick Beard's latest port is available from
+<a href="http://homepage.mac.com/pcbeard/gc/">
+<tt>http://homepage.mac.com/pcbeard/gc/</tt></a>.
+(Unfortunately, that's now quite dated.
+I'm not in a position to test under MacOS.  Although I try to
+incorporate changes, it is impossible for
+me to update the project file.)
+<p>
+Precompiled versions of the collector for NetBSD are available
+<a href="ftp://ftp.netbsd.org/pub/NetBSD/packages/pkgsrc/devel/boehm-gc/README.html">here</a>
+or
+<a href="http://www.netbsd.org/packages/devel/boehm-gc/README.html">here</a>.
+</p><p>
+<a href="http://www.debian.org/">Debian Linux</a> includes prepackaged
+versions of the collector.
+</p><h2><a name="multiprocessors">Scalable multiprocessor versions</a></h2>
+Kenjiro Taura, Toshio Endo, and Akinori Yonezawa have made available
+a <a href="http://www.yl.is.s.u-tokyo.ac.jp/gc/">parallel collector</a>
+based on this one.  Their collector takes advantage of multiple processors
+during a collection.  Starting with collector version 6.0alpha1
+we also do this, though with more modest processor scalability goals.
+Our approach is discussed briefly in
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/scale.html"><tt>scale.html</tt></a>.
+<h2><a name="details">Some Collector Details</a></h2>
+The collector uses a <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/complexity.html">mark-sweep</a> algorithm.
+It provides incremental and generational
+collection under operating systems which provide the right kind of
+virtual memory support.  (Currently this includes SunOS[45], IRIX,
+OSF/1, Linux, and Windows, with varying restrictions.)
+It allows <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/finalization.html"><i>finalization</i></a> code
+to be invoked when an object is collected.
+It can take advantage of type information to locate pointers if such
+information is provided, but it is usually used without such information.
+ee the README and
+<tt>gc.h</tt> files in the distribution for more details.
+<p>
+For an overview of the implementation, see <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gcdescr.html">here</a>.
+</p><p>
+The garbage collector distribution includes a C string
+(<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/cordh.txt"><i>cord</i></a>) package that provides
+for fast concatenation and substring operations on long strings.
+A simple curses- and win32-based editor that represents the entire file
+as a cord is included as a
+sample application.
+</p><p>
+Performance of the nonincremental collector is typically competitive
+with malloc/free implementations.  Both space and time overhead are
+likely to be only slightly higher
+for programs written for malloc/free
+(see Detlefs, Dosser and Zorn's
+<a href="ftp://ftp.cs.colorado.edu/pub/techreports/zorn/CU-CS-665-93.ps.Z">Memory Allocation Costs in Large C and C++ Programs</a>.)
+For programs allocating primarily very small objects, the collector
+may be faster; for programs allocating primarily large objects it will
+be slower.  If the collector is used in a multithreaded environment
+and configured for thread-local allocation, it may in some cases
+significantly outperform malloc/free allocation in time.
+</p><p>
+We also expect that in many cases any additional overhead
+will be more than compensated for by decreased copying etc.
+if programs are written
+and tuned for garbage collection.
+</p><h1><a name="further">Further Reading:</a></h1>
+<b>The beginnings of a frequently asked questions list for this
+collector are <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/faq.html">here</a></b>.
+<p>
+<b>The following provide information on garbage collection in general</b>:
+</p><p>
+Paul Wilson's <a href="ftp://ftp.cs.utexas.edu/pub/garbage">garbage collection ftp archive</a> and <a href="ftp://ftp.cs.utexas.edu/pub/garbage/gcsurvey.ps">GC survey</a>.
+</p><p>
+The Ravenbrook <a href="http://www.memorymanagement.org/">
+Memory Management Reference</a>.
+</p><p>
+David Chase's
+<a href="http://www.iecc.com/gclist/GC-faq.html">GC FAQ</a>.
+</p><p>
+Richard Jones'
+<a href="http://www.ukc.ac.uk/computer_science/Html/Jones/gc.html">
+GC page</a> and
+<a href="http://www.cs.kent.ac.uk/people/staff/rej/gcbook/gcbook.html">
+his book</a>.
+</p><p>
+<b>The following papers describe the collector algorithms we use
+and the underlying design decisions at
+a higher level.</b>
+</p><p>
+(Some of the lower level details can be found
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gcdescr.html">here</a>.)
+</p><p>
+The first one is not available
+electronically due to copyright considerations.  Most of the others are
+subject to ACM copyright.
+</p><p>
+Boehm, H., "Dynamic Memory Allocation and Garbage Collection", <i>Computers in Physics
+9</i>, 3, May/June 1995, pp. 297-303.  This is directed at an otherwise sophisticated
+audience unfamiliar with memory allocation issues.  The algorithmic details differ
+from those in the implementation.  There is a related letter to the editor and a minor
+correction in the next issue.
+</p><p>
+Boehm, H., and <a href="http://www.ubiq.com/hypertext/weiser/weiser.html">M. Weiser</a>,
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/spe_gc_paper">"Garbage Collection in an Uncooperative Environment"</a>,
+<i>Software Practice &amp; Experience</i>, September 1988, pp. 807-820.
+</p><p>
+Boehm, H., A. Demers, and S. Shenker, <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/papers/pldi91.ps.Z">"Mostly Parallel Garbage Collection"</a>, Proceedings
+of the ACM SIGPLAN '91 Conference on Programming Language Design and Implementation,
+<i>SIGPLAN Notices 26</i>, 6 (June 1991), pp. 157-164.
+</p><p>
+Boehm, H., <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/papers/pldi93.ps.Z">"Space Efficient Conservative Garbage Collection"</a>, Proceedings of the ACM
+SIGPLAN '93 Conference on Programming Language Design and Implementation, <i>SIGPLAN
+Notices 28</i>, 6 (June 1993), pp. 197-206.
+</p><p>
+Boehm, H., "Reducing Garbage Collector Cache Misses",
+<i> Proceedings of the 2000 International Symposium on Memory Management </i>.
+<a href="http://portal.acm.org/citation.cfm?doid=362422.362438">
+Official version.</a>
+<a href="http://www.hpl.hp.com/techreports/2000/HPL-2000-99.html">
+Technical report version.</a>  Describes the prefetch strategy
+incorporated into the collector for some platforms.  Explains why
+the sweep phase of a "mark-sweep" collector should not really be
+a distinct phase.
+</p><p>
+M. Serrano, H. Boehm,
+"Understanding Memory Allocation of Scheme Programs",
+<i>Proceedings of the Fifth ACM SIGPLAN International Conference on
+Functional Programming</i>, 2000, Montreal, Canada, pp. 245-256.
+<a href="http://www.acm.org/pubs/citations/proceedings/fp/351240/p245-serrano/">
+Official version.</a>
+<a href="http://www.hpl.hp.com/techreports/2000/HPL-2000-62.html">
+Earlier Technical Report version.</a>  Includes some discussion of the
+collector debugging facilities for identifying causes of memory retention.
+</p><p>
+Boehm, H.,
+"Fast Multiprocessor Memory Allocation and Garbage Collection",
+<a href="http://www.hpl.hp.com/techreports/2000/HPL-2000-165.html">
+HP Labs Technical Report HPL 2000-165</a>.  Discusses the parallel
+collection algorithms, and presents some performance results.
+</p><p>
+Boehm, H., "Bounding Space Usage of Conservative Garbage Collectors",
+<i>Proceeedings of the 2002 ACM SIGPLAN-SIGACT Symposium on Principles of
+Programming Languages</i>, Jan. 2002, pp. 93-100.
+<a href="http://portal.acm.org/citation.cfm?doid=503272.503282">
+Official version.</a>
+<a href="http://www.hpl.hp.com/techreports/2001/HPL-2001-251.html">
+Technical report version.</a>
+Includes a discussion of a collector facility to much more reliably test for
+the potential of unbounded heap growth.
+</p><p>
+<b>The following papers discuss language and compiler restrictions necessary to guaranteed
+safety of conservative garbage collection.</b>
+</p><p>
+We thank John Levine and JCLT for allowing
+us to make the second paper available electronically, and providing PostScript for the final
+version.
+</p><p>
+Boehm, H., <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/papers/pldi96.ps.gz">``Simple
+Garbage-Collector-Safety''</a>, Proceedings
+of the ACM SIGPLAN '96 Conference on Programming Language Design
+and Implementation.
+</p><p>
+Boehm, H., and D. Chase,  <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/papers/boecha.ps.gz">
+``A Proposal for Garbage-Collector-Safe C Compilation''</a>,
+<i>Journal of C  Language Translation 4</i>, 2 (Decemeber 1992), pp. 126-141.
+</p><p>
+<b>Other related information: </b>
+</p><p>
+The Detlefs, Dosser and Zorn's <a href="ftp://ftp.cs.colorado.edu/pub/techreports/zorn/CU-CS-665-93.ps.Z">Memory Allocation Costs in Large C and C++ Programs</a>.
+ This is a performance comparison of the Boehm-Demers-Weiser collector to malloc/free,
+using programs written for malloc/free.
+</p><p>
+Joel Bartlett's <a href="ftp://ftp.digital.com/pub/DEC/CCgc">mostly copying conservative garbage collector for C++</a>.
+</p><p>
+John Ellis and David Detlef's <a href="ftp://parcftp.xerox.com/pub/ellis/gc/gc.ps">Safe Efficient Garbage Collection for C++</a> proposal.
+</p><p>
+Henry Baker's <a href="http://home.pipeline.com/%7Ehbaker1/">paper collection</a>.
+</p><p>
+Slides for Hans Boehm's <a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/myths.ps">Allocation and GC Myths</a> talk.
+</p><h1><a name="users">Current users:</a></h1>
+Known current users of some variant of this collector include:
+<p>
+The runtime system for <a href="http://gcc.gnu.org/java">GCJ</a>,
+the static GNU java compiler.
+</p><p>
+<a href="http://w3m.sourceforge.net/">W3m</a>, a text-based web browser.
+</p><p>
+Some versions of the Xerox DocuPrint printer software.
+</p><p>
+The <a href="http://www.mozilla.org/">Mozilla</a> project, as leak
+detector.
+</p><p>
+The <a href="http://www.go-mono.com/">Mono</a> project,
+an open source implementation of the .NET development framework.
+</p><p>
+The <a href="http://www.gnu.org/projects/dotgnu/">DotGNU Portable.NET
+project</a>, another open source .NET implementation.
+</p><p>
+The <a href="http://irssi.org/">Irssi IRC client</a>.
+</p><p>
+<a href="http://titanium.cs.berkeley.edu/">The Berkeley Titanium project</a>.
+</p><p>
+<a href="http://www.nag.co.uk/nagware_fortran_compilers.asp">The NAGWare f90 Fortran 90 compiler</a>.
+</p><p>
+Elwood Corporation's <a href="http://www.elwood.com/eclipse-info/index.htm">
+Eclipse</a> Common Lisp system, C library, and translator.
+</p><p>
+The <a href="http://www-sop.inria.fr/mimosa/fp/Bigloo/">Bigloo
+Scheme</a>
+and <a href="http://kaolin.unice.fr/%7Eserrano/camloo.html">Camloo ML
+compilers</a>
+written by Manuel Serrano and others.
+</p><p>
+Brent Benson's <a href="http://ftp.cs.indiana.edu/pub/scheme-repository/imp/">libscheme</a>.
+</p><p>
+The <a href="http://www.cs.rice.edu/CS/PLT/packages/mzscheme/index.html">MzScheme</a> scheme implementation.
+</p><p>
+The <a href="http://www.cs.washington.edu/research/projects/cecil/www/cecil-home.html">University of Washington Cecil Implementation</a>.
+</p><p>
+<a href="http://www.icsi.berkeley.edu/Sather/">The Berkeley Sather implementation</a>.
+</p><p>
+<a href="http://www.cs.berkeley.edu/%7Eharmonia/">The Berkeley Harmonia Project</a>.
+</p><p>
+The <a href="http://www.cs.arizona.edu/sumatra/toba/">Toba</a> Java Virtual
+Machine to C translator.
+</p><p>
+The <a href="http://www.gwydiondylan.org/">Gwydion Dylan compiler</a>.
+</p><p>
+The <a href="http://gcc.gnu.org/onlinedocs/gcc/Objective-C.html">
+GNU Objective C runtime</a>.
+</p><p>
+<a href="http://www.math.uiuc.edu/Macaulay2">Macaulay 2</a>, a system to support
+research in algebraic geometry and commutative algebra.
+</p><p>
+The <a href="http://www.vestasys.org/">Vesta</a> configuration management
+system.
+</p><p>
+<a href="http://www.visual-prolog.com/vip6">Visual Prolog 6</a>.
+</p><p>
+<a href="http://asymptote.sf.net/">Asymptote LaTeX-compatible
+vector graphics language.</a>
+
+</p><h1><a name="collector">More collector information at this site</a></h1>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/simple_example.html">A simple illustration of how to build and
+use the collector.</a>.
+<p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gcinterface.html">Description of alternate interfaces to the
+garbage collector.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/04tutorial.pdf">Slides from an ISMM 2004  tutorial about the GC.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/faq.html">A FAQ (frequently asked questions) list.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/leak.html">How to use the garbage collector as a leak detector.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/debugging.html">Some hints on debugging garbage collected
+applications.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gcdescr.html">An overview of the implementation of the
+garbage collector.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/tree.html">The data structure used for fast pointer lookups.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/scale.html">Scalability of the collector to multiprocessors.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source">Directory containing garbage collector source.</a>
+
+</p><h1><a name="background">More background information at this site</a></h1>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/bounds.html">An attempt to establish a bound on space usage of
+conservative garbage collectors.</a>
+<p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/complexity.html">Mark-sweep versus copying garbage collectors
+and their complexity.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/conservative.html">Pros and cons of conservative garbage collectors,
+in comparison to other collectors.
+</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/issues.html">Issues related to garbage collection vs.
+manual memory management in C/C++.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/example.html">An example of a case in which garbage collection
+results in a much faster implementation as a result of reduced
+synchronization.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/nonmoving">Slide set discussing performance of nonmoving
+garbage collectors.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/popl03/web">
+Slide set discussing <i>Destructors, Finalizers, and Synchronization</i>
+(POPL 2003).</a>
+</p><p>
+<a href="http://portal.acm.org/citation.cfm?doid=604131.604153">
+Paper corresponding to above slide set.</a>
+(<a href="http://www.hpl.hp.com/techreports/2002/HPL-2002-335.html">
+Technical Report version</a>.)
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_bench.html">A Java/Scheme/C/C++ garbage collection benchmark.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/myths.ps">Slides for talk on memory allocation myths.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gctalk.ps">Slides for OOPSLA 98 garbage collection talk.</a>
+</p><p>
+<a href="http://www.hpl.hp.com/personal/Hans_Boehm/gc/papers">Related papers.</a>
+</p><h1><a name="contacts">Contacts and Mailing List</a><a></a></h1>
+<a>We have recently set up two mailing list for collector announcements
+and discussions:
+</a><ul>
+<li><a href="mailto:gc-announce@linux.hpl.hp.com">gc-announce@linux.hpl.hp.com</a>
+is used for announcements of new versions.  Postings are restricted.
+We expect this to always remain a very low volume list.
+</li><li><a href="mailto:gc@linux.hpl.hp.com">gc@linux.hpl.hp.com</a> is used for
+discussions, bug reports, and the like.  Subscribers may post.
+On-topic posts by nonsubscribers will usually also be accepted, but
+it may take some time to review them.
+</li></ul>
+To subscribe to these lists, send a mail message containing the
+word "subscribe" to
+<a href="mailto:gc-announce-request@linux.hpl.hp.com?subject=subscribe">gc-announce-request@linux.hpl.hp.com</a>
+or to
+<a href="mailto:gc-request@linux.hpl.hp.com?subject=subscribe">gc-request@linux.hpl.hp.com</a>.
+(Please ignore the instructions about web-based subscription.
+The listed web site is behind the HP firewall.)
+<p>
+The archives for these lists appear
+<a href="http://www.hpl.hp.com/hosted/linux/mail-archives">here</a>.
+The gc list archive may also be read at
+<a href="http://dir.gmane.org/gmane.comp.programming.garbage-collection.boehmgc">gmane.org</a>.
+</p><p>
+Some prior discussion of the collector has taken place on the gcc
+java mailing list, whose archives appear
+<a href="http://gcc.gnu.org/ml/java/">here</a>, and also on
+<a href="http://lists.tunes.org/mailman/listinfo/gclist">gclist@iecc.com</a>.
+</p><p>
+Comments and bug reports may also be sent to
+(<a href="mailto:Hans_Boehm@hp.com">Hans.Boehm@hp.com</a>) or
+(<a href="mailto:boehm@acm.org">boehm@acm.org</a>), but the gc
+mailing list is usually preferred.
+ 
+</p></body></html>
diff --git a/headers.c b/headers.c
index 8b14b4b..1a0ce88 100644
--- a/headers.c
+++ b/headers.c
@@ -254,7 +254,7 @@ struct hblkhdr * GC_install_header(struct hblk *h)
     result = alloc_hdr();
     SET_HDR(h, result);
 #   ifdef USE_MUNMAP
-	result -> hb_last_reclaimed = GC_gc_no;
+	result -> hb_last_reclaimed = (unsigned short)GC_gc_no;
 #   endif
     return(result);
 }
diff --git a/include/gc.h b/include/gc.h
index a47dc4a..5f049c5 100644
--- a/include/gc.h
+++ b/include/gc.h
@@ -962,17 +962,7 @@ extern void GC_thr_init(void);	/* Needed for Solaris/X86	*/
   * A GC_INIT call is required if the collector is built with THREAD_LOCAL_ALLOC
   * defined and the initial allocation call is not to GC_malloc().
   */
-#if (defined(sparc) || defined(__sparc)) && defined(sun)
-    /*
-     * If you are planning on putting
-     * the collector in a SunOS 5 dynamic library, you need to call GC_INIT()
-     * from the statically loaded program section.
-     * This circumvents a Solaris 2.X (X<=4) linker bug.
-     */
-#   define GC_INIT() { extern end, etext; \
-		       GC_noop(&end, &etext); \
-		       GC_init();}
-#elif defined(__CYGWIN32__) && defined(GC_DLL) || defined (_AIX)
+#if defined(__CYGWIN32__) || defined (_AIX)
     /*
      * Similarly gnu-win32 DLLs need explicit initialization from
      * the main program, as does AIX.
@@ -984,15 +974,22 @@ extern void GC_thr_init(void);	/* Needed for Solaris/X86	*/
       extern int _bss_end__[];
 #     define GC_MAX(x,y) ((x) > (y) ? (x) : (y))
 #     define GC_MIN(x,y) ((x) < (y) ? (x) : (y))
-#     define GC_DATASTART ((GC_PTR) GC_MIN(_data_start__, _bss_start__))
-#     define GC_DATAEND	 ((GC_PTR) GC_MAX(_data_end__, _bss_end__))
+#     define GC_DATASTART ((void *) GC_MIN(_data_start__, _bss_start__))
+#     define GC_DATAEND	 ((void *) GC_MAX(_data_end__, _bss_end__))
+#     if defined(GC_DLL)
+#       define GC_INIT() { GC_add_roots(GC_DATASTART, GC_DATAEND); GC_init(); }
+#     else
+	/* Main program init not required, but other defined needed for */
+	/* uniformity.							*/
+#       define GC_INIT() { GC_init(); }
+#     endif
 #   endif
 #   if defined(_AIX)
       extern int _data[], _end[];
-#     define GC_DATASTART ((GC_PTR)((ulong)_data))
-#     define GC_DATAEND ((GC_PTR)((ulong)_end))
+#     define GC_DATASTART ((void *)((ulong)_data))
+#     define GC_DATAEND ((void *)((ulong)_end))
+#     define GC_INIT() { GC_add_roots(GC_DATASTART, GC_DATAEND); GC_init(); }
 #   endif
-#   define GC_INIT() { GC_add_roots(GC_DATASTART, GC_DATAEND); GC_init(); }
 #else
 #   define GC_INIT() { GC_init(); }
 #endif
diff --git a/include/gc_config_macros.h b/include/gc_config_macros.h
index 2cfa6c2..f3b5ef4 100644
--- a/include/gc_config_macros.h
+++ b/include/gc_config_macros.h
@@ -5,12 +5,12 @@
  * Some tests for old macros.  These violate our namespace rules and will
  * disappear shortly.  Use the GC_ names.
  */
-#if defined(SOLARIS_THREADS) || defined(_SOLARIS_THREADS)
+#if defined(SOLARIS_THREADS) || defined(_SOLARIS_THREADS) \
+    || defined(_SOLARIS_PTHREADS) || defined(GC_SOLARIS_PTHREADS)
+  /* We no longer support old style Solaris threads.		*/
+  /* GC_SOLARIS_THREADS now means pthreads.			*/
 # define GC_SOLARIS_THREADS
 #endif
-#if defined(_SOLARIS_PTHREADS)
-# define GC_SOLARIS_PTHREADS
-#endif
 #if defined(IRIX_THREADS)
 # define GC_IRIX_THREADS
 #endif
@@ -39,7 +39,6 @@
 #endif
 
 #if !defined(_REENTRANT) && (defined(GC_SOLARIS_THREADS) \
-		             || defined(GC_SOLARIS_PTHREADS) \
 			     || defined(GC_HPUX_THREADS) \
 			     || defined(GC_AIX_THREADS) \
 			     || defined(GC_LINUX_THREADS))
@@ -52,7 +51,7 @@
 # define _POSIX4A_DRAFT10_SOURCE 1
 #endif
 
-# if defined(GC_SOLARIS_PTHREADS) || defined(GC_FREEBSD_THREADS) || \
+# if defined(GC_SOLARIS_THREADS) || defined(GC_FREEBSD_THREADS) || \
 	defined(GC_IRIX_THREADS) || defined(GC_LINUX_THREADS) || \
 	defined(GC_HPUX_THREADS) || defined(GC_OSF1_THREADS) || \
 	defined(GC_DGUX386_THREADS) || defined(GC_DARWIN_THREADS) || \
@@ -79,10 +78,12 @@
 #   define GC_IRIX_THREADS
 #   define GC_PTHREADS
 # endif
-# if defined(__sparc) && !defined(__linux__)
-#   define GC_SOLARIS_PTHREADS
+# if defined(__sparc) && !defined(__linux__) \
+     || defined(sun) && (defined(i386) || defined(__i386__))
+#   define GC_SOLARIS_THREADS
 #   define GC_PTHREADS
 # endif
+
 # if defined(__APPLE__) && defined(__MACH__) && defined(__ppc__)
 #   define GC_DARWIN_THREADS
 #   define GC_PTHREADS
@@ -111,10 +112,6 @@
 # endif
 #endif
 
-#if defined(GC_SOLARIS_PTHREADS) && !defined(GC_SOLARIS_THREADS)
-#   define GC_SOLARIS_THREADS
-#endif
-
 # define __GC
 # ifndef _WIN32_WCE
 #   include <stddef.h>
diff --git a/include/gc_inline.h b/include/gc_inline.h
index d2008cf..5f6b6bb 100644
--- a/include/gc_inline.h
+++ b/include/gc_inline.h
@@ -26,9 +26,12 @@
 #include "gc.h"
 #include "gc_tiny_fl.h"
 
-#ifndef __GNUC__
-#  define __builtin_expect(x, y) (x)
-#endif
+#if __GNUC__ >= 3
+# define GC_EXPECT(expr, outcome) __builtin_expect(expr,outcome)
+  /* Equivalent to (expr), but predict that usually (expr)==outcome. */
+#else
+# define GC_EXPECT(expr, outcome) (expr)
+#endif /* __GNUC__ */
 
 /* The ultimately general inline allocation macro.  Allocate an object	*/
 /* of size bytes, putting the resulting pointer in result.  Tiny_fl is	*/
@@ -49,14 +52,14 @@
 # define GC_FAST_MALLOC_GRANS(result,granules,tiny_fl,num_direct,\
 			      kind,default_expr,init) \
 { \
-    if (__builtin_expect(granules >= GC_TINY_FREELISTS,0)) { \
+    if (GC_EXPECT(granules >= GC_TINY_FREELISTS,0)) { \
         result = default_expr; \
     } else { \
 	void **my_fl = tiny_fl + granules; \
         void *my_entry=*my_fl; \
 	void *next; \
  \
-	while (__builtin_expect((word)my_entry \
+	while (GC_EXPECT((word)my_entry \
 				<= num_direct + GC_TINY_FREELISTS + 1, 0)) { \
 	    /* Entry contains counter or NULL */ \
 	    if ((word)my_entry - 1 < num_direct) { \
@@ -81,7 +84,7 @@
 	init; \
         PREFETCH_FOR_WRITE(next); \
         GC_ASSERT(GC_size(result) >= bytes + EXTRA_BYTES); \
-        GC_ASSERT(((word *)result)[1] == 0); \
+        GC_ASSERT((kind) == PTRFREE || ((word *)result)[1] == 0); \
       out: ; \
    } \
 }
diff --git a/include/private/gc_locks.h b/include/private/gc_locks.h
index 4dcba2b..5eecc50 100644
--- a/include/private/gc_locks.h
+++ b/include/private/gc_locks.h
@@ -18,22 +18,10 @@
 #ifndef GC_LOCKS_H
 #define GC_LOCKS_H
 
-#include <atomic_ops.h>
-
 /*
  * Mutual exclusion between allocator/collector routines.
  * Needed if there is more than one allocator thread.
- * FASTLOCK() is assumed to try to acquire the lock in a cheap and
- * dirty way that is acceptable for a few instructions, e.g. by
- * inhibiting preemption.  This is assumed to have succeeded only
- * if a subsequent call to FASTLOCK_SUCCEEDED() returns TRUE.
- * FASTUNLOCK() is called whether or not FASTLOCK_SUCCEEDED().
- * If signals cannot be tolerated with the FASTLOCK held, then
- * FASTLOCK should disable signals.  The code executed under
- * FASTLOCK is otherwise immune to interruption, provided it is
- * not restarted.
- * DCL_LOCK_STATE declares any local variables needed by LOCK and UNLOCK
- * and/or FASTLOCK.
+ * DCL_LOCK_STATE declares any local variables needed by LOCK and UNLOCK.
  *
  * In the PARALLEL_MARK case, we also need to define a number of
  * other inline finctions here:
@@ -44,21 +32,9 @@
  *   
  */  
 # ifdef THREADS
+#  include <atomic_ops.h>
+
    void GC_noop1(word);
-#  ifdef PCR_OBSOLETE	/* Faster, but broken with multiple lwp's	*/
-#    include  "th/PCR_Th.h"
-#    include  "th/PCR_ThCrSec.h"
-     extern struct PCR_Th_MLRep GC_allocate_ml;
-#    define DCL_LOCK_STATE  PCR_sigset_t GC_old_sig_mask
-#    define LOCK() PCR_Th_ML_Acquire(&GC_allocate_ml) 
-#    define UNLOCK() PCR_Th_ML_Release(&GC_allocate_ml)
-#    define UNLOCK() PCR_Th_ML_Release(&GC_allocate_ml)
-#    define FASTLOCK() PCR_ThCrSec_EnterSys()
-     /* Here we cheat (a lot): */
-#        define FASTLOCK_SUCCEEDED() (*(int *)(&GC_allocate_ml) == 0)
-		/* TRUE if nobody currently holds the lock */
-#    define FASTUNLOCK() PCR_ThCrSec_ExitSys()
-#  endif
 #  ifdef PCR
 #    include <base/PCR_Base.h>
 #    include <th/PCR_Th.h>
@@ -67,18 +43,37 @@
 	 PCR_ERes GC_fastLockRes; PCR_sigset_t GC_old_sig_mask
 #    define LOCK() PCR_Th_ML_Acquire(&GC_allocate_ml)
 #    define UNLOCK() PCR_Th_ML_Release(&GC_allocate_ml)
-#    define FASTLOCK() (GC_fastLockRes = PCR_Th_ML_Try(&GC_allocate_ml))
-#    define FASTLOCK_SUCCEEDED() (GC_fastLockRes == PCR_ERes_okay)
-#    define FASTUNLOCK()  {\
-        if( FASTLOCK_SUCCEEDED() ) PCR_Th_ML_Release(&GC_allocate_ml); }
 #  endif
 
 #  if !defined(AO_have_test_and_set_acquire)
 #    define USE_PTHREAD_LOCKS
 #  endif
 
+#  if defined(GC_WIN32_THREADS) && defined(GC_PTHREADS)
+#    define USE_PTHREAD_LOCKS
+#  endif
 
-#  if defined(GC_PTHREADS) && !defined(GC_WIN32_THREADS)
+#  if defined(GC_WIN32_THREADS) && !defined(USE_PTHREAD_LOCKS)
+#    include <windows.h>
+#    define NO_THREAD (DWORD)(-1)
+     extern DWORD GC_lock_holder;
+     extern CRITICAL_SECTION GC_allocate_ml;
+#    ifdef GC_ASSERTIONS
+#        define UNCOND_LOCK() \
+		{ EnterCriticalSection(&GC_allocate_ml); \
+		  SET_LOCK_HOLDER(); }
+#        define UNCOND_UNLOCK() \
+		{ GC_ASSERT(I_HOLD_LOCK()); UNSET_LOCK_HOLDER(); \
+	          LeaveCriticalSection(&GC_allocate_ml); }
+#    else
+#      define UNCOND_LOCK() EnterCriticalSection(&GC_allocate_ml);
+#      define UNCOND_UNLOCK() LeaveCriticalSection(&GC_allocate_ml);
+#    endif /* !GC_ASSERTIONS */
+#    define SET_LOCK_HOLDER() GC_lock_holder = GetCurrentThreadId()
+#    define UNSET_LOCK_HOLDER() GC_lock_holder = NO_THREAD
+#    define I_HOLD_LOCK() (!GC_need_to_lock \
+			   || GC_lock_holder == GetCurrentThreadId())
+#  elif defined(GC_PTHREADS)
 #    define NO_THREAD (pthread_t)(-1)
 #    include <pthread.h>
 
@@ -144,29 +139,16 @@
 #    endif
 #  endif /* GC_PTHREADS with linux_threads.c implementation */
 
-#  if defined(GC_WIN32_THREADS)
-#    if defined(GC_PTHREADS)
-#      include <pthread.h>
-       extern pthread_mutex_t GC_allocate_ml;
-#      define UNCOND_LOCK()   pthread_mutex_lock(&GC_allocate_ml)
-#      define UNCOND_UNLOCK() pthread_mutex_unlock(&GC_allocate_ml)
-#    else
-#      include <windows.h>
-       GC_API CRITICAL_SECTION GC_allocate_ml;
-#      define UNCOND_LOCK() EnterCriticalSection(&GC_allocate_ml);
-#      define UNCOND_UNLOCK() LeaveCriticalSection(&GC_allocate_ml);
-#    endif
-#  endif
-#  ifndef SET_LOCK_HOLDER
-#      define SET_LOCK_HOLDER()
-#      define UNSET_LOCK_HOLDER()
-#      define I_HOLD_LOCK() FALSE
-		/* Used on platforms were locks can be reacquired,	*/
-		/* so it doesn't matter if we lie.			*/
-#  endif
+
 # else /* !THREADS */
-#    define LOCK()
-#    define UNLOCK()
+#   define LOCK()
+#   define UNLOCK()
+#   define SET_LOCK_HOLDER()
+#   define UNSET_LOCK_HOLDER()
+#   define I_HOLD_LOCK() TRUE
+       		/* Used only in positive assertions or to test whether	*/
+       		/* we still need to acaquire the lock.	TRUE works in	*/
+       		/* either case.						*/
 # endif /* !THREADS */
 
 #if defined(UNCOND_LOCK) && !defined(LOCK) 
@@ -176,14 +158,6 @@
 #    define UNLOCK() if (GC_need_to_lock) { UNCOND_UNLOCK(); }
 #endif
 
-# ifndef SET_LOCK_HOLDER
-#   define SET_LOCK_HOLDER()
-#   define UNSET_LOCK_HOLDER()
-#   define I_HOLD_LOCK() FALSE
-		/* Used on platforms were locks can be reacquired,	*/
-		/* so it doesn't matter if we lie.			*/
-# endif
-
 # ifndef ENTER_GC
 #   define ENTER_GC()
 #   define EXIT_GC()
@@ -193,10 +167,4 @@
 #   define DCL_LOCK_STATE
 # endif
 
-# ifndef FASTLOCK
-#   define FASTLOCK() LOCK()
-#   define FASTLOCK_SUCCEEDED() TRUE
-#   define FASTUNLOCK() UNLOCK()
-# endif
-
 #endif /* GC_LOCKS_H */
diff --git a/include/private/gc_pmark.h b/include/private/gc_pmark.h
index 8a79b9d..1e96f18 100644
--- a/include/private/gc_pmark.h
+++ b/include/private/gc_pmark.h
@@ -167,23 +167,26 @@ exit_label: ; \
 /* Set mark bit, exit if it was already set.	*/
 
 # ifdef USE_MARK_BITS
-/* FIXME: untested */
-#   if defined(THREADS)
-      /* Introduces a benign race as in the byte case.	*/
-#     define OR_WORD_EXIT_IF_SET(addr, mask, label) \
-	if (!(*(addr) & (mask))) { \
-	  AO_or((AO_t *)(addr), (mask); \
-	} else { \
-	  goto label; \
-	}
-#   else /* !THREADS */
-#     define OR_WORD_EXIT_IF_SET(addr, mask, label) \
-	if (!(*(addr) & (mask))) { \
-	  *(addr) |= (mask); \
-	} else { \
-	  goto label; \
-	}
-#   endif
+#   ifdef PARALLEL_MARK
+      /* The following may fail to exit even if the bit was already set.    */
+      /* For our uses, that's benign:                                       */
+#     define OR_WORD_EXIT_IF_SET(addr, bits, exit_label) \
+        { \
+          if (!(*(addr) & (mask))) { \
+            AO_or((AO_t *)(addr), (mask); \
+          } else { \
+            goto label; \
+          } \
+        }
+#   else
+#     define OR_WORD_EXIT_IF_SET(addr, bits, exit_label) \
+        { \
+           word old = *(addr); \
+           word my_bits = (bits); \
+           if (old & my_bits) goto exit_label; \
+           *(addr) = (old | my_bits); \
+         }
+#   endif /* !PARALLEL_MARK */
 #   define SET_MARK_BIT_EXIT_IF_SET(hhdr,bit_no,exit_label) \
     { \
         word * mark_word_addr = hhdr -> hb_marks + divWORDSZ(bit_no); \
@@ -194,18 +197,19 @@ exit_label: ; \
 # endif
 
 
-#if defined(I386) && defined(__GNUC__)
+#ifdef USE_MARK_BYTES
+# if defined(I386) && defined(__GNUC__)
 #  define LONG_MULT(hprod, lprod, x, y) { \
 	asm("mull %2" : "=a"(lprod), "=d"(hprod) : "g"(y), "0"(x)); \
    }
-#else /* No in-line X86 assembly code */
+# else /* No in-line X86 assembly code */
 #  define LONG_MULT(hprod, lprod, x, y) { \
 	unsigned long long prod = (unsigned long long)x \
 				  * (unsigned long long)y; \
 	hprod = prod >> 32;  \
 	lprod = (unsigned32)prod;  \
    }
-#endif
+# endif
 
   /* There is a race here, and we may set				*/
   /* the bit twice in the concurrent case.  This can result in the	*/
@@ -218,6 +222,7 @@ exit_label: ; \
 	if (mark_byte) goto exit_label; \
 	*mark_byte_addr = 1;  \
     } 
+#endif /* USE_MARK_BYTES */
 
 #ifdef PARALLEL_MARK
 # define INCR_MARKS(hhdr) \
diff --git a/include/private/gc_priv.h b/include/private/gc_priv.h
index d65a393..b55a673 100644
--- a/include/private/gc_priv.h
+++ b/include/private/gc_priv.h
@@ -63,9 +63,6 @@ typedef char * ptr_t;	/* A generic pointer to which we can add	*/
 
 # ifndef GCCONFIG_H
 #   include "gcconfig.h"
-#   ifndef USE_MARK_BYTES
-#     define USE_MARK_BYTES
-#   endif
 # endif
 
 # ifndef HEADERS_H
@@ -74,8 +71,8 @@ typedef char * ptr_t;	/* A generic pointer to which we can add	*/
 
 #if __GNUC__ >= 3
 # define EXPECT(expr, outcome) __builtin_expect(expr,outcome)
-# define INLINE inline
   /* Equivalent to (expr), but predict that usually (expr)==outcome. */
+# define INLINE inline
 #else
 # define EXPECT(expr, outcome) (expr)
 # define INLINE
@@ -192,17 +189,6 @@ typedef char * ptr_t;	/* A generic pointer to which we can add	*/
 /*                               */
 /*********************************/
 
-#ifdef SAVE_CALL_CHAIN
-
-/* Fill in the pc and argument information for up to NFRAMES of my	*/
-/* callers.  Ignore my frame and my callers frame.			*/
-struct callinfo;
-void GC_save_callers(struct callinfo info[NFRAMES]);
-  
-void GC_print_callers(struct callinfo info[NFRAMES]);
-
-#endif
-
 #ifdef NEED_CALLINFO
     struct callinfo {
 	word ci_pc;  	/* Caller, not callee, pc	*/
@@ -216,6 +202,16 @@ void GC_print_callers(struct callinfo info[NFRAMES]);
     };
 #endif
 
+#ifdef SAVE_CALL_CHAIN
+
+/* Fill in the pc and argument information for up to NFRAMES of my	*/
+/* callers.  Ignore my frame and my callers frame.			*/
+void GC_save_callers(struct callinfo info[NFRAMES]);
+  
+void GC_print_callers(struct callinfo info[NFRAMES]);
+
+#endif
+
 
 /*********************************/
 /*                               */
@@ -331,10 +327,10 @@ void GC_print_callers(struct callinfo info[NFRAMES]);
 #   define ABORT(s) PCR_Base_Panic(s)
 # else
 #   ifdef SMALL_CONFIG
-#	define ABORT(msg) abort();
+#	define ABORT(msg) abort()
 #   else
 	GC_API void GC_abort(const char * msg);
-#       define ABORT(msg) GC_abort(msg);
+#       define ABORT(msg) GC_abort(msg)
 #   endif
 # endif
 
@@ -660,10 +656,20 @@ struct hblkhdr {
     counter_t hb_n_marks;	/* Number of set mark bits, excluding 	*/
     				/* the one always set at the end.	*/
     				/* Currently it is concurrently 	*/
-    				/* updated and hence only a lower bound.*/
-    				/* But a zero value does gurantee that	*/
+    				/* updated and hence only approximate.  */
+    				/* But a zero value does guarantee that	*/
     				/* the block contains no marked		*/
     				/* objects.				*/
+    				/* Ensuring this property means that we	*/
+    				/* never decrement it to zero during a	*/
+    				/* collection, and hence the count may 	*/
+    				/* be one too high.  Due to concurrent	*/
+    				/* updates, and arbitrary number of	*/
+    				/* increments, but not all of them (!)	*/
+    				/* may be lost, hence it may in theory	*/
+    				/* be much too low.			*/
+    				/* Without parallel marking, the count	*/
+    				/* is accurate.				*/
 #   ifdef USE_MARK_BYTES
       union {
         char _hb_marks[MARK_BITS_SZ];
@@ -676,12 +682,13 @@ struct hblkhdr {
 	word dummy;	/* Force word alignment of mark bytes. */
       } _mark_byte_union;
 #     define hb_marks _mark_byte_union._hb_marks
-#     define ANY_INDEX 23	/* Random mark bit index for assertions */
 #   else
       word hb_marks[MARK_BITS_SZ];
 #   endif /* !USE_MARK_BYTES */
 };
 
+# define ANY_INDEX 23	/* "Random" mark bit index for assertions */
+
 /*  heap block body */
 
 # define HBLK_WORDS (HBLKSIZE/sizeof(word))
@@ -1156,28 +1163,9 @@ extern long GC_large_alloc_warn_suppressed;
 /* accessed.								*/
 #ifdef PARALLEL_MARK
 # define OR_WORD(addr, bits) \
-	{ word old; \
-	  do { \
-	    old = *((volatile word *)addr); \
-	  } while (!GC_compare_and_exchange((addr), old, old | (bits))); \
-	}
-# define OR_WORD_EXIT_IF_SET(addr, bits, exit_label) \
-	{ word old; \
-	  word my_bits = (bits); \
-	  do { \
-	    old = *((volatile word *)addr); \
-	    if (old & my_bits) goto exit_label; \
-	  } while (!GC_compare_and_exchange((addr), old, old | my_bits)); \
-	}
+	{ AO_or((volatile AO_t *)(addr), (AO_t)bits); }
 #else
 # define OR_WORD(addr, bits) *(addr) |= (bits)
-# define OR_WORD_EXIT_IF_SET(addr, bits, exit_label) \
-	{ \
-	  word old = *(addr); \
-	  word my_bits = (bits); \
-	  if (old & my_bits) goto exit_label; \
-	  *(addr) = (old | my_bits); \
-	}
 #endif
 
 /* Mark bit operations */
@@ -1338,7 +1326,7 @@ void GC_with_callee_saves_pushed(void (*fn)(ptr_t, void *),
 # if defined(SPARC) || defined(IA64)
   /* Cause all stacked registers to be saved in memory.  Return a	*/
   /* pointer to the top of the corresponding memory stack.		*/
-  word GC_save_regs_in_stack(void);
+  ptr_t GC_save_regs_in_stack(void);
 # endif
 			/* Push register contents onto mark stack.	*/
   			/* If NURSERY is defined, the default push	*/
@@ -1504,7 +1492,8 @@ ptr_t GC_build_fl(struct hblk *h, size_t words, GC_bool clear, ptr_t list);
 				/* called by GC_new_hblk, but also	*/
 				/* called explicitly without GC lock.	*/
 
-struct hblk * GC_allochblk (size_t size_in_bytes, int kind, unsigned flags);
+struct hblk * GC_allochblk (size_t size_in_bytes, int kind,
+		            unsigned char flags);
 				/* Allocate a heap block, inform	*/
 				/* the marker that block is valid	*/
 				/* for objects of indicated size.	*/
@@ -1766,9 +1755,6 @@ GC_bool GC_page_was_dirty(struct hblk *h);
   			/* Read retrieved dirty bits.	*/
 GC_bool GC_page_was_ever_dirty(struct hblk *h);
   			/* Could the page contain valid heap pointers?	*/
-void GC_is_fresh(struct hblk *h, word n);
-  			/* Assert the region currently contains no	*/
-  			/* valid pointers.				*/
 void GC_remove_protection(struct hblk *h, word nblocks,
 			  GC_bool pointerfree);
   			/* h is about to be writteni or allocated.  Ensure  */
@@ -1896,7 +1882,7 @@ void GC_err_puts(const char *s);
 		/* some other reason.					*/
 # endif /* PARALLEL_MARK */
 
-# if defined(GC_PTHREADS) && !defined(GC_SOLARIS_THREADS)
+# if defined(GC_PTHREADS)
   /* We define the thread suspension signal here, so that we can refer	*/
   /* to it in the dirty bit implementation, if necessary.  Ideally we	*/
   /* would allocate a (real-time ?) signal using the standard mechanism.*/
diff --git a/include/private/gcconfig.h b/include/private/gcconfig.h
index 9fe0419..9b80cbe 100644
--- a/include/private/gcconfig.h
+++ b/include/private/gcconfig.h
@@ -854,6 +854,7 @@
 #     define ALIGNMENT 4	/* Required by hardware	*/
 #     define CPP_WORDSZ 32
 #   endif
+#   define USE_ASM_PUSH_REGS
 #   ifdef SUNOS5
 #	define OS_TYPE "SUNOS5"
 	extern int _etext[];
@@ -1146,7 +1147,11 @@
 #       if !defined(__WATCOMC__) && !defined(GC_WIN32_THREADS)
 #	  define MPROTECT_VDB
 #	endif
-#       define GWW_VDB
+#	if _MSC_VER >= 1300  /* .NET, i.e. > VisualStudio 6	*/
+#         define GWW_VDB
+#	else
+#	  define MPROTECT_VDB
+#	endif
 #       define DATAEND  /* not needed */
 #   endif
 #   ifdef MSWINCE
@@ -1490,13 +1495,6 @@
 
 # ifdef IA64
 #   define MACH_TYPE "IA64"
-	/* We need to get preserved registers in addition to register   */
-	/* windows.   That's easiest to do with setjmp.			*/
-#   ifdef PARALLEL_MARK
-#	define USE_MARK_BYTES
-	    /* Compare-and-exchange is too expensive to use for 	*/
-	    /* setting mark bits.					*/
-#   endif
 #   ifdef HPUX
 #	ifdef _ILP32
 #	  define CPP_WORDSZ 32
@@ -1992,6 +1990,14 @@
 #   define THREADS
 # endif
 
+# if !defined(USE_MARK_BITS) && !defined(USE_MARK_BYTES)
+#   if defined(THREADS) && defined(PARALLEL_MARK)
+#     define USE_MARK_BYTES
+#   else
+#     define USE_MARK_BITS
+#   endif
+# endif
+
 # if defined(MSWINCE)
 #   define NO_GETENV
 # endif
diff --git a/include/private/pthread_support.h b/include/private/pthread_support.h
index b2ef68e..77f1ad1 100644
--- a/include/private/pthread_support.h
+++ b/include/private/pthread_support.h
@@ -3,8 +3,7 @@
 
 # include "private/gc_priv.h"
 
-# if defined(GC_PTHREADS) && !defined(GC_SOLARIS_THREADS) \
-     && !defined(GC_WIN32_THREADS)
+# if defined(GC_PTHREADS) && !defined(GC_WIN32_THREADS)
      
 #if defined(GC_DARWIN_THREADS)
 # include "private/darwin_stop_world.h"
@@ -67,7 +66,7 @@ typedef struct GC_Thread_Rep {
 #   endif
 } * GC_thread;
 
-# define THREAD_TABLE_SZ 128	/* Must be power of 2	*/
+# define THREAD_TABLE_SZ 256	/* Must be power of 2	*/
 extern volatile GC_thread GC_threads[THREAD_TABLE_SZ];
 
 extern GC_bool GC_thr_initialized;
diff --git a/include/private/thread_local_alloc.h b/include/private/thread_local_alloc.h
index 32cbb08..3416931 100644
--- a/include/private/thread_local_alloc.h
+++ b/include/private/thread_local_alloc.h
@@ -19,6 +19,45 @@
 /* implementation also exports GC_malloc and friends, which	*/
 /* are declared in gc.h.					*/
 
+#include "private/gc_priv.h"
+
+#if defined(THREAD_LOCAL_ALLOC)
+
+#include "gc_inline.h"
+
+
+# if defined USE_HPUX_TLS
+#   error USE_HPUX_TLS macro was replaced by USE_COMPILER_TLS
+# endif
+
+# if !defined(USE_PTHREAD_SPECIFIC) && !defined(USE_WIN32_SPECIFIC) && \
+     !defined(USE_WIN32_COMPILER_TLS) && !defined(USE_COMPILER_TLS) && \
+     !defined(USE_CUSTOM_SPECIFIC)
+#   if defined(MSWIN32) || defined(MSWINCE) || defined(CYGWIN32)
+#     if defined(__GNUC__)  /* Fixed for versions past 2.95? */
+#       define USE_WIN32_SPECIFIC
+#     else
+#       define USE_WIN32_COMPILER_TLS
+#     endif /* !GNU */
+#   elif defined(LINUX) && defined(__GNUC__)
+#     define USE_COMPILER_TLS
+#   elif (defined(GC_DGUX386_THREADS) || defined(GC_OSF1_THREADS) || \
+         defined(GC_DARWIN_THREADS) || defined(GC_AIX_THREADS))
+#     define USE_PTHREAD_SPECIFIC
+#   elif defined(GC_HPUX_THREADS)
+#     ifdef __GNUC__
+#      define USE_PTHREAD_SPECIFIC
+         /* Empirically, as of gcc 3.3, USE_COMPILER_TLS doesn't work.	*/
+#     else
+#      define USE_COMPILER_TLS
+#     endif
+#   else
+#     define USE_CUSTOM_SPECIFIC  /* Use our own.	*/
+#   endif
+# endif
+
+# include <stdlib.h>
+
 /* One of these should be declared as the tlfs field in the	*/
 /* structure pointed to by a GC_thread.				*/
 typedef struct thread_local_freelists {
@@ -52,22 +91,27 @@ typedef struct thread_local_freelists {
 #   define GC_key_create pthread_key_create
 #   define GC_remove_specific()  /* No need for cleanup on exit. */
     typedef pthread_key_t GC_key_t;
-# elif defined(USE_COMPILER_TLS)
+# elif defined(USE_COMPILER_TLS) || defined(USE_WIN32_COMPILER_TLS)
 #   define GC_getspecific(x) (x)
 #   define GC_setspecific(key, v) ((key) = (v), 0)
 #   define GC_key_create(key, d) 0
 #   define GC_remove_specific()  /* No need for cleanup on exit. */
     typedef void * GC_key_t;
 # elif defined(USE_WIN32_SPECIFIC)
+#   include <windows.h>
 #   define GC_getspecific TlsGetValue
-#   define GC_setspecific TlsSetValue
+#   define GC_setspecific(key, v) !TlsSetValue(key, v)
+    	/* We assume 0 == success, msft does the opposite.	*/
 #   define GC_key_create(key, d)  \
 	((d) != 0? (ABORT("Destructor unsupported by TlsAlloc"),0) \
 	 	 : (*(key) = TlsAlloc(), 0))
 #   define GC_remove_specific()  /* No need for cleanup on thread exit. */
     	/* Need TlsFree on process exit/detach ? */
-# else
+    typedef DWORD GC_key_t;
+# elif defined(USE_CUSTOM_SPECIFIC)
 #   include "private/specific.h"
+# else
+#   error implement me
 # endif
 
 
@@ -86,14 +130,18 @@ void GC_destroy_thread_local(GC_tlfs p);
 /* we take care of an individual thread freelist structure.	*/
 void GC_mark_thread_local_fls_for(GC_tlfs p);
 
-#ifdef USE_COMPILER_TLS
+extern
+#if defined(USE_COMPILER_TLS)
   __thread
+#elif defined(USE_WIN32_COMPILER_TLS)
+  declspec(thread)
 #endif
 GC_key_t GC_thread_key;
+
 /* This is set up by the thread_local_alloc implementation.  But the	*/
 /* thread support layer calls GC_remove_specific(GC_thread_key)		*/
 /* before a thread exits.						*/
 /* And the thread support layer makes sure that GC_thread_key is traced,*/
 /* if necessary.							*/
 
-
+#endif /* THREAD_LOCAL_ALLOC */
diff --git a/mach_dep.c b/mach_dep.c
index 50b5665..ca1ace1 100644
--- a/mach_dep.c
+++ b/mach_dep.c
@@ -65,7 +65,7 @@ asm static void PushMacRegisters()
 # if defined(SPARC) || defined(IA64)
     /* Value returned from register flushing routine; either sp (SPARC) */
     /* or ar.bsp (IA64)							*/
-    word GC_save_regs_ret_val;
+    ptr_t GC_save_regs_ret_val;
 # endif
 
 /* Routine to mark from registers that are preserved by the C compiler. */
@@ -265,88 +265,12 @@ ptr_t cold_gc_frame;
     GC_with_callee_saves_pushed(GC_push_current_stack, cold_gc_frame);
 }
 
-/* On register window machines, we need a way to force registers into 	*/
-/* the stack.	Return sp.						*/
-# ifdef SPARC
-    asm("	.seg 	\"text\"");
-#   if defined(SVR4) || defined(NETBSD) || defined(FREEBSD)
-      asm("	.globl	GC_save_regs_in_stack");
-      asm("GC_save_regs_in_stack:");
-      asm("	.type GC_save_regs_in_stack,#function");
-#   else
-      asm("	.globl	_GC_save_regs_in_stack");
-      asm("_GC_save_regs_in_stack:");
-#   endif
-#   if defined(__arch64__) || defined(__sparcv9)
-      asm("	save	%sp,-128,%sp");
-      asm("	flushw");
-      asm("	ret");
-      asm("	restore %sp,2047+128,%o0");
-#   else
-      asm("	ta	0x3   ! ST_FLUSH_WINDOWS");
-      asm("	retl");
-      asm("	mov	%sp,%o0");
-#   endif
-#   ifdef SVR4
-      asm("	.GC_save_regs_in_stack_end:");
-      asm("	.size GC_save_regs_in_stack,.GC_save_regs_in_stack_end-GC_save_regs_in_stack");
-#   endif
-#   ifdef LINT
-	word GC_save_regs_in_stack() { return(0 /* sp really */);}
-#   endif
-# endif
-
-/* GC_clear_stack_inner(arg, limit) clears stack area up to limit and	*/
-/* returns arg.  Stack clearing is crucial on SPARC, so we supply	*/
-/* an assembly version that's more careful.  Assumes limit is hotter	*/
-/* than sp, and limit is 8 byte aligned.				*/
 #if defined(ASM_CLEAR_CODE)
-#ifndef SPARC
-	--> fix it
-#endif
-  asm(".globl GC_clear_stack_inner");
-  asm("GC_clear_stack_inner:");
-  asm(".type GC_save_regs_in_stack,#function");
-#if defined(__arch64__) || defined(__sparcv9)
-  asm("mov %sp,%o2");		/* Save sp			*/
-  asm("add %sp,2047-8,%o3");	/* p = sp+bias-8		*/
-  asm("add %o1,-2047-192,%sp");	/* Move sp out of the way,	*/
-  				/* so that traps still work.	*/
-  				/* Includes some extra words	*/
-  				/* so we can be sloppy below.	*/
-  asm("loop:");
-  asm("stx %g0,[%o3]");		/* *(long *)p = 0		*/
-  asm("cmp %o3,%o1");
-  asm("bgu,pt %xcc, loop");	/* if (p > limit) goto loop	*/
-    asm("add %o3,-8,%o3");	/* p -= 8 (delay slot) */
-  asm("retl");
-    asm("mov %o2,%sp");		/* Restore sp., delay slot	*/
-#else
-  asm("mov %sp,%o2");		/* Save sp	*/
-  asm("add %sp,-8,%o3");	/* p = sp-8	*/
-  asm("clr %g1");		/* [g0,g1] = 0	*/
-  asm("add %o1,-0x60,%sp");	/* Move sp out of the way,	*/
-  				/* so that traps still work.	*/
-  				/* Includes some extra words	*/
-  				/* so we can be sloppy below.	*/
-  asm("loop:");
-  asm("std %g0,[%o3]");		/* *(long long *)p = 0	*/
-  asm("cmp %o3,%o1");
-  asm("bgu loop	");		/* if (p > limit) goto loop	*/
-    asm("add %o3,-8,%o3");	/* p -= 8 (delay slot) */
-  asm("retl");
-    asm("mov %o2,%sp");		/* Restore sp., delay slot	*/
-#endif /* old SPARC */
-  /* First argument = %o0 = return value */
-#   ifdef SVR4
-      asm("	.GC_clear_stack_inner_end:");
-      asm("	.size GC_clear_stack_inner,.GC_clear_stack_inner_end-GC_clear_stack_inner");
-#   endif
-  
 # ifdef LINT
     /*ARGSUSED*/
     ptr_t GC_clear_stack_inner(arg, limit)
     ptr_t arg; word limit;
     { return(arg); }
+    /* The real version is in a .S file */
 # endif
 #endif /* ASM_CLEAR_CODE */ 
diff --git a/malloc.c b/malloc.c
index a36956a..1513735 100644
--- a/malloc.c
+++ b/malloc.c
@@ -215,14 +215,14 @@ void * GC_generic_malloc(size_t lb, int k)
     if(SMALL_OBJ(lb)) {
 	lg = GC_size_map[lb];
 	opp = &(GC_aobjfreelist[lg]);
-	FASTLOCK();
-        if( EXPECT(!FASTLOCK_SUCCEEDED() || (op = *opp) == 0, 0) ) {
-            FASTUNLOCK();
+	LOCK();
+        if( EXPECT((op = *opp) == 0, 0) ) {
+            UNLOCK();
             return(GENERAL_MALLOC((word)lb, PTRFREE));
         }
         *opp = obj_link(op);
         GC_bytes_allocd += GRANULES_TO_BYTES(lg);
-        FASTUNLOCK();
+        UNLOCK();
         return((void *) op);
    } else {
        return(GENERAL_MALLOC((word)lb, PTRFREE));
@@ -244,9 +244,9 @@ void * GC_generic_malloc(size_t lb, int k)
     if(SMALL_OBJ(lb)) {
 	lg = GC_size_map[lb];
 	opp = (void **)&(GC_objfreelist[lg]);
-	FASTLOCK();
-        if( EXPECT(!FASTLOCK_SUCCEEDED() || (op = *opp) == 0, 0) ) {
-            FASTUNLOCK();
+	LOCK();
+        if( EXPECT((op = *opp) == 0, 0) ) {
+            UNLOCK();
             return(GENERAL_MALLOC((word)lb, NORMAL));
         }
         /* See above comment on signals.	*/
@@ -258,7 +258,7 @@ void * GC_generic_malloc(size_t lb, int k)
         *opp = obj_link(op);
         obj_link(op) = 0;
         GC_bytes_allocd += GRANULES_TO_BYTES(lg);
-        FASTUNLOCK();
+        UNLOCK();
         return op;
    } else {
        return(GENERAL_MALLOC(lb, NORMAL));
diff --git a/mallocx.c b/mallocx.c
index 761514d..91e41d5 100644
--- a/mallocx.c
+++ b/mallocx.c
@@ -451,8 +451,8 @@ void * GC_malloc_uncollectable(size_t lb)
 	    	  /* collected anyway.					*/
 	lg = GC_size_map[lb];
 	opp = &(GC_uobjfreelist[lg]);
-	FASTLOCK();
-        if( FASTLOCK_SUCCEEDED() && (op = *opp) != 0 ) {
+	LOCK();
+        if( (op = *opp) != 0 ) {
             /* See above comment on signals.	*/
             *opp = obj_link(op);
             obj_link(op) = 0;
@@ -461,28 +461,31 @@ void * GC_malloc_uncollectable(size_t lb)
 	    /* cleared only temporarily during a collection, as a 	*/
 	    /* result of the normal free list mark bit clearing.	*/
             GC_non_gc_bytes += GRANULES_TO_BYTES(lg);
-            FASTUNLOCK();
-            return((void *) op);
-        }
-        FASTUNLOCK();
-        op = (ptr_t)GC_generic_malloc((word)lb, UNCOLLECTABLE);
+            UNLOCK();
+        } else {
+            UNLOCK();
+            op = (ptr_t)GC_generic_malloc((word)lb, UNCOLLECTABLE);
+	    /* For small objects, the free lists are completely marked. */
+	}
+	GC_ASSERT(0 == op || GC_is_marked(op));
+        return((void *) op);
     } else {
-	op = (ptr_t)GC_generic_malloc((word)lb, UNCOLLECTABLE);
-    }
-    if (0 == op) return(0);
-    /* We don't need the lock here, since we have an undisguised 	*/
-    /* pointer.  We do need to hold the lock while we adjust		*/
-    /* mark bits.							*/
-    {
-	register struct hblk * h;
 	size_t lb;
+	hdr * hhdr;
 	
-	h = HBLKPTR(op);
-	lb = HDR(h) -> hb_sz;
+	op = (ptr_t)GC_generic_malloc((word)lb, UNCOLLECTABLE);
+        if (0 == op) return(0);
 	
+	GC_ASSERT(((word)op & (HBLKSIZE - 1)) == 0); /* large block */
+	hhdr = HDR((struct hbklk *)op);
+	/* We don't need the lock here, since we have an undisguised 	*/
+	/* pointer.  We do need to hold the lock while we adjust	*/
+	/* mark bits.							*/
+	lb = hhdr -> hb_sz;
 	LOCK();
-	GC_set_mark_bit(op);
-	GC_non_gc_bytes += lb;
+	set_mark_bit_from_hdr(hhdr, 0);	/* Only object.	*/
+	GC_ASSERT(hhdr -> hb_n_marks == 0);
+	hhdr -> hb_n_marks = 1;
 	UNLOCK();
 	return((void *) op);
     }
@@ -538,36 +541,36 @@ void * GC_malloc_atomic_uncollectable(size_t lb)
 	    	  /* collected anyway.					*/
 	lg = GC_size_map[lg];
 	opp = &(GC_auobjfreelist[lg]);
-	FASTLOCK();
-        if( FASTLOCK_SUCCEEDED() && (op = *opp) != 0 ) {
+	LOCK();
+        if( (op = *opp) != 0 ) {
             /* See above comment on signals.	*/
             *opp = obj_link(op);
             obj_link(op) = 0;
             GC_bytes_allocd += GRANULES_TO_BYTES(lg);
 	    /* Mark bit was already set while object was on free list. */
             GC_non_gc_bytes += GRANULES_TO_BYTES(lg);
-            FASTUNLOCK();
-            return((void *) op);
-        }
-        FASTUNLOCK();
-        op = (ptr_t)GC_generic_malloc(lb, AUNCOLLECTABLE);
+            UNLOCK();
+        } else {
+            UNLOCK();
+            op = (ptr_t)GC_generic_malloc(lb, AUNCOLLECTABLE);
+	}
+	GC_ASSERT(0 == op || GC_is_marked(op));
+        return((void *) op);
     } else {
-	op = (ptr_t)GC_generic_malloc(lb, AUNCOLLECTABLE);
-    }
-    if (0 == op) return(0);
-    /* We don't need the lock here, since we have an undisguised 	*/
-    /* pointer.  We do need to hold the lock while we adjust		*/
-    /* mark bits.							*/
-    {
-	struct hblk * h;
 	size_t lb;
+	hdr * hhdr;
 	
-	h = HBLKPTR(op);
-	lb = HDR(h) -> hb_sz;
+	op = (ptr_t)GC_generic_malloc(lb, AUNCOLLECTABLE);
+        if (0 == op) return(0);
+
+	GC_ASSERT(((word)op & (HBLKSIZE - 1)) == 0);
+	hhdr = HDR((struct hbklk *)op);
+	lb = hhdr -> hb_sz;
 	
 	LOCK();
-	GC_set_mark_bit(op);
-	GC_non_gc_bytes += lb;
+	set_mark_bit_from_hdr(hhdr, 0);	/* Only object.	*/
+	GC_ASSERT(hhdr -> hb_n_marks == 0);
+	hhdr -> hb_n_marks = 1;
 	UNLOCK();
 	return((void *) op);
     }
diff --git a/mark.c b/mark.c
index 641f0d6..d46c1ac 100644
--- a/mark.c
+++ b/mark.c
@@ -156,7 +156,7 @@ void GC_clear_hdr_marks(hdr *hhdr)
 /* Set all mark bits in the header.  Used for uncollectable blocks. */
 void GC_set_hdr_marks(hdr *hhdr)
 {
-    int i;
+    unsigned i;
     size_t sz = hhdr -> hb_sz;
     int n_marks = FINAL_MARK_BIT(sz);
 
@@ -214,7 +214,7 @@ void GC_clear_mark_bit(ptr_t p)
       int n_marks;
       clear_mark_bit_from_hdr(hhdr, bit_no);
       n_marks = hhdr -> hb_n_marks - 1;
-#     ifdef THREADS
+#     ifdef PARALLEL_MARK
         if (n_marks != 0)
           hhdr -> hb_n_marks = n_marks; 
         /* Don't decrement to zero.  The counts are approximate due to	*/
@@ -1473,7 +1473,7 @@ void GC_push_all_eager(ptr_t bottom, ptr_t top)
     word * b = (word *)(((word) bottom + ALIGNMENT-1) & ~(ALIGNMENT-1));
     word * t = (word *)(((word) top) & ~(ALIGNMENT-1));
     register word *p;
-    register word q;
+    register ptr_t q;
     register word *lim;
     register ptr_t greatest_ha = GC_greatest_plausible_heap_addr;
     register ptr_t least_ha = GC_least_plausible_heap_addr;
@@ -1485,7 +1485,7 @@ void GC_push_all_eager(ptr_t bottom, ptr_t top)
     /* to be valid.						*/
       lim = t - 1 /* longword */;
       for (p = b; p <= lim; p = (word *)(((ptr_t)p) + ALIGNMENT)) {
-	q = *p;
+	q = (ptr_t)(*p);
 	GC_PUSH_ONE_STACK((ptr_t)q, p);
       }
 #   undef GC_greatest_plausible_heap_addr
@@ -1508,7 +1508,6 @@ void GC_push_all_stack_partially_eager(ptr_t bottom, ptr_t top,
 				       ptr_t cold_gc_frame)
 {
   if (!NEED_FIXUP_POINTER && GC_all_interior_pointers) {
-#   define EAGER_BYTES 1024
     /* Push the hot end of the stack eagerly, so that register values   */
     /* saved inside GC frames are marked before they disappear.		*/
     /* The rest of the marking can be deferred until later.		*/
@@ -1546,21 +1545,52 @@ void GC_push_all_stack(ptr_t bottom, ptr_t top)
 # endif
 }
 
-#if !defined(SMALL_CONFIG) && !defined(USE_MARK_BYTES)
+#if !defined(SMALL_CONFIG) && !defined(USE_MARK_BYTES) && \
+    defined(MARK_BIT_PER_GRANULE)
+# if GC_GRANULE_WORDS == 1
+#   define USE_PUSH_MARKED_ACCELERATORS
+#   define PUSH_GRANULE(q) \
+		{ ptr_t qcontents = (ptr_t)((q)[0]); \
+	          GC_PUSH_ONE_HEAP(qcontents, (q)); }
+# elif GC_GRANULE_WORDS == 2
+#   define USE_PUSH_MARKED_ACCELERATORS
+#   define PUSH_GRANULE(q) \
+		{ ptr_t qcontents = (ptr_t)((q)[0]); \
+	          GC_PUSH_ONE_HEAP(qcontents, (q)); \
+		  qcontents = (ptr_t)((q)[1]); \
+	          GC_PUSH_ONE_HEAP(qcontents, (q)+1); }
+# elif GC_GRANULE_WORDS == 4
+#   define USE_PUSH_MARKED_ACCELERATORS
+#   define PUSH_GRANULE(q) \
+		{ ptr_t qcontents = (ptr_t)((q)[0]); \
+	          GC_PUSH_ONE_HEAP(qcontents, (q)); \
+		  qcontents = (ptr_t)((q)[1]); \
+	          GC_PUSH_ONE_HEAP(qcontents, (q)+1); \
+		  qcontents = (ptr_t)((q)[2]); \
+	          GC_PUSH_ONE_HEAP(qcontents, (q)+2); \
+		  qcontents = (ptr_t)((q)[3]); \
+	          GC_PUSH_ONE_HEAP(qcontents, (q)+3); }
+# endif
+#endif
+
+#ifdef USE_PUSH_MARKED_ACCELERATORS
 /* Push all objects reachable from marked objects in the given block */
-/* of size 1 objects.						     */
+/* containing objects of size 1 granule.			     */
 void GC_push_marked1(struct hblk *h, hdr *hhdr)
 {
     word * mark_word_addr = &(hhdr->hb_marks[0]);
-    register word *p;
+    word *p;
     word *plim;
-    register int i;
-    register word q;
-    register word mark_word;
-    register ptr_t greatest_ha = GC_greatest_plausible_heap_addr;
-    register ptr_t least_ha = GC_least_plausible_heap_addr;
-    register mse * mark_stack_top = GC_mark_stack_top;
-    register mse * mark_stack_limit = GC_mark_stack_limit;
+    word *q;
+    word mark_word;
+
+    /* Allow registers to be used for some frequently acccessed	*/
+    /* global variables.  Otherwise aliasing issues are likely	*/
+    /* to prevent that.						*/
+    ptr_t greatest_ha = GC_greatest_plausible_heap_addr;
+    ptr_t least_ha = GC_least_plausible_heap_addr;
+    mse * mark_stack_top = GC_mark_stack_top;
+    mse * mark_stack_limit = GC_mark_stack_limit;
 #   define GC_mark_stack_top mark_stack_top
 #   define GC_mark_stack_limit mark_stack_limit
 #   define GC_greatest_plausible_heap_addr greatest_ha
@@ -1572,21 +1602,22 @@ void GC_push_marked1(struct hblk *h, hdr *hhdr)
     /* go through all words in block */
 	while( p < plim )  {
 	    mark_word = *mark_word_addr++;
-	    i = 0;
+	    q = p;
 	    while(mark_word != 0) {
 	      if (mark_word & 1) {
-	          q = p[i];
-	          GC_PUSH_ONE_HEAP(q, p + i);
+		  PUSH_GRANULE(q);
 	      }
-	      i++;
+	      q += GC_GRANULE_WORDS;
 	      mark_word >>= 1;
 	    }
-	    p += WORDSZ;
+	    p += WORDSZ*GC_GRANULE_WORDS;
 	}
+
 #   undef GC_greatest_plausible_heap_addr
 #   undef GC_least_plausible_heap_addr        
 #   undef GC_mark_stack_top
 #   undef GC_mark_stack_limit
+
     GC_mark_stack_top = mark_stack_top;
 }
 
@@ -1594,19 +1625,20 @@ void GC_push_marked1(struct hblk *h, hdr *hhdr)
 #ifndef UNALIGNED
 
 /* Push all objects reachable from marked objects in the given block */
-/* of size 2 objects.						     */
+/* of size 2 (granules) objects.				     */
 void GC_push_marked2(struct hblk *h, hdr *hhdr)
 {
     word * mark_word_addr = &(hhdr->hb_marks[0]);
-    register word *p;
+    word *p;
     word *plim;
-    register int i;
-    register word q;
-    register word mark_word;
-    register ptr_t greatest_ha = GC_greatest_plausible_heap_addr;
-    register ptr_t least_ha = GC_least_plausible_heap_addr;
-    register mse * mark_stack_top = GC_mark_stack_top;
-    register mse * mark_stack_limit = GC_mark_stack_limit;
+    word *q;
+    word mark_word;
+
+    ptr_t greatest_ha = GC_greatest_plausible_heap_addr;
+    ptr_t least_ha = GC_least_plausible_heap_addr;
+    mse * mark_stack_top = GC_mark_stack_top;
+    mse * mark_stack_limit = GC_mark_stack_limit;
+
 #   define GC_mark_stack_top mark_stack_top
 #   define GC_mark_stack_limit mark_stack_limit
 #   define GC_greatest_plausible_heap_addr greatest_ha
@@ -1618,42 +1650,43 @@ void GC_push_marked2(struct hblk *h, hdr *hhdr)
     /* go through all words in block */
 	while( p < plim )  {
 	    mark_word = *mark_word_addr++;
-	    i = 0;
+	    q = p;
 	    while(mark_word != 0) {
 	      if (mark_word & 1) {
-	          q = p[i];
-	          GC_PUSH_ONE_HEAP(q, p + i);
-	          q = p[i+1];
-	          GC_PUSH_ONE_HEAP(q, p + i);
+		  PUSH_GRANULE(q);
+		  PUSH_GRANULE(q + GC_GRANULE_WORDS);
 	      }
-	      i += 2;
+	      q += 2 * GC_GRANULE_WORDS;
 	      mark_word >>= 2;
 	    }
-	    p += WORDSZ;
+	    p += WORDSZ*GC_GRANULE_WORDS;
 	}
+
 #   undef GC_greatest_plausible_heap_addr
 #   undef GC_least_plausible_heap_addr        
 #   undef GC_mark_stack_top
 #   undef GC_mark_stack_limit
+
     GC_mark_stack_top = mark_stack_top;
 }
 
+# if GC_GRANULE_WORDS < 4
 /* Push all objects reachable from marked objects in the given block */
-/* of size 4 objects.						     */
+/* of size 4 (granules) objects.				     */
 /* There is a risk of mark stack overflow here.  But we handle that. */
 /* And only unmarked objects get pushed, so it's not very likely.    */
 void GC_push_marked4(struct hblk *h, hdr *hhdr)
 {
     word * mark_word_addr = &(hhdr->hb_marks[0]);
-    register word *p;
+    word *p;
     word *plim;
-    register int i;
-    register word q;
-    register word mark_word;
-    register ptr_t greatest_ha = GC_greatest_plausible_heap_addr;
-    register ptr_t least_ha = GC_least_plausible_heap_addr;
-    register mse * mark_stack_top = GC_mark_stack_top;
-    register mse * mark_stack_limit = GC_mark_stack_limit;
+    word *q;
+    word mark_word;
+
+    ptr_t greatest_ha = GC_greatest_plausible_heap_addr;
+    ptr_t least_ha = GC_least_plausible_heap_addr;
+    mse * mark_stack_top = GC_mark_stack_top;
+    mse * mark_stack_limit = GC_mark_stack_limit;
 #   define GC_mark_stack_top mark_stack_top
 #   define GC_mark_stack_limit mark_stack_limit
 #   define GC_greatest_plausible_heap_addr greatest_ha
@@ -1665,22 +1698,18 @@ void GC_push_marked4(struct hblk *h, hdr *hhdr)
     /* go through all words in block */
 	while( p < plim )  {
 	    mark_word = *mark_word_addr++;
-	    i = 0;
+	    q = p;
 	    while(mark_word != 0) {
 	      if (mark_word & 1) {
-	          q = p[i];
-	          GC_PUSH_ONE_HEAP(q, p + i);
-	          q = p[i+1];
-	          GC_PUSH_ONE_HEAP(q, p + i + 1);
-	          q = p[i+2];
-	          GC_PUSH_ONE_HEAP(q, p + i + 2);
-	          q = p[i+3];
-	          GC_PUSH_ONE_HEAP(q, p + i + 3);
+		  PUSH_GRANULE(q);
+		  PUSH_GRANULE(q + GC_GRANULE_WORDS);
+		  PUSH_GRANULE(q + 2*GC_GRANULE_WORDS);
+		  PUSH_GRANULE(q + 3*GC_GRANULE_WORDS);
 	      }
-	      i += 4;
+	      q += 4 * GC_GRANULE_WORDS;
 	      mark_word >>= 4;
 	    }
-	    p += WORDSZ;
+	    p += WORDSZ*GC_GRANULE_WORDS;
 	}
 #   undef GC_greatest_plausible_heap_addr
 #   undef GC_least_plausible_heap_addr        
@@ -1689,9 +1718,11 @@ void GC_push_marked4(struct hblk *h, hdr *hhdr)
     GC_mark_stack_top = mark_stack_top;
 }
 
+#endif /* GC_GRANULE_WORDS < 4 */
+
 #endif /* UNALIGNED */
 
-#endif /* SMALL_CONFIG */
+#endif /* USE_PUSH_MARKED_ACCELERATORS */
 
 /* Push all objects reachable from marked objects in the given block */
 void GC_push_marked(struct hblk *h, hdr *hhdr)
@@ -1715,20 +1746,21 @@ void GC_push_marked(struct hblk *h, hdr *hhdr)
         lim = (h + 1)->hb_body - sz;
     }
     
-    switch(BYTES_TO_WORDS(sz)) {
-#   if !defined(SMALL_CONFIG) && !defined(USE_MARK_BYTES)   
+    switch(BYTES_TO_GRANULES(sz)) {
+#   if defined(USE_PUSH_MARKED_ACCELERATORS)
      case 1:
        GC_push_marked1(h, hhdr);
        break;
-#   endif
-#   if !defined(SMALL_CONFIG) && !defined(UNALIGNED) && \
-       !defined(USE_MARK_BYTES)
-     case 2:
-       GC_push_marked2(h, hhdr);
-       break;
-     case 4:
-       GC_push_marked4(h, hhdr);
-       break;
+#    if !defined(UNALIGNED)
+       case 2:
+         GC_push_marked2(h, hhdr);
+         break;
+#     if GC_GRANULE_WORDS < 4
+       case 4:
+         GC_push_marked4(h, hhdr);
+         break;
+#     endif
+#    endif
 #   endif       
      default:
       GC_mark_stack_top_reg = GC_mark_stack_top;
diff --git a/mark_rts.c b/mark_rts.c
index 19ea80a..bd97c6e 100644
--- a/mark_rts.c
+++ b/mark_rts.c
@@ -593,7 +593,7 @@ void GC_push_roots(GC_bool all, ptr_t cold_gc_frame)
      /* If the world is not stopped, this is unsafe.  It is	*/
      /* also unnecessary, since we will do this again with the	*/
      /* world stopped.						*/
-#      if defined(THREAD_LOCAL_ALLOC) && !defined(DBG_HDRS_ALL)
+#      if defined(THREAD_LOCAL_ALLOC)
          if (GC_world_stopped) GC_mark_thread_local_free_lists();
 #      endif
 
diff --git a/misc.c b/misc.c
index 70e37fb..10bf512 100644
--- a/misc.c
+++ b/misc.c
@@ -43,31 +43,12 @@
   int GC_log;  /* Forward decl, so we can set it.	*/
 #endif
 
-# ifdef THREADS
-#   ifdef PCR
-#     include "il/PCR_IL.h"
-      PCR_Th_ML GC_allocate_ml;
-#   elif defined(GC_WIN32_THREADS) 
-#     if defined(GC_PTHREADS)
-	pthread_mutex_t GC_allocate_ml = PTHREAD_MUTEX_INITIALIZER;
-#     elif defined(GC_DLL)
-	 __declspec(dllexport) CRITICAL_SECTION GC_allocate_ml;
-#     else
-	 CRITICAL_SECTION GC_allocate_ml;
-#     endif
-#   elif defined(GC_PTHREADS)
-#     if defined(USE_SPIN_LOCK)
-        pthread_t GC_lock_holder = NO_THREAD;
-#     else
-	pthread_mutex_t GC_allocate_ml = PTHREAD_MUTEX_INITIALIZER;
-	pthread_t GC_lock_holder = NO_THREAD;
-		/* Used only for assertions, and to prevent	 */
-		/* recursive reentry in the system call wrapper. */
-#     endif 
-#   else
-       --> declare allocator lock here
-#   endif
-# endif
+#if defined(THREADS) && defined(PCR)
+# include "il/PCR_IL.h"
+  PCR_Th_ML GC_allocate_ml;
+#endif
+/* For other platforms with threads, the lock and possibly		*/
+/* GC_lock_holder variables are defined in the thread support code.	*/
 
 #if defined(NOSYS) || defined(ECOS)
 #undef STACKBASE
@@ -157,7 +138,7 @@ void * GC_project2(void *arg1, void *arg2)
 /* quantization alogrithm (but we precompute it).			*/ 
 void GC_init_size_map(void)
 {
-    register unsigned i;
+    int i;
 
     /* Map size 0 to something bigger.			*/
     /* This avoids problems at lower levels.		*/
@@ -423,7 +404,7 @@ void GC_init(void)
 #if defined(GC_WIN32_THREADS) && !defined(GC_PTHREADS)
     if (!GC_is_initialized) {
       BOOL (WINAPI *pfn) (LPCRITICAL_SECTION, DWORD) = NULL;
-      HMODULE hK32 = GetModuleHandle("kernel32.dll");
+      HMODULE hK32 = GetModuleHandleA("kernel32.dll");
       if (hK32)
           (FARPROC) pfn = GetProcAddress(hK32,
 			  "InitializeCriticalSectionAndSpinCount");
diff --git a/os_dep.c b/os_dep.c
index d78f8e9..e43062f 100644
--- a/os_dep.c
+++ b/os_dep.c
@@ -80,10 +80,12 @@
 # undef GC_AMIGA_DEF
 #endif
 
-#if defined(MSWIN32) || defined(MSWINCE)
+#if defined(MSWIN32) || defined(MSWINCE) || defined(CYGWIN32)
 # define WIN32_LEAN_AND_MEAN
 # define NOSERVICE
 # include <windows.h>
+  /* It's not clear this is completely kosher under Cygwin.  But it	*/
+  /* allows us to get a working GC_get_stack_base.			*/
 #endif
 
 #ifdef MACOS
@@ -468,7 +470,7 @@ void GC_enable_signals(void)
       && !defined(MACOS) && !defined(DJGPP) && !defined(DOS4GW) \
       && !defined(NOSYS) && !defined(ECOS)
 
-#   if defined(sigmask) && !defined(UTS4) && !defined(HURD)
+#   if 0
 	/* Use the traditional BSD interface */
 #	define SIGSET_T int
 #	define SIG_DEL(set, signal) (set) &= ~(sigmask(signal))
@@ -477,14 +479,15 @@ void GC_enable_signals(void)
     	  /* longjmp implementations.  Most systems appear not to have	*/
     	  /* a signal 32.						*/
 #	define SIGSETMASK(old, new) (old) = sigsetmask(new)
-#   else
-	/* Use POSIX/SYSV interface	*/
-#	define SIGSET_T sigset_t
-#	define SIG_DEL(set, signal) sigdelset(&(set), (signal))
-#	define SIG_FILL(set) sigfillset(&set)
-#	define SIGSETMASK(old, new) sigprocmask(SIG_SETMASK, &(new), &(old))
 #   endif
 
+    /* Use POSIX/SYSV interface	*/
+#   define SIGSET_T sigset_t
+#   define SIG_DEL(set, signal) sigdelset(&(set), (signal))
+#   define SIG_FILL(set) sigfillset(&set)
+#   define SIGSETMASK(old, new) sigprocmask(SIG_SETMASK, &(new), &(old))
+
+
 static GC_bool mask_initialized = FALSE;
 
 static SIGSET_T new_mask;
@@ -578,7 +581,7 @@ word GC_page_size;
  * With threads, GC_mark_roots needs to know how to do this.
  * Called with allocator lock held.
  */
-# if defined(MSWIN32) || defined(MSWINCE)
+# if defined(MSWIN32) || defined(MSWINCE) || defined(CYGWIN32)
 # define is_writable(prot) ((prot) == PAGE_READWRITE \
 			    || (prot) == PAGE_WRITECOPY \
 			    || (prot) == PAGE_EXECUTE_READWRITE \
@@ -970,7 +973,8 @@ ptr_t GC_get_main_stack_base(void)
 #endif /* FREEBSD_STACKBOTTOM */
 
 #if !defined(BEOS) && !defined(AMIGA) && !defined(MSWIN32) \
-    && !defined(MSWINCE) && !defined(OS2) && !defined(NOSYS) && !defined(ECOS)
+    && !defined(MSWINCE) && !defined(OS2) && !defined(NOSYS) && !defined(ECOS) \
+    && !defined(CYGWIN32)
 
 ptr_t GC_get_main_stack_base(void)
 {
@@ -2021,8 +2025,7 @@ void GC_default_push_other_roots(void)
 # endif /* PCR */
 
 
-# if defined(GC_SOLARIS_THREADS) || defined(GC_PTHREADS) || \
-     defined(GC_WIN32_THREADS)
+# if defined(GC_PTHREADS) || defined(GC_WIN32_THREADS)
 
 extern void GC_push_all_stacks(void);
 
@@ -2031,7 +2034,7 @@ void GC_default_push_other_roots(void)
     GC_push_all_stacks();
 }
 
-# endif /* GC_SOLARIS_THREADS || GC_PTHREADS */
+# endif /* GC_WIN32_THREADS || GC_PTHREADS */
 
 void (*GC_push_other_roots)(void) = GC_default_push_other_roots;
 
@@ -2182,8 +2185,6 @@ void GC_or_pages(page_hash_table pht1, page_hash_table pht2)
   }
 
 # ifndef MPROTECT_VDB
-    void GC_is_fresh(struct hblk *h, word n)
-    {}
     void GC_remove_protection(struct hblk *h, word nblocks, GC_bool is_ptrfree)
     {}
 # endif
@@ -2235,11 +2236,6 @@ GC_bool GC_page_was_ever_dirty(struct hblk *h)
     return(TRUE);
 }
 
-/* Reset the n pages starting at h to "was never dirty" status.	*/
-void GC_is_fresh(struct hblk *h, word n)
-{
-}
-
 /* A call that:						*/
 /* I) hints that [h, h+nblocks) is about to be written.	*/
 /* II) guarantees that protection is removed.		*/
@@ -2302,11 +2298,6 @@ void GC_dirty(ptr_t p)
     async_set_pht_entry_from_index(GC_dirty_pages, index);
 }
 
-/* Reset the n pages starting at h to "was never dirty" status.	*/
-void GC_is_fresh(struct hblk *h, word n)
-{
-}
-
 /*ARGSUSED*/
 void GC_remove_protection(struct hblk *h, word nblocks, GC_bool is_ptrfree)
 {
@@ -2980,12 +2971,6 @@ GC_bool GC_page_was_ever_dirty(struct hblk *h)
     return(TRUE);
 }
 
-/* Reset the n pages starting at h to "was never dirty" status.	*/
-/*ARGSUSED*/
-void GC_is_fresh(struct hblk *h, word n)
-{
-}
-
 # endif /* MPROTECT_VDB */
 
 # ifdef PROC_VDB
@@ -3013,23 +2998,6 @@ void GC_is_fresh(struct hblk *h, word n)
 word GC_proc_buf_size = INITIAL_BUF_SZ;
 char *GC_proc_buf;
 
-#ifdef GC_SOLARIS_THREADS
-/* We don't have exact sp values for threads.  So we count on	*/
-/* occasionally declaring stack pages to be fresh.  Thus we 	*/
-/* need a real implementation of GC_is_fresh.  We can't clear	*/
-/* entries in GC_written_pages, since that would declare all	*/
-/* pages with the given hash address to be fresh.		*/
-#   define MAX_FRESH_PAGES 8*1024	/* Must be power of 2 */
-    struct hblk ** GC_fresh_pages;	/* A direct mapped cache.	*/
-    					/* Collisions are dropped.	*/
-
-#   define FRESH_PAGE_SLOT(h) (divHBLKSZ((word)(h)) & (MAX_FRESH_PAGES-1))
-#   define ADD_FRESH_PAGE(h) \
-	GC_fresh_pages[FRESH_PAGE_SLOT(h)] = (h)
-#   define PAGE_IS_FRESH(h) \
-	(GC_fresh_pages[FRESH_PAGE_SLOT(h)] == (h) && (h) != 0)
-#endif
-
 int GC_proc_fd;
 
 void GC_dirty_init(void)
@@ -3060,15 +3028,6 @@ void GC_dirty_init(void)
     	ABORT("/proc ioctl failed");
     }
     GC_proc_buf = GC_scratch_alloc(GC_proc_buf_size);
-#   ifdef GC_SOLARIS_THREADS
-	GC_fresh_pages = (struct hblk **)
-	  GC_scratch_alloc(MAX_FRESH_PAGES * sizeof (struct hblk *));
-	if (GC_fresh_pages == 0) {
-	    GC_err_printf("No space for fresh pages\n");
-	    EXIT();
-	}
-	BZERO(GC_fresh_pages, MAX_FRESH_PAGES * sizeof (struct hblk *));
-#   endif
 }
 
 /* Ignore write hints. They don't help us here.	*/
@@ -3080,11 +3039,7 @@ GC_bool is_ptrfree;
 {
 }
 
-#ifdef GC_SOLARIS_THREADS
-#   define READ(fd,buf,nbytes) syscall(SYS_read, fd, buf, nbytes)
-#else
-#   define READ(fd,buf,nbytes) read(fd, buf, nbytes)
-#endif
+# define READ(fd,buf,nbytes) read(fd, buf, nbytes)
 
 void GC_read_dirty(void)
 {
@@ -3117,10 +3072,6 @@ void GC_read_dirty(void)
                 /* Punt:	*/
         	memset(GC_grungy_pages, 0xff, sizeof (page_hash_table));
 		memset(GC_written_pages, 0xff, sizeof(page_hash_table));
-#		ifdef GC_SOLARIS_THREADS
-		    BZERO(GC_fresh_pages,
-		    	  MAX_FRESH_PAGES * sizeof (struct hblk *)); 
-#		endif
 		return;
             }
         }
@@ -3147,15 +3098,6 @@ void GC_read_dirty(void)
 	                register word index = PHT_HASH(h);
 	                
 	                set_pht_entry_from_index(GC_grungy_pages, index);
-#			ifdef GC_SOLARIS_THREADS
-			  {
-			    register int slot = FRESH_PAGE_SLOT(h);
-			    
-			    if (GC_fresh_pages[slot] == h) {
-			        GC_fresh_pages[slot] = 0;
-			    }
-			  }
-#			endif
 	                h++;
 	            }
 	        }
@@ -3165,30 +3107,16 @@ void GC_read_dirty(void)
 	}
     /* Update GC_written_pages. */
         GC_or_pages(GC_written_pages, GC_grungy_pages);
-#   ifdef GC_SOLARIS_THREADS
-      /* Make sure that old stacks are considered completely clean	*/
-      /* unless written again.						*/
-	GC_old_stacks_are_fresh();
-#   endif
 }
 
 #undef READ
 
 GC_bool GC_page_was_dirty(struct hblk *h)
-struct hblk *h;
 {
     register word index = PHT_HASH(h);
     register GC_bool result;
     
     result = get_pht_entry_from_index(GC_grungy_pages, index);
-#   ifdef GC_SOLARIS_THREADS
-	if (result && PAGE_IS_FRESH(h)) result = FALSE;
-	/* This happens only if page was declared fresh since	*/
-	/* the read_dirty call, e.g. because it's in an unused  */
-	/* thread stack.  It's OK to treat it as clean, in	*/
-	/* that case.  And it's consistent with 		*/
-	/* GC_page_was_ever_dirty.				*/
-#   endif
     return(result);
 }
 
@@ -3198,29 +3126,9 @@ GC_bool GC_page_was_ever_dirty(struct hblk *h)
     register GC_bool result;
     
     result = get_pht_entry_from_index(GC_written_pages, index);
-#   ifdef GC_SOLARIS_THREADS
-	if (result && PAGE_IS_FRESH(h)) result = FALSE;
-#   endif
     return(result);
 }
 
-/* Caller holds allocation lock.	*/
-void GC_is_fresh(struct hblk *h, word n)
-{
-
-    register word index;
-    
-#   ifdef GC_SOLARIS_THREADS
-      register word i;
-      
-      if (GC_fresh_pages != 0) {
-        for (i = 0; i < n; i++) {
-          ADD_FRESH_PAGE(h + i);
-        }
-      }
-#   endif
-}
-
 # endif /* PROC_VDB */
 
 
diff --git a/pthread_stop_world.c b/pthread_stop_world.c
index bd1f67e..33cc9e0 100644
--- a/pthread_stop_world.c
+++ b/pthread_stop_world.c
@@ -1,7 +1,7 @@
 #include "private/pthread_support.h"
 
-#if defined(GC_PTHREADS) && !defined(GC_SOLARIS_THREADS) \
-     && !defined(GC_WIN32_THREADS) && !defined(GC_DARWIN_THREADS)
+#if defined(GC_PTHREADS) && !defined(GC_WIN32_THREADS) && \
+    !defined(GC_DARWIN_THREADS)
 
 #include <signal.h>
 #include <semaphore.h>
@@ -160,12 +160,12 @@ void GC_suspend_handler_inner(ptr_t sig_arg, void *context)
 	return;
     }
 #   ifdef SPARC
-	me -> stop_info.stack_ptr = (ptr_t)GC_save_regs_in_stack();
+	me -> stop_info.stack_ptr = GC_save_regs_in_stack();
 #   else
 	me -> stop_info.stack_ptr = (ptr_t)(&dummy);
 #   endif
 #   ifdef IA64
-	me -> backing_store_ptr = (ptr_t)GC_save_regs_in_stack();
+	me -> backing_store_ptr = GC_save_regs_in_stack();
 #   endif
 
     /* Tell the thread that wants to stop the world that this   */
@@ -282,6 +282,8 @@ void GC_push_all_stacks()
     	              (unsigned)p -> id, bs_lo, bs_hi);
 #	  endif
           if (pthread_equal(p -> id, me)) {
+	    /* FIXME:  This may add an unbounded number of entries,	*/
+	    /* and hence overflow the mark stack, which is bad.		*/
 	    GC_push_all_eager(bs_lo, bs_hi);
 	  } else {
 	    GC_push_all_stack(bs_lo, bs_hi);
@@ -340,13 +342,13 @@ int GC_suspend_all()
     return n_live_threads;
 }
 
-/* Caller holds allocation lock.	*/
 void GC_stop_world()
 {
     int i;
     int n_live_threads;
     int code;
 
+    GC_ASSERT(I_HOLD_LOCK());
     #if DEBUG_THREADS
       GC_printf("Stopping the world from 0x%x\n", (unsigned)pthread_self());
     #endif
diff --git a/pthread_support.c b/pthread_support.c
index 5b25525..a8c3c6b 100644
--- a/pthread_support.c
+++ b/pthread_support.c
@@ -80,7 +80,15 @@
 # include <sys/sysctl.h>
 #endif /* GC_DARWIN_THREADS */
 
-
+/* Allocator lock definitions.		*/
+#if defined(USE_SPIN_LOCK)
+  pthread_t GC_lock_holder = NO_THREAD;
+#else
+  pthread_mutex_t GC_allocate_ml = PTHREAD_MUTEX_INITIALIZER;
+  pthread_t GC_lock_holder = NO_THREAD;
+		/* Used only for assertions, and to prevent	 */
+		/* recursive reentry in the system call wrapper. */
+#endif
 
 #if defined(GC_DGUX386_THREADS)
 # include <sys/dg_sys_info.h>
@@ -241,7 +249,7 @@ void GC_mark_thread_local_free_lists(void)
 	    GC_check_tls_for(&(p->tlfs));
 	  }
 	}
-#       if !defined(USE_COMPILER_TLS) && !defined(USE_PTHREAD_SPECIFIC)
+#       if defined(USE_CUSTOM_SPECIFIC)
 	  if (GC_thread_key != 0)
 	    GC_check_tsd_marks(GC_thread_key);
 #	endif 
@@ -346,16 +354,15 @@ volatile GC_thread GC_threads[THREAD_TABLE_SZ];
 
 void GC_push_thread_structures(void)
 {
+    GC_ASSERT(I_HOLD_LOCK());
     GC_push_all((ptr_t)(GC_threads), (ptr_t)(GC_threads)+sizeof(GC_threads));
-#   if defined(THREAD_LOCAL_ALLOC) && !defined(DBG_HDRS_ALL)
+#   if defined(THREAD_LOCAL_ALLOC)
       GC_push_all((ptr_t)(&GC_thread_key),
 	  (ptr_t)(&GC_thread_key)+sizeof(&GC_thread_key));
 #   endif
 }
 
-#if defined(THREAD_LOCAL_ALLOC) && !defined(DBG_HDRS_ALL)
-#endif /* THREAD_LOCAL_ALLOC */
-
+/* It may not be safe to allocate when we register the first thread.	*/
 static struct GC_Thread_Rep first_thread;
 
 /* Add a thread to GC_threads.  We assume it wasn't already there.	*/
@@ -385,13 +392,13 @@ GC_thread GC_new_thread(pthread_t id)
 
 /* Delete a thread from GC_threads.  We assume it is there.	*/
 /* (The code intentionally traps if it wasn't.)			*/
-/* Caller holds allocation lock.				*/
 void GC_delete_thread(pthread_t id)
 {
     int hv = ((word)id) % THREAD_TABLE_SZ;
     register GC_thread p = GC_threads[hv];
     register GC_thread prev = 0;
     
+    GC_ASSERT(I_HOLD_LOCK());
     while (!pthread_equal(p -> id, id)) {
         prev = p;
         p = p -> next;
@@ -408,12 +415,14 @@ void GC_delete_thread(pthread_t id)
 /* been notified, then there may be more than one thread 	*/
 /* in the table with the same pthread id.			*/
 /* This is OK, but we need a way to delete a specific one.	*/
-void GC_delete_gc_thread(pthread_t id, GC_thread gc_id)
+void GC_delete_gc_thread(GC_thread gc_id)
 {
+    pthread_t id = gc_id -> id;
     int hv = ((word)id) % THREAD_TABLE_SZ;
     register GC_thread p = GC_threads[hv];
     register GC_thread prev = 0;
 
+    GC_ASSERT(I_HOLD_LOCK());
     while (p != gc_id) {
         prev = p;
         p = p -> next;
@@ -680,7 +689,8 @@ void GC_thr_init(void)
 #       if defined(GC_HPUX_THREADS)
 	  GC_nprocs = pthread_num_processors_np();
 #       endif
-#	if defined(GC_OSF1_THREADS) || defined(GC_AIX_THREADS)
+#	if defined(GC_OSF1_THREADS) || defined(GC_AIX_THREADS) \
+	   || defined(GC_SOLARIS_THREADS)
 	  GC_nprocs = sysconf(_SC_NPROCESSORS_ONLN);
 	  if (GC_nprocs <= 0) GC_nprocs = 1;
 #	endif
@@ -742,7 +752,7 @@ void GC_thr_init(void)
 /* may require allocation.				*/
 /* Called without allocation lock.			*/
 /* Must be called before a second thread is created.	*/
-/* Called without allocation lock.			*/
+/* Did we say it's called without the allocation lock?	*/
 void GC_init_parallel(void)
 {
     if (parallel_initialized) return;
@@ -751,7 +761,7 @@ void GC_init_parallel(void)
     /* GC_init() calls us back, so set flag first.	*/
     if (!GC_is_initialized) GC_init();
     /* Initialize thread local free lists if used.	*/
-#   if defined(THREAD_LOCAL_ALLOC) && !defined(DBG_HDRS_ALL)
+#   if defined(THREAD_LOCAL_ALLOC)
       LOCK();
       GC_init_thread_local(&(GC_lookup_thread(pthread_self())->tlfs));
       UNLOCK();
@@ -789,12 +799,12 @@ static void GC_do_blocking_inner(ptr_t data, void * context) {
     me = GC_lookup_thread(pthread_self());
     GC_ASSERT(!(me -> thread_blocked));
 #   ifdef SPARC
-	me -> stop_info.stack_ptr = (ptr_t)GC_save_regs_in_stack();
+	me -> stop_info.stack_ptr = GC_save_regs_in_stack();
 #   elif !defined(GC_DARWIN_THREADS)
-	me -> stop_info.stack_ptr = (ptr_t)GC_approx_sp();
+	me -> stop_info.stack_ptr = GC_approx_sp();
 #   endif
 #   ifdef IA64
-	me -> backing_store_ptr = (ptr_t)GC_save_regs_in_stack();
+	me -> backing_store_ptr = GC_save_regs_in_stack();
 #   endif
     me -> thread_blocked = TRUE;
     /* Save context here if we want to support precise stack marking */
@@ -830,13 +840,17 @@ int GC_unregister_my_thread(void)
     /* complete before we remove this thread.			*/
     GC_wait_for_gc_completion(FALSE);
     me = GC_lookup_thread(pthread_self());
-    GC_destroy_thread_local(&(me->tlfs));
+#   if defined(THREAD_LOCAL_ALLOC)
+      GC_destroy_thread_local(&(me->tlfs));
+#   endif
     if (me -> flags & DETACHED) {
     	GC_delete_thread(pthread_self());
     } else {
 	me -> flags |= FINISHED;
     }
-    GC_remove_specific(GC_thread_key);
+#   if defined(THREAD_LOCAL_ALLOC)
+      GC_remove_specific();
+#   endif
     UNLOCK();
     return GC_SUCCESS;
 }
@@ -877,7 +891,7 @@ int WRAP_FUNC(pthread_join)(pthread_t thread, void **retval)
     if (result == 0) {
         LOCK();
         /* Here the pthread thread id may have been recycled. */
-        GC_delete_gc_thread(thread, thread_gc_id);
+        GC_delete_gc_thread(thread_gc_id);
         UNLOCK();
     }
     return result;
@@ -899,7 +913,7 @@ WRAP_FUNC(pthread_detach)(pthread_t thread)
       thread_gc_id -> flags |= DETACHED;
       /* Here the pthread thread id may have been recycled. */
       if (thread_gc_id -> flags & FINISHED) {
-        GC_delete_gc_thread(thread, thread_gc_id);
+        GC_delete_gc_thread(thread_gc_id);
       }
       UNLOCK();
     }
@@ -913,7 +927,7 @@ GC_thread GC_register_my_thread_inner(struct GC_stack_base *sb,
 {
     GC_thread me;
 
-    GC_in_thread_creation = TRUE; /* OK to collect from unknow thread. */
+    GC_in_thread_creation = TRUE; /* OK to collect from unknown thread. */
     me = GC_new_thread(my_pthread);
     GC_in_thread_creation = FALSE;
 #   ifdef GC_DARWIN_THREADS
@@ -975,7 +989,7 @@ void * GC_inner_start_routine(struct GC_stack_base *sb, void * arg)
     sem_post(&(si -> registered));	/* Last action on si.	*/
     					/* OK to deallocate.	*/
     pthread_cleanup_push(GC_thread_exit_proc, 0);
-#   if defined(THREAD_LOCAL_ALLOC) && !defined(DBG_HDRS_ALL)
+#   if defined(THREAD_LOCAL_ALLOC)
  	LOCK();
         GC_init_thread_local(&(me->tlfs));
 	UNLOCK();
diff --git a/reclaim.c b/reclaim.c
index 6cb8b47..1149f49 100644
--- a/reclaim.c
+++ b/reclaim.c
@@ -289,7 +289,12 @@ void GC_reclaim_block(struct hblk *hbp, word report_if_found)
 	}
     } else {
         GC_bool empty = GC_block_empty(hhdr);
-	GC_ASSERT(sz * hhdr -> hb_n_marks <= HBLKSIZE);
+#	ifdef PARALLEL_MARK
+	  /* Count can be low or one too high.	*/
+	  GC_ASSERT(hhdr -> hb_n_marks <= HBLKSIZE/sz + 1);
+#	else
+	  GC_ASSERT(sz * hhdr -> hb_n_marks <= HBLKSIZE);
+#	endif
 	if (hhdr -> hb_descr != 0) {
 	  GC_composite_in_use += sz * hhdr -> hb_n_marks;
 	} else {
@@ -387,7 +392,7 @@ int GC_n_set_marks(hdr *hhdr)
 #endif /* !USE_MARK_BYTES  */
 
 /*ARGSUSED*/
-void GC_print_block_descr(struct hblk *h, word dummy)
+void GC_print_block_descr(struct hblk *h, word /* struct PrintStats */ raw_ps)
 {
     hdr * hhdr = HDR(h);
     unsigned bytes = hhdr -> hb_sz;
@@ -405,7 +410,7 @@ void GC_print_block_descr(struct hblk *h, word dummy)
     bytes += HBLKSIZE-1;
     bytes &= ~(HBLKSIZE-1);
 
-    ps = (struct Print_stats *)dummy;
+    ps = (struct Print_stats *)raw_ps;
     ps->total_bytes += bytes;
     ps->number_of_blocks++;
 }
diff --git a/setjmp_t.c b/setjmp_t.c
index 648c712..9dc6bfc 100644
--- a/setjmp_t.c
+++ b/setjmp_t.c
@@ -24,7 +24,7 @@
 #include <stdio.h>
 #include <setjmp.h>
 #include <string.h>
-#include "private/gcconfig.h"
+#include "private/gc_priv.h"
 
 #ifdef OS2
 /* GETPAGESIZE() is set to getpagesize() by default, but that	*/
@@ -82,6 +82,9 @@ int main()
 	printf("A good guess for ALIGNMENT on this machine is %ld.\n",
 	       (unsigned long)(&(a.a_b))-(unsigned long)(&a));
 	
+	printf("The following is a very dubious test of one root marking"
+	       " strategy.\n");
+	printf("Results may not be accurate/useful:\n");
 	/* Encourage the compiler to keep x in a callee-save register */
 	x = 2*x-1;
 	printf("");
@@ -107,6 +110,27 @@ int main()
 	y++;
 	x = 2;
 	if (y == 1) longjmp(b,1);
+	printf("Some GC internal configuration stuff: \n");
+	printf("\tWORDSZ = %d, ALIGNMENT = %d, GC_GRANULE_BYTES = %d\n",
+	       WORDSZ, ALIGNMENT, GC_GRANULE_BYTES);
+	printf("\tUsing one mark ");
+#       if defined(USE_MARK_BYTES)
+	  printf("byte");
+#	elif defined(USE_MARK_BITS)
+	  printf("bit");
+#       endif
+	printf(" per ");
+#       if defined(MARK_BIT_PER_OBJ)
+	  printf("object.\n");
+#	elif defined(MARK_BIT_PER_GRANULE)
+	  printf("granule.\n");
+#	endif
+# 	ifdef THREAD_LOCAL_ALLOC
+	  printf("Thread local allocation enabled.\n");
+#	endif
+#	ifdef PARALLEL_MARK
+	  printf("Parallel marking enabled.\n");
+#	endif
 	return(0);
 }
 
diff --git a/sparc_mach_dep.S b/sparc_mach_dep.S
index 06a0f3b..6997fa1 100644
--- a/sparc_mach_dep.S
+++ b/sparc_mach_dep.S
@@ -24,6 +24,10 @@ GC_push_regs:
 	.size GC_save_regs_in_stack,.GC_save_regs_in_stack_end-GC_save_regs_in_stack
 	
 
+! GC_clear_stack_inner(arg, limit) clears stack area up to limit and
+! returns arg.  Stack clearing is crucial on SPARC, so we supply
+! an assembly version that's more careful.  Assumes limit is hotter
+! than sp, and limit is 8 byte aligned.	
 	.globl	GC_clear_stack_inner
 GC_clear_stack_inner:
 #if defined(__arch64__) || defined(__sparcv9)
diff --git a/tests/leak_test.c b/tests/leak_test.c
index 421d0c6..d6a60d4 100644
--- a/tests/leak_test.c
+++ b/tests/leak_test.c
@@ -5,6 +5,9 @@ main() {
     int i;
     GC_find_leak = 1; /* for new collect versions not compiled  */
     /* with -DFIND_LEAK.                                        */
+
+    GC_INIT();	/* Needed if thread-local allocation is enabled.	*/
+    		/* FIXME: This is not ideal.				*/
     for (i = 0; i < 10; ++i) {
         p[i] = malloc(sizeof(int)+i);
     }
@@ -18,4 +21,5 @@ main() {
     CHECK_LEAKS();
     CHECK_LEAKS();
     CHECK_LEAKS();
+    return 0;
 }       
diff --git a/tests/test.c b/tests/test.c
index 076ce4b..afe62f2 100644
--- a/tests/test.c
+++ b/tests/test.c
@@ -36,12 +36,10 @@
 # else
 #   include <assert.h>        /* Not normally used, but handy for debugging. */
 # endif
-# include <assert.h>	/* Not normally used, but handy for debugging. */
 # include "gc.h"
 # include "gc_typed.h"
 # include "private/gc_priv.h"	/* For output, locking, MIN_WORDS, 	*/
-				/* and some statistics.			*/
-# include "private/gcconfig.h"
+				/* and some statistics, and gcconfig.h.	*/
 
 # if defined(MSWIN32) || defined(MSWINCE)
 #   include <windows.h>
@@ -53,11 +51,6 @@
 #   define GC_printf printf
 # endif
 
-# if defined(GC_SOLARIS_THREADS) && !defined(GC_SOLARIS_PTHREADS)
-#   include <thread.h>
-#   include <synch.h>
-# endif
-
 # if defined(GC_PTHREADS)
 #   include <pthread.h>
 # endif
@@ -507,8 +500,6 @@ void check_marks_int_list(sexpr x)
     	}
     }
 
-/* # elif defined(GC_SOLARIS_THREADS) */
-
 # else
 
 #   define fork_a_thread()
@@ -674,17 +665,11 @@ volatile int dropped_something = 0;
 # ifdef PCR
      PCR_ThCrSec_EnterSys();
 # endif
-# if defined(GC_SOLARIS_THREADS) && !defined(GC_SOLARIS_PTHREADS)
-    static mutex_t incr_lock;
-    mutex_lock(&incr_lock);
-# endif
-# if  defined(GC_PTHREADS)
+# if defined(GC_PTHREADS)
     static pthread_mutex_t incr_lock = PTHREAD_MUTEX_INITIALIZER;
     pthread_mutex_lock(&incr_lock);
-# else
-#   ifdef GC_WIN32_THREADS
-      EnterCriticalSection(&incr_cs);
-#   endif
+# elif defined(GC_WIN32_THREADS)
+    EnterCriticalSection(&incr_cs);
 # endif
   if ((int)(GC_word)client_data != t -> level) {
      (void)GC_printf("Wrong finalization data - collector is broken\n");
@@ -695,15 +680,10 @@ volatile int dropped_something = 0;
 # ifdef PCR
     PCR_ThCrSec_ExitSys();
 # endif
-# if defined(GC_SOLARIS_THREADS) && !defined(GC_SOLARIS_PTHREADS)
-    mutex_unlock(&incr_lock);
-# endif
 # if defined(GC_PTHREADS)
     pthread_mutex_unlock(&incr_lock);
-# else
-#   ifdef GC_WIN32_THREADS
-      LeaveCriticalSection(&incr_cs);
-#   endif
+# elif defined(GC_WIN32_THREADS)
+    LeaveCriticalSection(&incr_cs);
 # endif
 }
 
@@ -757,17 +737,11 @@ int n;
 #	  ifdef PCR
  	    PCR_ThCrSec_EnterSys();
 #	  endif
-#	  if defined(GC_SOLARIS_THREADS) && !defined(GC_SOLARIS_PTHREADS)
-	    static mutex_t incr_lock;
-	    mutex_lock(&incr_lock);
-#	  endif
 #         if defined(GC_PTHREADS)
             static pthread_mutex_t incr_lock = PTHREAD_MUTEX_INITIALIZER;
             pthread_mutex_lock(&incr_lock);
-#         else
-#           ifdef GC_WIN32_THREADS
-              EnterCriticalSection(&incr_cs);
-#           endif
+#         elif defined(GC_WIN32_THREADS)
+            EnterCriticalSection(&incr_cs);
 #         endif
 		/* Losing a count here causes erroneous report of failure. */
           finalizable_count++;
@@ -775,15 +749,10 @@ int n;
 #	  ifdef PCR
  	    PCR_ThCrSec_ExitSys();
 #	  endif
-#	  if defined(GC_SOLARIS_THREADS) && !defined(GC_SOLARIS_PTHREADS)
-	    mutex_unlock(&incr_lock);
-#	  endif
 #	  if defined(GC_PTHREADS)
 	    pthread_mutex_unlock(&incr_lock);
-#	  else
-#           ifdef GC_WIN32_THREADS
-              LeaveCriticalSection(&incr_cs);
-#           endif
+#	  elif defined(GC_WIN32_THREADS)
+            LeaveCriticalSection(&incr_cs);
 #         endif
 	}
 
@@ -841,48 +810,8 @@ int n;
     chktree(t -> rchild, n-1);
 }
 
-# if defined(GC_SOLARIS_THREADS) && !defined(GC_SOLARIS_PTHREADS)
-thread_key_t fl_key;
 
-void * alloc8bytes()
-{
-# if defined(SMALL_CONFIG) || defined(GC_DEBUG)
-    collectable_count++;
-    return(GC_MALLOC(8));
-# else
-    void ** my_free_list_ptr;
-    void * my_free_list;
-    
-    if (thr_getspecific(fl_key, (void **)(&my_free_list_ptr)) != 0) {
-    	(void)GC_printf("thr_getspecific failed\n");
-    	FAIL;
-    }
-    if (my_free_list_ptr == 0) {
-        uncollectable_count++;
-        my_free_list_ptr = GC_NEW_UNCOLLECTABLE(void *);
-        if (thr_setspecific(fl_key, my_free_list_ptr) != 0) {
-    	    (void)GC_printf("thr_setspecific failed\n");
-    	    FAIL;
-        }
-    }
-    my_free_list = *my_free_list_ptr;
-    if (my_free_list == 0) {
-        collectable_count++;
-        my_free_list = GC_malloc_many(8);
-        if (my_free_list == 0) {
-            (void)GC_printf("alloc8bytes out of memory\n");
-    	    FAIL;
-        }
-    }
-    *my_free_list_ptr = GC_NEXT(my_free_list);
-    GC_NEXT(my_free_list) = 0;
-    return(my_free_list);
-# endif
-}
-
-#else
-
-# if defined(GC_PTHREADS)
+#if defined(GC_PTHREADS)
 pthread_key_t fl_key;
 
 void * alloc8bytes()
@@ -918,9 +847,8 @@ void * alloc8bytes()
 # endif
 }
 
-# else
+#else
 #   define alloc8bytes() GC_MALLOC_ATOMIC(8)
-# endif
 #endif
 
 void alloc_small(n)
@@ -1099,7 +1027,7 @@ static void uniq(void *p, ...) {
     for (j=0; j<i; j++)
       if (q[i] == q[j]) {
         GC_printf(
-              "Apparently failed to mark form some function arguments.\n"
+              "Apparently failed to mark from some function arguments.\n"
               "Perhaps GC_push_regs was configured incorrectly?\n"
         );
 	FAIL;
@@ -1389,7 +1317,7 @@ void SetMinimumStack(long minSize)
 }
 
 
-#if !defined(PCR) && !defined(GC_SOLARIS_THREADS) \
+#if !defined(PCR) \
     && !defined(GC_WIN32_THREADS) && !defined(GC_PTHREADS) \
     || defined(LINT)
 #if defined(MSWIN32) && !defined(__MINGW32__)
diff --git a/tests/test_cpp.cc b/tests/test_cpp.cc
index 6661e41..9a7af1c 100644
--- a/tests/test_cpp.cc
+++ b/tests/test_cpp.cc
@@ -52,7 +52,7 @@ extern "C" {
 
 #define my_assert( e ) \
     if (! (e)) { \
-        GC_printf1( "Assertion failure in " __FILE__ ", line %d: " #e "\n", \
+        GC_printf( "Assertion failure in " __FILE__ ", line %d: " #e "\n", \
                     __LINE__ ); \
         exit( 1 ); }
 
@@ -216,11 +216,11 @@ int APIENTRY WinMain(
       x = 0;
 #   endif
     if (argc != 2 || (0 >= (n = atoi( argv[ 1 ] )))) {
-        GC_printf0( "usage: test_cpp number-of-iterations\nAssuming 10 iters\n" );
+        GC_printf( "usage: test_cpp number-of-iterations\nAssuming 10 iters\n" );
         n = 10;}
         
     for (iters = 1; iters <= n; iters++) {
-        GC_printf1( "Starting iteration %d\n", iters );
+        GC_printf( "Starting iteration %d\n", iters );
 
             /* Allocate some uncollectable As and disguise their pointers.
             Later we'll check to see if the objects are still there.  We're
@@ -282,7 +282,7 @@ int APIENTRY WinMain(
       x = *xptr;
 #   endif
     my_assert (29 == x[0]);
-    GC_printf0( "The test appears to have succeeded.\n" );
+    GC_printf( "The test appears to have succeeded.\n" );
     return( 0 );}
     
 
diff --git a/tests/thread_leak_test.c b/tests/thread_leak_test.c
index 1174705..5f183cf 100644
--- a/tests/thread_leak_test.c
+++ b/tests/thread_leak_test.c
@@ -37,4 +37,5 @@ main() {
     CHECK_LEAKS();
     CHECK_LEAKS();
     CHECK_LEAKS();
+    return 0;
 }
diff --git a/thread_local_alloc.c b/thread_local_alloc.c
index 0961794..b3fe28c 100644
--- a/thread_local_alloc.c
+++ b/thread_local_alloc.c
@@ -12,37 +12,17 @@
  */
 #include "private/gc_priv.h"
 
-# if defined(THREAD_LOCAL_ALLOC) && !defined(DBG_HDRS_ALL)
+# if defined(THREAD_LOCAL_ALLOC)
 
 #include "private/thread_local_alloc.h"
 #include "gc_inline.h"
 
-# if defined(GC_HPUX_THREADS) && !defined(USE_PTHREAD_SPECIFIC) \
-     && !defined(USE_COMPILER_TLS)
-#   ifdef __GNUC__
-#     define USE_PTHREAD_SPECIFIC
-      /* Empirically, as of gcc 3.3, USE_COMPILER_TLS doesn't work.	*/
-#   else
-#     define USE_COMPILER_TLS
-#   endif
-# endif
-
-# if defined USE_HPUX_TLS
-#   error USE_HPUX_TLS macro was replaced by USE_COMPILER_TLS
-# endif
-
-# if (defined(GC_DGUX386_THREADS) || defined(GC_OSF1_THREADS) || \
-      defined(GC_DARWIN_THREADS) || defined(GC_AIX_THREADS)) \
-      && !defined(USE_PTHREAD_SPECIFIC)
-#   define USE_PTHREAD_SPECIFIC
-# endif
-
 # include <stdlib.h>
 
-/* We don't really support thread-local allocation with DBG_HDRS_ALL */
-
-#ifdef USE_COMPILER_TLS
+#if defined(USE_COMPILER_TLS)
   __thread
+#elif defined(USE_WIN32_COMPILER_TLS)
+  declspec(thread)
 #endif
 GC_key_t GC_thread_key;
 
@@ -56,7 +36,6 @@ static void return_freelists(void **fl, void **gfl)
     void *q, **qptr;
 
     for (i = 1; i < TINY_FREELISTS; ++i) {
-#if 0
 	if ((word)(fl[i]) >= HBLKSIZE) {
 	  if (gfl[i] == 0) {
 	    gfl[i] = fl[i];
@@ -71,7 +50,6 @@ static void return_freelists(void **fl, void **gfl)
 	      gfl[i] = fl[i];
 	  }
 	}
-#endif
 	/* Clear fl[i], since the thread structure may hang around.	*/
 	/* Do it in a way that is likely to trap if we access it.	*/
 	fl[i] = (ptr_t)HBLKSIZE;
@@ -134,11 +112,16 @@ void GC_destroy_thread_local(GC_tlfs p)
 #   endif
 }
 
-#if defined(GC_ASSERTIONS) && defined(GC_LINUX_THREADS)
+#if defined(GC_ASSERTIONS) && defined(GC_PTHREADS) && !defined(CYGWIN32)
 # include <pthread.h>
   extern char * GC_lookup_thread(pthread_t id);
 #endif
 
+#if defined(GC_ASSERTIONS) && defined(GC_WIN32_THREADS)
+# include <pthread.h>
+  extern char * GC_lookup_thread(int id);
+#endif
+
 void * GC_malloc(size_t bytes)
 {
     size_t granules = ROUNDED_UP_GRANULES(bytes);
@@ -164,10 +147,14 @@ void * GC_malloc(size_t bytes)
 #   endif
 #   ifdef GC_ASSERTIONS
       /* We can't check tsd correctly, since we don't have access to 	*/
-      /* the right declarations.  But we cna check that it's close.	*/
+      /* the right declarations.  But we can check that it's close.	*/
       LOCK();
       {
-	char * me = GC_lookup_thread(pthread_self());
+#	if defined(GC_WIN32_THREADS)
+	  char * me = (char *)GC_lookup_thread_inner(GetCurrentThreadId());
+#       else
+	  char * me = GC_lookup_thread(pthread_self());
+#	endif
         GC_ASSERT((char *)tsd > me && (char *)tsd < me + 1000);
       }
       UNLOCK();
@@ -273,7 +260,7 @@ void GC_mark_thread_local_fls_for(GC_tlfs p)
     }
 #endif /* GC_ASSERTIONS */
 
-# else  /* !THREAD_LOCAL_ALLOC  && !DBG_HDRS_ALL */
+# else  /* !THREAD_LOCAL_ALLOC  */
 
 #   define GC_destroy_thread_local(t)
 
diff --git a/threadlibs.c b/threadlibs.c
index 178a7ec..1309694 100644
--- a/threadlibs.c
+++ b/threadlibs.c
@@ -11,7 +11,6 @@ int main()
 	       "-Wl,--wrap -Wl,pthread_sigmask -Wl,--wrap -Wl,sleep\n");
 #   endif
 #   if defined(GC_LINUX_THREADS) || defined(GC_IRIX_THREADS) \
-	|| defined(GC_SOLARIS_PTHREADS) \
 	|| defined(GC_DARWIN_THREADS) || defined(GC_AIX_THREADS)
 #       ifdef GC_USE_DLOPEN_WRAP
 	  printf("-ldl ");
@@ -31,8 +30,9 @@ int main()
 #   if defined(GC_HPUX_THREADS) || defined(GC_OSF1_THREADS)
 	printf("-lpthread -lrt\n");
 #   endif
-#   if defined(GC_SOLARIS_THREADS) && !defined(GC_SOLARIS_PTHREADS)
-        printf("-lthread -ldl\n");
+#   if defined(GC_SOLARIS_THREADS) || defined(GC_SOLARIS_PTHREADS)
+        printf("-lthread -lposix4\n");
+		/* Is this right for recent versions? */
 #   endif
 #   if defined(GC_WIN32_THREADS) && defined(CYGWIN32)
         printf("-lpthread\n");
diff --git a/typd_mlc.c b/typd_mlc.c
index cdedf46..1124ff9 100644
--- a/typd_mlc.c
+++ b/typd_mlc.c
@@ -587,9 +587,9 @@ void * GC_malloc_explicitly_typed(size_t lb, GC_descr d)
     if(SMALL_OBJ(lb)) {
 	lg = GC_size_map[lb];
 	opp = &(GC_eobjfreelist[lg]);
-	FASTLOCK();
-        if( !FASTLOCK_SUCCEEDED() || (op = *opp) == 0 ) {
-            FASTUNLOCK();
+	LOCK();
+        if( (op = *opp) == 0 ) {
+            UNLOCK();
             op = (ptr_t)GENERAL_MALLOC((word)lb, GC_explicit_kind);
 	    if (0 == op) return 0;
 	    lg = GC_size_map[lb];	/* May have been uninitialized.	*/
@@ -597,7 +597,7 @@ void * GC_malloc_explicitly_typed(size_t lb, GC_descr d)
             *opp = obj_link(op);
 	    obj_link(op) = 0;
             GC_bytes_allocd += GRANULES_TO_BYTES(lg);
-            FASTUNLOCK();
+            UNLOCK();
         }
    } else {
        op = (ptr_t)GENERAL_MALLOC((word)lb, GC_explicit_kind);
@@ -620,16 +620,16 @@ DCL_LOCK_STATE;
     if( SMALL_OBJ(lb) ) {
 	lg = GC_size_map[lb];
 	opp = &(GC_eobjfreelist[lg]);
-	FASTLOCK();
-        if( !FASTLOCK_SUCCEEDED() || (op = *opp) == 0 ) {
-            FASTUNLOCK();
+	LOCK();
+        if( (op = *opp) == 0 ) {
+            UNLOCK();
             op = (ptr_t)GENERAL_MALLOC_IOP(lb, GC_explicit_kind);
 	    lg = GC_size_map[lb];	/* May have been uninitialized.	*/
         } else {
             *opp = obj_link(op);
 	    obj_link(op) = 0;
             GC_bytes_allocd += GRANULES_TO_BYTES(lg);
-            FASTUNLOCK();
+            UNLOCK();
         }
    } else {
        op = (ptr_t)GENERAL_MALLOC_IOP(lb, GC_explicit_kind);
@@ -669,9 +669,9 @@ DCL_LOCK_STATE;
     if( SMALL_OBJ(lb) ) {
 	lg = GC_size_map[lb];
 	opp = &(GC_arobjfreelist[lg]);
-	FASTLOCK();
-        if( !FASTLOCK_SUCCEEDED() || (op = *opp) == 0 ) {
-            FASTUNLOCK();
+	LOCK();
+        if( (op = *opp) == 0 ) {
+            UNLOCK();
             op = (ptr_t)GENERAL_MALLOC((word)lb, GC_array_kind);
 	    if (0 == op) return(0);
 	    lg = GC_size_map[lb];	/* May have been uninitialized.	*/            
@@ -679,7 +679,7 @@ DCL_LOCK_STATE;
             *opp = obj_link(op);
 	    obj_link(op) = 0;
             GC_bytes_allocd += GRANULES_TO_BYTES(lg);
-            FASTUNLOCK();
+            UNLOCK();
         }
    } else {
        op = (ptr_t)GENERAL_MALLOC((word)lb, GC_array_kind);
diff --git a/version.h b/version.h
index 9c34ccc..b71dd6b 100644
--- a/version.h
+++ b/version.h
@@ -3,7 +3,7 @@
 /* it to keep the old-style build process working.		*/
 #define GC_TMP_VERSION_MAJOR 7
 #define GC_TMP_VERSION_MINOR 0
-#define GC_TMP_ALPHA_VERSION 4
+#define GC_TMP_ALPHA_VERSION 5
 
 #ifndef GC_NOT_ALPHA
 #   define GC_NOT_ALPHA 0xff
diff --git a/win32_threads.c b/win32_threads.c
index 8609900..214d5c1 100755
--- a/win32_threads.c
+++ b/win32_threads.c
@@ -4,6 +4,24 @@
 
 #include <windows.h>
 
+#ifdef THREAD_LOCAL_ALLOC
+# include "private/thread_local_alloc.h"
+#endif /* THREAD_LOCAL_ALLOC */
+
+/* Allocation lock declarations.	*/
+#if !defined(USE_PTHREAD_LOCKS)
+# if defined(GC_DLL)
+    __declspec(dllexport) CRITICAL_SECTION GC_allocate_ml;
+# else
+    CRITICAL_SECTION GC_allocate_ml;
+# endif
+  DWORD GC_lock_holder = NO_THREAD;
+  	/* Thread id for current holder of allocation lock */
+#else
+  pthread_mutex_t GC_allocate_ml = PTHREAD_MUTEX_INITIALIZER;
+  pthread_t GC_lock_holder = NO_THREAD;
+#endif
+
 #ifdef CYGWIN32
 # include <errno.h>
 
@@ -16,41 +34,102 @@
 
 # define DEBUG_CYGWIN_THREADS 0
 
-  void * GC_start_routine(void * arg);
+  void * GC_pthread_start(void * arg);
   void GC_thread_exit_proc(void *arg);
 
 # include <pthread.h>
 
 #endif
 
+#if defined(GC_DLL) && !defined(MSWINCE)
+  static GC_bool GC_win32_dll_threads = FALSE;
+  /* This code operates in two distinct modes, depending on	*/
+  /* the setting of GC_win32_dll_threads.  If			*/
+  /* GC_win32_dll_threads is set, all threads in the process	*/
+  /* are implicitly registered with the GC by DllMain. 		*/
+  /* No explicit registration is required, and attempts at	*/
+  /* explicit registration are ignored.  This mode is		*/
+  /* very different from the Posix operation of the collector.	*/
+  /* In this mode access to the thread table is lock-free.	*/
+  /* Hence there is a static limit on the number of threads.	*/
+  
+  /* If GC_win32_dll_threads is FALSE, or the collector is	*/
+  /* built without GC_DLL defined, things operate in a way	*/
+  /* that is very similar to Posix platforms, and new threads	*/
+  /* must be registered with the collector, e.g. by using	*/
+  /* preprocessor-based interception of the thread primitives.	*/
+  /* In this case, we use a real data structure for the thread	*/
+  /* table.  Note that there is no equivalent of linker-based	*/
+  /* call interception, since we don't have ELF-like 		*/
+  /* facilities.  The Windows analog appears to be "API		*/
+  /* hooking", which really seems to be a standard way to 	*/
+  /* do minor binary rewriting (?).  I'd prefer not to have	*/
+  /* the basic collector rely on such facilities, but an	*/
+  /* optional package that intercepts thread calls this way	*/
+  /* would probably be nice.					*/
+
+  /* GC_win32_dll_threads must be set at initialization time,	*/
+  /* i.e. before any collector or thread calls.  We make it a	*/
+  /* "dynamic" option only to avoid multiple library versions.	*/
+#else
+# define GC_win32_dll_threads FALSE
+#endif
+
 /* The type of the first argument to InterlockedExchange.	*/
 /* Documented to be LONG volatile *, but at least gcc likes 	*/
 /* this better.							*/
 typedef LONG * IE_t;
 
-#ifndef MAX_THREADS
-# define MAX_THREADS 256
-    /* FIXME:							*/
-    /* Things may get quite slow for large numbers of threads,	*/
-    /* since we look them up with sequential search.		*/
-#endif
-
 GC_bool GC_thr_initialized = FALSE;
 
+GC_bool GC_need_to_lock = FALSE;
+
+static GC_bool parallel_initialized = FALSE;
+
+void GC_init_parallel(void);
+
 #ifdef GC_DLL
-  GC_API GC_bool GC_need_to_lock = TRUE;
+  /* Turn on GC_win32_dll_threads	*/
+  GC_API void GC_use_DllMain(void)
+  {
+#     ifdef THREAD_LOCAL_ALLOC
+	  ABORT("Cannot use thread local allocation with DllMain-based "
+		"thread registration.");
+	  /* Thread-local allocation really wants to lock at thread	*/
+	  /* entry and exit.						*/
+#     endif
+      GC_need_to_lock = TRUE;
   	/* Cannot intercept thread creation.	*/
+      GC_ASSERT(GC_gc_no == 0);
+      GC_win32_dll_threads = TRUE;
+  }
 #else
-  GC_bool GC_need_to_lock = FALSE;
+  GC_API void GC_use_DllMain(void)
+  {
+      ABORT("GC not configured as DLL");
+  }
 #endif
 
 DWORD GC_main_thread = 0;
 
-struct GC_thread_Rep {
-  AO_t in_use; 		/* Updated without lock.	*/
-  			/* We assert that unused 	*/
-  			/* entries have invalid ids of	*/
-  			/* zero and zero stack fields.  */
+struct GC_Thread_Rep {
+  union {
+    AO_t tm_in_use; 	/* Updated without lock.		*/
+  			/* We assert that unused 		*/
+  			/* entries have invalid ids of		*/
+  			/* zero and zero stack fields.  	*/
+    			/* Used only with GC_win32_dll_threads. */
+    struct GC_Thread_Rep * tm_next;
+    			/* Hash table link without 		*/
+    			/* GC_win32_dll_threads.		*/
+    			/* More recently allocated threads	*/
+			/* with a given pthread id come 	*/
+			/* first.  (All but the first are	*/
+			/* guaranteed to be dead, but we may    */
+			/* not yet have registered the join.)   */
+  } table_management;
+# define in_use table_management.tm_in_use
+# define next table_management.tm_next
   DWORD id;
   HANDLE handle;
   ptr_t stack_base;	/* The cold end of the stack.   */
@@ -65,9 +144,13 @@ struct GC_thread_Rep {
 #	define FINISHED 1   	/* Thread has exited.	*/
 #	define DETACHED 2	/* Thread is intended to be detached.	*/
 # endif
+# ifdef THREAD_LOCAL_ALLOC
+    struct thread_local_freelists tlfs;
+# endif
 };
 
-typedef volatile struct GC_thread_Rep * GC_thread;
+typedef struct GC_Thread_Rep * GC_thread;
+
 
 /*
  * We assumed that volatile ==> memory ordering, at least among
@@ -76,36 +159,78 @@ typedef volatile struct GC_thread_Rep * GC_thread;
 
 volatile GC_bool GC_please_stop = FALSE;
 
-/*
- * FIXME: At initialization time we should perhaps chose
- * between two different thread table representations.  This simple
- * linear representation may be the best we can reliably do if we use
- * DllMain.  By default we should probably rely on thread registration
- * as with the other platforms, and use a hash table or other real
- * data structure.
- */
-volatile struct GC_thread_Rep thread_table[MAX_THREADS];
+/* We have two versions of the thread table.  Which one	*/
+/* we us depends on whether or not GC_win32_dll_threads */
+/* is set.  The one complication is that at process	*/
+/* startup, we use both, since the client hasn't yet	*/
+/* had a chance to tell us which one (s)he wants.	*/
+static GC_bool client_has_run = FALSE;
+
+/* Thread table used if GC_win32_dll_threads is set.	*/
+/* This is a fixed size array.				*/
+/* Since we use runtime conditionals, both versions	*/
+/* are always defined.					*/
+# ifndef MAX_THREADS
+#   define MAX_THREADS 512
+#  endif
+  /* Things may get quite slow for large numbers of threads,	*/
+  /* since we look them up with sequential search.		*/
+
+  volatile struct GC_Thread_Rep dll_thread_table[MAX_THREADS];
+
+  volatile LONG GC_max_thread_index = 0;
+  			/* Largest index in dll_thread_table	*/
+		        /* that was ever used.			*/
+
+/* And now the version used if GC_win32_dll_threads is not set.	*/
+/* This is a chained hash table, with much of the code borrowed	*/
+/* From the Posix implementation.				*/
+# define THREAD_TABLE_SZ 256	/* Must be power of 2	*/
+  volatile GC_thread GC_threads[THREAD_TABLE_SZ];
+  
 
-volatile LONG GC_max_thread_index = 0; /* Largest index in thread_table	*/
-				       /* that was ever used.		*/
+/* Add a thread to GC_threads.  We assume it wasn't already there.	*/
+/* Caller holds allocation lock.					*/
+/* Unlike the pthreads version, the id field is set by the caller.	*/
+GC_thread GC_new_thread(DWORD id)
+{
+    int hv = ((word)id) % THREAD_TABLE_SZ;
+    GC_thread result;
+    /* It may not be safe to allocate when we register the first thread. */
+    static struct GC_Thread_Rep first_thread;
+    static GC_bool first_thread_used = FALSE;
+    
+    GC_ASSERT(I_HOLD_LOCK());
+    if (!first_thread_used) {
+    	result = &first_thread;
+    	first_thread_used = TRUE;
+    } else {
+        GC_ASSERT(!GC_win32_dll_threads);
+        result = (struct GC_Thread_Rep *)
+        	 GC_INTERNAL_MALLOC(sizeof(struct GC_Thread_Rep), NORMAL);
+	GC_ASSERT(result -> flags == 0);
+    }
+    if (result == 0) return(0);
+    /* result -> id = id; Done by caller.	*/
+    result -> next = GC_threads[hv];
+    GC_threads[hv] = result;
+    GC_ASSERT(result -> flags == 0 /* && result -> thread_blocked == 0 */);
+    return(result);
+}
 
 extern LONG WINAPI GC_write_fault_handler(struct _EXCEPTION_POINTERS *exc_info);
 
 /*
  * This may be called from DllMain, and hence operates under unusual
- * constraints.  In particular, it must be lock-free.
- * Always called from the thread being added.
+ * constraints.  In particular, it must be lock-free if GC_win32_dll_threads
+ * is set.  Always called from the thread being added.
+ * If GC_win32_dll_threads is not set, we already hold the allocation lock,
+ * except possibly during single-threaded start-up code.
  */
 static GC_thread GC_register_my_thread_inner(struct GC_stack_base *sb,
 					     DWORD thread_id)
 {
-  int i;
-  /* It appears to be unsafe to acquire a lock here, since this	*/
-  /* code is apparently not preeemptible on some systems.	*/
-  /* (This is based on complaints, not on Microsoft's official	*/
-  /* documentation, which says this should perform "only simple	*/
-  /* initialization tasks".)					*/
-  /* Hence we make do with nonblocking synchronization.		*/
+  volatile struct GC_Thread_Rep * me;
 
   /* The following should be a noop according to the win32	*/
   /* documentation.  There is empirical evidence that it	*/
@@ -113,39 +238,60 @@ static GC_thread GC_register_my_thread_inner(struct GC_stack_base *sb,
 # if defined(MPROTECT_VDB)
    if (GC_incremental) SetUnhandledExceptionFilter(GC_write_fault_handler);
 # endif
+
+  if (GC_win32_dll_threads || !client_has_run) {
+    int i;
+    /* It appears to be unsafe to acquire a lock here, since this	*/
+    /* code is apparently not preeemptible on some systems.		*/
+    /* (This is based on complaints, not on Microsoft's official	*/
+    /* documentation, which says this should perform "only simple	*/
+    /* initialization tasks".)						*/
+    /* Hence we make do with nonblocking synchronization.		*/
+    /* It has been claimed that DllMain is really only executed with	*/
+    /* a particular system lock held, and thus careful use of locking	*/
+    /* around code that doesn't call back into the system libraries	*/
+    /* might be OK.  But this hasn't been tested across all win32	*/
+    /* variants.							*/
                 /* cast away volatile qualifier */
-  for (i = 0; InterlockedExchange((IE_t)&thread_table[i].in_use,1) != 0; i++) {
-    /* Compare-and-swap would make this cleaner, but that's not 	*/
-    /* supported before Windows 98 and NT 4.0.  In Windows 2000,	*/
-    /* InterlockedExchange is supposed to be replaced by		*/
-    /* InterlockedExchangePointer, but that's not really what I		*/
-    /* want here.							*/
-    /* FIXME: We should eventually declare Win95 dead and use AO_	*/
-    /* primitives here.							*/
-    if (i == MAX_THREADS - 1)
-      ABORT("too many threads");
-  }
-  /* Update GC_max_thread_index if necessary.  The following is safe,	*/
-  /* and unlike CompareExchange-based solutions seems to work on all	*/
-  /* Windows95 and later platforms.					*/
-  /* Unfortunately, GC_max_thread_index may be temporarily out of 	*/
-  /* bounds, so readers have to compensate.				*/
-  while (i > GC_max_thread_index) {
-    InterlockedIncrement((IE_t)&GC_max_thread_index);
+    for (i = 0; InterlockedExchange((IE_t)&dll_thread_table[i].in_use,1) != 0;
+	 i++) {
+      /* Compare-and-swap would make this cleaner, but that's not 	*/
+      /* supported before Windows 98 and NT 4.0.  In Windows 2000,	*/
+      /* InterlockedExchange is supposed to be replaced by		*/
+      /* InterlockedExchangePointer, but that's not really what I	*/
+      /* want here.							*/
+      /* FIXME: We should eventually declare Win95 dead and use AO_	*/
+      /* primitives here.						*/
+      if (i == MAX_THREADS - 1)
+        ABORT("too many threads");
+    }
+    /* Update GC_max_thread_index if necessary.  The following is safe,	*/
+    /* and unlike CompareExchange-based solutions seems to work on all	*/
+    /* Windows95 and later platforms.					*/
+    /* Unfortunately, GC_max_thread_index may be temporarily out of 	*/
+    /* bounds, so readers have to compensate.				*/
+    while (i > GC_max_thread_index) {
+      InterlockedIncrement((IE_t)&GC_max_thread_index);
+    }
+    if (GC_max_thread_index >= MAX_THREADS) {
+      /* We overshot due to simultaneous increments.	*/
+      /* Setting it to MAX_THREADS-1 is always safe.	*/
+      GC_max_thread_index = MAX_THREADS - 1;
+    }
+    me = dll_thread_table + i;
   }
-  if (GC_max_thread_index >= MAX_THREADS) {
-    /* We overshot due to simultaneous increments.	*/
-    /* Setting it to MAX_THREADS-1 is always safe.	*/
-    GC_max_thread_index = MAX_THREADS - 1;
+  if (!GC_win32_dll_threads || !client_has_run) {
+    GC_ASSERT(I_HOLD_LOCK() || !client_has_run);
+    me = GC_new_thread(thread_id);
   }
   
 # ifdef CYGWIN32
-    thread_table[i].pthread_id = pthread_self();
+    me -> pthread_id = pthread_self();
 # endif
   if (!DuplicateHandle(GetCurrentProcess(),
 	               GetCurrentThread(),
 		       GetCurrentProcess(),
-		       (HANDLE*)&thread_table[i].handle,
+		       (HANDLE*)&(me -> handle),
 		       0,
 		       0,
 		       DUPLICATE_SAME_ACCESS)) {
@@ -153,19 +299,25 @@ static GC_thread GC_register_my_thread_inner(struct GC_stack_base *sb,
 	GC_err_printf("Last error code: %d\n", last_error);
 	ABORT("DuplicateHandle failed");
   }
-  thread_table[i].stack_base = sb -> mem_base;
+  me -> stack_base = sb -> mem_base;
   /* Up until this point, GC_push_all_stacks considers this thread	*/
   /* invalid.								*/
-  if (thread_table[i].stack_base == NULL) 
-    ABORT("Bad stack base in GC_register_my_thread");
+  if (me -> stack_base == NULL) 
+    ABORT("Bad stack base in GC_register_my_thread_inner");
   /* Up until this point, this entry is viewed as reserved but invalid	*/
   /* by GC_delete_thread.						*/
-  thread_table[i].id = thread_id;
+  me -> id = thread_id;
+# if defined(THREAD_LOCAL_ALLOC)
+    GC_init_thread_local((GC_tlfs)(&(me->tlfs)));
+# endif
+  GC_ASSERT(!GC_please_stop || GC_win32_dll_threads);
+  	/* Otherwise both we and the thread stopping code would be	*/
+  	/* holding the allocation lock.					*/
   /* If this thread is being created while we are trying to stop	*/
   /* the world, wait here.  Hopefully this can't happen on any	*/
   /* systems that don't allow us to block here.			*/
   while (GC_please_stop) Sleep(20);
-  return thread_table + i;
+  return (GC_thread)(me);
 }
 
 /*
@@ -187,149 +339,326 @@ LONG GC_get_max_thread_index()
 /* without a lock, but should be called in contexts in which the	*/
 /* requested thread cannot be asynchronously deleted, e.g. from the	*/
 /* thread itself.							*/
-static GC_thread GC_lookup_thread(DWORD thread_id) {
-  int i;
-  LONG my_max = GC_get_max_thread_index();
-
-  for (i = 0;
+/* This version assumes that either GC_win32_dll_threads is set, or	*/
+/* we hold the allocator lock.						*/
+/* Also used (for assertion checking only) from thread_local_alloc.c.	*/
+GC_thread GC_lookup_thread_inner(DWORD thread_id) {
+  if (GC_win32_dll_threads) {
+    int i;
+    LONG my_max = GC_get_max_thread_index();
+    for (i = 0;
        i <= my_max &&
-       (!AO_load_acquire(&(thread_table[i].in_use))
-	|| thread_table[i].id != thread_id);
+       (!AO_load_acquire(&(dll_thread_table[i].in_use))
+	|| dll_thread_table[i].id != thread_id);
        /* Must still be in_use, since nobody else can store our thread_id. */
        i++) {}
-  if (i > my_max) {
-    return 0;
+    if (i > my_max) {
+      return 0;
+    } else {
+      return (GC_thread)(dll_thread_table + i);
+    }
   } else {
-    return thread_table + i;
+    int hv = ((word)thread_id) % THREAD_TABLE_SZ;
+    register GC_thread p = GC_threads[hv];
+    
+    GC_ASSERT(I_HOLD_LOCK());
+    while (p != 0 && p -> id != thread_id) p = p -> next;
+    return(p);
   }
 }
 
-int GC_register_my_thread(struct GC_stack_base *sb) {
-  DWORD t = GetCurrentThreadId();
-
-  if (0 == GC_lookup_thread(t)) {
-    /* We lock here, since we want to wait for an ongoing GC.	*/
+/* A version of the above that acquires the lock if necessary.  Note	*/
+/* that the identically named function for pthreads is different, and	*/
+/* just assumes we hold the lock.					*/
+/* Also used (for assertion checking only) from thread_local_alloc.c.	*/
+static GC_thread GC_lookup_thread(DWORD thread_id)
+{
+  if (GC_win32_dll_threads) {
+    return GC_lookup_thread_inner(thread_id);
+  } else {
+    GC_thread result;
     LOCK();
-    GC_register_my_thread_inner(sb, t);
+    result = GC_lookup_thread_inner(thread_id);
     UNLOCK();
-    return GC_SUCCESS;
-  } else {
-    return GC_DUPLICATE;
+    return result;
   }
 }
 
-/* This is intended to be lock-free.					*/
-/* It is either called synchronously from the thread being deleted,	*/
-/* or by the joining thread.						*/
-static void GC_delete_gc_thread(GC_thread thr)
+/* If a thread has been joined, but we have not yet		*/
+/* been notified, then there may be more than one thread 	*/
+/* in the table with the same win32 id.				*/
+/* This is OK, but we need a way to delete a specific one.	*/
+/* Assumes we hold the allocation lock unless			*/
+/* GC_win32_dll_threads is set.					*/
+/* If GC_win32_dll_threads is set it should be called from the	*/
+/* thread being deleted.					*/
+void GC_delete_gc_thread(GC_thread gc_id)
 {
-    CloseHandle(thr->handle);
+  if (GC_win32_dll_threads) {
+    /* This is intended to be lock-free.				*/
+    /* It is either called synchronously from the thread being deleted,	*/
+    /* or by the joining thread.					*/
+    CloseHandle(gc_id->handle);
       /* cast away volatile qualifier */
-    thr->stack_base = 0;
-    thr->id = 0;
+    gc_id -> stack_base = 0;
+    gc_id -> id = 0;
 #   ifdef CYGWIN32
-      thr->pthread_id = 0;
+      gc_id -> pthread_id = 0;
 #   endif /* CYGWIN32 */
-    AO_store_release(&(thr->in_use), FALSE);
+    AO_store_release(&(gc_id->in_use), FALSE);
+  } else {
+    DWORD id = gc_id -> id;
+    int hv = ((word)id) % THREAD_TABLE_SZ;
+    register GC_thread p = GC_threads[hv];
+    register GC_thread prev = 0;
+
+    GC_ASSERT(I_HOLD_LOCK());
+    while (p != gc_id) {
+        prev = p;
+        p = p -> next;
+    }
+    if (prev == 0) {
+        GC_threads[hv] = p -> next;
+    } else {
+        prev -> next = p -> next;
+    }
+    GC_INTERNAL_FREE(p);
+  }
 }
 
+/* Delete a thread from GC_threads.  We assume it is there.	*/
+/* (The code intentionally traps if it wasn't.)			*/
+/* Assumes we hold the allocation lock unless			*/
+/* GC_win32_dll_threads is set.					*/
+/* If GC_win32_dll_threads is set it should be called from the	*/
+/* thread being deleted.					*/
+void GC_delete_thread(DWORD id)
+{
+  if (GC_win32_dll_threads) {
+    GC_thread t = GC_lookup_thread_inner(id);
 
-static void GC_delete_thread(DWORD thread_id) {
-  GC_thread t = GC_lookup_thread(thread_id);
+    if (0 == t) {
+      WARN("Removing nonexistent thread %ld\n", (GC_word)id);
+    } else {
+      GC_delete_gc_thread(t);
+    }
+  } else {
+    int hv = ((word)id) % THREAD_TABLE_SZ;
+    register GC_thread p = GC_threads[hv];
+    register GC_thread prev = 0;
+    
+    GC_ASSERT(I_HOLD_LOCK());
+    while (p -> id != id) {
+        prev = p;
+        p = p -> next;
+    }
+    if (prev == 0) {
+        GC_threads[hv] = p -> next;
+    } else {
+        prev -> next = p -> next;
+    }
+    GC_INTERNAL_FREE(p);
+  }
+}
+
+int GC_register_my_thread(struct GC_stack_base *sb) {
+  DWORD t = GetCurrentThreadId();
 
-  if (0 == t) {
-    WARN("Removing nonexistent thread %ld\n", (GC_word)thread_id);
+  if (0 == GC_lookup_thread(t)) {
+    /* We lock here, since we want to wait for an ongoing GC.	*/
+    LOCK();
+    GC_register_my_thread_inner(sb, t);
+    UNLOCK();
+    return GC_SUCCESS;
   } else {
-    GC_delete_gc_thread(t);
+    return GC_DUPLICATE;
   }
 }
 
 int GC_unregister_my_thread(void)
 {
-    GC_delete_thread(GetCurrentThreadId());
+    if (GC_win32_dll_threads) {
+      /* Should we just ignore this? */
+      GC_delete_thread(GetCurrentThreadId());
+    } else {
+      LOCK();
+      GC_delete_thread(GetCurrentThreadId());
+      UNLOCK();
+    }
+#   if defined(THREAD_LOCAL_ALLOC)
+      LOCK();
+      {
+	GC_thread me = GC_lookup_thread_inner(GetCurrentThreadId());
+        GC_destroy_thread_local(&(me->tlfs));
+      }
+      UNLOCK();
+#   endif
     return GC_SUCCESS;
 }
 
 
 #ifdef CYGWIN32
 
+/* A quick-and-dirty cache of the mapping between pthread_t	*/
+/* and win32 thread id.						*/
+#define PTHREAD_MAP_SIZE 512
+DWORD GC_pthread_map_cache[PTHREAD_MAP_SIZE];
+#define HASH(pthread_id) ((((word)(pthread_id) >> 5)) % PTHREAD_MAP_SIZE)
+	/* It appears pthread_t is really a pointer type ... */
+#define SET_PTHREAD_MAP_CACHE(pthread_id, win32_id) \
+	GC_pthread_map_cache[HASH(pthread_id)] = (win32_id);
+#define GET_PTHREAD_MAP_CACHE(pthread_id) \
+	GC_pthread_map_cache[HASH(pthread_id)]
+
 /* Return a GC_thread corresponding to a given pthread_t.	*/
 /* Returns 0 if it's not there.					*/
 /* We assume that this is only called for pthread ids that	*/
 /* have not yet terminated or are still joinable, and		*/
 /* cannot be concurrently terminated.				*/
+/* Assumes we do NOT hold the allocation lock.			*/
 static GC_thread GC_lookup_pthread(pthread_t id)
 {
-  int i;
-  LONG my_max = GC_get_max_thread_index();
+  if (GC_win32_dll_threads) {
+    int i;
+    LONG my_max = GC_get_max_thread_index();
 
-  for (i = 0;
-       i <= my_max &&
-       (!AO_load_acquire(&(thread_table[i].in_use))
-	|| thread_table[i].pthread_id != id);
+    for (i = 0;
+         i <= my_max &&
+         (!AO_load_acquire(&(dll_thread_table[i].in_use))
+	  || dll_thread_table[i].pthread_id != id);
        /* Must still be in_use, since nobody else can store our thread_id. */
        i++);
-  if (i > my_max) return 0;
-  return thread_table + i;
+    if (i > my_max) return 0;
+    return (GC_thread)(dll_thread_table + i);
+  } else {
+    /* We first try the cache.  If that fails, we use a very slow	*/
+    /* approach.							*/
+    int hv_guess = GET_PTHREAD_MAP_CACHE(id) % THREAD_TABLE_SZ;
+    int hv;
+    GC_thread p;
+
+    LOCK();
+    for (p = GC_threads[hv_guess]; 0 != p; p = p -> next) {
+      if (pthread_equal(p -> pthread_id, id))
+	goto foundit; 
+    }
+    for (hv = 0; hv < THREAD_TABLE_SZ; ++hv) {
+      for (p = GC_threads[hv]; 0 != p; p = p -> next) {
+        if (pthread_equal(p -> pthread_id, id))
+	  goto foundit; 
+      }
+    }
+    p = 0;
+   foundit:
+    UNLOCK();
+    return p;
+  }
 }
 
 #endif /* CYGWIN32 */
 
 void GC_push_thread_structures(void)
 {
+  GC_ASSERT(I_HOLD_LOCK());
+  if (GC_win32_dll_threads) {
     /* Unlike the other threads implementations, the thread table here	*/
     /* contains no pointers to the collectable heap.  Thus we have	*/
     /* no private structures we need to preserve.			*/
-# ifdef CYGWIN32
-  { int i; /* pthreads may keep a pointer in the thread exit value */
-    LONG my_max = GC_get_max_thread_index();
+#   ifdef CYGWIN32
+    { int i; /* pthreads may keep a pointer in the thread exit value */
+      LONG my_max = GC_get_max_thread_index();
 
-    for (i = 0; i <= my_max; i++)
-      if (thread_table[i].in_use)
-	GC_push_all((ptr_t)&(thread_table[i].status),
-                    (ptr_t)(&(thread_table[i].status)+1));
+      for (i = 0; i <= my_max; i++)
+        if (dll_thread_table[i].in_use)
+	  GC_push_all((ptr_t)&(dll_thread_table[i].status),
+                      (ptr_t)(&(dll_thread_table[i].status)+1));
+    }
+#   endif
+  } else {
+    GC_push_all((ptr_t)(GC_threads), (ptr_t)(GC_threads)+sizeof(GC_threads));
   }
+# if defined(THREAD_LOCAL_ALLOC)
+    GC_push_all((ptr_t)(&GC_thread_key),
+      (ptr_t)(&GC_thread_key)+sizeof(&GC_thread_key));
+    /* Just in case we ever use our own TLS implementation.	*/
 # endif
 }
 
+/* Suspend the given thread, if it's still active.	*/
+GC_suspend(GC_thread t)
+{
+# ifdef MSWINCE
+    /* SuspendThread will fail if thread is running kernel code */
+      while (SuspendThread(t -> handle) == (DWORD)-1)
+	Sleep(10);
+# else
+    /* Apparently the Windows 95 GetOpenFileName call creates	*/
+    /* a thread that does not properly get cleaned up, and		*/
+    /* SuspendThread on its descriptor may provoke a crash.		*/
+    /* This reduces the probability of that event, though it still	*/
+    /* appears there's a race here.					*/
+    DWORD exitCode; 
+    if (GetExitCodeThread(t -> handle, &exitCode) &&
+        exitCode != STILL_ACTIVE) {
+      t -> stack_base = 0; /* prevent stack from being pushed */
+#     ifndef CYGWIN32
+        /* this breaks pthread_join on Cygwin, which is guaranteed to  */
+        /* only see user pthreads 	 			       */
+        AO_store(&(t -> in_use), FALSE);
+        CloseHandle(t -> handle);
+#     endif
+      return;
+    }
+    if (SuspendThread(t -> handle) == (DWORD)-1)
+      ABORT("SuspendThread failed");
+# endif
+   t -> suspended = TRUE;
+}
+
+/* Defined in misc.c */
+#ifndef CYGWIN32
+  extern CRITICAL_SECTION GC_write_cs;
+#endif
+
 void GC_stop_world(void)
 {
   DWORD thread_id = GetCurrentThreadId();
   int i;
 
   if (!GC_thr_initialized) ABORT("GC_stop_world() called before GC_thr_init()");
+  GC_ASSERT(I_HOLD_LOCK());
 
   GC_please_stop = TRUE;
-  for (i = 0; i <= GC_get_max_thread_index(); i++)
-    if (thread_table[i].stack_base != 0
-	&& thread_table[i].id != thread_id) {
-#     ifdef MSWINCE
-        /* SuspendThread will fail if thread is running kernel code */
-	while (SuspendThread(thread_table[i].handle) == (DWORD)-1)
-	  Sleep(10);
-#     else
-	/* Apparently the Windows 95 GetOpenFileName call creates	*/
-	/* a thread that does not properly get cleaned up, and		*/
-	/* SuspendThread on its descriptor may provoke a crash.		*/
-	/* This reduces the probability of that event, though it still	*/
-	/* appears there's a race here.					*/
-	DWORD exitCode; 
-	if (GetExitCodeThread(thread_table[i].handle,&exitCode) &&
-            exitCode != STILL_ACTIVE) {
-          thread_table[i].stack_base = 0; /* prevent stack from being pushed */
-#         ifndef CYGWIN32
-            /* this breaks pthread_join on Cygwin, which is guaranteed to  */
-	    /* only see user pthreads 					   */
-	    AO_store(&(thread_table[i].in_use), FALSE);
-	    CloseHandle(thread_table[i].handle);
-#         endif
-	  continue;
-	}
-	if (SuspendThread(thread_table[i].handle) == (DWORD)-1)
-	  ABORT("SuspendThread failed");
-#     endif
-      thread_table[i].suspended = TRUE;
+# ifndef CYGWIN32
+    EnterCriticalSection(&GC_write_cs);
+# endif
+  if (GC_win32_dll_threads) {
+    /* Any threads being created during this loop will end up sleeping	*/
+    /* in the thread registration code until GC_please_stop becomes	*/
+    /* false.  This is not ideal, but hopefully correct.		*/
+    for (i = 0; i <= GC_get_max_thread_index(); i++) {
+      volatile struct GC_Thread_Rep * t = dll_thread_table + i;
+      if (t -> stack_base != 0
+	  && t -> id != thread_id) {
+	  GC_suspend((GC_thread)t);
+      }
     }
+  } else {
+      GC_thread t;
+      int i;
+
+      for (i = 0; i < THREAD_TABLE_SZ; i++) {
+        for (t = GC_threads[i]; t != 0; t = t -> next) {
+	  if (t -> stack_base != 0
+	  && t -> id != thread_id) {
+	    GC_suspend(t);
+	  }
+	}
+      }
+  }
+# ifndef CYGWIN32
+    LeaveCriticalSection(&GC_write_cs);
+# endif    
 }
 
 void GC_start_world(void)
@@ -338,33 +667,34 @@ void GC_start_world(void)
   int i;
   LONG my_max = GC_get_max_thread_index();
 
-  for (i = 0; i <= my_max; i++)
-    if (thread_table[i].stack_base != 0 && thread_table[i].suspended
-	&& thread_table[i].id != thread_id) {
-      if (ResumeThread(thread_table[i].handle) == (DWORD)-1)
-	ABORT("ResumeThread failed");
-      thread_table[i].suspended = FALSE;
+  GC_ASSERT(I_HOLD_LOCK());
+  if (GC_win32_dll_threads) {
+    for (i = 0; i <= my_max; i++) {
+      GC_thread t = (GC_thread)(dll_thread_table + i);
+      if (t -> stack_base != 0 && t -> suspended
+	  && t -> id != thread_id) {
+        if (ResumeThread(t -> handle) == (DWORD)-1)
+	  ABORT("ResumeThread failed");
+        t -> suspended = FALSE;
+      }
     }
-  GC_please_stop = FALSE;
-}
-
-# ifdef _MSC_VER
-#   pragma warning(disable:4715)
-# endif
-ptr_t GC_current_stackbottom(void)
-{
-  DWORD thread_id = GetCurrentThreadId();
-  int i;
-  LONG my_max = GC_get_max_thread_index();
+  } else {
+    GC_thread t;
+    int i;
 
-  for (i = 0; i <= my_max; i++)
-    if (thread_table[i].stack_base && thread_table[i].id == thread_id)
-      return thread_table[i].stack_base;
-  ABORT("no thread table entry for current thread");
+    for (i = 0; i < THREAD_TABLE_SZ; i++) {
+      for (t = GC_threads[i]; t != 0; t = t -> next) {
+        if (t -> stack_base != 0 && t -> suspended
+	    && t -> id != thread_id) {
+          if (ResumeThread(t -> handle) == (DWORD)-1)
+	    ABORT("ResumeThread failed");
+          t -> suspended = FALSE;
+        }
+      }
+    }
+  }
+  GC_please_stop = FALSE;
 }
-# ifdef _MSC_VER
-#   pragma warning(default:4715)
-# endif
 
 # ifdef MSWINCE
     /* The VirtualQuery calls below won't work properly on WinCE, but	*/
@@ -387,26 +717,19 @@ ptr_t GC_current_stackbottom(void)
     }
 # endif
 
-void GC_push_all_stacks(void)
+void GC_push_stack_for(GC_thread thread)
 {
-  DWORD thread_id = GetCurrentThreadId();
-  GC_bool found_me = FALSE;
-  int i;
-  int dummy;
-  ptr_t sp, stack_min;
-  GC_thread thread;
-  LONG my_max = GC_get_max_thread_index();
-  
-  for (i = 0; i <= my_max; i++) {
-    thread = thread_table + i;
-    if (thread -> in_use && thread -> stack_base) {
-      if (thread -> id == thread_id) {
+    int dummy;
+    ptr_t sp, stack_min;
+    DWORD me = GetCurrentThreadId();
+
+    if (thread -> stack_base) {
+      if (thread -> id == me) {
 	sp = (ptr_t) &dummy;
-	found_me = TRUE;
       } else {
         CONTEXT context;
         context.ContextFlags = CONTEXT_INTEGER|CONTEXT_CONTROL;
-        if (!GetThreadContext(thread_table[i].handle, &context))
+        if (!GetThreadContext(thread -> handle, &context))
 	  ABORT("GetThreadContext failed");
 
         /* Push all registers that might point into the heap.  Frame	*/
@@ -446,17 +769,49 @@ void GC_push_all_stacks(void)
 #       else
 #         error "architecture is not supported"
 #       endif
-      }
+      } /* ! current thread */
 
       stack_min = GC_get_stack_min(thread->stack_base);
 
-      if (sp >= stack_min && sp < thread->stack_base)
+      if (sp >= stack_min && sp < thread->stack_base) {
+#       if DEBUG_CYGWIN_THREADS
+	  GC_printf("Pushing thread from %p to %p for %d from %d\n",
+		    sp, thread -> stack_base, thread -> id, me);
+#       endif
         GC_push_all_stack(sp, thread->stack_base);
-      else {
+      } else {
         WARN("Thread stack pointer 0x%lx out of range, pushing everything\n",
 	     (unsigned long)sp);
         GC_push_all_stack(stack_min, thread->stack_base);
       }
+    } /* thread looks live */
+}
+
+void GC_push_all_stacks(void)
+{
+  DWORD me = GetCurrentThreadId();
+  GC_bool found_me = FALSE;
+  
+  if (GC_win32_dll_threads) {
+    int i;
+    LONG my_max = GC_get_max_thread_index();
+
+    for (i = 0; i <= my_max; i++) {
+      GC_thread t = (GC_thread)(dll_thread_table + i);
+      if (t -> in_use) {
+        GC_push_stack_for(t);
+        if (t -> id == me) found_me = TRUE;
+      }
+    }
+  } else {
+    GC_thread t;
+    int i;
+
+    for (i = 0; i < THREAD_TABLE_SZ; i++) {
+      for (t = GC_threads[i]; t != 0; t = t -> next) {
+        GC_push_stack_for(t);
+        if (t -> id == me) found_me = TRUE;
+      }
     }
   }
   if (!found_me) ABORT("Collecting from unknown thread.");
@@ -467,14 +822,29 @@ void GC_get_next_stack(char *start, char **lo, char **hi)
     int i;
 #   define ADDR_LIMIT (char *)(-1L)
     char * current_min = ADDR_LIMIT;
-    LONG my_max = GC_get_max_thread_index();
+
+    if (GC_win32_dll_threads) {
+      LONG my_max = GC_get_max_thread_index();
   
-    for (i = 0; i <= my_max; i++) {
-    	char * s = (char *)thread_table[i].stack_base;
+      for (i = 0; i <= my_max; i++) {
+    	ptr_t s = (ptr_t)(dll_thread_table[i].stack_base);
 
 	if (0 != s && s > start && s < current_min) {
 	    current_min = s;
 	}
+      }
+    } else {
+      for (i = 0; i < THREAD_TABLE_SZ; i++) {
+	GC_thread t;
+
+        for (t = GC_threads[i]; t != 0; t = t -> next) {
+	  ptr_t s = (ptr_t)(t -> stack_base);
+
+	  if (0 != s && s > start && s < current_min) {
+	    current_min = s;
+	  }
+        }
+      }
     }
     *hi = current_min;
     if (current_min == ADDR_LIMIT) {
@@ -487,21 +857,6 @@ void GC_get_next_stack(char *start, char **lo, char **hi)
 
 #if !defined(CYGWIN32)
 
-#if !defined(MSWINCE) && defined(GC_DLL)
-
-/* We register threads from DllMain */
-
-GC_API HANDLE WINAPI GC_CreateThread(
-    LPSECURITY_ATTRIBUTES lpThreadAttributes, 
-    DWORD dwStackSize, LPTHREAD_START_ROUTINE lpStartAddress, 
-    LPVOID lpParameter, DWORD dwCreationFlags, LPDWORD lpThreadId )
-{
-    return CreateThread(lpThreadAttributes, dwStackSize, lpStartAddress,
-                        lpParameter, dwCreationFlags, lpThreadId);
-}
-
-#else /* defined(MSWINCE) || !defined(GC_DLL))  */
-
 /* We have no DllMain to take care of new threads.  Thus we	*/
 /* must properly intercept thread creation.			*/
 
@@ -521,37 +876,42 @@ GC_API HANDLE WINAPI GC_CreateThread(
 
     thread_args *args;
 
-    if (!GC_is_initialized) GC_init();
-    		/* make sure GC is initialized (i.e. main thread is attached) */
-    
-    args = GC_malloc_uncollectable(sizeof(thread_args)); 
+    if (!parallel_initialized) GC_init_parallel();
+    		/* make sure GC is initialized (i.e. main thread is attached,
+		   tls initialized) */
+
+    client_has_run = TRUE;
+    if (GC_win32_dll_threads) {
+      return CreateThread(lpThreadAttributes, dwStackSize, lpStartAddress,
+                        lpParameter, dwCreationFlags, lpThreadId);
+    } else {
+      args = GC_malloc_uncollectable(sizeof(thread_args)); 
 	/* Handed off to and deallocated by child thread.	*/
-    if (0 == args) {
+      if (0 == args) {
 	SetLastError(ERROR_NOT_ENOUGH_MEMORY);
         return NULL;
-    }
+      }
 
-    /* set up thread arguments */
+      /* set up thread arguments */
     	args -> start = lpStartAddress;
     	args -> param = lpParameter;
 
-    GC_need_to_lock = TRUE;
-    thread_h = CreateThread(lpThreadAttributes,
-    			    dwStackSize, thread_start,
-    			    args, dwCreationFlags,
-    			    lpThreadId);
+      GC_need_to_lock = TRUE;
+      thread_h = CreateThread(lpThreadAttributes,
+    			      dwStackSize, GC_win32_start,
+    			      args, dwCreationFlags,
+    			      lpThreadId);
 
-    return thread_h;
+      return thread_h;
+    }
 }
 
-static DWORD WINAPI thread_start(LPVOID arg)
+void * GC_win32_start_inner(struct GC_stack_base *sb, LPVOID arg)
 {
-    DWORD ret = 0;
+    void * ret;
     thread_args *args = (thread_args *)arg;
-    struct GC_stack_base *sb;
 
-    GC_get_stack_base(&sb);
-    GC_register_my_thread(&sb); /* This waits for an in-progress GC. */
+    GC_register_my_thread(sb); /* This waits for an in-progress GC. */
 
     /* Clear the thread entry even if we exit with an exception.	*/
     /* This is probably pointless, since an uncaught exception is	*/
@@ -559,10 +919,15 @@ static DWORD WINAPI thread_start(LPVOID arg)
 #ifndef __GNUC__
     __try {
 #endif /* __GNUC__ */
-	ret = args->start (args->param);
+	ret = (void *)args->start (args->param);
 #ifndef __GNUC__
     } __finally {
 #endif /* __GNUC__ */
+#       if defined(THREAD_LOCAL_ALLOC)
+          LOCK();
+          GC_destroy_thread_local(&(me->tlfs));
+          UNLOCK();
+#       endif
 	GC_free(args);
 	GC_delete_thread(GetCurrentThreadId());
 #ifndef __GNUC__
@@ -571,8 +936,11 @@ static DWORD WINAPI thread_start(LPVOID arg)
 
     return ret;
 }
-#endif /* !defined(MSWINCE) && !(defined(__MINGW32__) && !defined(_DLL))  */
 
+DWORD WINAPI GC_win32_start(struct GC_stack_base *sb, LPVOID arg)
+{
+    return (DWORD)GC_call_with_stack_base(GC_win32_start_inner, arg);
+}
 #endif /* !CYGWIN32 */
 
 #ifdef MSWINCE
@@ -630,13 +998,16 @@ DWORD WINAPI main_thread_start(LPVOID arg)
 /* Called by GC_init() - we hold the allocation lock.	*/
 void GC_thr_init(void) {
     struct GC_stack_base sb;
+    int sb_result;
 
+    GC_ASSERT(I_HOLD_LOCK());
     if (GC_thr_initialized) return;
     GC_main_thread = GetCurrentThreadId();
     GC_thr_initialized = TRUE;
 
     /* Add the initial thread, so we can stop it.	*/
-    GC_get_stack_base(&sb);
+    sb_result = GC_get_stack_base(&sb);
+    GC_ASSERT(sb_result == GC_SUCCESS);
     GC_register_my_thread(&sb);
 }
 
@@ -658,6 +1029,7 @@ int GC_pthread_join(pthread_t pthread_id, void **retval) {
 		(int)pthread_self(), GetCurrentThreadId(), (int)pthread_id);
 #   endif
 
+    client_has_run = TRUE;
     /* Thread being joined might not have registered itself yet. */
     /* After the join,thread id may have been recycled.		 */
     /* FIXME: It would be better if this worked more like	 */
@@ -667,9 +1039,11 @@ int GC_pthread_join(pthread_t pthread_id, void **retval) {
 
     result = pthread_join(pthread_id, retval);
 
-    /* FIXME:  This is an asynchronous deletion, which we said can't	*/
-    /* happen?								*/
-    GC_delete_gc_thread(joinee);
+    if (!GC_win32_dll_threads) {
+      LOCK();
+      GC_delete_gc_thread(joinee);
+      UNLOCK();
+    } /* otherwise dllmain handles it.	*/
 
 #   if DEBUG_CYGWIN_THREADS
       GC_printf("thread 0x%x(0x%x) completed join with thread 0x%x.\n",
@@ -690,8 +1064,12 @@ GC_pthread_create(pthread_t *new_thread,
     int result;
     struct start_info * si;
 
-    if (!GC_is_initialized) GC_init();
+    if (!parallel_initialized) GC_init_parallel();
     		/* make sure GC is initialized (i.e. main thread is attached) */
+    client_has_run = TRUE;
+    if (GC_win32_dll_threads) {
+      return pthread_create(new_thread, attr, start_routine, arg);
+    }
     
     /* This is otherwise saved only in an area mmapped by the thread */
     /* library, which isn't visible to the collector.		 */
@@ -711,7 +1089,7 @@ GC_pthread_create(pthread_t *new_thread,
 		(int)pthread_self(), GetCurrentThreadId);
 #   endif
     GC_need_to_lock = TRUE;
-    result = pthread_create(new_thread, attr, GC_start_routine, si); 
+    result = pthread_create(new_thread, attr, GC_pthread_start, si); 
 
     if (result) { /* failure */
       	GC_free(si);
@@ -720,24 +1098,24 @@ GC_pthread_create(pthread_t *new_thread,
     return(result);
 }
 
-void * GC_start_routine(void * arg)
+void * GC_pthread_start_inner(struct GC_stack_base *sb, void * arg)
 {
     struct start_info * si = arg;
     void * result;
     void *(*start)(void *);
     void *start_arg;
-    pthread_t pthread_id;
     DWORD thread_id = GetCurrentThreadId();
+    pthread_t pthread_id = pthread_self();
     GC_thread me;
     GC_bool detached;
     int i;
-    struct GC_stack_base sb;
 
 #   if DEBUG_CYGWIN_THREADS
-      GC_printf("thread 0x%x(0x%x) starting...\n",(int)pthread_self(),
+      GC_printf("thread 0x%x(0x%x) starting...\n",(int)pthread_id,
 		      				  thread_id);
 #   endif
 
+    GC_ASSERT(!GC_win32_dll_threads);
     /* If a GC occurs before the thread is registered, that GC will	*/
     /* ignore this thread.  That's fine, since it will block trying to  */
     /* acquire the allocation lock, and won't yet hold interesting 	*/
@@ -745,14 +1123,14 @@ void * GC_start_routine(void * arg)
     LOCK();
     /* We register the thread here instead of in the parent, so that	*/
     /* we don't need to hold the allocation lock during pthread_create. */
-    GC_get_stack_base(&sb);
-    me = GC_register_my_thread_inner(&sb, thread_id);
+    me = GC_register_my_thread_inner(sb, thread_id);
+    SET_PTHREAD_MAP_CACHE(pthread_id, thread_id);
     UNLOCK();
 
     start = si -> start_routine;
     start_arg = si -> arg;
     if (si-> detached) me -> flags |= DETACHED;
-    me -> pthread_id = pthread_id = pthread_self();
+    me -> pthread_id = pthread_id;
 
     GC_free(si); /* was allocated uncollectable */
 
@@ -769,17 +1147,26 @@ void * GC_start_routine(void * arg)
     return(result);
 }
 
+void * GC_pthread_start(void * arg)
+{
+    return GC_call_with_stack_base(GC_pthread_start_inner, arg);
+}
+
 void GC_thread_exit_proc(void *arg)
 {
     GC_thread me = (GC_thread)arg;
     int i;
 
+    GC_ASSERT(!GC_win32_dll_threads);
 #   if DEBUG_CYGWIN_THREADS
       GC_printf("thread 0x%x(0x%x) called pthread_exit().\n",
 		(int)pthread_self(),GetCurrentThreadId());
 #   endif
 
     LOCK();
+#   if defined(THREAD_LOCAL_ALLOC)
+      GC_destroy_thread_local(&(me->tlfs));
+#   endif
     if (me -> flags & DETACHED) {
       GC_delete_thread(GetCurrentThreadId());
     } else {
@@ -791,6 +1178,7 @@ void GC_thread_exit_proc(void *arg)
 
 /* nothing required here... */
 int GC_pthread_sigmask(int how, const sigset_t *set, sigset_t *oset) {
+  client_has_run = TRUE;
   return pthread_sigmask(how, set, oset);
 }
 
@@ -799,6 +1187,7 @@ int GC_pthread_detach(pthread_t thread)
     int result;
     GC_thread thread_gc_id;
     
+    client_has_run = TRUE;
     LOCK();
     thread_gc_id = GC_lookup_pthread(thread);
     UNLOCK();
@@ -827,6 +1216,9 @@ BOOL WINAPI DllMain(HINSTANCE inst, ULONG reason, LPVOID reserved)
 {
   struct GC_stack_base sb;
   DWORD thread_id;
+  int sb_result;
+
+  if (client_has_run && !GC_win32_dll_threads) return TRUE;
 
   switch (reason) {
   case DLL_PROCESS_ATTACH:
@@ -837,28 +1229,32 @@ BOOL WINAPI DllMain(HINSTANCE inst, ULONG reason, LPVOID reserved)
     thread_id = GetCurrentThreadId();
     if (GC_main_thread != thread_id) {
 	/* Don't lock here.	*/
-	GC_get_stack_base(&sb);
+        sb_result = GC_get_stack_base(&sb);
+        GC_ASSERT(sb_result == GC_SUCCESS);
+#       ifdef THREAD_LOCAL_ALLOC
+	  ABORT("Cannot initialize thread local cache from DllMain");
+#       endif
 	GC_register_my_thread_inner(&sb, thread_id);
     } /* o.w. we already did it during GC_thr_init(), called by GC_init() */
     break;
 
   case DLL_THREAD_DETACH:
-    LOCK();	/* Safe? DllMain description is ambiguous.	*/
+    /* We are hopefully running in the context of the exiting thread.	*/
+    client_has_run = TRUE;
+    if (!GC_win32_dll_threads) return TRUE;
     GC_delete_thread(GetCurrentThreadId());
-    UNLOCK();
     break;
 
   case DLL_PROCESS_DETACH:
     {
       int i;
 
-      LOCK();
+      if (!GC_win32_dll_threads) return TRUE;
       for (i = 0; i <= GC_get_max_thread_index(); ++i)
       {
-          if (AO_load(&(thread_table[i].in_use)))
-	    GC_delete_gc_thread(thread_table + i);
+          if (AO_load(&(dll_thread_table[i].in_use)))
+	    GC_delete_gc_thread(dll_thread_table + i);
       }
-      UNLOCK();
 
       GC_deinit();
       DeleteCriticalSection(&GC_allocate_ml);
@@ -873,12 +1269,80 @@ BOOL WINAPI DllMain(HINSTANCE inst, ULONG reason, LPVOID reserved)
 
 # endif /* !MSWINCE */
 
-# if defined(THREAD_LOCAL_ALLOC) && !defined(DBG_HDRS_ALL)
+/* Perform all initializations, including those that	*/
+/* may require allocation.				*/
+/* Called without allocation lock.			*/
+/* Must be called before a second thread is created.	*/
+void GC_init_parallel(void)
+{
+    if (parallel_initialized) return;
+    parallel_initialized = TRUE;
+
+    /* GC_init() calls us back, so set flag first.	*/
+    if (!GC_is_initialized) GC_init();
+    /* Initialize thread local free lists if used.	*/
+#   if defined(THREAD_LOCAL_ALLOC)
+      LOCK();
+      GC_init_thread_local(&(GC_lookup_thread(GetCurrentThreadId())->tlfs));
+      UNLOCK();
+#   endif
+}
+
+#if defined(USE_PTHREAD_LOCKS)
+  /* Support for pthread locking code.		*/
+  /* Pthread_mutex_try_lock may not win here,	*/
+  /* due to builtinsupport for spinning first?	*/
+
+volatile GC_bool GC_collecting = 0;
+			/* A hint that we're in the collector and       */
+                        /* holding the allocation lock for an           */
+                        /* extended period.                             */
+
+void GC_lock(void)
+{
+    pthread_mutex_lock(&GC_allocate_ml);
+}
+#endif /* USE_PTHREAD ... */
 
-/* We don't really support thread-local allocation with DBG_HDRS_ALL */
+# if defined(THREAD_LOCAL_ALLOC)
 
 /* Add thread-local allocation support.  Microsoft uses __declspec(thread) */
 
+/* We must explicitly mark ptrfree and gcj free lists, since the free 	*/
+/* list links wouldn't otherwise be found.  We also set them in the 	*/
+/* normal free lists, since that involves touching less memory than if	*/
+/* we scanned them normally.						*/
+void GC_mark_thread_local_free_lists(void)
+{
+    int i;
+    GC_thread p;
+    
+    for (i = 0; i < THREAD_TABLE_SZ; ++i) {
+      for (p = GC_threads[i]; 0 != p; p = p -> next) {
+	GC_mark_thread_local_fls_for(&(p->tlfs));
+      }
+    }
+}
+
+#if defined(GC_ASSERTIONS)
+    /* Check that all thread-local free-lists are completely marked.	*/
+    /* also check that thread-specific-data structures are marked.	*/
+    void GC_check_tls(void) {
+	int i;
+	GC_thread p;
+	
+	for (i = 0; i < THREAD_TABLE_SZ; ++i) {
+	  for (p = GC_threads[i]; 0 != p; p = p -> next) {
+	    GC_check_tls_for(&(p->tlfs));
+	  }
+	}
+#       if defined(USE_CUSTOM_SPECIFIC)
+	  if (GC_thread_key != 0)
+	    GC_check_tsd_marks(GC_thread_key);
+#	endif 
+    }
+#endif /* GC_ASSERTIONS */
+
 #endif /* THREAD_LOCAL_ALLOC ... */
 
 #endif /* GC_WIN32_THREADS */
-- 
2.7.4