From a52b89ebb6d4499be38780db8d176c5d3a6fbc17 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Sun, 12 Jan 2014 15:31:23 -0800
Subject: [PATCH] futexes: Increase hash table size for better performance
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Currently, the futex global hash table suffers from its fixed,
smallish (for today's standards) size of 256 entries, as well as
its lack of NUMA awareness. Large systems, using many futexes,
can be prone to high amounts of collisions; where these futexes
hash to the same bucket and lead to extra contention on the same
hb->lock. Furthermore, cacheline bouncing is a reality when we
have multiple hb->locks residing on the same cacheline and
different futexes hash to adjacent buckets.

This patch keeps the current static size of 16 entries for small
systems, or otherwise, 256 * ncpus (or larger as we need to
round the number to a power of 2). Note that this number of CPUs
accounts for all CPUs that can ever be available in the system,
taking into consideration things like hotpluging. While we do
impose extra overhead at bootup by making the hash table larger,
this is a one time thing, and does not shadow the benefits of
this patch.

Furthermore, as suggested by tglx, by cache aligning the hash
buckets we can avoid access across cacheline boundaries and also
avoid massive cache line bouncing if multiple cpus are hammering
away at different hash buckets which happen to reside in the
same cache line.

Also, similar to other core kernel components (pid, dcache,
tcp), by using alloc_large_system_hash() we benefit from its
NUMA awareness and thus the table is distributed among the nodes
instead of in a single one.

For a custom microbenchmark that pounds on the uaddr hashing --
making the wait path fail at futex_wait_setup() returning
-EWOULDBLOCK for large amounts of futexes, we can see the
following benefits on a 80-core, 8-socket 1Tb server:

 +---------+--------------------+------------------------+-----------------------+-------------------------------+
 | threads | baseline (ops/sec) | aligned-only (ops/sec) | large table (ops/sec) | large table+aligned (ops/sec) |
 +---------+--------------------+------------------------+-----------------------+-------------------------------+
 |Â Â Â Â  512 |Â Â Â Â Â Â Â Â Â Â Â Â Â  32426 | 50531Â  (+55.8%)Â Â Â Â Â Â Â Â | 255274Â  (+687.2%)Â Â Â Â Â | 292553Â  (+802.2%)Â Â Â Â Â Â Â Â Â Â Â Â Â |
 |Â Â Â Â  256 |Â Â Â Â Â Â Â Â Â Â Â Â Â  65360 | 99588Â  (+52.3%)Â Â Â Â Â Â Â Â | 443563Â  (+578.6%)Â Â Â Â Â | 508088Â  (+677.3%)Â Â Â Â Â Â Â Â Â Â Â Â Â |
 |Â Â Â Â  128 |Â Â Â Â Â Â Â Â Â Â Â Â Â 125635 | 200075 (+59.2%)Â Â Â Â Â Â Â Â | 742613Â  (+491.1%)Â Â Â Â Â | 835452Â  (+564.9%)Â Â Â Â Â Â Â Â Â Â Â Â Â |
 |Â Â Â Â Â  80 |Â Â Â Â Â Â Â Â Â Â Â Â Â 193559 | 323425 (+67.1%)Â Â Â Â Â Â Â Â | 1028147 (+431.1%)Â Â Â Â Â | 1130304 (+483.9%)Â Â Â Â Â Â Â Â Â Â Â Â Â |
 |Â Â Â Â Â  64 |Â Â Â Â Â Â Â Â Â Â Â Â Â 247667 | 443740 (+79.1%)Â Â Â Â Â Â Â Â | 997300Â  (+302.6%)Â Â Â Â Â | 1145494 (+362.5%)Â Â Â Â Â Â Â Â Â Â Â Â Â |
 |Â Â Â Â Â  32 |Â Â Â Â Â Â Â Â Â Â Â Â Â 628412 | 721401 (+14.7%)Â Â Â Â Â Â Â Â | 965996Â  (+53.7%)Â Â Â Â Â Â | 1122115 (+78.5%)Â Â Â Â Â Â Â Â Â Â Â Â Â Â |
 +---------+--------------------+------------------------+-----------------------+-------------------------------+

Reviewed-by: Darren Hart <dvhart@linux.intel.com>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Waiman Long <Waiman.Long@hp.com>
Reviewed-and-tested-by: Jason Low <jason.low2@hp.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: Tom Vaden <tom.vaden@hp.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Link: http://lkml.kernel.org/r/1389569486-25487-3-git-send-email-davidlohr@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/futex.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 085f5fa..577481d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,6 +63,7 @@
 #include <linux/sched/rt.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
+#include <linux/bootmem.h>
 
 #include <asm/futex.h>
 
@@ -70,8 +71,6 @@
 
 int __read_mostly futex_cmpxchg_enabled;
 
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
-
 /*
  * Futex flags used to encode options to functions and preserve them across
  * restarts.
@@ -149,9 +148,11 @@ static const struct futex_q futex_q_init = {
 struct futex_hash_bucket {
 	spinlock_t lock;
 	struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;
 
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static unsigned long __read_mostly futex_hashsize;
+
+static struct futex_hash_bucket *futex_queues;
 
 /*
  * We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +162,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
 	u32 hash = jhash2((u32*)&key->both.word,
 			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 			  key->both.offset);
-	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+	return &futex_queues[hash & (futex_hashsize - 1)];
 }
 
 /*
@@ -2719,7 +2720,18 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 static int __init futex_init(void)
 {
 	u32 curval;
-	int i;
+	unsigned long i;
+
+#if CONFIG_BASE_SMALL
+	futex_hashsize = 16;
+#else
+	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+					       futex_hashsize, 0,
+					       futex_hashsize < 256 ? HASH_SMALL : 0,
+					       NULL, NULL, futex_hashsize, futex_hashsize);
 
 	/*
 	 * This will fail and we want it. Some arch implementations do
@@ -2734,7 +2746,7 @@ static int __init futex_init(void)
 	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
 		futex_cmpxchg_enabled = 1;
 
-	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+	for (i = 0; i < futex_hashsize; i++) {
 		plist_head_init(&futex_queues[i].chain);
 		spin_lock_init(&futex_queues[i].lock);
 	}
-- 
2.7.4