Jiri Slaby reported regression of bind() with a simple repro. [0]
The repro creates a TIME_WAIT socket and tries to bind() a new socket
with the same local address and port. Before commit
28044fc1d495 ("net:
Add a bhash2 table hashed by port and address"), the bind() failed with
-EADDRINUSE, but now it succeeds.
The cited commit should have put TIME_WAIT sockets into bhash2; otherwise,
inet_bhash2_conflict() misses TIME_WAIT sockets when validating bind()
requests if the address is not a wildcard one.
The straight option is to move sk_bind2_node from struct sock to struct
sock_common to add twsk to bhash2 as implemented as RFC. [1] However, the
binary layout change in the struct sock could affect performances moving
hot fields on different cachelines.
To avoid that, we add another TIME_WAIT list in inet_bind2_bucket and check
it while validating bind().
[0]: https://lore.kernel.org/netdev/
6b971a4e-c7d8-411e-1f92-
fda29b5b2fb9@kernel.org/
[1]: https://lore.kernel.org/netdev/
20221221151258.25748-2-kuniyu@amazon.com/
Fixes:
28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
Reported-by: Jiri Slaby <jirislaby@kernel.org>
Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
struct hlist_node node;
/* List of sockets hashed to this bucket */
struct hlist_head owners;
+ /* bhash has twsk in owners, but bhash2 has twsk in
+ * deathrow not to add a member in struct sock_common.
+ */
+ struct hlist_head deathrow;
};
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
u32 tw_priority;
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
+ struct inet_bind2_bucket *tw_tb2;
+ struct hlist_node tw_bind2_node;
};
#define tw_tclass tw_tos
+#define twsk_for_each_bound_bhash2(__tw, list) \
+ hlist_for_each_entry(__tw, list, tw_bind2_node)
+
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
{
return (struct inet_timewait_sock *)sk;
return false;
}
+static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
+ kuid_t sk_uid, bool relax,
+ bool reuseport_cb_ok, bool reuseport_ok)
+{
+ if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
+ return false;
+
+ return inet_bind_conflict(sk, sk2, sk_uid, relax,
+ reuseport_cb_ok, reuseport_ok);
+}
+
static bool inet_bhash2_conflict(const struct sock *sk,
const struct inet_bind2_bucket *tb2,
kuid_t sk_uid,
bool relax, bool reuseport_cb_ok,
bool reuseport_ok)
{
+ struct inet_timewait_sock *tw2;
struct sock *sk2;
sk_for_each_bound_bhash2(sk2, &tb2->owners) {
- if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
- continue;
+ if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
+ reuseport_cb_ok, reuseport_ok))
+ return true;
+ }
- if (inet_bind_conflict(sk, sk2, sk_uid, relax,
- reuseport_cb_ok, reuseport_ok))
+ twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) {
+ sk2 = (struct sock *)tw2;
+
+ if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
+ reuseport_cb_ok, reuseport_ok))
return true;
}
+
return false;
}
#endif
tb->rcv_saddr = sk->sk_rcv_saddr;
INIT_HLIST_HEAD(&tb->owners);
+ INIT_HLIST_HEAD(&tb->deathrow);
hlist_add_head(&tb->node, &head->chain);
}
/* Caller must hold hashbucket lock for this tb with local BH disabled */
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
- if (hlist_empty(&tb->owners)) {
+ if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) {
__hlist_del(&tb->node);
kmem_cache_free(cachep, tb);
}
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
- spin_unlock(&head2->lock);
-
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
}
if (tw)
inet_twsk_bind_unhash(tw, hinfo);
+
+ spin_unlock(&head2->lock);
spin_unlock(&head->lock);
+
if (tw)
inet_twsk_deschedule_put(tw);
local_bh_enable();
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
+ struct inet_bind2_bucket *tb2 = tw->tw_tb2;
struct inet_bind_bucket *tb = tw->tw_tb;
if (!tb)
__hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+
+ __hlist_del(&tw->tw_bind2_node);
+ tw->tw_tb2 = NULL;
+ inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+
__sock_put((struct sock *)tw);
}
{
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
- struct inet_bind_hashbucket *bhead;
+ struct inet_bind_hashbucket *bhead, *bhead2;
spin_lock(lock);
sk_nulls_del_node_init_rcu((struct sock *)tw);
/* Disassociate with bind bucket. */
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
hashinfo->bhash_size)];
+ bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
+ twsk_net(tw), tw->tw_num);
spin_lock(&bhead->lock);
+ spin_lock(&bhead2->lock);
inet_twsk_bind_unhash(tw, hashinfo);
+ spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock);
refcount_dec(&tw->tw_dr->tw_refcount);
hlist_add_head(&tw->tw_bind_node, list);
}
+static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
+ struct hlist_head *list)
+{
+ hlist_add_head(&tw->tw_bind2_node, list);
+}
+
/*
* Enter the time wait state. This is called with locally disabled BH.
* Essentially we whip up a timewait bucket, copy the relevant info into it
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
- struct inet_bind_hashbucket *bhead;
+ struct inet_bind_hashbucket *bhead, *bhead2;
+
/* Step 1: Put TW into bind hash. Original socket stays there too.
Note, that any socket with inet->num != 0 MUST be bound in
binding cache, even if it is closed.
*/
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)];
+ bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
+
spin_lock(&bhead->lock);
+ spin_lock(&bhead2->lock);
+
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+
+ tw->tw_tb2 = icsk->icsk_bind2_hash;
+ WARN_ON(!icsk->icsk_bind2_hash);
+ inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow);
+
+ spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock);
spin_lock(lock);