inet: Add a 2nd listener hashtable (port+addr)
authorMartin KaFai Lau <kafai@fb.com>
Fri, 1 Dec 2017 20:52:31 +0000 (12:52 -0800)
committerDavid S. Miller <davem@davemloft.net>
Sun, 3 Dec 2017 15:18:28 +0000 (10:18 -0500)
The current listener hashtable is hashed by port only.
When a process is listening at many IP addresses with the same port (e.g.
[IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener()
performance is degraded to a link list.  It is prone to syn attack.

UDP had a similar issue and a second hashtable was added to resolve it.

This patch adds a second hashtable for the listener's sockets.
The second hashtable is hashed by port and address.

It cannot reuse the existing skc_portaddr_node which is shared
with skc_bind_node.  TCP listener needs to use skc_bind_node.
Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to
the inet_connection_sock which the listener (like TCP) also belongs to.

The new portaddr hashtable may need two lookup (First by IP:PORT.
Second by INADDR_ANY:PORT if the IP:PORT is a not found).   Hence,
it implements a similar cut off as UDP such that it will only consult the
new portaddr hashtable if the current port-only hashtable has >10
sk in the link-list.

lhash2 and lhash2_mask are added to 'struct inet_hashinfo'.  I take
this chance to plug a 4 bytes hole.  It is done by first moving
the existing bind_bucket_cachep up and then add the new
(int lhash2_mask, *lhash2) after the existing bhash_size.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/inet_connection_sock.h
include/net/inet_hashtables.h
net/ipv4/inet_hashtables.c
net/ipv6/inet6_hashtables.c

index 0358745..8e1bf9a 100644 (file)
@@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_af_ops                   Operations which are AF_INET{4,6} specific
  * @icsk_ulp_ops          Pluggable ULP control hook
  * @icsk_ulp_data         ULP private data
+ * @icsk_listen_portaddr_node  hash to the portaddr listener hashtable
  * @icsk_ca_state:        Congestion control state
  * @icsk_retransmits:     Number of unrecovered [RTO] timeouts
  * @icsk_pending:         Scheduled timer event
@@ -101,6 +102,7 @@ struct inet_connection_sock {
        const struct inet_connection_sock_af_ops *icsk_af_ops;
        const struct tcp_ulp_ops  *icsk_ulp_ops;
        void                      *icsk_ulp_data;
+       struct hlist_node         icsk_listen_portaddr_node;
        unsigned int              (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
        __u8                      icsk_ca_state:6,
                                  icsk_ca_setsockopt:1,
index 4cce516..9141e95 100644 (file)
@@ -133,12 +133,13 @@ struct inet_hashinfo {
        /* Ok, let's try this, I give up, we do need a local binding
         * TCP hash as well as the others for fast bind/connect.
         */
+       struct kmem_cache               *bind_bucket_cachep;
        struct inet_bind_hashbucket     *bhash;
-
        unsigned int                    bhash_size;
-       /* 4 bytes hole on 64 bit */
 
-       struct kmem_cache               *bind_bucket_cachep;
+       /* The 2nd listener table hashed by local port and address */
+       unsigned int                    lhash2_mask;
+       struct inet_listen_hashbucket   *lhash2;
 
        /* All the above members are written once at bootup and
         * never written again _or_ are predominantly read-access.
@@ -146,14 +147,25 @@ struct inet_hashinfo {
         * Now align to a new cache line as all the following members
         * might be often dirty.
         */
-       /* All sockets in TCP_LISTEN state will be in here.  This is the only
-        * table where wildcard'd TCP sockets can exist.  Hash function here
-        * is just local port number.
+       /* All sockets in TCP_LISTEN state will be in listening_hash.
+        * This is the only table where wildcard'd TCP sockets can
+        * exist.  listening_hash is only hashed by local port number.
+        * If lhash2 is initialized, the same socket will also be hashed
+        * to lhash2 by port and address.
         */
        struct inet_listen_hashbucket   listening_hash[INET_LHTABLE_SIZE]
                                        ____cacheline_aligned_in_smp;
 };
 
+#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
+       hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
+
+static inline struct inet_listen_hashbucket *
+inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
+{
+       return &h->lhash2[hash & h->lhash2_mask];
+}
+
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
        struct inet_hashinfo *hashinfo,
        unsigned int hash)
@@ -209,6 +221,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child);
 void inet_put_port(struct sock *sk);
 
 void inet_hashinfo_init(struct inet_hashinfo *h);
+void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+                        unsigned long numentries, int scale,
+                        unsigned long low_limit,
+                        unsigned long high_limit);
 
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
index 80cfd3f..f6f5810 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/vmalloc.h>
+#include <linux/bootmem.h>
 
 #include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
@@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
 }
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+       u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+       if (sk->sk_family == AF_INET6)
+               hash = ipv6_portaddr_hash(sock_net(sk),
+                                         &sk->sk_v6_rcv_saddr,
+                                         inet_sk(sk)->inet_num);
+       else
+#endif
+               hash = ipv4_portaddr_hash(sock_net(sk),
+                                         inet_sk(sk)->inet_rcv_saddr,
+                                         inet_sk(sk)->inet_num);
+       return inet_lhash2_bucket(h, hash);
+}
+
+static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
+{
+       struct inet_listen_hashbucket *ilb2;
+
+       if (!h->lhash2)
+               return;
+
+       ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+       spin_lock(&ilb2->lock);
+       if (sk->sk_reuseport && sk->sk_family == AF_INET6)
+               hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+                                  &ilb2->head);
+       else
+               hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+                                  &ilb2->head);
+       ilb2->count++;
+       spin_unlock(&ilb2->lock);
+}
+
+static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
+{
+       struct inet_listen_hashbucket *ilb2;
+
+       if (!h->lhash2 ||
+           WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
+               return;
+
+       ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+       spin_lock(&ilb2->lock);
+       hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
+       ilb2->count--;
+       spin_unlock(&ilb2->lock);
+}
+
 static inline int compute_score(struct sock *sk, struct net *net,
                                const unsigned short hnum, const __be32 daddr,
                                const int dif, const int sdif, bool exact_dif)
@@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
  */
 
 /* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(struct net *net,
+                               struct inet_listen_hashbucket *ilb2,
+                               struct sk_buff *skb, int doff,
+                               const __be32 saddr, __be16 sport,
+                               const __be32 daddr, const unsigned short hnum,
+                               const int dif, const int sdif)
+{
+       bool exact_dif = inet_exact_dif_match(net, skb);
+       struct inet_connection_sock *icsk;
+       struct sock *sk, *result = NULL;
+       int score, hiscore = 0;
+       u32 phash = 0;
+
+       inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+               sk = (struct sock *)icsk;
+               score = compute_score(sk, net, hnum, daddr,
+                                     dif, sdif, exact_dif);
+               if (score > hiscore) {
+                       if (sk->sk_reuseport) {
+                               phash = inet_ehashfn(net, daddr, hnum,
+                                                    saddr, sport);
+                               result = reuseport_select_sock(sk, phash,
+                                                              skb, doff);
+                               if (result)
+                                       return result;
+                       }
+                       result = sk;
+                       hiscore = score;
+               }
+       }
+
+       return result;
+}
+
 struct sock *__inet_lookup_listener(struct net *net,
                                    struct inet_hashinfo *hashinfo,
                                    struct sk_buff *skb, int doff,
@@ -217,10 +306,42 @@ struct sock *__inet_lookup_listener(struct net *net,
        unsigned int hash = inet_lhashfn(net, hnum);
        struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
        bool exact_dif = inet_exact_dif_match(net, skb);
+       struct inet_listen_hashbucket *ilb2;
        struct sock *sk, *result = NULL;
        int score, hiscore = 0;
+       unsigned int hash2;
        u32 phash = 0;
 
+       if (ilb->count <= 10 || !hashinfo->lhash2)
+               goto port_lookup;
+
+       /* Too many sk in the ilb bucket (which is hashed by port alone).
+        * Try lhash2 (which is hashed by port and addr) instead.
+        */
+
+       hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+       ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+       if (ilb2->count > ilb->count)
+               goto port_lookup;
+
+       result = inet_lhash2_lookup(net, ilb2, skb, doff,
+                                   saddr, sport, daddr, hnum,
+                                   dif, sdif);
+       if (result)
+               return result;
+
+       /* Lookup lhash2 with INADDR_ANY */
+
+       hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+       ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+       if (ilb2->count > ilb->count)
+               goto port_lookup;
+
+       return inet_lhash2_lookup(net, ilb2, skb, doff,
+                                 saddr, sport, daddr, hnum,
+                                 dif, sdif);
+
+port_lookup:
        sk_for_each_rcu(sk, &ilb->head) {
                score = compute_score(sk, net, hnum, daddr,
                                      dif, sdif, exact_dif);
@@ -476,6 +597,7 @@ int __inet_hash(struct sock *sk, struct sock *osk)
                hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
        else
                hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+       inet_hash2(hashinfo, sk);
        ilb->count++;
        sock_set_flag(sk, SOCK_RCU_FREE);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -506,7 +628,6 @@ void inet_unhash(struct sock *sk)
        struct inet_listen_hashbucket *ilb;
        spinlock_t *lock;
        bool listener = false;
-       int done;
 
        if (sk_unhashed(sk))
                return;
@@ -519,17 +640,20 @@ void inet_unhash(struct sock *sk)
                lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
        }
        spin_lock_bh(lock);
+       if (sk_unhashed(sk))
+               goto unlock;
+
        if (rcu_access_pointer(sk->sk_reuseport_cb))
                reuseport_detach_sock(sk);
-       if (listener)
-               done = __sk_del_node_init(sk);
-       else
-               done = __sk_nulls_del_node_init_rcu(sk);
-       if (done) {
-               if (listener)
-                       ilb->count--;
-               sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+       if (listener) {
+               inet_unhash2(hashinfo, sk);
+                __sk_del_node_init(sk);
+                ilb->count--;
+       } else {
+               __sk_nulls_del_node_init_rcu(sk);
        }
+       sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+unlock:
        spin_unlock_bh(lock);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
@@ -666,9 +790,35 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
                INIT_HLIST_HEAD(&h->listening_hash[i].head);
                h->listening_hash[i].count = 0;
        }
+
+       h->lhash2 = NULL;
 }
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
 
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+                               unsigned long numentries, int scale,
+                               unsigned long low_limit,
+                               unsigned long high_limit)
+{
+       unsigned int i;
+
+       h->lhash2 = alloc_large_system_hash(name,
+                                           sizeof(*h->lhash2),
+                                           numentries,
+                                           scale,
+                                           0,
+                                           NULL,
+                                           &h->lhash2_mask,
+                                           low_limit,
+                                           high_limit);
+
+       for (i = 0; i <= h->lhash2_mask; i++) {
+               spin_lock_init(&h->lhash2[i].lock);
+               INIT_HLIST_HEAD(&h->lhash2[i].head);
+               h->lhash2[i].count = 0;
+       }
+}
+
 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
 {
        unsigned int locksz = sizeof(spinlock_t);
index 0d14513..2febe26 100644 (file)
@@ -125,6 +125,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
 }
 
 /* called with rcu_read_lock() */
+static struct sock *inet6_lhash2_lookup(struct net *net,
+               struct inet_listen_hashbucket *ilb2,
+               struct sk_buff *skb, int doff,
+               const struct in6_addr *saddr,
+               const __be16 sport, const struct in6_addr *daddr,
+               const unsigned short hnum, const int dif, const int sdif)
+{
+       bool exact_dif = inet6_exact_dif_match(net, skb);
+       struct inet_connection_sock *icsk;
+       struct sock *sk, *result = NULL;
+       int score, hiscore = 0;
+       u32 phash = 0;
+
+       inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+               sk = (struct sock *)icsk;
+               score = compute_score(sk, net, hnum, daddr, dif, sdif,
+                                     exact_dif);
+               if (score > hiscore) {
+                       if (sk->sk_reuseport) {
+                               phash = inet6_ehashfn(net, daddr, hnum,
+                                                     saddr, sport);
+                               result = reuseport_select_sock(sk, phash,
+                                                              skb, doff);
+                               if (result)
+                                       return result;
+                       }
+                       result = sk;
+                       hiscore = score;
+               }
+       }
+
+       return result;
+}
+
 struct sock *inet6_lookup_listener(struct net *net,
                struct inet_hashinfo *hashinfo,
                struct sk_buff *skb, int doff,
@@ -135,10 +169,42 @@ struct sock *inet6_lookup_listener(struct net *net,
        unsigned int hash = inet_lhashfn(net, hnum);
        struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
        bool exact_dif = inet6_exact_dif_match(net, skb);
+       struct inet_listen_hashbucket *ilb2;
        struct sock *sk, *result = NULL;
        int score, hiscore = 0;
+       unsigned int hash2;
        u32 phash = 0;
 
+       if (ilb->count <= 10 || !hashinfo->lhash2)
+               goto port_lookup;
+
+       /* Too many sk in the ilb bucket (which is hashed by port alone).
+        * Try lhash2 (which is hashed by port and addr) instead.
+        */
+
+       hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+       ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+       if (ilb2->count > ilb->count)
+               goto port_lookup;
+
+       result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+                                    saddr, sport, daddr, hnum,
+                                    dif, sdif);
+       if (result)
+               return result;
+
+       /* Lookup lhash2 with in6addr_any */
+
+       hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+       ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+       if (ilb2->count > ilb->count)
+               goto port_lookup;
+
+       return inet6_lhash2_lookup(net, ilb2, skb, doff,
+                                  saddr, sport, daddr, hnum,
+                                  dif, sdif);
+
+port_lookup:
        sk_for_each(sk, &ilb->head) {
                score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
                if (score > hiscore) {