net/netfilter/nf_conntrack_core.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Connection state tracking for netfilter.  This is separated from,
   3    but required by, the NAT layer; it can also be used by an iptables
   4    extension. */
   5
   6 /* (C) 1999-2001 Paul `Rusty' Russell
   7  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
   8  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
   9  * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
  10  */
  11
  12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14 #include <linux/types.h>
  15 #include <linux/netfilter.h>
  16 #include <linux/module.h>
  17 #include <linux/sched.h>
  18 #include <linux/skbuff.h>
  19 #include <linux/proc_fs.h>
  20 #include <linux/vmalloc.h>
  21 #include <linux/stddef.h>
  22 #include <linux/slab.h>
  23 #include <linux/random.h>
  24 #include <linux/siphash.h>
  25 #include <linux/err.h>
  26 #include <linux/percpu.h>
  27 #include <linux/moduleparam.h>
  28 #include <linux/notifier.h>
  29 #include <linux/kernel.h>
  30 #include <linux/netdevice.h>
  31 #include <linux/socket.h>
  32 #include <linux/mm.h>
  33 #include <linux/nsproxy.h>
  34 #include <linux/rculist_nulls.h>
  35
  36 #include <net/netfilter/nf_conntrack.h>
  37 #include <net/netfilter/nf_conntrack_bpf.h>
  38 #include <net/netfilter/nf_conntrack_l4proto.h>
  39 #include <net/netfilter/nf_conntrack_expect.h>
  40 #include <net/netfilter/nf_conntrack_helper.h>
  41 #include <net/netfilter/nf_conntrack_core.h>
  42 #include <net/netfilter/nf_conntrack_extend.h>
  43 #include <net/netfilter/nf_conntrack_acct.h>
  44 #include <net/netfilter/nf_conntrack_ecache.h>
  45 #include <net/netfilter/nf_conntrack_zones.h>
  46 #include <net/netfilter/nf_conntrack_timestamp.h>
  47 #include <net/netfilter/nf_conntrack_timeout.h>
  48 #include <net/netfilter/nf_conntrack_labels.h>
  49 #include <net/netfilter/nf_conntrack_synproxy.h>
  50 #include <net/netfilter/nf_nat.h>
  51 #include <net/netfilter/nf_nat_helper.h>
  52 #include <net/netns/hash.h>
  53 #include <net/ip.h>
  54
  55 #include "nf_internals.h"
  56
  57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
  58 EXPORT_SYMBOL_GPL(nf_conntrack_locks);
  59
  60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
  61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
  62
  63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
  64 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
  65
  66 struct conntrack_gc_work {
  67         struct delayed_work     dwork;
  68         u32                     next_bucket;
  69         u32                     avg_timeout;
  70         u32                     count;
  71         u32                     start_time;
  72         bool                    exiting;
  73         bool                    early_drop;
  74 };
  75
  76 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
  77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
  78 static __read_mostly bool nf_conntrack_locks_all;
  79
  80 /* serialize hash resizes and nf_ct_iterate_cleanup */
  81 static DEFINE_MUTEX(nf_conntrack_mutex);
  82
  83 #define GC_SCAN_INTERVAL_MAX    (60ul * HZ)
  84 #define GC_SCAN_INTERVAL_MIN    (1ul * HZ)
  85
  86 /* clamp timeouts to this value (TCP unacked) */
  87 #define GC_SCAN_INTERVAL_CLAMP  (300ul * HZ)
  88
  89 /* Initial bias pretending we have 100 entries at the upper bound so we don't
  90  * wakeup often just because we have three entries with a 1s timeout while still
  91  * allowing non-idle machines to wakeup more often when needed.
  92  */
  93 #define GC_SCAN_INITIAL_COUNT   100
  94 #define GC_SCAN_INTERVAL_INIT   GC_SCAN_INTERVAL_MAX
  95
  96 #define GC_SCAN_MAX_DURATION    msecs_to_jiffies(10)
  97 #define GC_SCAN_EXPIRED_MAX     (64000u / HZ)
  98
  99 #define MIN_CHAINLEN    8u
 100 #define MAX_CHAINLEN    (32u - MIN_CHAINLEN)
 101
 102 static struct conntrack_gc_work conntrack_gc_work;
 103
 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
 105 {
 106         /* 1) Acquire the lock */
 107         spin_lock(lock);
 108
 109         /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics
 110          * It pairs with the smp_store_release() in nf_conntrack_all_unlock()
 111          */
 112         if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
 113                 return;
 114
 115         /* fast path failed, unlock */
 116         spin_unlock(lock);
 117
 118         /* Slow path 1) get global lock */
 119         spin_lock(&nf_conntrack_locks_all_lock);
 120
 121         /* Slow path 2) get the lock we want */
 122         spin_lock(lock);
 123
 124         /* Slow path 3) release the global lock */
 125         spin_unlock(&nf_conntrack_locks_all_lock);
 126 }
 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 128
 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
 130 {
 131         h1 %= CONNTRACK_LOCKS;
 132         h2 %= CONNTRACK_LOCKS;
 133         spin_unlock(&nf_conntrack_locks[h1]);
 134         if (h1 != h2)
 135                 spin_unlock(&nf_conntrack_locks[h2]);
 136 }
 137
 138 /* return true if we need to recompute hashes (in case hash table was resized) */
 139 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
 140                                      unsigned int h2, unsigned int sequence)
 141 {
 142         h1 %= CONNTRACK_LOCKS;
 143         h2 %= CONNTRACK_LOCKS;
 144         if (h1 <= h2) {
 145                 nf_conntrack_lock(&nf_conntrack_locks[h1]);
 146                 if (h1 != h2)
 147                         spin_lock_nested(&nf_conntrack_locks[h2],
 148                                          SINGLE_DEPTH_NESTING);
 149         } else {
 150                 nf_conntrack_lock(&nf_conntrack_locks[h2]);
 151                 spin_lock_nested(&nf_conntrack_locks[h1],
 152                                  SINGLE_DEPTH_NESTING);
 153         }
 154         if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
 155                 nf_conntrack_double_unlock(h1, h2);
 156                 return true;
 157         }
 158         return false;
 159 }
 160
 161 static void nf_conntrack_all_lock(void)
 162         __acquires(&nf_conntrack_locks_all_lock)
 163 {
 164         int i;
 165
 166         spin_lock(&nf_conntrack_locks_all_lock);
 167
 168         /* For nf_contrack_locks_all, only the latest time when another
 169          * CPU will see an update is controlled, by the "release" of the
 170          * spin_lock below.
 171          * The earliest time is not controlled, an thus KCSAN could detect
 172          * a race when nf_conntract_lock() reads the variable.
 173          * WRITE_ONCE() is used to ensure the compiler will not
 174          * optimize the write.
 175          */
 176         WRITE_ONCE(nf_conntrack_locks_all, true);
 177
 178         for (i = 0; i < CONNTRACK_LOCKS; i++) {
 179                 spin_lock(&nf_conntrack_locks[i]);
 180
 181                 /* This spin_unlock provides the "release" to ensure that
 182                  * nf_conntrack_locks_all==true is visible to everyone that
 183                  * acquired spin_lock(&nf_conntrack_locks[]).
 184                  */
 185                 spin_unlock(&nf_conntrack_locks[i]);
 186         }
 187 }
 188
 189 static void nf_conntrack_all_unlock(void)
 190         __releases(&nf_conntrack_locks_all_lock)
 191 {
 192         /* All prior stores must be complete before we clear
 193          * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
 194          * might observe the false value but not the entire
 195          * critical section.
 196          * It pairs with the smp_load_acquire() in nf_conntrack_lock()
 197          */
 198         smp_store_release(&nf_conntrack_locks_all, false);
 199         spin_unlock(&nf_conntrack_locks_all_lock);
 200 }
 201
 202 unsigned int nf_conntrack_htable_size __read_mostly;
 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 204
 205 unsigned int nf_conntrack_max __read_mostly;
 206 EXPORT_SYMBOL_GPL(nf_conntrack_max);
 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
 208 static siphash_aligned_key_t nf_conntrack_hash_rnd;
 209
 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
 211                               unsigned int zoneid,
 212                               const struct net *net)
 213 {
 214         struct {
 215                 struct nf_conntrack_man src;
 216                 union nf_inet_addr dst_addr;
 217                 unsigned int zone;
 218                 u32 net_mix;
 219                 u16 dport;
 220                 u16 proto;
 221         } __aligned(SIPHASH_ALIGNMENT) combined;
 222
 223         get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
 224
 225         memset(&combined, 0, sizeof(combined));
 226
 227         /* The direction must be ignored, so handle usable members manually. */
 228         combined.src = tuple->src;
 229         combined.dst_addr = tuple->dst.u3;
 230         combined.zone = zoneid;
 231         combined.net_mix = net_hash_mix(net);
 232         combined.dport = (__force __u16)tuple->dst.u.all;
 233         combined.proto = tuple->dst.protonum;
 234
 235         return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
 236 }
 237
 238 static u32 scale_hash(u32 hash)
 239 {
 240         return reciprocal_scale(hash, nf_conntrack_htable_size);
 241 }
 242
 243 static u32 __hash_conntrack(const struct net *net,
 244                             const struct nf_conntrack_tuple *tuple,
 245                             unsigned int zoneid,
 246                             unsigned int size)
 247 {
 248         return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
 249 }
 250
 251 static u32 hash_conntrack(const struct net *net,
 252                           const struct nf_conntrack_tuple *tuple,
 253                           unsigned int zoneid)
 254 {
 255         return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
 256 }
 257
 258 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
 259                                   unsigned int dataoff,
 260                                   struct nf_conntrack_tuple *tuple)
 261 {       struct {
 262                 __be16 sport;
 263                 __be16 dport;
 264         } _inet_hdr, *inet_hdr;
 265
 266         /* Actually only need first 4 bytes to get ports. */
 267         inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
 268         if (!inet_hdr)
 269                 return false;
 270
 271         tuple->src.u.udp.port = inet_hdr->sport;
 272         tuple->dst.u.udp.port = inet_hdr->dport;
 273         return true;
 274 }
 275
 276 static bool
 277 nf_ct_get_tuple(const struct sk_buff *skb,
 278                 unsigned int nhoff,
 279                 unsigned int dataoff,
 280                 u_int16_t l3num,
 281                 u_int8_t protonum,
 282                 struct net *net,
 283                 struct nf_conntrack_tuple *tuple)
 284 {
 285         unsigned int size;
 286         const __be32 *ap;
 287         __be32 _addrs[8];
 288
 289         memset(tuple, 0, sizeof(*tuple));
 290
 291         tuple->src.l3num = l3num;
 292         switch (l3num) {
 293         case NFPROTO_IPV4:
 294                 nhoff += offsetof(struct iphdr, saddr);
 295                 size = 2 * sizeof(__be32);
 296                 break;
 297         case NFPROTO_IPV6:
 298                 nhoff += offsetof(struct ipv6hdr, saddr);
 299                 size = sizeof(_addrs);
 300                 break;
 301         default:
 302                 return true;
 303         }
 304
 305         ap = skb_header_pointer(skb, nhoff, size, _addrs);
 306         if (!ap)
 307                 return false;
 308
 309         switch (l3num) {
 310         case NFPROTO_IPV4:
 311                 tuple->src.u3.ip = ap[0];
 312                 tuple->dst.u3.ip = ap[1];
 313                 break;
 314         case NFPROTO_IPV6:
 315                 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
 316                 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
 317                 break;
 318         }
 319
 320         tuple->dst.protonum = protonum;
 321         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 322
 323         switch (protonum) {
 324 #if IS_ENABLED(CONFIG_IPV6)
 325         case IPPROTO_ICMPV6:
 326                 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
 327 #endif
 328         case IPPROTO_ICMP:
 329                 return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
 330 #ifdef CONFIG_NF_CT_PROTO_GRE
 331         case IPPROTO_GRE:
 332                 return gre_pkt_to_tuple(skb, dataoff, net, tuple);
 333 #endif
 334         case IPPROTO_TCP:
 335         case IPPROTO_UDP:
 336 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
 337         case IPPROTO_UDPLITE:
 338 #endif
 339 #ifdef CONFIG_NF_CT_PROTO_SCTP
 340         case IPPROTO_SCTP:
 341 #endif
 342 #ifdef CONFIG_NF_CT_PROTO_DCCP
 343         case IPPROTO_DCCP:
 344 #endif
 345                 /* fallthrough */
 346                 return nf_ct_get_tuple_ports(skb, dataoff, tuple);
 347         default:
 348                 break;
 349         }
 350
 351         return true;
 352 }
 353
 354 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 355                             u_int8_t *protonum)
 356 {
 357         int dataoff = -1;
 358         const struct iphdr *iph;
 359         struct iphdr _iph;
 360
 361         iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
 362         if (!iph)
 363                 return -1;
 364
 365         /* Conntrack defragments packets, we might still see fragments
 366          * inside ICMP packets though.
 367          */
 368         if (iph->frag_off & htons(IP_OFFSET))
 369                 return -1;
 370
 371         dataoff = nhoff + (iph->ihl << 2);
 372         *protonum = iph->protocol;
 373
 374         /* Check bogus IP headers */
 375         if (dataoff > skb->len) {
 376                 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
 377                          nhoff, iph->ihl << 2, skb->len);
 378                 return -1;
 379         }
 380         return dataoff;
 381 }
 382
 383 #if IS_ENABLED(CONFIG_IPV6)
 384 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 385                             u8 *protonum)
 386 {
 387         int protoff = -1;
 388         unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
 389         __be16 frag_off;
 390         u8 nexthdr;
 391
 392         if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
 393                           &nexthdr, sizeof(nexthdr)) != 0) {
 394                 pr_debug("can't get nexthdr\n");
 395                 return -1;
 396         }
 397         protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
 398         /*
 399          * (protoff == skb->len) means the packet has not data, just
 400          * IPv6 and possibly extensions headers, but it is tracked anyway
 401          */
 402         if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
 403                 pr_debug("can't find proto in pkt\n");
 404                 return -1;
 405         }
 406
 407         *protonum = nexthdr;
 408         return protoff;
 409 }
 410 #endif
 411
 412 static int get_l4proto(const struct sk_buff *skb,
 413                        unsigned int nhoff, u8 pf, u8 *l4num)
 414 {
 415         switch (pf) {
 416         case NFPROTO_IPV4:
 417                 return ipv4_get_l4proto(skb, nhoff, l4num);
 418 #if IS_ENABLED(CONFIG_IPV6)
 419         case NFPROTO_IPV6:
 420                 return ipv6_get_l4proto(skb, nhoff, l4num);
 421 #endif
 422         default:
 423                 *l4num = 0;
 424                 break;
 425         }
 426         return -1;
 427 }
 428
 429 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 430                        u_int16_t l3num,
 431                        struct net *net, struct nf_conntrack_tuple *tuple)
 432 {
 433         u8 protonum;
 434         int protoff;
 435
 436         protoff = get_l4proto(skb, nhoff, l3num, &protonum);
 437         if (protoff <= 0)
 438                 return false;
 439
 440         return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
 441 }
 442 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 443
 444 bool
 445 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 446                    const struct nf_conntrack_tuple *orig)
 447 {
 448         memset(inverse, 0, sizeof(*inverse));
 449
 450         inverse->src.l3num = orig->src.l3num;
 451
 452         switch (orig->src.l3num) {
 453         case NFPROTO_IPV4:
 454                 inverse->src.u3.ip = orig->dst.u3.ip;
 455                 inverse->dst.u3.ip = orig->src.u3.ip;
 456                 break;
 457         case NFPROTO_IPV6:
 458                 inverse->src.u3.in6 = orig->dst.u3.in6;
 459                 inverse->dst.u3.in6 = orig->src.u3.in6;
 460                 break;
 461         default:
 462                 break;
 463         }
 464
 465         inverse->dst.dir = !orig->dst.dir;
 466
 467         inverse->dst.protonum = orig->dst.protonum;
 468
 469         switch (orig->dst.protonum) {
 470         case IPPROTO_ICMP:
 471                 return nf_conntrack_invert_icmp_tuple(inverse, orig);
 472 #if IS_ENABLED(CONFIG_IPV6)
 473         case IPPROTO_ICMPV6:
 474                 return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
 475 #endif
 476         }
 477
 478         inverse->src.u.all = orig->dst.u.all;
 479         inverse->dst.u.all = orig->src.u.all;
 480         return true;
 481 }
 482 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 483
 484 /* Generate a almost-unique pseudo-id for a given conntrack.
 485  *
 486  * intentionally doesn't re-use any of the seeds used for hash
 487  * table location, we assume id gets exposed to userspace.
 488  *
 489  * Following nf_conn items do not change throughout lifetime
 490  * of the nf_conn:
 491  *
 492  * 1. nf_conn address
 493  * 2. nf_conn->master address (normally NULL)
 494  * 3. the associated net namespace
 495  * 4. the original direction tuple
 496  */
 497 u32 nf_ct_get_id(const struct nf_conn *ct)
 498 {
 499         static siphash_aligned_key_t ct_id_seed;
 500         unsigned long a, b, c, d;
 501
 502         net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
 503
 504         a = (unsigned long)ct;
 505         b = (unsigned long)ct->master;
 506         c = (unsigned long)nf_ct_net(ct);
 507         d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 508                                    sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
 509                                    &ct_id_seed);
 510 #ifdef CONFIG_64BIT
 511         return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
 512 #else
 513         return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
 514 #endif
 515 }
 516 EXPORT_SYMBOL_GPL(nf_ct_get_id);
 517
 518 static void
 519 clean_from_lists(struct nf_conn *ct)
 520 {
 521         pr_debug("clean_from_lists(%p)\n", ct);
 522         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 523         hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
 524
 525         /* Destroy all pending expectations */
 526         nf_ct_remove_expectations(ct);
 527 }
 528
 529 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
 530
 531 /* Released via nf_ct_destroy() */
 532 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
 533                                  const struct nf_conntrack_zone *zone,
 534                                  gfp_t flags)
 535 {
 536         struct nf_conn *tmpl, *p;
 537
 538         if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
 539                 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
 540                 if (!tmpl)
 541                         return NULL;
 542
 543                 p = tmpl;
 544                 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
 545                 if (tmpl != p) {
 546                         tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
 547                         tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
 548                 }
 549         } else {
 550                 tmpl = kzalloc(sizeof(*tmpl), flags);
 551                 if (!tmpl)
 552                         return NULL;
 553         }
 554
 555         tmpl->status = IPS_TEMPLATE;
 556         write_pnet(&tmpl->ct_net, net);
 557         nf_ct_zone_add(tmpl, zone);
 558         refcount_set(&tmpl->ct_general.use, 1);
 559
 560         return tmpl;
 561 }
 562 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 563
 564 void nf_ct_tmpl_free(struct nf_conn *tmpl)
 565 {
 566         kfree(tmpl->ext);
 567
 568         if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
 569                 kfree((char *)tmpl - tmpl->proto.tmpl_padto);
 570         else
 571                 kfree(tmpl);
 572 }
 573 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 574
 575 static void destroy_gre_conntrack(struct nf_conn *ct)
 576 {
 577 #ifdef CONFIG_NF_CT_PROTO_GRE
 578         struct nf_conn *master = ct->master;
 579
 580         if (master)
 581                 nf_ct_gre_keymap_destroy(master);
 582 #endif
 583 }
 584
 585 void nf_ct_destroy(struct nf_conntrack *nfct)
 586 {
 587         struct nf_conn *ct = (struct nf_conn *)nfct;
 588
 589         pr_debug("%s(%p)\n", __func__, ct);
 590         WARN_ON(refcount_read(&nfct->use) != 0);
 591
 592         if (unlikely(nf_ct_is_template(ct))) {
 593                 nf_ct_tmpl_free(ct);
 594                 return;
 595         }
 596
 597         if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
 598                 destroy_gre_conntrack(ct);
 599
 600         /* Expectations will have been removed in clean_from_lists,
 601          * except TFTP can create an expectation on the first packet,
 602          * before connection is in the list, so we need to clean here,
 603          * too.
 604          */
 605         nf_ct_remove_expectations(ct);
 606
 607         if (ct->master)
 608                 nf_ct_put(ct->master);
 609
 610         pr_debug("%s: returning ct=%p to slab\n", __func__, ct);
 611         nf_conntrack_free(ct);
 612 }
 613 EXPORT_SYMBOL(nf_ct_destroy);
 614
 615 static void __nf_ct_delete_from_lists(struct nf_conn *ct)
 616 {
 617         struct net *net = nf_ct_net(ct);
 618         unsigned int hash, reply_hash;
 619         unsigned int sequence;
 620
 621         do {
 622                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 623                 hash = hash_conntrack(net,
 624                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 625                                       nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
 626                 reply_hash = hash_conntrack(net,
 627                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 628                                            nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
 629         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 630
 631         clean_from_lists(ct);
 632         nf_conntrack_double_unlock(hash, reply_hash);
 633 }
 634
 635 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 636 {
 637         nf_ct_helper_destroy(ct);
 638         local_bh_disable();
 639
 640         __nf_ct_delete_from_lists(ct);
 641
 642         local_bh_enable();
 643 }
 644
 645 static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
 646 {
 647 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 648         struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));
 649
 650         spin_lock(&cnet->ecache.dying_lock);
 651         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 652                                  &cnet->ecache.dying_list);
 653         spin_unlock(&cnet->ecache.dying_lock);
 654 #endif
 655 }
 656
 657 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 658 {
 659         struct nf_conn_tstamp *tstamp;
 660         struct net *net;
 661
 662         if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
 663                 return false;
 664
 665         tstamp = nf_conn_tstamp_find(ct);
 666         if (tstamp) {
 667                 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
 668
 669                 tstamp->stop = ktime_get_real_ns();
 670                 if (timeout < 0)
 671                         tstamp->stop -= jiffies_to_nsecs(-timeout);
 672         }
 673
 674         if (nf_conntrack_event_report(IPCT_DESTROY, ct,
 675                                     portid, report) < 0) {
 676                 /* destroy event was not delivered. nf_ct_put will
 677                  * be done by event cache worker on redelivery.
 678                  */
 679                 nf_ct_helper_destroy(ct);
 680                 local_bh_disable();
 681                 __nf_ct_delete_from_lists(ct);
 682                 nf_ct_add_to_ecache_list(ct);
 683                 local_bh_enable();
 684
 685                 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
 686                 return false;
 687         }
 688
 689         net = nf_ct_net(ct);
 690         if (nf_conntrack_ecache_dwork_pending(net))
 691                 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT);
 692         nf_ct_delete_from_lists(ct);
 693         nf_ct_put(ct);
 694         return true;
 695 }
 696 EXPORT_SYMBOL_GPL(nf_ct_delete);
 697
 698 static inline bool
 699 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 700                 const struct nf_conntrack_tuple *tuple,
 701                 const struct nf_conntrack_zone *zone,
 702                 const struct net *net)
 703 {
 704         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 705
 706         /* A conntrack can be recreated with the equal tuple,
 707          * so we need to check that the conntrack is confirmed
 708          */
 709         return nf_ct_tuple_equal(tuple, &h->tuple) &&
 710                nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
 711                nf_ct_is_confirmed(ct) &&
 712                net_eq(net, nf_ct_net(ct));
 713 }
 714
 715 static inline bool
 716 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
 717 {
 718         return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 719                                  &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
 720                nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
 721                                  &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
 722                nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
 723                nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
 724                net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
 725 }
 726
 727 /* caller must hold rcu readlock and none of the nf_conntrack_locks */
 728 static void nf_ct_gc_expired(struct nf_conn *ct)
 729 {
 730         if (!refcount_inc_not_zero(&ct->ct_general.use))
 731                 return;
 732
 733         /* load ->status after refcount increase */
 734         smp_acquire__after_ctrl_dep();
 735
 736         if (nf_ct_should_gc(ct))
 737                 nf_ct_kill(ct);
 738
 739         nf_ct_put(ct);
 740 }
 741
 742 /*
 743  * Warning :
 744  * - Caller must take a reference on returned object
 745  *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 746  */
 747 static struct nf_conntrack_tuple_hash *
 748 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
 749                       const struct nf_conntrack_tuple *tuple, u32 hash)
 750 {
 751         struct nf_conntrack_tuple_hash *h;
 752         struct hlist_nulls_head *ct_hash;
 753         struct hlist_nulls_node *n;
 754         unsigned int bucket, hsize;
 755
 756 begin:
 757         nf_conntrack_get_ht(&ct_hash, &hsize);
 758         bucket = reciprocal_scale(hash, hsize);
 759
 760         hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
 761                 struct nf_conn *ct;
 762
 763                 ct = nf_ct_tuplehash_to_ctrack(h);
 764                 if (nf_ct_is_expired(ct)) {
 765                         nf_ct_gc_expired(ct);
 766                         continue;
 767                 }
 768
 769                 if (nf_ct_key_equal(h, tuple, zone, net))
 770                         return h;
 771         }
 772         /*
 773          * if the nulls value we got at the end of this lookup is
 774          * not the expected one, we must restart lookup.
 775          * We probably met an item that was moved to another chain.
 776          */
 777         if (get_nulls_value(n) != bucket) {
 778                 NF_CT_STAT_INC_ATOMIC(net, search_restart);
 779                 goto begin;
 780         }
 781
 782         return NULL;
 783 }
 784
 785 /* Find a connection corresponding to a tuple. */
 786 static struct nf_conntrack_tuple_hash *
 787 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 788                         const struct nf_conntrack_tuple *tuple, u32 hash)
 789 {
 790         struct nf_conntrack_tuple_hash *h;
 791         struct nf_conn *ct;
 792
 793         rcu_read_lock();
 794
 795         h = ____nf_conntrack_find(net, zone, tuple, hash);
 796         if (h) {
 797                 /* We have a candidate that matches the tuple we're interested
 798                  * in, try to obtain a reference and re-check tuple
 799                  */
 800                 ct = nf_ct_tuplehash_to_ctrack(h);
 801                 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
 802                         /* re-check key after refcount */
 803                         smp_acquire__after_ctrl_dep();
 804
 805                         if (likely(nf_ct_key_equal(h, tuple, zone, net)))
 806                                 goto found;
 807
 808                         /* TYPESAFE_BY_RCU recycled the candidate */
 809                         nf_ct_put(ct);
 810                 }
 811
 812                 h = NULL;
 813         }
 814 found:
 815         rcu_read_unlock();
 816
 817         return h;
 818 }
 819
 820 struct nf_conntrack_tuple_hash *
 821 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
 822                       const struct nf_conntrack_tuple *tuple)
 823 {
 824         unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
 825         struct nf_conntrack_tuple_hash *thash;
 826
 827         thash = __nf_conntrack_find_get(net, zone, tuple,
 828                                         hash_conntrack_raw(tuple, zone_id, net));
 829
 830         if (thash)
 831                 return thash;
 832
 833         rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
 834         if (rid != zone_id)
 835                 return __nf_conntrack_find_get(net, zone, tuple,
 836                                                hash_conntrack_raw(tuple, rid, net));
 837         return thash;
 838 }
 839 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 840
 841 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 842                                        unsigned int hash,
 843                                        unsigned int reply_hash)
 844 {
 845         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
 846                            &nf_conntrack_hash[hash]);
 847         hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
 848                            &nf_conntrack_hash[reply_hash]);
 849 }
 850
 851 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
 852 {
 853         /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions
 854          * may contain stale pointers to e.g. helper that has been removed.
 855          *
 856          * The helper can't clear this because the nf_conn object isn't in
 857          * any hash and synchronize_rcu() isn't enough because associated skb
 858          * might sit in a queue.
 859          */
 860         return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid);
 861 }
 862
 863 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
 864 {
 865         if (!ext)
 866                 return true;
 867
 868         if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid))
 869                 return false;
 870
 871         /* inserted into conntrack table, nf_ct_iterate_cleanup()
 872          * will find it.  Disable nf_ct_ext_find() id check.
 873          */
 874         WRITE_ONCE(ext->gen_id, 0);
 875         return true;
 876 }
 877
 878 int
 879 nf_conntrack_hash_check_insert(struct nf_conn *ct)
 880 {
 881         const struct nf_conntrack_zone *zone;
 882         struct net *net = nf_ct_net(ct);
 883         unsigned int hash, reply_hash;
 884         struct nf_conntrack_tuple_hash *h;
 885         struct hlist_nulls_node *n;
 886         unsigned int max_chainlen;
 887         unsigned int chainlen = 0;
 888         unsigned int sequence;
 889         int err = -EEXIST;
 890
 891         zone = nf_ct_zone(ct);
 892
 893         if (!nf_ct_ext_valid_pre(ct->ext)) {
 894                 NF_CT_STAT_INC(net, insert_failed);
 895                 return -ETIMEDOUT;
 896         }
 897
 898         local_bh_disable();
 899         do {
 900                 sequence = read_seqcount_begin(&nf_conntrack_generation);
 901                 hash = hash_conntrack(net,
 902                                       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 903                                       nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
 904                 reply_hash = hash_conntrack(net,
 905                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 906                                            nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
 907         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
 908
 909         max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
 910
 911         /* See if there's one in the list already, including reverse */
 912         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
 913                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
 914                                     zone, net))
 915                         goto out;
 916
 917                 if (chainlen++ > max_chainlen)
 918                         goto chaintoolong;
 919         }
 920
 921         chainlen = 0;
 922
 923         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
 924                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
 925                                     zone, net))
 926                         goto out;
 927                 if (chainlen++ > max_chainlen)
 928                         goto chaintoolong;
 929         }
 930
 931         smp_wmb();
 932         /* The caller holds a reference to this object */
 933         refcount_set(&ct->ct_general.use, 2);
 934         __nf_conntrack_hash_insert(ct, hash, reply_hash);
 935         nf_conntrack_double_unlock(hash, reply_hash);
 936         NF_CT_STAT_INC(net, insert);
 937         local_bh_enable();
 938
 939         if (!nf_ct_ext_valid_post(ct->ext)) {
 940                 nf_ct_kill(ct);
 941                 NF_CT_STAT_INC(net, drop);
 942                 return -ETIMEDOUT;
 943         }
 944
 945         return 0;
 946 chaintoolong:
 947         NF_CT_STAT_INC(net, chaintoolong);
 948         err = -ENOSPC;
 949 out:
 950         nf_conntrack_double_unlock(hash, reply_hash);
 951         local_bh_enable();
 952         return err;
 953 }
 954 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 955
 956 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
 957                     unsigned int bytes)
 958 {
 959         struct nf_conn_acct *acct;
 960
 961         acct = nf_conn_acct_find(ct);
 962         if (acct) {
 963                 struct nf_conn_counter *counter = acct->counter;
 964
 965                 atomic64_add(packets, &counter[dir].packets);
 966                 atomic64_add(bytes, &counter[dir].bytes);
 967         }
 968 }
 969 EXPORT_SYMBOL_GPL(nf_ct_acct_add);
 970
 971 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 972                              const struct nf_conn *loser_ct)
 973 {
 974         struct nf_conn_acct *acct;
 975
 976         acct = nf_conn_acct_find(loser_ct);
 977         if (acct) {
 978                 struct nf_conn_counter *counter = acct->counter;
 979                 unsigned int bytes;
 980
 981                 /* u32 should be fine since we must have seen one packet. */
 982                 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
 983                 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
 984         }
 985 }
 986
 987 static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
 988 {
 989         struct nf_conn_tstamp *tstamp;
 990
 991         refcount_inc(&ct->ct_general.use);
 992
 993         /* set conntrack timestamp, if enabled. */
 994         tstamp = nf_conn_tstamp_find(ct);
 995         if (tstamp)
 996                 tstamp->start = ktime_get_real_ns();
 997 }
 998
 999 /* caller must hold locks to prevent concurrent changes */
1000 static int __nf_ct_resolve_clash(struct sk_buff *skb,
1001                                  struct nf_conntrack_tuple_hash *h)
1002 {
1003         /* This is the conntrack entry already in hashes that won race. */
1004         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1005         enum ip_conntrack_info ctinfo;
1006         struct nf_conn *loser_ct;
1007
1008         loser_ct = nf_ct_get(skb, &ctinfo);
1009
1010         if (nf_ct_is_dying(ct))
1011                 return NF_DROP;
1012
1013         if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
1014             nf_ct_match(ct, loser_ct)) {
1015                 struct net *net = nf_ct_net(ct);
1016
1017                 nf_conntrack_get(&ct->ct_general);
1018
1019                 nf_ct_acct_merge(ct, ctinfo, loser_ct);
1020                 nf_ct_put(loser_ct);
1021                 nf_ct_set(skb, ct, ctinfo);
1022
1023                 NF_CT_STAT_INC(net, clash_resolve);
1024                 return NF_ACCEPT;
1025         }
1026
1027         return NF_DROP;
1028 }
1029
1030 /**
1031  * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
1032  *
1033  * @skb: skb that causes the collision
1034  * @repl_idx: hash slot for reply direction
1035  *
1036  * Called when origin or reply direction had a clash.
1037  * The skb can be handled without packet drop provided the reply direction
1038  * is unique or there the existing entry has the identical tuple in both
1039  * directions.
1040  *
1041  * Caller must hold conntrack table locks to prevent concurrent updates.
1042  *
1043  * Returns NF_DROP if the clash could not be handled.
1044  */
1045 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
1046 {
1047         struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
1048         const struct nf_conntrack_zone *zone;
1049         struct nf_conntrack_tuple_hash *h;
1050         struct hlist_nulls_node *n;
1051         struct net *net;
1052
1053         zone = nf_ct_zone(loser_ct);
1054         net = nf_ct_net(loser_ct);
1055
1056         /* Reply direction must never result in a clash, unless both origin
1057          * and reply tuples are identical.
1058          */
1059         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
1060                 if (nf_ct_key_equal(h,
1061                                     &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1062                                     zone, net))
1063                         return __nf_ct_resolve_clash(skb, h);
1064         }
1065
1066         /* We want the clashing entry to go away real soon: 1 second timeout. */
1067         WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
1068
1069         /* IPS_NAT_CLASH removes the entry automatically on the first
1070          * reply.  Also prevents UDP tracker from moving the entry to
1071          * ASSURED state, i.e. the entry can always be evicted under
1072          * pressure.
1073          */
1074         loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
1075
1076         __nf_conntrack_insert_prepare(loser_ct);
1077
1078         /* fake add for ORIGINAL dir: we want lookups to only find the entry
1079          * already in the table.  This also hides the clashing entry from
1080          * ctnetlink iteration, i.e. conntrack -L won't show them.
1081          */
1082         hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
1083
1084         hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
1085                                  &nf_conntrack_hash[repl_idx]);
1086
1087         NF_CT_STAT_INC(net, clash_resolve);
1088         return NF_ACCEPT;
1089 }
1090
1091 /**
1092  * nf_ct_resolve_clash - attempt to handle clash without packet drop
1093  *
1094  * @skb: skb that causes the clash
1095  * @h: tuplehash of the clashing entry already in table
1096  * @reply_hash: hash slot for reply direction
1097  *
1098  * A conntrack entry can be inserted to the connection tracking table
1099  * if there is no existing entry with an identical tuple.
1100  *
1101  * If there is one, @skb (and the assocated, unconfirmed conntrack) has
1102  * to be dropped.  In case @skb is retransmitted, next conntrack lookup
1103  * will find the already-existing entry.
1104  *
1105  * The major problem with such packet drop is the extra delay added by
1106  * the packet loss -- it will take some time for a retransmit to occur
1107  * (or the sender to time out when waiting for a reply).
1108  *
1109  * This function attempts to handle the situation without packet drop.
1110  *
1111  * If @skb has no NAT transformation or if the colliding entries are
1112  * exactly the same, only the to-be-confirmed conntrack entry is discarded
1113  * and @skb is associated with the conntrack entry already in the table.
1114  *
1115  * Failing that, the new, unconfirmed conntrack is still added to the table
1116  * provided that the collision only occurs in the ORIGINAL direction.
1117  * The new entry will be added only in the non-clashing REPLY direction,
1118  * so packets in the ORIGINAL direction will continue to match the existing
1119  * entry.  The new entry will also have a fixed timeout so it expires --
1120  * due to the collision, it will only see reply traffic.
1121  *
1122  * Returns NF_DROP if the clash could not be resolved.
1123  */
1124 static __cold noinline int
1125 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
1126                     u32 reply_hash)
1127 {
1128         /* This is the conntrack entry already in hashes that won race. */
1129         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1130         const struct nf_conntrack_l4proto *l4proto;
1131         enum ip_conntrack_info ctinfo;
1132         struct nf_conn *loser_ct;
1133         struct net *net;
1134         int ret;
1135
1136         loser_ct = nf_ct_get(skb, &ctinfo);
1137         net = nf_ct_net(loser_ct);
1138
1139         l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1140         if (!l4proto->allow_clash)
1141                 goto drop;
1142
1143         ret = __nf_ct_resolve_clash(skb, h);
1144         if (ret == NF_ACCEPT)
1145                 return ret;
1146
1147         ret = nf_ct_resolve_clash_harder(skb, reply_hash);
1148         if (ret == NF_ACCEPT)
1149                 return ret;
1150
1151 drop:
1152         NF_CT_STAT_INC(net, drop);
1153         NF_CT_STAT_INC(net, insert_failed);
1154         return NF_DROP;
1155 }
1156
1157 /* Confirm a connection given skb; places it in hash table */
1158 int
1159 __nf_conntrack_confirm(struct sk_buff *skb)
1160 {
1161         unsigned int chainlen = 0, sequence, max_chainlen;
1162         const struct nf_conntrack_zone *zone;
1163         unsigned int hash, reply_hash;
1164         struct nf_conntrack_tuple_hash *h;
1165         struct nf_conn *ct;
1166         struct nf_conn_help *help;
1167         struct hlist_nulls_node *n;
1168         enum ip_conntrack_info ctinfo;
1169         struct net *net;
1170         int ret = NF_DROP;
1171
1172         ct = nf_ct_get(skb, &ctinfo);
1173         net = nf_ct_net(ct);
1174
1175         /* ipt_REJECT uses nf_conntrack_attach to attach related
1176            ICMP/TCP RST packets in other direction.  Actual packet
1177            which created connection will be IP_CT_NEW or for an
1178            expected connection, IP_CT_RELATED. */
1179         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
1180                 return NF_ACCEPT;
1181
1182         zone = nf_ct_zone(ct);
1183         local_bh_disable();
1184
1185         do {
1186                 sequence = read_seqcount_begin(&nf_conntrack_generation);
1187                 /* reuse the hash saved before */
1188                 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
1189                 hash = scale_hash(hash);
1190                 reply_hash = hash_conntrack(net,
1191                                            &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1192                                            nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
1193         } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
1194
1195         /* We're not in hash table, and we refuse to set up related
1196          * connections for unconfirmed conns.  But packet copies and
1197          * REJECT will give spurious warnings here.
1198          */
1199
1200         /* Another skb with the same unconfirmed conntrack may
1201          * win the race. This may happen for bridge(br_flood)
1202          * or broadcast/multicast packets do skb_clone with
1203          * unconfirmed conntrack.
1204          */
1205         if (unlikely(nf_ct_is_confirmed(ct))) {
1206                 WARN_ON_ONCE(1);
1207                 nf_conntrack_double_unlock(hash, reply_hash);
1208                 local_bh_enable();
1209                 return NF_DROP;
1210         }
1211
1212         if (!nf_ct_ext_valid_pre(ct->ext)) {
1213                 NF_CT_STAT_INC(net, insert_failed);
1214                 goto dying;
1215         }
1216
1217         pr_debug("Confirming conntrack %p\n", ct);
1218         /* We have to check the DYING flag after unlink to prevent
1219          * a race against nf_ct_get_next_corpse() possibly called from
1220          * user context, else we insert an already 'dead' hash, blocking
1221          * further use of that particular connection -JM.
1222          */
1223         ct->status |= IPS_CONFIRMED;
1224
1225         if (unlikely(nf_ct_is_dying(ct))) {
1226                 NF_CT_STAT_INC(net, insert_failed);
1227                 goto dying;
1228         }
1229
1230         max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN);
1231         /* See if there's one in the list already, including reverse:
1232            NAT could have grabbed it without realizing, since we're
1233            not in the hash.  If there is, we lost race. */
1234         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
1235                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1236                                     zone, net))
1237                         goto out;
1238                 if (chainlen++ > max_chainlen)
1239                         goto chaintoolong;
1240         }
1241
1242         chainlen = 0;
1243         hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
1244                 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1245                                     zone, net))
1246                         goto out;
1247                 if (chainlen++ > max_chainlen) {
1248 chaintoolong:
1249                         NF_CT_STAT_INC(net, chaintoolong);
1250                         NF_CT_STAT_INC(net, insert_failed);
1251                         ret = NF_DROP;
1252                         goto dying;
1253                 }
1254         }
1255
1256         /* Timer relative to confirmation time, not original
1257            setting time, otherwise we'd get timer wrap in
1258            weird delay cases. */
1259         ct->timeout += nfct_time_stamp;
1260
1261         __nf_conntrack_insert_prepare(ct);
1262
1263         /* Since the lookup is lockless, hash insertion must be done after
1264          * starting the timer and setting the CONFIRMED bit. The RCU barriers
1265          * guarantee that no other CPU can find the conntrack before the above
1266          * stores are visible.
1267          */
1268         __nf_conntrack_hash_insert(ct, hash, reply_hash);
1269         nf_conntrack_double_unlock(hash, reply_hash);
1270         local_bh_enable();
1271
1272         /* ext area is still valid (rcu read lock is held,
1273          * but will go out of scope soon, we need to remove
1274          * this conntrack again.
1275          */
1276         if (!nf_ct_ext_valid_post(ct->ext)) {
1277                 nf_ct_kill(ct);
1278                 NF_CT_STAT_INC(net, drop);
1279                 return NF_DROP;
1280         }
1281
1282         help = nfct_help(ct);
1283         if (help && help->helper)
1284                 nf_conntrack_event_cache(IPCT_HELPER, ct);
1285
1286         nf_conntrack_event_cache(master_ct(ct) ?
1287                                  IPCT_RELATED : IPCT_NEW, ct);
1288         return NF_ACCEPT;
1289
1290 out:
1291         ret = nf_ct_resolve_clash(skb, h, reply_hash);
1292 dying:
1293         nf_conntrack_double_unlock(hash, reply_hash);
1294         local_bh_enable();
1295         return ret;
1296 }
1297 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
1298
1299 /* Returns true if a connection correspondings to the tuple (required
1300    for NAT). */
1301 int
1302 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
1303                          const struct nf_conn *ignored_conntrack)
1304 {
1305         struct net *net = nf_ct_net(ignored_conntrack);
1306         const struct nf_conntrack_zone *zone;
1307         struct nf_conntrack_tuple_hash *h;
1308         struct hlist_nulls_head *ct_hash;
1309         unsigned int hash, hsize;
1310         struct hlist_nulls_node *n;
1311         struct nf_conn *ct;
1312
1313         zone = nf_ct_zone(ignored_conntrack);
1314
1315         rcu_read_lock();
1316  begin:
1317         nf_conntrack_get_ht(&ct_hash, &hsize);
1318         hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);
1319
1320         hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
1321                 ct = nf_ct_tuplehash_to_ctrack(h);
1322
1323                 if (ct == ignored_conntrack)
1324                         continue;
1325
1326                 if (nf_ct_is_expired(ct)) {
1327                         nf_ct_gc_expired(ct);
1328                         continue;
1329                 }
1330
1331                 if (nf_ct_key_equal(h, tuple, zone, net)) {
1332                         /* Tuple is taken already, so caller will need to find
1333                          * a new source port to use.
1334                          *
1335                          * Only exception:
1336                          * If the *original tuples* are identical, then both
1337                          * conntracks refer to the same flow.
1338                          * This is a rare situation, it can occur e.g. when
1339                          * more than one UDP packet is sent from same socket
1340                          * in different threads.
1341                          *
1342                          * Let nf_ct_resolve_clash() deal with this later.
1343                          */
1344                         if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1345                                               &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
1346                                               nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
1347                                 continue;
1348
1349                         NF_CT_STAT_INC_ATOMIC(net, found);
1350                         rcu_read_unlock();
1351                         return 1;
1352                 }
1353         }
1354
1355         if (get_nulls_value(n) != hash) {
1356                 NF_CT_STAT_INC_ATOMIC(net, search_restart);
1357                 goto begin;
1358         }
1359
1360         rcu_read_unlock();
1361
1362         return 0;
1363 }
1364 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1365
1366 #define NF_CT_EVICTION_RANGE    8
1367
1368 /* There's a small race here where we may free a just-assured
1369    connection.  Too bad: we're in trouble anyway. */
1370 static unsigned int early_drop_list(struct net *net,
1371                                     struct hlist_nulls_head *head)
1372 {
1373         struct nf_conntrack_tuple_hash *h;
1374         struct hlist_nulls_node *n;
1375         unsigned int drops = 0;
1376         struct nf_conn *tmp;
1377
1378         hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
1379                 tmp = nf_ct_tuplehash_to_ctrack(h);
1380
1381                 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
1382                         continue;
1383
1384                 if (nf_ct_is_expired(tmp)) {
1385                         nf_ct_gc_expired(tmp);
1386                         continue;
1387                 }
1388
1389                 if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
1390                     !net_eq(nf_ct_net(tmp), net) ||
1391                     nf_ct_is_dying(tmp))
1392                         continue;
1393
1394                 if (!refcount_inc_not_zero(&tmp->ct_general.use))
1395                         continue;
1396
1397                 /* load ->ct_net and ->status after refcount increase */
1398                 smp_acquire__after_ctrl_dep();
1399
1400                 /* kill only if still in same netns -- might have moved due to
1401                  * SLAB_TYPESAFE_BY_RCU rules.
1402                  *
1403                  * We steal the timer reference.  If that fails timer has
1404                  * already fired or someone else deleted it. Just drop ref
1405                  * and move to next entry.
1406                  */
1407                 if (net_eq(nf_ct_net(tmp), net) &&
1408                     nf_ct_is_confirmed(tmp) &&
1409                     nf_ct_delete(tmp, 0, 0))
1410                         drops++;
1411
1412                 nf_ct_put(tmp);
1413         }
1414
1415         return drops;
1416 }
1417
1418 static noinline int early_drop(struct net *net, unsigned int hash)
1419 {
1420         unsigned int i, bucket;
1421
1422         for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
1423                 struct hlist_nulls_head *ct_hash;
1424                 unsigned int hsize, drops;
1425
1426                 rcu_read_lock();
1427                 nf_conntrack_get_ht(&ct_hash, &hsize);
1428                 if (!i)
1429                         bucket = reciprocal_scale(hash, hsize);
1430                 else
1431                         bucket = (bucket + 1) % hsize;
1432
1433                 drops = early_drop_list(net, &ct_hash[bucket]);
1434                 rcu_read_unlock();
1435
1436                 if (drops) {
1437                         NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
1438                         return true;
1439                 }
1440         }
1441
1442         return false;
1443 }
1444
1445 static bool gc_worker_skip_ct(const struct nf_conn *ct)
1446 {
1447         return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
1448 }
1449
1450 static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1451 {
1452         const struct nf_conntrack_l4proto *l4proto;
1453
1454         if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1455                 return true;
1456
1457         l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
1458         if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1459                 return true;
1460
1461         return false;
1462 }
1463
1464 static void gc_worker(struct work_struct *work)
1465 {
1466         unsigned int i, hashsz, nf_conntrack_max95 = 0;
1467         u32 end_time, start_time = nfct_time_stamp;
1468         struct conntrack_gc_work *gc_work;
1469         unsigned int expired_count = 0;
1470         unsigned long next_run;
1471         s32 delta_time;
1472         long count;
1473
1474         gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1475
1476         i = gc_work->next_bucket;
1477         if (gc_work->early_drop)
1478                 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
1479
1480         if (i == 0) {
1481                 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
1482                 gc_work->count = GC_SCAN_INITIAL_COUNT;
1483                 gc_work->start_time = start_time;
1484         }
1485
1486         next_run = gc_work->avg_timeout;
1487         count = gc_work->count;
1488
1489         end_time = start_time + GC_SCAN_MAX_DURATION;
1490
1491         do {
1492                 struct nf_conntrack_tuple_hash *h;
1493                 struct hlist_nulls_head *ct_hash;
1494                 struct hlist_nulls_node *n;
1495                 struct nf_conn *tmp;
1496
1497                 rcu_read_lock();
1498
1499                 nf_conntrack_get_ht(&ct_hash, &hashsz);
1500                 if (i >= hashsz) {
1501                         rcu_read_unlock();
1502                         break;
1503                 }
1504
1505                 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1506                         struct nf_conntrack_net *cnet;
1507                         struct net *net;
1508                         long expires;
1509
1510                         tmp = nf_ct_tuplehash_to_ctrack(h);
1511
1512                         if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
1513                                 nf_ct_offload_timeout(tmp);
1514                                 continue;
1515                         }
1516
1517                         if (expired_count > GC_SCAN_EXPIRED_MAX) {
1518                                 rcu_read_unlock();
1519
1520                                 gc_work->next_bucket = i;
1521                                 gc_work->avg_timeout = next_run;
1522                                 gc_work->count = count;
1523
1524                                 delta_time = nfct_time_stamp - gc_work->start_time;
1525
1526                                 /* re-sched immediately if total cycle time is exceeded */
1527                                 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
1528                                 goto early_exit;
1529                         }
1530
1531                         if (nf_ct_is_expired(tmp)) {
1532                                 nf_ct_gc_expired(tmp);
1533                                 expired_count++;
1534                                 continue;
1535                         }
1536
1537                         expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
1538                         expires = (expires - (long)next_run) / ++count;
1539                         next_run += expires;
1540
1541                         if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
1542                                 continue;
1543
1544                         net = nf_ct_net(tmp);
1545                         cnet = nf_ct_pernet(net);
1546                         if (atomic_read(&cnet->count) < nf_conntrack_max95)
1547                                 continue;
1548
1549                         /* need to take reference to avoid possible races */
1550                         if (!refcount_inc_not_zero(&tmp->ct_general.use))
1551                                 continue;
1552
1553                         /* load ->status after refcount increase */
1554                         smp_acquire__after_ctrl_dep();
1555
1556                         if (gc_worker_skip_ct(tmp)) {
1557                                 nf_ct_put(tmp);
1558                                 continue;
1559                         }
1560
1561                         if (gc_worker_can_early_drop(tmp)) {
1562                                 nf_ct_kill(tmp);
1563                                 expired_count++;
1564                         }
1565
1566                         nf_ct_put(tmp);
1567                 }
1568
1569                 /* could check get_nulls_value() here and restart if ct
1570                  * was moved to another chain.  But given gc is best-effort
1571                  * we will just continue with next hash slot.
1572                  */
1573                 rcu_read_unlock();
1574                 cond_resched();
1575                 i++;
1576
1577                 delta_time = nfct_time_stamp - end_time;
1578                 if (delta_time > 0 && i < hashsz) {
1579                         gc_work->avg_timeout = next_run;
1580                         gc_work->count = count;
1581                         gc_work->next_bucket = i;
1582                         next_run = 0;
1583                         goto early_exit;
1584                 }
1585         } while (i < hashsz);
1586
1587         gc_work->next_bucket = 0;
1588
1589         next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
1590
1591         delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
1592         if (next_run > (unsigned long)delta_time)
1593                 next_run -= delta_time;
1594         else
1595                 next_run = 1;
1596
1597 early_exit:
1598         if (gc_work->exiting)
1599                 return;
1600
1601         if (next_run)
1602                 gc_work->early_drop = false;
1603
1604         queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
1605 }
1606
1607 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1608 {
1609         INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
1610         gc_work->exiting = false;
1611 }
1612
1613 static struct nf_conn *
1614 __nf_conntrack_alloc(struct net *net,
1615                      const struct nf_conntrack_zone *zone,
1616                      const struct nf_conntrack_tuple *orig,
1617                      const struct nf_conntrack_tuple *repl,
1618                      gfp_t gfp, u32 hash)
1619 {
1620         struct nf_conntrack_net *cnet = nf_ct_pernet(net);
1621         unsigned int ct_count;
1622         struct nf_conn *ct;
1623
1624         /* We don't want any race condition at early drop stage */
1625         ct_count = atomic_inc_return(&cnet->count);
1626
1627         if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
1628                 if (!early_drop(net, hash)) {
1629                         if (!conntrack_gc_work.early_drop)
1630                                 conntrack_gc_work.early_drop = true;
1631                         atomic_dec(&cnet->count);
1632                         net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1633                         return ERR_PTR(-ENOMEM);
1634                 }
1635         }
1636
1637         /*
1638          * Do not use kmem_cache_zalloc(), as this cache uses
1639          * SLAB_TYPESAFE_BY_RCU.
1640          */
1641         ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1642         if (ct == NULL)
1643                 goto out;
1644
1645         spin_lock_init(&ct->lock);
1646         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1647         ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1648         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1649         /* save hash for reusing when confirming */
1650         *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1651         ct->status = 0;
1652         WRITE_ONCE(ct->timeout, 0);
1653         write_pnet(&ct->ct_net, net);
1654         memset_after(ct, 0, __nfct_init_offset);
1655
1656         nf_ct_zone_add(ct, zone);
1657
1658         /* Because we use RCU lookups, we set ct_general.use to zero before
1659          * this is inserted in any list.
1660          */
1661         refcount_set(&ct->ct_general.use, 0);
1662         return ct;
1663 out:
1664         atomic_dec(&cnet->count);
1665         return ERR_PTR(-ENOMEM);
1666 }
1667
1668 struct nf_conn *nf_conntrack_alloc(struct net *net,
1669                                    const struct nf_conntrack_zone *zone,
1670                                    const struct nf_conntrack_tuple *orig,
1671                                    const struct nf_conntrack_tuple *repl,
1672                                    gfp_t gfp)
1673 {
1674         return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
1675 }
1676 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1677
1678 void nf_conntrack_free(struct nf_conn *ct)
1679 {
1680         struct net *net = nf_ct_net(ct);
1681         struct nf_conntrack_net *cnet;
1682
1683         /* A freed object has refcnt == 0, that's
1684          * the golden rule for SLAB_TYPESAFE_BY_RCU
1685          */
1686         WARN_ON(refcount_read(&ct->ct_general.use) != 0);
1687
1688         if (ct->status & IPS_SRC_NAT_DONE) {
1689                 const struct nf_nat_hook *nat_hook;
1690
1691                 rcu_read_lock();
1692                 nat_hook = rcu_dereference(nf_nat_hook);
1693                 if (nat_hook)
1694                         nat_hook->remove_nat_bysrc(ct);
1695                 rcu_read_unlock();
1696         }
1697
1698         kfree(ct->ext);
1699         kmem_cache_free(nf_conntrack_cachep, ct);
1700         cnet = nf_ct_pernet(net);
1701
1702         smp_mb__before_atomic();
1703         atomic_dec(&cnet->count);
1704 }
1705 EXPORT_SYMBOL_GPL(nf_conntrack_free);
1706
1707
1708 /* Allocate a new conntrack: we return -ENOMEM if classification
1709    failed due to stress.  Otherwise it really is unclassifiable. */
1710 static noinline struct nf_conntrack_tuple_hash *
1711 init_conntrack(struct net *net, struct nf_conn *tmpl,
1712                const struct nf_conntrack_tuple *tuple,
1713                struct sk_buff *skb,
1714                unsigned int dataoff, u32 hash)
1715 {
1716         struct nf_conn *ct;
1717         struct nf_conn_help *help;
1718         struct nf_conntrack_tuple repl_tuple;
1719 #ifdef CONFIG_NF_CONNTRACK_EVENTS
1720         struct nf_conntrack_ecache *ecache;
1721 #endif
1722         struct nf_conntrack_expect *exp = NULL;
1723         const struct nf_conntrack_zone *zone;
1724         struct nf_conn_timeout *timeout_ext;
1725         struct nf_conntrack_zone tmp;
1726         struct nf_conntrack_net *cnet;
1727
1728         if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
1729                 pr_debug("Can't invert tuple.\n");
1730                 return NULL;
1731         }
1732
1733         zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1734         ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1735                                   hash);
1736         if (IS_ERR(ct))
1737                 return (struct nf_conntrack_tuple_hash *)ct;
1738
1739         if (!nf_ct_add_synproxy(ct, tmpl)) {
1740                 nf_conntrack_free(ct);
1741                 return ERR_PTR(-ENOMEM);
1742         }
1743
1744         timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1745
1746         if (timeout_ext)
1747                 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1748                                       GFP_ATOMIC);
1749
1750         nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1751         nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1752         nf_ct_labels_ext_add(ct);
1753
1754 #ifdef CONFIG_NF_CONNTRACK_EVENTS
1755         ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1756
1757         if ((ecache || net->ct.sysctl_events) &&
1758             !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1759                                   ecache ? ecache->expmask : 0,
1760                                   GFP_ATOMIC)) {
1761                 nf_conntrack_free(ct);
1762                 return ERR_PTR(-ENOMEM);
1763         }
1764 #endif
1765
1766         cnet = nf_ct_pernet(net);
1767         if (cnet->expect_count) {
1768                 spin_lock_bh(&nf_conntrack_expect_lock);
1769                 exp = nf_ct_find_expectation(net, zone, tuple);
1770                 if (exp) {
1771                         pr_debug("expectation arrives ct=%p exp=%p\n",
1772                                  ct, exp);
1773                         /* Welcome, Mr. Bond.  We've been expecting you... */
1774                         __set_bit(IPS_EXPECTED_BIT, &ct->status);
1775                         /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1776                         ct->master = exp->master;
1777                         if (exp->helper) {
1778                                 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1779                                 if (help)
1780                                         rcu_assign_pointer(help->helper, exp->helper);
1781                         }
1782
1783 #ifdef CONFIG_NF_CONNTRACK_MARK
1784                         ct->mark = exp->master->mark;
1785 #endif
1786 #ifdef CONFIG_NF_CONNTRACK_SECMARK
1787                         ct->secmark = exp->master->secmark;
1788 #endif
1789                         NF_CT_STAT_INC(net, expect_new);
1790                 }
1791                 spin_unlock_bh(&nf_conntrack_expect_lock);
1792         }
1793         if (!exp && tmpl)
1794                 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1795
1796         /* Other CPU might have obtained a pointer to this object before it was
1797          * released.  Because refcount is 0, refcount_inc_not_zero() will fail.
1798          *
1799          * After refcount_set(1) it will succeed; ensure that zeroing of
1800          * ct->status and the correct ct->net pointer are visible; else other
1801          * core might observe CONFIRMED bit which means the entry is valid and
1802          * in the hash table, but its not (anymore).
1803          */
1804         smp_wmb();
1805
1806         /* Now it is going to be associated with an sk_buff, set refcount to 1. */
1807         refcount_set(&ct->ct_general.use, 1);
1808
1809         if (exp) {
1810                 if (exp->expectfn)
1811                         exp->expectfn(ct, exp);
1812                 nf_ct_expect_put(exp);
1813         }
1814
1815         return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1816 }
1817
1818 /* On success, returns 0, sets skb->_nfct | ctinfo */
1819 static int
1820 resolve_normal_ct(struct nf_conn *tmpl,
1821                   struct sk_buff *skb,
1822                   unsigned int dataoff,
1823                   u_int8_t protonum,
1824                   const struct nf_hook_state *state)
1825 {
1826         const struct nf_conntrack_zone *zone;
1827         struct nf_conntrack_tuple tuple;
1828         struct nf_conntrack_tuple_hash *h;
1829         enum ip_conntrack_info ctinfo;
1830         struct nf_conntrack_zone tmp;
1831         u32 hash, zone_id, rid;
1832         struct nf_conn *ct;
1833
1834         if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1835                              dataoff, state->pf, protonum, state->net,
1836                              &tuple)) {
1837                 pr_debug("Can't get tuple\n");
1838                 return 0;
1839         }
1840
1841         /* look for tuple match */
1842         zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1843
1844         zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
1845         hash = hash_conntrack_raw(&tuple, zone_id, state->net);
1846         h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
1847
1848         if (!h) {
1849                 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
1850                 if (zone_id != rid) {
1851                         u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);
1852
1853                         h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
1854                 }
1855         }
1856
1857         if (!h) {
1858                 h = init_conntrack(state->net, tmpl, &tuple,
1859                                    skb, dataoff, hash);
1860                 if (!h)
1861                         return 0;
1862                 if (IS_ERR(h))
1863                         return PTR_ERR(h);
1864         }
1865         ct = nf_ct_tuplehash_to_ctrack(h);
1866
1867         /* It exists; we have (non-exclusive) reference. */
1868         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1869                 ctinfo = IP_CT_ESTABLISHED_REPLY;
1870         } else {
1871                 /* Once we've had two way comms, always ESTABLISHED. */
1872                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1873                         pr_debug("normal packet for %p\n", ct);
1874                         ctinfo = IP_CT_ESTABLISHED;
1875                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1876                         pr_debug("related packet for %p\n", ct);
1877                         ctinfo = IP_CT_RELATED;
1878                 } else {
1879                         pr_debug("new packet for %p\n", ct);
1880                         ctinfo = IP_CT_NEW;
1881                 }
1882         }
1883         nf_ct_set(skb, ct, ctinfo);
1884         return 0;
1885 }
1886
1887 /*
1888  * icmp packets need special treatment to handle error messages that are
1889  * related to a connection.
1890  *
1891  * Callers need to check if skb has a conntrack assigned when this
1892  * helper returns; in such case skb belongs to an already known connection.
1893  */
1894 static unsigned int __cold
1895 nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1896                          struct sk_buff *skb,
1897                          unsigned int dataoff,
1898                          u8 protonum,
1899                          const struct nf_hook_state *state)
1900 {
1901         int ret;
1902
1903         if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1904                 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1905 #if IS_ENABLED(CONFIG_IPV6)
1906         else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1907                 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1908 #endif
1909         else
1910                 return NF_ACCEPT;
1911
1912         if (ret <= 0)
1913                 NF_CT_STAT_INC_ATOMIC(state->net, error);
1914
1915         return ret;
1916 }
1917
1918 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
1919                           enum ip_conntrack_info ctinfo)
1920 {
1921         const unsigned int *timeout = nf_ct_timeout_lookup(ct);
1922
1923         if (!timeout)
1924                 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
1925
1926         nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
1927         return NF_ACCEPT;
1928 }
1929
1930 /* Returns verdict for packet, or -1 for invalid. */
1931 static int nf_conntrack_handle_packet(struct nf_conn *ct,
1932                                       struct sk_buff *skb,
1933                                       unsigned int dataoff,
1934                                       enum ip_conntrack_info ctinfo,
1935                                       const struct nf_hook_state *state)
1936 {
1937         switch (nf_ct_protonum(ct)) {
1938         case IPPROTO_TCP:
1939                 return nf_conntrack_tcp_packet(ct, skb, dataoff,
1940                                                ctinfo, state);
1941         case IPPROTO_UDP:
1942                 return nf_conntrack_udp_packet(ct, skb, dataoff,
1943                                                ctinfo, state);
1944         case IPPROTO_ICMP:
1945                 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
1946 #if IS_ENABLED(CONFIG_IPV6)
1947         case IPPROTO_ICMPV6:
1948                 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
1949 #endif
1950 #ifdef CONFIG_NF_CT_PROTO_UDPLITE
1951         case IPPROTO_UDPLITE:
1952                 return nf_conntrack_udplite_packet(ct, skb, dataoff,
1953                                                    ctinfo, state);
1954 #endif
1955 #ifdef CONFIG_NF_CT_PROTO_SCTP
1956         case IPPROTO_SCTP:
1957                 return nf_conntrack_sctp_packet(ct, skb, dataoff,
1958                                                 ctinfo, state);
1959 #endif
1960 #ifdef CONFIG_NF_CT_PROTO_DCCP
1961         case IPPROTO_DCCP:
1962                 return nf_conntrack_dccp_packet(ct, skb, dataoff,
1963                                                 ctinfo, state);
1964 #endif
1965 #ifdef CONFIG_NF_CT_PROTO_GRE
1966         case IPPROTO_GRE:
1967                 return nf_conntrack_gre_packet(ct, skb, dataoff,
1968                                                ctinfo, state);
1969 #endif
1970         }
1971
1972         return generic_packet(ct, skb, ctinfo);
1973 }
1974
1975 unsigned int
1976 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
1977 {
1978         enum ip_conntrack_info ctinfo;
1979         struct nf_conn *ct, *tmpl;
1980         u_int8_t protonum;
1981         int dataoff, ret;
1982
1983         tmpl = nf_ct_get(skb, &ctinfo);
1984         if (tmpl || ctinfo == IP_CT_UNTRACKED) {
1985                 /* Previously seen (loopback or untracked)?  Ignore. */
1986                 if ((tmpl && !nf_ct_is_template(tmpl)) ||
1987                      ctinfo == IP_CT_UNTRACKED)
1988                         return NF_ACCEPT;
1989                 skb->_nfct = 0;
1990         }
1991
1992         /* rcu_read_lock()ed by nf_hook_thresh */
1993         dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
1994         if (dataoff <= 0) {
1995                 pr_debug("not prepared to track yet or error occurred\n");
1996                 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1997                 ret = NF_ACCEPT;
1998                 goto out;
1999         }
2000
2001         if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
2002                 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
2003                                                protonum, state);
2004                 if (ret <= 0) {
2005                         ret = -ret;
2006                         goto out;
2007                 }
2008                 /* ICMP[v6] protocol trackers may assign one conntrack. */
2009                 if (skb->_nfct)
2010                         goto out;
2011         }
2012 repeat:
2013         ret = resolve_normal_ct(tmpl, skb, dataoff,
2014                                 protonum, state);
2015         if (ret < 0) {
2016                 /* Too stressed to deal. */
2017                 NF_CT_STAT_INC_ATOMIC(state->net, drop);
2018                 ret = NF_DROP;
2019                 goto out;
2020         }
2021
2022         ct = nf_ct_get(skb, &ctinfo);
2023         if (!ct) {
2024                 /* Not valid part of a connection */
2025                 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
2026                 ret = NF_ACCEPT;
2027                 goto out;
2028         }
2029
2030         ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
2031         if (ret <= 0) {
2032                 /* Invalid: inverse of the return code tells
2033                  * the netfilter core what to do */
2034                 pr_debug("nf_conntrack_in: Can't track with proto module\n");
2035                 nf_ct_put(ct);
2036                 skb->_nfct = 0;
2037                 /* Special case: TCP tracker reports an attempt to reopen a
2038                  * closed/aborted connection. We have to go back and create a
2039                  * fresh conntrack.
2040                  */
2041                 if (ret == -NF_REPEAT)
2042                         goto repeat;
2043
2044                 NF_CT_STAT_INC_ATOMIC(state->net, invalid);
2045                 if (ret == -NF_DROP)
2046                         NF_CT_STAT_INC_ATOMIC(state->net, drop);
2047
2048                 ret = -ret;
2049                 goto out;
2050         }
2051
2052         if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
2053             !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
2054                 nf_conntrack_event_cache(IPCT_REPLY, ct);
2055 out:
2056         if (tmpl)
2057                 nf_ct_put(tmpl);
2058
2059         return ret;
2060 }
2061 EXPORT_SYMBOL_GPL(nf_conntrack_in);
2062
2063 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
2064    implicitly racy: see __nf_conntrack_confirm */
2065 void nf_conntrack_alter_reply(struct nf_conn *ct,
2066                               const struct nf_conntrack_tuple *newreply)
2067 {
2068         struct nf_conn_help *help = nfct_help(ct);
2069
2070         /* Should be unconfirmed, so not in hash table yet */
2071         WARN_ON(nf_ct_is_confirmed(ct));
2072
2073         pr_debug("Altering reply tuple of %p to ", ct);
2074         nf_ct_dump_tuple(newreply);
2075
2076         ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
2077         if (ct->master || (help && !hlist_empty(&help->expectations)))
2078                 return;
2079 }
2080 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
2081
2082 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
2083 void __nf_ct_refresh_acct(struct nf_conn *ct,
2084                           enum ip_conntrack_info ctinfo,
2085                           const struct sk_buff *skb,
2086                           u32 extra_jiffies,
2087                           bool do_acct)
2088 {
2089         /* Only update if this is not a fixed timeout */
2090         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
2091                 goto acct;
2092
2093         /* If not in hash table, timer will not be active yet */
2094         if (nf_ct_is_confirmed(ct))
2095                 extra_jiffies += nfct_time_stamp;
2096
2097         if (READ_ONCE(ct->timeout) != extra_jiffies)
2098                 WRITE_ONCE(ct->timeout, extra_jiffies);
2099 acct:
2100         if (do_acct)
2101                 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
2102 }
2103 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
2104
2105 bool nf_ct_kill_acct(struct nf_conn *ct,
2106                      enum ip_conntrack_info ctinfo,
2107                      const struct sk_buff *skb)
2108 {
2109         nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
2110
2111         return nf_ct_delete(ct, 0, 0);
2112 }
2113 EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
2114
2115 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
2116
2117 #include <linux/netfilter/nfnetlink.h>
2118 #include <linux/netfilter/nfnetlink_conntrack.h>
2119 #include <linux/mutex.h>
2120
2121 /* Generic function for tcp/udp/sctp/dccp and alike. */
2122 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
2123                                const struct nf_conntrack_tuple *tuple)
2124 {
2125         if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
2126             nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
2127                 goto nla_put_failure;
2128         return 0;
2129
2130 nla_put_failure:
2131         return -1;
2132 }
2133 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
2134
2135 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
2136         [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
2137         [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
2138 };
2139 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
2140
2141 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
2142                                struct nf_conntrack_tuple *t,
2143                                u_int32_t flags)
2144 {
2145         if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
2146                 if (!tb[CTA_PROTO_SRC_PORT])
2147                         return -EINVAL;
2148
2149                 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
2150         }
2151
2152         if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
2153                 if (!tb[CTA_PROTO_DST_PORT])
2154                         return -EINVAL;
2155
2156                 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
2157         }
2158
2159         return 0;
2160 }
2161 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
2162
2163 unsigned int nf_ct_port_nlattr_tuple_size(void)
2164 {
2165         static unsigned int size __read_mostly;
2166
2167         if (!size)
2168                 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
2169
2170         return size;
2171 }
2172 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
2173 #endif
2174
2175 /* Used by ipt_REJECT and ip6t_REJECT. */
2176 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
2177 {
2178         struct nf_conn *ct;
2179         enum ip_conntrack_info ctinfo;
2180
2181         /* This ICMP is in reverse direction to the packet which caused it */
2182         ct = nf_ct_get(skb, &ctinfo);
2183         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
2184                 ctinfo = IP_CT_RELATED_REPLY;
2185         else
2186                 ctinfo = IP_CT_RELATED;
2187
2188         /* Attach to new skbuff, and increment count */
2189         nf_ct_set(nskb, ct, ctinfo);
2190         nf_conntrack_get(skb_nfct(nskb));
2191 }
2192
2193 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
2194                                  struct nf_conn *ct,
2195                                  enum ip_conntrack_info ctinfo)
2196 {
2197         const struct nf_nat_hook *nat_hook;
2198         struct nf_conntrack_tuple_hash *h;
2199         struct nf_conntrack_tuple tuple;
2200         unsigned int status;
2201         int dataoff;
2202         u16 l3num;
2203         u8 l4num;
2204
2205         l3num = nf_ct_l3num(ct);
2206
2207         dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
2208         if (dataoff <= 0)
2209                 return -1;
2210
2211         if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
2212                              l4num, net, &tuple))
2213                 return -1;
2214
2215         if (ct->status & IPS_SRC_NAT) {
2216                 memcpy(tuple.src.u3.all,
2217                        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
2218                        sizeof(tuple.src.u3.all));
2219                 tuple.src.u.all =
2220                         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
2221         }
2222
2223         if (ct->status & IPS_DST_NAT) {
2224                 memcpy(tuple.dst.u3.all,
2225                        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
2226                        sizeof(tuple.dst.u3.all));
2227                 tuple.dst.u.all =
2228                         ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
2229         }
2230
2231         h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
2232         if (!h)
2233                 return 0;
2234
2235         /* Store status bits of the conntrack that is clashing to re-do NAT
2236          * mangling according to what it has been done already to this packet.
2237          */
2238         status = ct->status;
2239
2240         nf_ct_put(ct);
2241         ct = nf_ct_tuplehash_to_ctrack(h);
2242         nf_ct_set(skb, ct, ctinfo);
2243
2244         nat_hook = rcu_dereference(nf_nat_hook);
2245         if (!nat_hook)
2246                 return 0;
2247
2248         if (status & IPS_SRC_NAT &&
2249             nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
2250                                 IP_CT_DIR_ORIGINAL) == NF_DROP)
2251                 return -1;
2252
2253         if (status & IPS_DST_NAT &&
2254             nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
2255                                 IP_CT_DIR_ORIGINAL) == NF_DROP)
2256                 return -1;
2257
2258         return 0;
2259 }
2260
2261 /* This packet is coming from userspace via nf_queue, complete the packet
2262  * processing after the helper invocation in nf_confirm().
2263  */
2264 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
2265                                enum ip_conntrack_info ctinfo)
2266 {
2267         const struct nf_conntrack_helper *helper;
2268         const struct nf_conn_help *help;
2269         int protoff;
2270
2271         help = nfct_help(ct);
2272         if (!help)
2273                 return 0;
2274
2275         helper = rcu_dereference(help->helper);
2276         if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
2277                 return 0;
2278
2279         switch (nf_ct_l3num(ct)) {
2280         case NFPROTO_IPV4:
2281                 protoff = skb_network_offset(skb) + ip_hdrlen(skb);
2282                 break;
2283 #if IS_ENABLED(CONFIG_IPV6)
2284         case NFPROTO_IPV6: {
2285                 __be16 frag_off;
2286                 u8 pnum;
2287
2288                 pnum = ipv6_hdr(skb)->nexthdr;
2289                 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
2290                                            &frag_off);
2291                 if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
2292                         return 0;
2293                 break;
2294         }
2295 #endif
2296         default:
2297                 return 0;
2298         }
2299
2300         if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
2301             !nf_is_loopback_packet(skb)) {
2302                 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
2303                         NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
2304                         return -1;
2305                 }
2306         }
2307
2308         /* We've seen it coming out the other side: confirm it */
2309         return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
2310 }
2311
2312 static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
2313 {
2314         enum ip_conntrack_info ctinfo;
2315         struct nf_conn *ct;
2316         int err;
2317
2318         ct = nf_ct_get(skb, &ctinfo);
2319         if (!ct)
2320                 return 0;
2321
2322         if (!nf_ct_is_confirmed(ct)) {
2323                 err = __nf_conntrack_update(net, skb, ct, ctinfo);
2324                 if (err < 0)
2325                         return err;
2326
2327                 ct = nf_ct_get(skb, &ctinfo);
2328         }
2329
2330         return nf_confirm_cthelper(skb, ct, ctinfo);
2331 }
2332
2333 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
2334                                        const struct sk_buff *skb)
2335 {
2336         const struct nf_conntrack_tuple *src_tuple;
2337         const struct nf_conntrack_tuple_hash *hash;
2338         struct nf_conntrack_tuple srctuple;
2339         enum ip_conntrack_info ctinfo;
2340         struct nf_conn *ct;
2341
2342         ct = nf_ct_get(skb, &ctinfo);
2343         if (ct) {
2344                 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
2345                 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2346                 return true;
2347         }
2348
2349         if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
2350                                NFPROTO_IPV4, dev_net(skb->dev),
2351                                &srctuple))
2352                 return false;
2353
2354         hash = nf_conntrack_find_get(dev_net(skb->dev),
2355                                      &nf_ct_zone_dflt,
2356                                      &srctuple);
2357         if (!hash)
2358                 return false;
2359
2360         ct = nf_ct_tuplehash_to_ctrack(hash);
2361         src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
2362         memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2363         nf_ct_put(ct);
2364
2365         return true;
2366 }
2367
2368 /* Bring out ya dead! */
2369 static struct nf_conn *
2370 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
2371                 const struct nf_ct_iter_data *iter_data, unsigned int *bucket)
2372 {
2373         struct nf_conntrack_tuple_hash *h;
2374         struct nf_conn *ct;
2375         struct hlist_nulls_node *n;
2376         spinlock_t *lockp;
2377
2378         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
2379                 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
2380
2381                 if (hlist_nulls_empty(hslot))
2382                         continue;
2383
2384                 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
2385                 local_bh_disable();
2386                 nf_conntrack_lock(lockp);
2387                 hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
2388                         if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2389                                 continue;
2390                         /* All nf_conn objects are added to hash table twice, one
2391                          * for original direction tuple, once for the reply tuple.
2392                          *
2393                          * Exception: In the IPS_NAT_CLASH case, only the reply
2394                          * tuple is added (the original tuple already existed for
2395                          * a different object).
2396                          *
2397                          * We only need to call the iterator once for each
2398                          * conntrack, so we just use the 'reply' direction
2399                          * tuple while iterating.
2400                          */
2401                         ct = nf_ct_tuplehash_to_ctrack(h);
2402
2403                         if (iter_data->net &&
2404                             !net_eq(iter_data->net, nf_ct_net(ct)))
2405                                 continue;
2406
2407                         if (iter(ct, iter_data->data))
2408                                 goto found;
2409                 }
2410                 spin_unlock(lockp);
2411                 local_bh_enable();
2412                 cond_resched();
2413         }
2414
2415         return NULL;
2416 found:
2417         refcount_inc(&ct->ct_general.use);
2418         spin_unlock(lockp);
2419         local_bh_enable();
2420         return ct;
2421 }
2422
2423 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
2424                                   const struct nf_ct_iter_data *iter_data)
2425 {
2426         unsigned int bucket = 0;
2427         struct nf_conn *ct;
2428
2429         might_sleep();
2430
2431         mutex_lock(&nf_conntrack_mutex);
2432         while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) {
2433                 /* Time to push up daises... */
2434
2435                 nf_ct_delete(ct, iter_data->portid, iter_data->report);
2436                 nf_ct_put(ct);
2437                 cond_resched();
2438         }
2439         mutex_unlock(&nf_conntrack_mutex);
2440 }
2441
2442 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
2443                                const struct nf_ct_iter_data *iter_data)
2444 {
2445         struct net *net = iter_data->net;
2446         struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2447
2448         might_sleep();
2449
2450         if (atomic_read(&cnet->count) == 0)
2451                 return;
2452
2453         nf_ct_iterate_cleanup(iter, iter_data);
2454 }
2455 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
2456
2457 /**
2458  * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table
2459  * @iter: callback to invoke for each conntrack
2460  * @data: data to pass to @iter
2461  *
2462  * Like nf_ct_iterate_cleanup, but first marks conntracks on the
2463  * unconfirmed list as dying (so they will not be inserted into
2464  * main table).
2465  *
2466  * Can only be called in module exit path.
2467  */
2468 void
2469 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
2470 {
2471         struct nf_ct_iter_data iter_data = {};
2472         struct net *net;
2473
2474         down_read(&net_rwsem);
2475         for_each_net(net) {
2476                 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2477
2478                 if (atomic_read(&cnet->count) == 0)
2479                         continue;
2480                 nf_queue_nf_hook_drop(net);
2481         }
2482         up_read(&net_rwsem);
2483
2484         /* Need to wait for netns cleanup worker to finish, if its
2485          * running -- it might have deleted a net namespace from
2486          * the global list, so hook drop above might not have
2487          * affected all namespaces.
2488          */
2489         net_ns_barrier();
2490
2491         /* a skb w. unconfirmed conntrack could have been reinjected just
2492          * before we called nf_queue_nf_hook_drop().
2493          *
2494          * This makes sure its inserted into conntrack table.
2495          */
2496         synchronize_net();
2497
2498         nf_ct_ext_bump_genid();
2499         iter_data.data = data;
2500         nf_ct_iterate_cleanup(iter, &iter_data);
2501
2502         /* Another cpu might be in a rcu read section with
2503          * rcu protected pointer cleared in iter callback
2504          * or hidden via nf_ct_ext_bump_genid() above.
2505          *
2506          * Wait until those are done.
2507          */
2508         synchronize_rcu();
2509 }
2510 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
2511
2512 static int kill_all(struct nf_conn *i, void *data)
2513 {
2514         return 1;
2515 }
2516
2517 void nf_conntrack_cleanup_start(void)
2518 {
2519         conntrack_gc_work.exiting = true;
2520 }
2521
2522 void nf_conntrack_cleanup_end(void)
2523 {
2524         RCU_INIT_POINTER(nf_ct_hook, NULL);
2525         cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2526         kvfree(nf_conntrack_hash);
2527
2528         nf_conntrack_proto_fini();
2529         nf_conntrack_helper_fini();
2530         nf_conntrack_expect_fini();
2531
2532         kmem_cache_destroy(nf_conntrack_cachep);
2533 }
2534
2535 /*
2536  * Mishearing the voices in his head, our hero wonders how he's
2537  * supposed to kill the mall.
2538  */
2539 void nf_conntrack_cleanup_net(struct net *net)
2540 {
2541         LIST_HEAD(single);
2542
2543         list_add(&net->exit_list, &single);
2544         nf_conntrack_cleanup_net_list(&single);
2545 }
2546
2547 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
2548 {
2549         struct nf_ct_iter_data iter_data = {};
2550         struct net *net;
2551         int busy;
2552
2553         /*
2554          * This makes sure all current packets have passed through
2555          *  netfilter framework.  Roll on, two-stage module
2556          *  delete...
2557          */
2558         synchronize_net();
2559 i_see_dead_people:
2560         busy = 0;
2561         list_for_each_entry(net, net_exit_list, exit_list) {
2562                 struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2563
2564                 iter_data.net = net;
2565                 nf_ct_iterate_cleanup_net(kill_all, &iter_data);
2566                 if (atomic_read(&cnet->count) != 0)
2567                         busy = 1;
2568         }
2569         if (busy) {
2570                 schedule();
2571                 goto i_see_dead_people;
2572         }
2573
2574         list_for_each_entry(net, net_exit_list, exit_list) {
2575                 nf_conntrack_ecache_pernet_fini(net);
2576                 nf_conntrack_expect_pernet_fini(net);
2577                 free_percpu(net->ct.stat);
2578         }
2579 }
2580
2581 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
2582 {
2583         struct hlist_nulls_head *hash;
2584         unsigned int nr_slots, i;
2585
2586         if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
2587                 return NULL;
2588
2589         BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
2590         nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
2591
2592         hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
2593
2594         if (hash && nulls)
2595                 for (i = 0; i < nr_slots; i++)
2596                         INIT_HLIST_NULLS_HEAD(&hash[i], i);
2597
2598         return hash;
2599 }
2600 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
2601
2602 int nf_conntrack_hash_resize(unsigned int hashsize)
2603 {
2604         int i, bucket;
2605         unsigned int old_size;
2606         struct hlist_nulls_head *hash, *old_hash;
2607         struct nf_conntrack_tuple_hash *h;
2608         struct nf_conn *ct;
2609
2610         if (!hashsize)
2611                 return -EINVAL;
2612
2613         hash = nf_ct_alloc_hashtable(&hashsize, 1);
2614         if (!hash)
2615                 return -ENOMEM;
2616
2617         mutex_lock(&nf_conntrack_mutex);
2618         old_size = nf_conntrack_htable_size;
2619         if (old_size == hashsize) {
2620                 mutex_unlock(&nf_conntrack_mutex);
2621                 kvfree(hash);
2622                 return 0;
2623         }
2624
2625         local_bh_disable();
2626         nf_conntrack_all_lock();
2627         write_seqcount_begin(&nf_conntrack_generation);
2628
2629         /* Lookups in the old hash might happen in parallel, which means we
2630          * might get false negatives during connection lookup. New connections
2631          * created because of a false negative won't make it into the hash
2632          * though since that required taking the locks.
2633          */
2634
2635         for (i = 0; i < nf_conntrack_htable_size; i++) {
2636                 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
2637                         unsigned int zone_id;
2638
2639                         h = hlist_nulls_entry(nf_conntrack_hash[i].first,
2640                                               struct nf_conntrack_tuple_hash, hnnode);
2641                         ct = nf_ct_tuplehash_to_ctrack(h);
2642                         hlist_nulls_del_rcu(&h->hnnode);
2643
2644                         zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
2645                         bucket = __hash_conntrack(nf_ct_net(ct),
2646                                                   &h->tuple, zone_id, hashsize);
2647                         hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
2648                 }
2649         }
2650         old_hash = nf_conntrack_hash;
2651
2652         nf_conntrack_hash = hash;
2653         nf_conntrack_htable_size = hashsize;
2654
2655         write_seqcount_end(&nf_conntrack_generation);
2656         nf_conntrack_all_unlock();
2657         local_bh_enable();
2658
2659         mutex_unlock(&nf_conntrack_mutex);
2660
2661         synchronize_net();
2662         kvfree(old_hash);
2663         return 0;
2664 }
2665
2666 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
2667 {
2668         unsigned int hashsize;
2669         int rc;
2670
2671         if (current->nsproxy->net_ns != &init_net)
2672                 return -EOPNOTSUPP;
2673
2674         /* On boot, we can set this without any fancy locking. */
2675         if (!nf_conntrack_hash)
2676                 return param_set_uint(val, kp);
2677
2678         rc = kstrtouint(val, 0, &hashsize);
2679         if (rc)
2680                 return rc;
2681
2682         return nf_conntrack_hash_resize(hashsize);
2683 }
2684
2685 int nf_conntrack_init_start(void)
2686 {
2687         unsigned long nr_pages = totalram_pages();
2688         int max_factor = 8;
2689         int ret = -ENOMEM;
2690         int i;
2691
2692         seqcount_spinlock_init(&nf_conntrack_generation,
2693                                &nf_conntrack_locks_all_lock);
2694
2695         for (i = 0; i < CONNTRACK_LOCKS; i++)
2696                 spin_lock_init(&nf_conntrack_locks[i]);
2697
2698         if (!nf_conntrack_htable_size) {
2699                 nf_conntrack_htable_size
2700                         = (((nr_pages << PAGE_SHIFT) / 16384)
2701                            / sizeof(struct hlist_head));
2702                 if (BITS_PER_LONG >= 64 &&
2703                     nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
2704                         nf_conntrack_htable_size = 262144;
2705                 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
2706                         nf_conntrack_htable_size = 65536;
2707
2708                 if (nf_conntrack_htable_size < 1024)
2709                         nf_conntrack_htable_size = 1024;
2710                 /* Use a max. factor of one by default to keep the average
2711                  * hash chain length at 2 entries.  Each entry has to be added
2712                  * twice (once for original direction, once for reply).
2713                  * When a table size is given we use the old value of 8 to
2714                  * avoid implicit reduction of the max entries setting.
2715                  */
2716                 max_factor = 1;
2717         }
2718
2719         nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
2720         if (!nf_conntrack_hash)
2721                 return -ENOMEM;
2722
2723         nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2724
2725         nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
2726                                                 sizeof(struct nf_conn),
2727                                                 NFCT_INFOMASK + 1,
2728                                                 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
2729         if (!nf_conntrack_cachep)
2730                 goto err_cachep;
2731
2732         ret = nf_conntrack_expect_init();
2733         if (ret < 0)
2734                 goto err_expect;
2735
2736         ret = nf_conntrack_helper_init();
2737         if (ret < 0)
2738                 goto err_helper;
2739
2740         ret = nf_conntrack_proto_init();
2741         if (ret < 0)
2742                 goto err_proto;
2743
2744         conntrack_gc_work_init(&conntrack_gc_work);
2745         queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
2746
2747         ret = register_nf_conntrack_bpf();
2748         if (ret < 0)
2749                 goto err_kfunc;
2750
2751         return 0;
2752
2753 err_kfunc:
2754         cancel_delayed_work_sync(&conntrack_gc_work.dwork);
2755         nf_conntrack_proto_fini();
2756 err_proto:
2757         nf_conntrack_helper_fini();
2758 err_helper:
2759         nf_conntrack_expect_fini();
2760 err_expect:
2761         kmem_cache_destroy(nf_conntrack_cachep);
2762 err_cachep:
2763         kvfree(nf_conntrack_hash);
2764         return ret;
2765 }
2766
2767 static const struct nf_ct_hook nf_conntrack_hook = {
2768         .update         = nf_conntrack_update,
2769         .destroy        = nf_ct_destroy,
2770         .get_tuple_skb  = nf_conntrack_get_tuple_skb,
2771         .attach         = nf_conntrack_attach,
2772 };
2773
2774 void nf_conntrack_init_end(void)
2775 {
2776         RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2777 }
2778
2779 /*
2780  * We need to use special "null" values, not used in hash table
2781  */
2782 #define UNCONFIRMED_NULLS_VAL   ((1<<30)+0)
2783
2784 int nf_conntrack_init_net(struct net *net)
2785 {
2786         struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2787         int ret = -ENOMEM;
2788
2789         BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2790         BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
2791         atomic_set(&cnet->count, 0);
2792
2793         net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2794         if (!net->ct.stat)
2795                 return ret;
2796
2797         ret = nf_conntrack_expect_pernet_init(net);
2798         if (ret < 0)
2799                 goto err_expect;
2800
2801         nf_conntrack_acct_pernet_init(net);
2802         nf_conntrack_tstamp_pernet_init(net);
2803         nf_conntrack_ecache_pernet_init(net);
2804         nf_conntrack_proto_pernet_init(net);
2805
2806         return 0;
2807
2808 err_expect:
2809         free_percpu(net->ct.stat);
2810         return ret;
2811 }
2812
2813 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
2814
2815 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
2816 {
2817         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
2818                 return -EPERM;
2819
2820         __nf_ct_set_timeout(ct, timeout);
2821
2822         if (test_bit(IPS_DYING_BIT, &ct->status))
2823                 return -ETIME;
2824
2825         return 0;
2826 }
2827 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
2828
2829 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off)
2830 {
2831         unsigned int bit;
2832
2833         /* Ignore these unchangable bits */
2834         on &= ~IPS_UNCHANGEABLE_MASK;
2835         off &= ~IPS_UNCHANGEABLE_MASK;
2836
2837         for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
2838                 if (on & (1 << bit))
2839                         set_bit(bit, &ct->status);
2840                 else if (off & (1 << bit))
2841                         clear_bit(bit, &ct->status);
2842         }
2843 }
2844 EXPORT_SYMBOL_GPL(__nf_ct_change_status);
2845
2846 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status)
2847 {
2848         unsigned long d;
2849
2850         d = ct->status ^ status;
2851
2852         if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
2853                 /* unchangeable */
2854                 return -EBUSY;
2855
2856         if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
2857                 /* SEEN_REPLY bit can only be set */
2858                 return -EBUSY;
2859
2860         if (d & IPS_ASSURED && !(status & IPS_ASSURED))
2861                 /* ASSURED bit can only be set */
2862                 return -EBUSY;
2863
2864         __nf_ct_change_status(ct, status, 0);
2865         return 0;
2866 }
2867 EXPORT_SYMBOL_GPL(nf_ct_change_status_common);