netfilter: nft_set_pipapo: release elements in clone from abort path
[platform/kernel/linux-rpi.git] / net / netfilter / nf_conntrack_core.c
index 770a631..31399c5 100644 (file)
@@ -66,6 +66,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 struct conntrack_gc_work {
        struct delayed_work     dwork;
        u32                     next_bucket;
+       u32                     avg_timeout;
+       u32                     start_time;
        bool                    exiting;
        bool                    early_drop;
 };
@@ -77,8 +79,19 @@ static __read_mostly bool nf_conntrack_locks_all;
 /* serialize hash resizes and nf_ct_iterate_cleanup */
 static DEFINE_MUTEX(nf_conntrack_mutex);
 
-#define GC_SCAN_INTERVAL       (120u * HZ)
+#define GC_SCAN_INTERVAL_MAX   (60ul * HZ)
+#define GC_SCAN_INTERVAL_MIN   (1ul * HZ)
+
+/* clamp timeouts to this value (TCP unacked) */
+#define GC_SCAN_INTERVAL_CLAMP (300ul * HZ)
+
+/* large initial bias so that we don't scan often just because we have
+ * three entries with a 1s timeout.
+ */
+#define GC_SCAN_INTERVAL_INIT  INT_MAX
+
 #define GC_SCAN_MAX_DURATION   msecs_to_jiffies(10)
+#define GC_SCAN_EXPIRED_MAX    (64000u / HZ)
 
 #define MIN_CHAINLEN   8u
 #define MAX_CHAINLEN   (32u - MIN_CHAINLEN)
@@ -558,7 +571,7 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 
 #define NFCT_ALIGN(len)        (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
 
-/* Released via destroy_conntrack() */
+/* Released via nf_ct_destroy() */
 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
                                 const struct nf_conntrack_zone *zone,
                                 gfp_t flags)
@@ -585,7 +598,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
        tmpl->status = IPS_TEMPLATE;
        write_pnet(&tmpl->ct_net, net);
        nf_ct_zone_add(tmpl, zone);
-       atomic_set(&tmpl->ct_general.use, 0);
+       refcount_set(&tmpl->ct_general.use, 1);
 
        return tmpl;
 }
@@ -612,13 +625,12 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
 #endif
 }
 
-static void
-destroy_conntrack(struct nf_conntrack *nfct)
+void nf_ct_destroy(struct nf_conntrack *nfct)
 {
        struct nf_conn *ct = (struct nf_conn *)nfct;
 
-       pr_debug("destroy_conntrack(%p)\n", ct);
-       WARN_ON(atomic_read(&nfct->use) != 0);
+       pr_debug("%s(%p)\n", __func__, ct);
+       WARN_ON(refcount_read(&nfct->use) != 0);
 
        if (unlikely(nf_ct_is_template(ct))) {
                nf_ct_tmpl_free(ct);
@@ -643,9 +655,10 @@ destroy_conntrack(struct nf_conntrack *nfct)
        if (ct->master)
                nf_ct_put(ct->master);
 
-       pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
+       pr_debug("%s: returning ct=%p to slab\n", __func__, ct);
        nf_conntrack_free(ct);
 }
+EXPORT_SYMBOL(nf_ct_destroy);
 
 static void nf_ct_delete_from_lists(struct nf_conn *ct)
 {
@@ -684,7 +697,7 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
 
        tstamp = nf_conn_tstamp_find(ct);
        if (tstamp) {
-               s32 timeout = ct->timeout - nfct_time_stamp;
+               s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
 
                tstamp->stop = ktime_get_real_ns();
                if (timeout < 0)
@@ -742,7 +755,7 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
 /* caller must hold rcu readlock and none of the nf_conntrack_locks */
 static void nf_ct_gc_expired(struct nf_conn *ct)
 {
-       if (!atomic_inc_not_zero(&ct->ct_general.use))
+       if (!refcount_inc_not_zero(&ct->ct_general.use))
                return;
 
        if (nf_ct_should_gc(ct))
@@ -810,7 +823,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
                 * in, try to obtain a reference and re-check tuple
                 */
                ct = nf_ct_tuplehash_to_ctrack(h);
-               if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
+               if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
                        if (likely(nf_ct_key_equal(h, tuple, zone, net)))
                                goto found;
 
@@ -907,7 +920,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
 
        smp_wmb();
        /* The caller holds a reference to this object */
-       atomic_set(&ct->ct_general.use, 2);
+       refcount_set(&ct->ct_general.use, 2);
        __nf_conntrack_hash_insert(ct, hash, reply_hash);
        nf_conntrack_double_unlock(hash, reply_hash);
        NF_CT_STAT_INC(net, insert);
@@ -958,7 +971,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
 {
        struct nf_conn_tstamp *tstamp;
 
-       atomic_inc(&ct->ct_general.use);
+       refcount_inc(&ct->ct_general.use);
        ct->status |= IPS_CONFIRMED;
 
        /* set conntrack timestamp, if enabled. */
@@ -989,7 +1002,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
 
                nf_ct_acct_merge(ct, ctinfo, loser_ct);
                nf_ct_add_to_dying_list(loser_ct);
-               nf_conntrack_put(&loser_ct->ct_general);
+               nf_ct_put(loser_ct);
                nf_ct_set(skb, ct, ctinfo);
 
                NF_CT_STAT_INC(net, clash_resolve);
@@ -1036,7 +1049,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
        }
 
        /* We want the clashing entry to go away real soon: 1 second timeout. */
-       loser_ct->timeout = nfct_time_stamp + HZ;
+       WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
 
        /* IPS_NAT_CLASH removes the entry automatically on the first
         * reply.  Also prevents UDP tracker from moving the entry to
@@ -1351,7 +1364,7 @@ static unsigned int early_drop_list(struct net *net,
                    nf_ct_is_dying(tmp))
                        continue;
 
-               if (!atomic_inc_not_zero(&tmp->ct_general.use))
+               if (!refcount_inc_not_zero(&tmp->ct_general.use))
                        continue;
 
                /* kill only if still in same netns -- might have moved due to
@@ -1420,16 +1433,28 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-       unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
        unsigned int i, hashsz, nf_conntrack_max95 = 0;
-       unsigned long next_run = GC_SCAN_INTERVAL;
+       u32 end_time, start_time = nfct_time_stamp;
        struct conntrack_gc_work *gc_work;
+       unsigned int expired_count = 0;
+       unsigned long next_run;
+       s32 delta_time;
+
        gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
        i = gc_work->next_bucket;
        if (gc_work->early_drop)
                nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
+       if (i == 0) {
+               gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
+               gc_work->start_time = start_time;
+       }
+
+       next_run = gc_work->avg_timeout;
+
+       end_time = start_time + GC_SCAN_MAX_DURATION;
+
        do {
                struct nf_conntrack_tuple_hash *h;
                struct hlist_nulls_head *ct_hash;
@@ -1446,6 +1471,7 @@ static void gc_worker(struct work_struct *work)
 
                hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
                        struct nf_conntrack_net *cnet;
+                       unsigned long expires;
                        struct net *net;
 
                        tmp = nf_ct_tuplehash_to_ctrack(h);
@@ -1455,11 +1481,29 @@ static void gc_worker(struct work_struct *work)
                                continue;
                        }
 
+                       if (expired_count > GC_SCAN_EXPIRED_MAX) {
+                               rcu_read_unlock();
+
+                               gc_work->next_bucket = i;
+                               gc_work->avg_timeout = next_run;
+
+                               delta_time = nfct_time_stamp - gc_work->start_time;
+
+                               /* re-sched immediately if total cycle time is exceeded */
+                               next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
+                               goto early_exit;
+                       }
+
                        if (nf_ct_is_expired(tmp)) {
                                nf_ct_gc_expired(tmp);
+                               expired_count++;
                                continue;
                        }
 
+                       expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
+                       next_run += expires;
+                       next_run /= 2u;
+
                        if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
                                continue;
 
@@ -1469,7 +1513,7 @@ static void gc_worker(struct work_struct *work)
                                continue;
 
                        /* need to take reference to avoid possible races */
-                       if (!atomic_inc_not_zero(&tmp->ct_general.use))
+                       if (!refcount_inc_not_zero(&tmp->ct_general.use))
                                continue;
 
                        if (gc_worker_skip_ct(tmp)) {
@@ -1477,8 +1521,10 @@ static void gc_worker(struct work_struct *work)
                                continue;
                        }
 
-                       if (gc_worker_can_early_drop(tmp))
+                       if (gc_worker_can_early_drop(tmp)) {
                                nf_ct_kill(tmp);
+                               expired_count++;
+                       }
 
                        nf_ct_put(tmp);
                }
@@ -1491,33 +1537,38 @@ static void gc_worker(struct work_struct *work)
                cond_resched();
                i++;
 
-               if (time_after(jiffies, end_time) && i < hashsz) {
+               delta_time = nfct_time_stamp - end_time;
+               if (delta_time > 0 && i < hashsz) {
+                       gc_work->avg_timeout = next_run;
                        gc_work->next_bucket = i;
                        next_run = 0;
-                       break;
+                       goto early_exit;
                }
        } while (i < hashsz);
 
+       gc_work->next_bucket = 0;
+
+       next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
+
+       delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
+       if (next_run > (unsigned long)delta_time)
+               next_run -= delta_time;
+       else
+               next_run = 1;
+
+early_exit:
        if (gc_work->exiting)
                return;
 
-       /*
-        * Eviction will normally happen from the packet path, and not
-        * from this gc worker.
-        *
-        * This worker is only here to reap expired entries when system went
-        * idle after a busy period.
-        */
-       if (next_run) {
+       if (next_run)
                gc_work->early_drop = false;
-               gc_work->next_bucket = 0;
-       }
+
        queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
 }
 
 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
 {
-       INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
+       INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
        gc_work->exiting = false;
 }
 
@@ -1560,7 +1611,7 @@ __nf_conntrack_alloc(struct net *net,
        /* save hash for reusing when confirming */
        *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
        ct->status = 0;
-       ct->timeout = 0;
+       WRITE_ONCE(ct->timeout, 0);
        write_pnet(&ct->ct_net, net);
        memset(&ct->__nfct_init_offset, 0,
               offsetof(struct nf_conn, proto) -
@@ -1571,7 +1622,7 @@ __nf_conntrack_alloc(struct net *net,
        /* Because we use RCU lookups, we set ct_general.use to zero before
         * this is inserted in any list.
         */
-       atomic_set(&ct->ct_general.use, 0);
+       refcount_set(&ct->ct_general.use, 0);
        return ct;
 out:
        atomic_dec(&cnet->count);
@@ -1596,7 +1647,7 @@ void nf_conntrack_free(struct nf_conn *ct)
        /* A freed object has refcnt == 0, that's
         * the golden rule for SLAB_TYPESAFE_BY_RCU
         */
-       WARN_ON(atomic_read(&ct->ct_general.use) != 0);
+       WARN_ON(refcount_read(&ct->ct_general.use) != 0);
 
        nf_ct_ext_destroy(ct);
        kmem_cache_free(nf_conntrack_cachep, ct);
@@ -1688,8 +1739,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
        if (!exp)
                __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
 
-       /* Now it is inserted into the unconfirmed list, bump refcount */
-       nf_conntrack_get(&ct->ct_general);
+       /* Now it is inserted into the unconfirmed list, set refcount to 1. */
+       refcount_set(&ct->ct_general.use, 1);
        nf_ct_add_to_unconfirmed_list(ct);
 
        local_bh_enable();
@@ -1920,17 +1971,19 @@ repeat:
                /* Invalid: inverse of the return code tells
                 * the netfilter core what to do */
                pr_debug("nf_conntrack_in: Can't track with proto module\n");
-               nf_conntrack_put(&ct->ct_general);
+               nf_ct_put(ct);
                skb->_nfct = 0;
-               NF_CT_STAT_INC_ATOMIC(state->net, invalid);
-               if (ret == -NF_DROP)
-                       NF_CT_STAT_INC_ATOMIC(state->net, drop);
                /* Special case: TCP tracker reports an attempt to reopen a
                 * closed/aborted connection. We have to go back and create a
                 * fresh conntrack.
                 */
                if (ret == -NF_REPEAT)
                        goto repeat;
+
+               NF_CT_STAT_INC_ATOMIC(state->net, invalid);
+               if (ret == -NF_DROP)
+                       NF_CT_STAT_INC_ATOMIC(state->net, drop);
+
                ret = -ret;
                goto out;
        }
@@ -2299,7 +2352,7 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
 
        return NULL;
 found:
-       atomic_inc(&ct->ct_general.use);
+       refcount_inc(&ct->ct_general.use);
        spin_unlock(lockp);
        local_bh_enable();
        return ct;
@@ -2772,7 +2825,7 @@ err_cachep:
 
 static struct nf_ct_hook nf_conntrack_hook = {
        .update         = nf_conntrack_update,
-       .destroy        = destroy_conntrack,
+       .destroy        = nf_ct_destroy,
        .get_tuple_skb  = nf_conntrack_get_tuple_skb,
 };