c699e5095607dedbc921802f31e8f4c78c35985f
[platform/kernel/linux-starfive.git] / net / sched / sch_fq_pie.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Flow Queue PIE discipline
3  *
4  * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
5  * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
6  * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
7  * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
8  * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
9  * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
10  */
11
12 #include <linux/jhash.h>
13 #include <linux/sizes.h>
14 #include <linux/vmalloc.h>
15 #include <net/pkt_cls.h>
16 #include <net/pie.h>
17
18 /* Flow Queue PIE
19  *
20  * Principles:
21  *   - Packets are classified on flows.
22  *   - This is a Stochastic model (as we use a hash, several flows might
23  *                                 be hashed to the same slot)
24  *   - Each flow has a PIE managed queue.
25  *   - Flows are linked onto two (Round Robin) lists,
26  *     so that new flows have priority on old ones.
27  *   - For a given flow, packets are not reordered.
28  *   - Drops during enqueue only.
29  *   - ECN capability is off by default.
30  *   - ECN threshold (if ECN is enabled) is at 10% by default.
31  *   - Uses timestamps to calculate queue delay by default.
32  */
33
34 /**
35  * struct fq_pie_flow - contains data for each flow
36  * @vars:       pie vars associated with the flow
37  * @deficit:    number of remaining byte credits
38  * @backlog:    size of data in the flow
39  * @qlen:       number of packets in the flow
40  * @flowchain:  flowchain for the flow
41  * @head:       first packet in the flow
42  * @tail:       last packet in the flow
43  */
44 struct fq_pie_flow {
45         struct pie_vars vars;
46         s32 deficit;
47         u32 backlog;
48         u32 qlen;
49         struct list_head flowchain;
50         struct sk_buff *head;
51         struct sk_buff *tail;
52 };
53
54 struct fq_pie_sched_data {
55         struct tcf_proto __rcu *filter_list; /* optional external classifier */
56         struct tcf_block *block;
57         struct fq_pie_flow *flows;
58         struct Qdisc *sch;
59         struct list_head old_flows;
60         struct list_head new_flows;
61         struct pie_params p_params;
62         u32 ecn_prob;
63         u32 flows_cnt;
64         u32 quantum;
65         u32 memory_limit;
66         u32 new_flow_count;
67         u32 memory_usage;
68         u32 overmemory;
69         struct pie_stats stats;
70         struct timer_list adapt_timer;
71 };
72
73 static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q,
74                                 struct sk_buff *skb)
75 {
76         return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
77 }
78
79 static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
80                                     int *qerr)
81 {
82         struct fq_pie_sched_data *q = qdisc_priv(sch);
83         struct tcf_proto *filter;
84         struct tcf_result res;
85         int result;
86
87         if (TC_H_MAJ(skb->priority) == sch->handle &&
88             TC_H_MIN(skb->priority) > 0 &&
89             TC_H_MIN(skb->priority) <= q->flows_cnt)
90                 return TC_H_MIN(skb->priority);
91
92         filter = rcu_dereference_bh(q->filter_list);
93         if (!filter)
94                 return fq_pie_hash(q, skb) + 1;
95
96         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
97         result = tcf_classify(skb, NULL, filter, &res, false);
98         if (result >= 0) {
99 #ifdef CONFIG_NET_CLS_ACT
100                 switch (result) {
101                 case TC_ACT_STOLEN:
102                 case TC_ACT_QUEUED:
103                 case TC_ACT_TRAP:
104                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
105                         fallthrough;
106                 case TC_ACT_SHOT:
107                         return 0;
108                 }
109 #endif
110                 if (TC_H_MIN(res.classid) <= q->flows_cnt)
111                         return TC_H_MIN(res.classid);
112         }
113         return 0;
114 }
115
116 /* add skb to flow queue (tail add) */
117 static inline void flow_queue_add(struct fq_pie_flow *flow,
118                                   struct sk_buff *skb)
119 {
120         if (!flow->head)
121                 flow->head = skb;
122         else
123                 flow->tail->next = skb;
124         flow->tail = skb;
125         skb->next = NULL;
126 }
127
128 static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
129                                 struct sk_buff **to_free)
130 {
131         struct fq_pie_sched_data *q = qdisc_priv(sch);
132         struct fq_pie_flow *sel_flow;
133         int ret;
134         u8 memory_limited = false;
135         u8 enqueue = false;
136         u32 pkt_len;
137         u32 idx;
138
139         /* Classifies packet into corresponding flow */
140         idx = fq_pie_classify(skb, sch, &ret);
141         if (idx == 0) {
142                 if (ret & __NET_XMIT_BYPASS)
143                         qdisc_qstats_drop(sch);
144                 __qdisc_drop(skb, to_free);
145                 return ret;
146         }
147         idx--;
148
149         sel_flow = &q->flows[idx];
150         /* Checks whether adding a new packet would exceed memory limit */
151         get_pie_cb(skb)->mem_usage = skb->truesize;
152         memory_limited = q->memory_usage > q->memory_limit + skb->truesize;
153
154         /* Checks if the qdisc is full */
155         if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
156                 q->stats.overlimit++;
157                 goto out;
158         } else if (unlikely(memory_limited)) {
159                 q->overmemory++;
160         }
161
162         if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
163                             sel_flow->backlog, skb->len)) {
164                 enqueue = true;
165         } else if (q->p_params.ecn &&
166                    sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob &&
167                    INET_ECN_set_ce(skb)) {
168                 /* If packet is ecn capable, mark it if drop probability
169                  * is lower than the parameter ecn_prob, else drop it.
170                  */
171                 q->stats.ecn_mark++;
172                 enqueue = true;
173         }
174         if (enqueue) {
175                 /* Set enqueue time only when dq_rate_estimator is disabled. */
176                 if (!q->p_params.dq_rate_estimator)
177                         pie_set_enqueue_time(skb);
178
179                 pkt_len = qdisc_pkt_len(skb);
180                 q->stats.packets_in++;
181                 q->memory_usage += skb->truesize;
182                 sch->qstats.backlog += pkt_len;
183                 sch->q.qlen++;
184                 flow_queue_add(sel_flow, skb);
185                 if (list_empty(&sel_flow->flowchain)) {
186                         list_add_tail(&sel_flow->flowchain, &q->new_flows);
187                         q->new_flow_count++;
188                         sel_flow->deficit = q->quantum;
189                         sel_flow->qlen = 0;
190                         sel_flow->backlog = 0;
191                 }
192                 sel_flow->qlen++;
193                 sel_flow->backlog += pkt_len;
194                 return NET_XMIT_SUCCESS;
195         }
196 out:
197         q->stats.dropped++;
198         sel_flow->vars.accu_prob = 0;
199         __qdisc_drop(skb, to_free);
200         qdisc_qstats_drop(sch);
201         return NET_XMIT_CN;
202 }
203
204 static struct netlink_range_validation fq_pie_q_range = {
205         .min = 1,
206         .max = 1 << 20,
207 };
208
209 static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
210         [TCA_FQ_PIE_LIMIT]              = {.type = NLA_U32},
211         [TCA_FQ_PIE_FLOWS]              = {.type = NLA_U32},
212         [TCA_FQ_PIE_TARGET]             = {.type = NLA_U32},
213         [TCA_FQ_PIE_TUPDATE]            = {.type = NLA_U32},
214         [TCA_FQ_PIE_ALPHA]              = {.type = NLA_U32},
215         [TCA_FQ_PIE_BETA]               = {.type = NLA_U32},
216         [TCA_FQ_PIE_QUANTUM]            =
217                         NLA_POLICY_FULL_RANGE(NLA_U32, &fq_pie_q_range),
218         [TCA_FQ_PIE_MEMORY_LIMIT]       = {.type = NLA_U32},
219         [TCA_FQ_PIE_ECN_PROB]           = {.type = NLA_U32},
220         [TCA_FQ_PIE_ECN]                = {.type = NLA_U32},
221         [TCA_FQ_PIE_BYTEMODE]           = {.type = NLA_U32},
222         [TCA_FQ_PIE_DQ_RATE_ESTIMATOR]  = {.type = NLA_U32},
223 };
224
225 static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
226 {
227         struct sk_buff *skb = flow->head;
228
229         flow->head = skb->next;
230         skb->next = NULL;
231         return skb;
232 }
233
234 static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
235 {
236         struct fq_pie_sched_data *q = qdisc_priv(sch);
237         struct sk_buff *skb = NULL;
238         struct fq_pie_flow *flow;
239         struct list_head *head;
240         u32 pkt_len;
241
242 begin:
243         head = &q->new_flows;
244         if (list_empty(head)) {
245                 head = &q->old_flows;
246                 if (list_empty(head))
247                         return NULL;
248         }
249
250         flow = list_first_entry(head, struct fq_pie_flow, flowchain);
251         /* Flow has exhausted all its credits */
252         if (flow->deficit <= 0) {
253                 flow->deficit += q->quantum;
254                 list_move_tail(&flow->flowchain, &q->old_flows);
255                 goto begin;
256         }
257
258         if (flow->head) {
259                 skb = dequeue_head(flow);
260                 pkt_len = qdisc_pkt_len(skb);
261                 sch->qstats.backlog -= pkt_len;
262                 sch->q.qlen--;
263                 qdisc_bstats_update(sch, skb);
264         }
265
266         if (!skb) {
267                 /* force a pass through old_flows to prevent starvation */
268                 if (head == &q->new_flows && !list_empty(&q->old_flows))
269                         list_move_tail(&flow->flowchain, &q->old_flows);
270                 else
271                         list_del_init(&flow->flowchain);
272                 goto begin;
273         }
274
275         flow->qlen--;
276         flow->deficit -= pkt_len;
277         flow->backlog -= pkt_len;
278         q->memory_usage -= get_pie_cb(skb)->mem_usage;
279         pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog);
280         return skb;
281 }
282
283 static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
284                          struct netlink_ext_ack *extack)
285 {
286         struct fq_pie_sched_data *q = qdisc_priv(sch);
287         struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
288         unsigned int len_dropped = 0;
289         unsigned int num_dropped = 0;
290         int err;
291
292         err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
293         if (err < 0)
294                 return err;
295
296         sch_tree_lock(sch);
297         if (tb[TCA_FQ_PIE_LIMIT]) {
298                 u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
299
300                 q->p_params.limit = limit;
301                 sch->limit = limit;
302         }
303         if (tb[TCA_FQ_PIE_FLOWS]) {
304                 if (q->flows) {
305                         NL_SET_ERR_MSG_MOD(extack,
306                                            "Number of flows cannot be changed");
307                         goto flow_error;
308                 }
309                 q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
310                 if (!q->flows_cnt || q->flows_cnt > 65536) {
311                         NL_SET_ERR_MSG_MOD(extack,
312                                            "Number of flows must range in [1..65536]");
313                         goto flow_error;
314                 }
315         }
316
317         /* convert from microseconds to pschedtime */
318         if (tb[TCA_FQ_PIE_TARGET]) {
319                 /* target is in us */
320                 u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
321
322                 /* convert to pschedtime */
323                 q->p_params.target =
324                         PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
325         }
326
327         /* tupdate is in jiffies */
328         if (tb[TCA_FQ_PIE_TUPDATE])
329                 q->p_params.tupdate =
330                         usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
331
332         if (tb[TCA_FQ_PIE_ALPHA])
333                 q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
334
335         if (tb[TCA_FQ_PIE_BETA])
336                 q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
337
338         if (tb[TCA_FQ_PIE_QUANTUM])
339                 q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
340
341         if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
342                 q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]);
343
344         if (tb[TCA_FQ_PIE_ECN_PROB])
345                 q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
346
347         if (tb[TCA_FQ_PIE_ECN])
348                 q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
349
350         if (tb[TCA_FQ_PIE_BYTEMODE])
351                 q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
352
353         if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
354                 q->p_params.dq_rate_estimator =
355                         nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]);
356
357         /* Drop excess packets if new limit is lower */
358         while (sch->q.qlen > sch->limit) {
359                 struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
360
361                 len_dropped += qdisc_pkt_len(skb);
362                 num_dropped += 1;
363                 rtnl_kfree_skbs(skb, skb);
364         }
365         qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
366
367         sch_tree_unlock(sch);
368         return 0;
369
370 flow_error:
371         sch_tree_unlock(sch);
372         return -EINVAL;
373 }
374
375 static void fq_pie_timer(struct timer_list *t)
376 {
377         struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
378         struct Qdisc *sch = q->sch;
379         spinlock_t *root_lock; /* to lock qdisc for probability calculations */
380         u32 idx;
381
382         root_lock = qdisc_lock(qdisc_root_sleeping(sch));
383         spin_lock(root_lock);
384
385         for (idx = 0; idx < q->flows_cnt; idx++)
386                 pie_calculate_probability(&q->p_params, &q->flows[idx].vars,
387                                           q->flows[idx].backlog);
388
389         /* reset the timer to fire after 'tupdate' jiffies. */
390         if (q->p_params.tupdate)
391                 mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate);
392
393         spin_unlock(root_lock);
394 }
395
396 static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
397                        struct netlink_ext_ack *extack)
398 {
399         struct fq_pie_sched_data *q = qdisc_priv(sch);
400         int err;
401         u32 idx;
402
403         pie_params_init(&q->p_params);
404         sch->limit = 10 * 1024;
405         q->p_params.limit = sch->limit;
406         q->quantum = psched_mtu(qdisc_dev(sch));
407         q->sch = sch;
408         q->ecn_prob = 10;
409         q->flows_cnt = 1024;
410         q->memory_limit = SZ_32M;
411
412         INIT_LIST_HEAD(&q->new_flows);
413         INIT_LIST_HEAD(&q->old_flows);
414         timer_setup(&q->adapt_timer, fq_pie_timer, 0);
415
416         if (opt) {
417                 err = fq_pie_change(sch, opt, extack);
418
419                 if (err)
420                         return err;
421         }
422
423         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
424         if (err)
425                 goto init_failure;
426
427         q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow),
428                             GFP_KERNEL);
429         if (!q->flows) {
430                 err = -ENOMEM;
431                 goto init_failure;
432         }
433         for (idx = 0; idx < q->flows_cnt; idx++) {
434                 struct fq_pie_flow *flow = q->flows + idx;
435
436                 INIT_LIST_HEAD(&flow->flowchain);
437                 pie_vars_init(&flow->vars);
438         }
439
440         mod_timer(&q->adapt_timer, jiffies + HZ / 2);
441
442         return 0;
443
444 init_failure:
445         q->flows_cnt = 0;
446
447         return err;
448 }
449
450 static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
451 {
452         struct fq_pie_sched_data *q = qdisc_priv(sch);
453         struct nlattr *opts;
454
455         opts = nla_nest_start(skb, TCA_OPTIONS);
456         if (!opts)
457                 return -EMSGSIZE;
458
459         /* convert target from pschedtime to us */
460         if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
461             nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) ||
462             nla_put_u32(skb, TCA_FQ_PIE_TARGET,
463                         ((u32)PSCHED_TICKS2NS(q->p_params.target)) /
464                         NSEC_PER_USEC) ||
465             nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
466                         jiffies_to_usecs(q->p_params.tupdate)) ||
467             nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) ||
468             nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) ||
469             nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
470             nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) ||
471             nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) ||
472             nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) ||
473             nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) ||
474             nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
475                         q->p_params.dq_rate_estimator))
476                 goto nla_put_failure;
477
478         return nla_nest_end(skb, opts);
479
480 nla_put_failure:
481         nla_nest_cancel(skb, opts);
482         return -EMSGSIZE;
483 }
484
485 static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
486 {
487         struct fq_pie_sched_data *q = qdisc_priv(sch);
488         struct tc_fq_pie_xstats st = {
489                 .packets_in     = q->stats.packets_in,
490                 .overlimit      = q->stats.overlimit,
491                 .overmemory     = q->overmemory,
492                 .dropped        = q->stats.dropped,
493                 .ecn_mark       = q->stats.ecn_mark,
494                 .new_flow_count = q->new_flow_count,
495                 .memory_usage   = q->memory_usage,
496         };
497         struct list_head *pos;
498
499         sch_tree_lock(sch);
500         list_for_each(pos, &q->new_flows)
501                 st.new_flows_len++;
502
503         list_for_each(pos, &q->old_flows)
504                 st.old_flows_len++;
505         sch_tree_unlock(sch);
506
507         return gnet_stats_copy_app(d, &st, sizeof(st));
508 }
509
510 static void fq_pie_reset(struct Qdisc *sch)
511 {
512         struct fq_pie_sched_data *q = qdisc_priv(sch);
513         u32 idx;
514
515         INIT_LIST_HEAD(&q->new_flows);
516         INIT_LIST_HEAD(&q->old_flows);
517         for (idx = 0; idx < q->flows_cnt; idx++) {
518                 struct fq_pie_flow *flow = q->flows + idx;
519
520                 /* Removes all packets from flow */
521                 rtnl_kfree_skbs(flow->head, flow->tail);
522                 flow->head = NULL;
523
524                 INIT_LIST_HEAD(&flow->flowchain);
525                 pie_vars_init(&flow->vars);
526         }
527 }
528
529 static void fq_pie_destroy(struct Qdisc *sch)
530 {
531         struct fq_pie_sched_data *q = qdisc_priv(sch);
532
533         tcf_block_put(q->block);
534         q->p_params.tupdate = 0;
535         del_timer_sync(&q->adapt_timer);
536         kvfree(q->flows);
537 }
538
539 static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
540         .id             = "fq_pie",
541         .priv_size      = sizeof(struct fq_pie_sched_data),
542         .enqueue        = fq_pie_qdisc_enqueue,
543         .dequeue        = fq_pie_qdisc_dequeue,
544         .peek           = qdisc_peek_dequeued,
545         .init           = fq_pie_init,
546         .destroy        = fq_pie_destroy,
547         .reset          = fq_pie_reset,
548         .change         = fq_pie_change,
549         .dump           = fq_pie_dump,
550         .dump_stats     = fq_pie_dump_stats,
551         .owner          = THIS_MODULE,
552 };
553
554 static int __init fq_pie_module_init(void)
555 {
556         return register_qdisc(&fq_pie_qdisc_ops);
557 }
558
559 static void __exit fq_pie_module_exit(void)
560 {
561         unregister_qdisc(&fq_pie_qdisc_ops);
562 }
563
564 module_init(fq_pie_module_init);
565 module_exit(fq_pie_module_exit);
566
567 MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
568 MODULE_AUTHOR("Mohit P. Tahiliani");
569 MODULE_LICENSE("GPL");