Merge git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending
[platform/kernel/linux-starfive.git] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *              This program is free software; you can redistribute it and/or
11  *              modify it under the terms of the GNU General Public License
12  *              as published by the Free Software Foundation; either version
13  *              2 of the License, or (at your option) any later version.
14  */
15
16 #include <linux/uaccess.h>
17 #include <linux/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/jiffies.h>
21 #include <linux/mm.h>
22 #include <linux/string.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/errno.h>
26 #include <linux/in.h>
27 #include <linux/inet.h>
28 #include <linux/inetdevice.h>
29 #include <linux/netdevice.h>
30 #include <linux/if_arp.h>
31 #include <linux/proc_fs.h>
32 #include <linux/skbuff.h>
33 #include <linux/init.h>
34 #include <linux/slab.h>
35
36 #include <net/arp.h>
37 #include <net/ip.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
40 #include <net/tcp.h>
41 #include <net/sock.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
45 #include <net/lwtunnel.h>
46
47 #include "fib_lookup.h"
48
49 static DEFINE_SPINLOCK(fib_info_lock);
50 static struct hlist_head *fib_info_hash;
51 static struct hlist_head *fib_info_laddrhash;
52 static unsigned int fib_info_hash_size;
53 static unsigned int fib_info_cnt;
54
55 #define DEVINDEX_HASHBITS 8
56 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
57 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
58
59 #ifdef CONFIG_IP_ROUTE_MULTIPATH
60 u32 fib_multipath_secret __read_mostly;
61
62 #define for_nexthops(fi) {                                              \
63         int nhsel; const struct fib_nh *nh;                             \
64         for (nhsel = 0, nh = (fi)->fib_nh;                              \
65              nhsel < (fi)->fib_nhs;                                     \
66              nh++, nhsel++)
67
68 #define change_nexthops(fi) {                                           \
69         int nhsel; struct fib_nh *nexthop_nh;                           \
70         for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh);   \
71              nhsel < (fi)->fib_nhs;                                     \
72              nexthop_nh++, nhsel++)
73
74 #else /* CONFIG_IP_ROUTE_MULTIPATH */
75
76 /* Hope, that gcc will optimize it to get rid of dummy loop */
77
78 #define for_nexthops(fi) {                                              \
79         int nhsel; const struct fib_nh *nh = (fi)->fib_nh;              \
80         for (nhsel = 0; nhsel < 1; nhsel++)
81
82 #define change_nexthops(fi) {                                           \
83         int nhsel;                                                      \
84         struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);    \
85         for (nhsel = 0; nhsel < 1; nhsel++)
86
87 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
88
89 #define endfor_nexthops(fi) }
90
91
92 const struct fib_prop fib_props[RTN_MAX + 1] = {
93         [RTN_UNSPEC] = {
94                 .error  = 0,
95                 .scope  = RT_SCOPE_NOWHERE,
96         },
97         [RTN_UNICAST] = {
98                 .error  = 0,
99                 .scope  = RT_SCOPE_UNIVERSE,
100         },
101         [RTN_LOCAL] = {
102                 .error  = 0,
103                 .scope  = RT_SCOPE_HOST,
104         },
105         [RTN_BROADCAST] = {
106                 .error  = 0,
107                 .scope  = RT_SCOPE_LINK,
108         },
109         [RTN_ANYCAST] = {
110                 .error  = 0,
111                 .scope  = RT_SCOPE_LINK,
112         },
113         [RTN_MULTICAST] = {
114                 .error  = 0,
115                 .scope  = RT_SCOPE_UNIVERSE,
116         },
117         [RTN_BLACKHOLE] = {
118                 .error  = -EINVAL,
119                 .scope  = RT_SCOPE_UNIVERSE,
120         },
121         [RTN_UNREACHABLE] = {
122                 .error  = -EHOSTUNREACH,
123                 .scope  = RT_SCOPE_UNIVERSE,
124         },
125         [RTN_PROHIBIT] = {
126                 .error  = -EACCES,
127                 .scope  = RT_SCOPE_UNIVERSE,
128         },
129         [RTN_THROW] = {
130                 .error  = -EAGAIN,
131                 .scope  = RT_SCOPE_UNIVERSE,
132         },
133         [RTN_NAT] = {
134                 .error  = -EINVAL,
135                 .scope  = RT_SCOPE_NOWHERE,
136         },
137         [RTN_XRESOLVE] = {
138                 .error  = -EINVAL,
139                 .scope  = RT_SCOPE_NOWHERE,
140         },
141 };
142
143 static void rt_fibinfo_free(struct rtable __rcu **rtp)
144 {
145         struct rtable *rt = rcu_dereference_protected(*rtp, 1);
146
147         if (!rt)
148                 return;
149
150         /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
151          * because we waited an RCU grace period before calling
152          * free_fib_info_rcu()
153          */
154
155         dst_free(&rt->dst);
156 }
157
158 static void free_nh_exceptions(struct fib_nh *nh)
159 {
160         struct fnhe_hash_bucket *hash;
161         int i;
162
163         hash = rcu_dereference_protected(nh->nh_exceptions, 1);
164         if (!hash)
165                 return;
166         for (i = 0; i < FNHE_HASH_SIZE; i++) {
167                 struct fib_nh_exception *fnhe;
168
169                 fnhe = rcu_dereference_protected(hash[i].chain, 1);
170                 while (fnhe) {
171                         struct fib_nh_exception *next;
172                         
173                         next = rcu_dereference_protected(fnhe->fnhe_next, 1);
174
175                         rt_fibinfo_free(&fnhe->fnhe_rth_input);
176                         rt_fibinfo_free(&fnhe->fnhe_rth_output);
177
178                         kfree(fnhe);
179
180                         fnhe = next;
181                 }
182         }
183         kfree(hash);
184 }
185
186 static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
187 {
188         int cpu;
189
190         if (!rtp)
191                 return;
192
193         for_each_possible_cpu(cpu) {
194                 struct rtable *rt;
195
196                 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
197                 if (rt)
198                         dst_free(&rt->dst);
199         }
200         free_percpu(rtp);
201 }
202
203 /* Release a nexthop info record */
204 static void free_fib_info_rcu(struct rcu_head *head)
205 {
206         struct fib_info *fi = container_of(head, struct fib_info, rcu);
207
208         change_nexthops(fi) {
209                 if (nexthop_nh->nh_dev)
210                         dev_put(nexthop_nh->nh_dev);
211                 lwtstate_put(nexthop_nh->nh_lwtstate);
212                 free_nh_exceptions(nexthop_nh);
213                 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
214                 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
215         } endfor_nexthops(fi);
216
217         if (fi->fib_metrics != (u32 *) dst_default_metrics)
218                 kfree(fi->fib_metrics);
219         kfree(fi);
220 }
221
222 void free_fib_info(struct fib_info *fi)
223 {
224         if (fi->fib_dead == 0) {
225                 pr_warn("Freeing alive fib_info %p\n", fi);
226                 return;
227         }
228         fib_info_cnt--;
229 #ifdef CONFIG_IP_ROUTE_CLASSID
230         change_nexthops(fi) {
231                 if (nexthop_nh->nh_tclassid)
232                         fi->fib_net->ipv4.fib_num_tclassid_users--;
233         } endfor_nexthops(fi);
234 #endif
235         call_rcu(&fi->rcu, free_fib_info_rcu);
236 }
237 EXPORT_SYMBOL_GPL(free_fib_info);
238
239 void fib_release_info(struct fib_info *fi)
240 {
241         spin_lock_bh(&fib_info_lock);
242         if (fi && --fi->fib_treeref == 0) {
243                 hlist_del(&fi->fib_hash);
244                 if (fi->fib_prefsrc)
245                         hlist_del(&fi->fib_lhash);
246                 change_nexthops(fi) {
247                         if (!nexthop_nh->nh_dev)
248                                 continue;
249                         hlist_del(&nexthop_nh->nh_hash);
250                 } endfor_nexthops(fi)
251                 fi->fib_dead = 1;
252                 fib_info_put(fi);
253         }
254         spin_unlock_bh(&fib_info_lock);
255 }
256
257 static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
258 {
259         const struct fib_nh *onh = ofi->fib_nh;
260
261         for_nexthops(fi) {
262                 if (nh->nh_oif != onh->nh_oif ||
263                     nh->nh_gw  != onh->nh_gw ||
264                     nh->nh_scope != onh->nh_scope ||
265 #ifdef CONFIG_IP_ROUTE_MULTIPATH
266                     nh->nh_weight != onh->nh_weight ||
267 #endif
268 #ifdef CONFIG_IP_ROUTE_CLASSID
269                     nh->nh_tclassid != onh->nh_tclassid ||
270 #endif
271                     lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
272                     ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
273                         return -1;
274                 onh++;
275         } endfor_nexthops(fi);
276         return 0;
277 }
278
279 static inline unsigned int fib_devindex_hashfn(unsigned int val)
280 {
281         unsigned int mask = DEVINDEX_HASHSIZE - 1;
282
283         return (val ^
284                 (val >> DEVINDEX_HASHBITS) ^
285                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
286 }
287
288 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
289 {
290         unsigned int mask = (fib_info_hash_size - 1);
291         unsigned int val = fi->fib_nhs;
292
293         val ^= (fi->fib_protocol << 8) | fi->fib_scope;
294         val ^= (__force u32)fi->fib_prefsrc;
295         val ^= fi->fib_priority;
296         for_nexthops(fi) {
297                 val ^= fib_devindex_hashfn(nh->nh_oif);
298         } endfor_nexthops(fi)
299
300         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
301 }
302
303 static struct fib_info *fib_find_info(const struct fib_info *nfi)
304 {
305         struct hlist_head *head;
306         struct fib_info *fi;
307         unsigned int hash;
308
309         hash = fib_info_hashfn(nfi);
310         head = &fib_info_hash[hash];
311
312         hlist_for_each_entry(fi, head, fib_hash) {
313                 if (!net_eq(fi->fib_net, nfi->fib_net))
314                         continue;
315                 if (fi->fib_nhs != nfi->fib_nhs)
316                         continue;
317                 if (nfi->fib_protocol == fi->fib_protocol &&
318                     nfi->fib_scope == fi->fib_scope &&
319                     nfi->fib_prefsrc == fi->fib_prefsrc &&
320                     nfi->fib_priority == fi->fib_priority &&
321                     nfi->fib_type == fi->fib_type &&
322                     memcmp(nfi->fib_metrics, fi->fib_metrics,
323                            sizeof(u32) * RTAX_MAX) == 0 &&
324                     !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
325                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
326                         return fi;
327         }
328
329         return NULL;
330 }
331
332 /* Check, that the gateway is already configured.
333  * Used only by redirect accept routine.
334  */
335 int ip_fib_check_default(__be32 gw, struct net_device *dev)
336 {
337         struct hlist_head *head;
338         struct fib_nh *nh;
339         unsigned int hash;
340
341         spin_lock(&fib_info_lock);
342
343         hash = fib_devindex_hashfn(dev->ifindex);
344         head = &fib_info_devhash[hash];
345         hlist_for_each_entry(nh, head, nh_hash) {
346                 if (nh->nh_dev == dev &&
347                     nh->nh_gw == gw &&
348                     !(nh->nh_flags & RTNH_F_DEAD)) {
349                         spin_unlock(&fib_info_lock);
350                         return 0;
351                 }
352         }
353
354         spin_unlock(&fib_info_lock);
355
356         return -1;
357 }
358
359 static inline size_t fib_nlmsg_size(struct fib_info *fi)
360 {
361         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
362                          + nla_total_size(4) /* RTA_TABLE */
363                          + nla_total_size(4) /* RTA_DST */
364                          + nla_total_size(4) /* RTA_PRIORITY */
365                          + nla_total_size(4) /* RTA_PREFSRC */
366                          + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
367
368         /* space for nested metrics */
369         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
370
371         if (fi->fib_nhs) {
372                 size_t nh_encapsize = 0;
373                 /* Also handles the special case fib_nhs == 1 */
374
375                 /* each nexthop is packed in an attribute */
376                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
377
378                 /* may contain flow and gateway attribute */
379                 nhsize += 2 * nla_total_size(4);
380
381                 /* grab encap info */
382                 for_nexthops(fi) {
383                         if (nh->nh_lwtstate) {
384                                 /* RTA_ENCAP_TYPE */
385                                 nh_encapsize += lwtunnel_get_encap_size(
386                                                 nh->nh_lwtstate);
387                                 /* RTA_ENCAP */
388                                 nh_encapsize +=  nla_total_size(2);
389                         }
390                 } endfor_nexthops(fi);
391
392                 /* all nexthops are packed in a nested attribute */
393                 payload += nla_total_size((fi->fib_nhs * nhsize) +
394                                           nh_encapsize);
395
396         }
397
398         return payload;
399 }
400
401 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
402                int dst_len, u32 tb_id, const struct nl_info *info,
403                unsigned int nlm_flags)
404 {
405         struct sk_buff *skb;
406         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
407         int err = -ENOBUFS;
408
409         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
410         if (!skb)
411                 goto errout;
412
413         err = fib_dump_info(skb, info->portid, seq, event, tb_id,
414                             fa->fa_type, key, dst_len,
415                             fa->fa_tos, fa->fa_info, nlm_flags);
416         if (err < 0) {
417                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
418                 WARN_ON(err == -EMSGSIZE);
419                 kfree_skb(skb);
420                 goto errout;
421         }
422         rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
423                     info->nlh, GFP_KERNEL);
424         return;
425 errout:
426         if (err < 0)
427                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
428 }
429
430 static int fib_detect_death(struct fib_info *fi, int order,
431                             struct fib_info **last_resort, int *last_idx,
432                             int dflt)
433 {
434         struct neighbour *n;
435         int state = NUD_NONE;
436
437         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
438         if (n) {
439                 state = n->nud_state;
440                 neigh_release(n);
441         } else {
442                 return 0;
443         }
444         if (state == NUD_REACHABLE)
445                 return 0;
446         if ((state & NUD_VALID) && order != dflt)
447                 return 0;
448         if ((state & NUD_VALID) ||
449             (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
450                 *last_resort = fi;
451                 *last_idx = order;
452         }
453         return 1;
454 }
455
456 #ifdef CONFIG_IP_ROUTE_MULTIPATH
457
458 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
459 {
460         int nhs = 0;
461
462         while (rtnh_ok(rtnh, remaining)) {
463                 nhs++;
464                 rtnh = rtnh_next(rtnh, &remaining);
465         }
466
467         /* leftover implies invalid nexthop configuration, discard it */
468         return remaining > 0 ? 0 : nhs;
469 }
470
471 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
472                        int remaining, struct fib_config *cfg)
473 {
474         int ret;
475
476         change_nexthops(fi) {
477                 int attrlen;
478
479                 if (!rtnh_ok(rtnh, remaining))
480                         return -EINVAL;
481
482                 if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
483                         return -EINVAL;
484
485                 nexthop_nh->nh_flags =
486                         (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
487                 nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
488                 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
489
490                 attrlen = rtnh_attrlen(rtnh);
491                 if (attrlen > 0) {
492                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
493
494                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
495                         nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
496 #ifdef CONFIG_IP_ROUTE_CLASSID
497                         nla = nla_find(attrs, attrlen, RTA_FLOW);
498                         nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
499                         if (nexthop_nh->nh_tclassid)
500                                 fi->fib_net->ipv4.fib_num_tclassid_users++;
501 #endif
502                         nla = nla_find(attrs, attrlen, RTA_ENCAP);
503                         if (nla) {
504                                 struct lwtunnel_state *lwtstate;
505                                 struct nlattr *nla_entype;
506
507                                 nla_entype = nla_find(attrs, attrlen,
508                                                       RTA_ENCAP_TYPE);
509                                 if (!nla_entype)
510                                         goto err_inval;
511
512                                 ret = lwtunnel_build_state(nla_get_u16(
513                                                            nla_entype),
514                                                            nla,  AF_INET, cfg,
515                                                            &lwtstate);
516                                 if (ret)
517                                         goto errout;
518                                 nexthop_nh->nh_lwtstate =
519                                         lwtstate_get(lwtstate);
520                         }
521                 }
522
523                 rtnh = rtnh_next(rtnh, &remaining);
524         } endfor_nexthops(fi);
525
526         return 0;
527
528 err_inval:
529         ret = -EINVAL;
530
531 errout:
532         return ret;
533 }
534
535 static void fib_rebalance(struct fib_info *fi)
536 {
537         int total;
538         int w;
539         struct in_device *in_dev;
540
541         if (fi->fib_nhs < 2)
542                 return;
543
544         total = 0;
545         for_nexthops(fi) {
546                 if (nh->nh_flags & RTNH_F_DEAD)
547                         continue;
548
549                 in_dev = __in_dev_get_rtnl(nh->nh_dev);
550
551                 if (in_dev &&
552                     IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
553                     nh->nh_flags & RTNH_F_LINKDOWN)
554                         continue;
555
556                 total += nh->nh_weight;
557         } endfor_nexthops(fi);
558
559         w = 0;
560         change_nexthops(fi) {
561                 int upper_bound;
562
563                 in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
564
565                 if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
566                         upper_bound = -1;
567                 } else if (in_dev &&
568                            IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
569                            nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
570                         upper_bound = -1;
571                 } else {
572                         w += nexthop_nh->nh_weight;
573                         upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
574                                                             total) - 1;
575                 }
576
577                 atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
578         } endfor_nexthops(fi);
579
580         net_get_random_once(&fib_multipath_secret,
581                             sizeof(fib_multipath_secret));
582 }
583
584 static inline void fib_add_weight(struct fib_info *fi,
585                                   const struct fib_nh *nh)
586 {
587         fi->fib_weight += nh->nh_weight;
588 }
589
590 #else /* CONFIG_IP_ROUTE_MULTIPATH */
591
592 #define fib_rebalance(fi) do { } while (0)
593 #define fib_add_weight(fi, nh) do { } while (0)
594
595 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
596
597 static int fib_encap_match(u16 encap_type,
598                            struct nlattr *encap,
599                            const struct fib_nh *nh,
600                            const struct fib_config *cfg)
601 {
602         struct lwtunnel_state *lwtstate;
603         int ret, result = 0;
604
605         if (encap_type == LWTUNNEL_ENCAP_NONE)
606                 return 0;
607
608         ret = lwtunnel_build_state(encap_type, encap,
609                                    AF_INET, cfg, &lwtstate);
610         if (!ret) {
611                 result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
612                 lwtstate_free(lwtstate);
613         }
614
615         return result;
616 }
617
618 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
619 {
620 #ifdef CONFIG_IP_ROUTE_MULTIPATH
621         struct rtnexthop *rtnh;
622         int remaining;
623 #endif
624
625         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
626                 return 1;
627
628         if (cfg->fc_oif || cfg->fc_gw) {
629                 if (cfg->fc_encap) {
630                         if (fib_encap_match(cfg->fc_encap_type,
631                                             cfg->fc_encap, fi->fib_nh, cfg))
632                             return 1;
633                 }
634                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
635                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
636                         return 0;
637                 return 1;
638         }
639
640 #ifdef CONFIG_IP_ROUTE_MULTIPATH
641         if (!cfg->fc_mp)
642                 return 0;
643
644         rtnh = cfg->fc_mp;
645         remaining = cfg->fc_mp_len;
646
647         for_nexthops(fi) {
648                 int attrlen;
649
650                 if (!rtnh_ok(rtnh, remaining))
651                         return -EINVAL;
652
653                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
654                         return 1;
655
656                 attrlen = rtnh_attrlen(rtnh);
657                 if (attrlen > 0) {
658                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
659
660                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
661                         if (nla && nla_get_in_addr(nla) != nh->nh_gw)
662                                 return 1;
663 #ifdef CONFIG_IP_ROUTE_CLASSID
664                         nla = nla_find(attrs, attrlen, RTA_FLOW);
665                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
666                                 return 1;
667 #endif
668                 }
669
670                 rtnh = rtnh_next(rtnh, &remaining);
671         } endfor_nexthops(fi);
672 #endif
673         return 0;
674 }
675
676
677 /*
678  * Picture
679  * -------
680  *
681  * Semantics of nexthop is very messy by historical reasons.
682  * We have to take into account, that:
683  * a) gateway can be actually local interface address,
684  *    so that gatewayed route is direct.
685  * b) gateway must be on-link address, possibly
686  *    described not by an ifaddr, but also by a direct route.
687  * c) If both gateway and interface are specified, they should not
688  *    contradict.
689  * d) If we use tunnel routes, gateway could be not on-link.
690  *
691  * Attempt to reconcile all of these (alas, self-contradictory) conditions
692  * results in pretty ugly and hairy code with obscure logic.
693  *
694  * I chose to generalized it instead, so that the size
695  * of code does not increase practically, but it becomes
696  * much more general.
697  * Every prefix is assigned a "scope" value: "host" is local address,
698  * "link" is direct route,
699  * [ ... "site" ... "interior" ... ]
700  * and "universe" is true gateway route with global meaning.
701  *
702  * Every prefix refers to a set of "nexthop"s (gw, oif),
703  * where gw must have narrower scope. This recursion stops
704  * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
705  * which means that gw is forced to be on link.
706  *
707  * Code is still hairy, but now it is apparently logically
708  * consistent and very flexible. F.e. as by-product it allows
709  * to co-exists in peace independent exterior and interior
710  * routing processes.
711  *
712  * Normally it looks as following.
713  *
714  * {universe prefix}  -> (gw, oif) [scope link]
715  *                |
716  *                |-> {link prefix} -> (gw, oif) [scope local]
717  *                                      |
718  *                                      |-> {local prefix} (terminal node)
719  */
720 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
721                         struct fib_nh *nh)
722 {
723         int err = 0;
724         struct net *net;
725         struct net_device *dev;
726
727         net = cfg->fc_nlinfo.nl_net;
728         if (nh->nh_gw) {
729                 struct fib_result res;
730
731                 if (nh->nh_flags & RTNH_F_ONLINK) {
732                         unsigned int addr_type;
733
734                         if (cfg->fc_scope >= RT_SCOPE_LINK)
735                                 return -EINVAL;
736                         dev = __dev_get_by_index(net, nh->nh_oif);
737                         if (!dev)
738                                 return -ENODEV;
739                         if (!(dev->flags & IFF_UP))
740                                 return -ENETDOWN;
741                         addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
742                         if (addr_type != RTN_UNICAST)
743                                 return -EINVAL;
744                         if (!netif_carrier_ok(dev))
745                                 nh->nh_flags |= RTNH_F_LINKDOWN;
746                         nh->nh_dev = dev;
747                         dev_hold(dev);
748                         nh->nh_scope = RT_SCOPE_LINK;
749                         return 0;
750                 }
751                 rcu_read_lock();
752                 {
753                         struct fib_table *tbl = NULL;
754                         struct flowi4 fl4 = {
755                                 .daddr = nh->nh_gw,
756                                 .flowi4_scope = cfg->fc_scope + 1,
757                                 .flowi4_oif = nh->nh_oif,
758                                 .flowi4_iif = LOOPBACK_IFINDEX,
759                         };
760
761                         /* It is not necessary, but requires a bit of thinking */
762                         if (fl4.flowi4_scope < RT_SCOPE_LINK)
763                                 fl4.flowi4_scope = RT_SCOPE_LINK;
764
765                         if (cfg->fc_table)
766                                 tbl = fib_get_table(net, cfg->fc_table);
767
768                         if (tbl)
769                                 err = fib_table_lookup(tbl, &fl4, &res,
770                                                        FIB_LOOKUP_IGNORE_LINKSTATE |
771                                                        FIB_LOOKUP_NOREF);
772
773                         /* on error or if no table given do full lookup. This
774                          * is needed for example when nexthops are in the local
775                          * table rather than the given table
776                          */
777                         if (!tbl || err) {
778                                 err = fib_lookup(net, &fl4, &res,
779                                                  FIB_LOOKUP_IGNORE_LINKSTATE);
780                         }
781
782                         if (err) {
783                                 rcu_read_unlock();
784                                 return err;
785                         }
786                 }
787                 err = -EINVAL;
788                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
789                         goto out;
790                 nh->nh_scope = res.scope;
791                 nh->nh_oif = FIB_RES_OIF(res);
792                 nh->nh_dev = dev = FIB_RES_DEV(res);
793                 if (!dev)
794                         goto out;
795                 dev_hold(dev);
796                 if (!netif_carrier_ok(dev))
797                         nh->nh_flags |= RTNH_F_LINKDOWN;
798                 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
799         } else {
800                 struct in_device *in_dev;
801
802                 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
803                         return -EINVAL;
804
805                 rcu_read_lock();
806                 err = -ENODEV;
807                 in_dev = inetdev_by_index(net, nh->nh_oif);
808                 if (!in_dev)
809                         goto out;
810                 err = -ENETDOWN;
811                 if (!(in_dev->dev->flags & IFF_UP))
812                         goto out;
813                 nh->nh_dev = in_dev->dev;
814                 dev_hold(nh->nh_dev);
815                 nh->nh_scope = RT_SCOPE_HOST;
816                 if (!netif_carrier_ok(nh->nh_dev))
817                         nh->nh_flags |= RTNH_F_LINKDOWN;
818                 err = 0;
819         }
820 out:
821         rcu_read_unlock();
822         return err;
823 }
824
825 static inline unsigned int fib_laddr_hashfn(__be32 val)
826 {
827         unsigned int mask = (fib_info_hash_size - 1);
828
829         return ((__force u32)val ^
830                 ((__force u32)val >> 7) ^
831                 ((__force u32)val >> 14)) & mask;
832 }
833
834 static struct hlist_head *fib_info_hash_alloc(int bytes)
835 {
836         if (bytes <= PAGE_SIZE)
837                 return kzalloc(bytes, GFP_KERNEL);
838         else
839                 return (struct hlist_head *)
840                         __get_free_pages(GFP_KERNEL | __GFP_ZERO,
841                                          get_order(bytes));
842 }
843
844 static void fib_info_hash_free(struct hlist_head *hash, int bytes)
845 {
846         if (!hash)
847                 return;
848
849         if (bytes <= PAGE_SIZE)
850                 kfree(hash);
851         else
852                 free_pages((unsigned long) hash, get_order(bytes));
853 }
854
855 static void fib_info_hash_move(struct hlist_head *new_info_hash,
856                                struct hlist_head *new_laddrhash,
857                                unsigned int new_size)
858 {
859         struct hlist_head *old_info_hash, *old_laddrhash;
860         unsigned int old_size = fib_info_hash_size;
861         unsigned int i, bytes;
862
863         spin_lock_bh(&fib_info_lock);
864         old_info_hash = fib_info_hash;
865         old_laddrhash = fib_info_laddrhash;
866         fib_info_hash_size = new_size;
867
868         for (i = 0; i < old_size; i++) {
869                 struct hlist_head *head = &fib_info_hash[i];
870                 struct hlist_node *n;
871                 struct fib_info *fi;
872
873                 hlist_for_each_entry_safe(fi, n, head, fib_hash) {
874                         struct hlist_head *dest;
875                         unsigned int new_hash;
876
877                         new_hash = fib_info_hashfn(fi);
878                         dest = &new_info_hash[new_hash];
879                         hlist_add_head(&fi->fib_hash, dest);
880                 }
881         }
882         fib_info_hash = new_info_hash;
883
884         for (i = 0; i < old_size; i++) {
885                 struct hlist_head *lhead = &fib_info_laddrhash[i];
886                 struct hlist_node *n;
887                 struct fib_info *fi;
888
889                 hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
890                         struct hlist_head *ldest;
891                         unsigned int new_hash;
892
893                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
894                         ldest = &new_laddrhash[new_hash];
895                         hlist_add_head(&fi->fib_lhash, ldest);
896                 }
897         }
898         fib_info_laddrhash = new_laddrhash;
899
900         spin_unlock_bh(&fib_info_lock);
901
902         bytes = old_size * sizeof(struct hlist_head *);
903         fib_info_hash_free(old_info_hash, bytes);
904         fib_info_hash_free(old_laddrhash, bytes);
905 }
906
907 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
908 {
909         nh->nh_saddr = inet_select_addr(nh->nh_dev,
910                                         nh->nh_gw,
911                                         nh->nh_parent->fib_scope);
912         nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
913
914         return nh->nh_saddr;
915 }
916
917 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
918 {
919         if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
920             fib_prefsrc != cfg->fc_dst) {
921                 u32 tb_id = cfg->fc_table;
922                 int rc;
923
924                 if (tb_id == RT_TABLE_MAIN)
925                         tb_id = RT_TABLE_LOCAL;
926
927                 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
928                                           fib_prefsrc, tb_id);
929
930                 if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
931                         rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
932                                                   fib_prefsrc, RT_TABLE_LOCAL);
933                 }
934
935                 if (rc != RTN_LOCAL)
936                         return false;
937         }
938         return true;
939 }
940
941 static int
942 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
943 {
944         bool ecn_ca = false;
945         struct nlattr *nla;
946         int remaining;
947
948         if (!cfg->fc_mx)
949                 return 0;
950
951         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
952                 int type = nla_type(nla);
953                 u32 val;
954
955                 if (!type)
956                         continue;
957                 if (type > RTAX_MAX)
958                         return -EINVAL;
959
960                 if (type == RTAX_CC_ALGO) {
961                         char tmp[TCP_CA_NAME_MAX];
962
963                         nla_strlcpy(tmp, nla, sizeof(tmp));
964                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
965                         if (val == TCP_CA_UNSPEC)
966                                 return -EINVAL;
967                 } else {
968                         val = nla_get_u32(nla);
969                 }
970                 if (type == RTAX_ADVMSS && val > 65535 - 40)
971                         val = 65535 - 40;
972                 if (type == RTAX_MTU && val > 65535 - 15)
973                         val = 65535 - 15;
974                 if (type == RTAX_HOPLIMIT && val > 255)
975                         val = 255;
976                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
977                         return -EINVAL;
978                 fi->fib_metrics[type - 1] = val;
979         }
980
981         if (ecn_ca)
982                 fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
983
984         return 0;
985 }
986
987 struct fib_info *fib_create_info(struct fib_config *cfg)
988 {
989         int err;
990         struct fib_info *fi = NULL;
991         struct fib_info *ofi;
992         int nhs = 1;
993         struct net *net = cfg->fc_nlinfo.nl_net;
994
995         if (cfg->fc_type > RTN_MAX)
996                 goto err_inval;
997
998         /* Fast check to catch the most weird cases */
999         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
1000                 goto err_inval;
1001
1002         if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
1003                 goto err_inval;
1004
1005 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1006         if (cfg->fc_mp) {
1007                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
1008                 if (nhs == 0)
1009                         goto err_inval;
1010         }
1011 #endif
1012
1013         err = -ENOBUFS;
1014         if (fib_info_cnt >= fib_info_hash_size) {
1015                 unsigned int new_size = fib_info_hash_size << 1;
1016                 struct hlist_head *new_info_hash;
1017                 struct hlist_head *new_laddrhash;
1018                 unsigned int bytes;
1019
1020                 if (!new_size)
1021                         new_size = 16;
1022                 bytes = new_size * sizeof(struct hlist_head *);
1023                 new_info_hash = fib_info_hash_alloc(bytes);
1024                 new_laddrhash = fib_info_hash_alloc(bytes);
1025                 if (!new_info_hash || !new_laddrhash) {
1026                         fib_info_hash_free(new_info_hash, bytes);
1027                         fib_info_hash_free(new_laddrhash, bytes);
1028                 } else
1029                         fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
1030
1031                 if (!fib_info_hash_size)
1032                         goto failure;
1033         }
1034
1035         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
1036         if (!fi)
1037                 goto failure;
1038         fib_info_cnt++;
1039         if (cfg->fc_mx) {
1040                 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1041                 if (!fi->fib_metrics)
1042                         goto failure;
1043         } else
1044                 fi->fib_metrics = (u32 *) dst_default_metrics;
1045
1046         fi->fib_net = net;
1047         fi->fib_protocol = cfg->fc_protocol;
1048         fi->fib_scope = cfg->fc_scope;
1049         fi->fib_flags = cfg->fc_flags;
1050         fi->fib_priority = cfg->fc_priority;
1051         fi->fib_prefsrc = cfg->fc_prefsrc;
1052         fi->fib_type = cfg->fc_type;
1053         fi->fib_tb_id = cfg->fc_table;
1054
1055         fi->fib_nhs = nhs;
1056         change_nexthops(fi) {
1057                 nexthop_nh->nh_parent = fi;
1058                 nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
1059                 if (!nexthop_nh->nh_pcpu_rth_output)
1060                         goto failure;
1061         } endfor_nexthops(fi)
1062
1063         err = fib_convert_metrics(fi, cfg);
1064         if (err)
1065                 goto failure;
1066
1067         if (cfg->fc_mp) {
1068 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1069                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
1070                 if (err != 0)
1071                         goto failure;
1072                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
1073                         goto err_inval;
1074                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
1075                         goto err_inval;
1076 #ifdef CONFIG_IP_ROUTE_CLASSID
1077                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
1078                         goto err_inval;
1079 #endif
1080 #else
1081                 goto err_inval;
1082 #endif
1083         } else {
1084                 struct fib_nh *nh = fi->fib_nh;
1085
1086                 if (cfg->fc_encap) {
1087                         struct lwtunnel_state *lwtstate;
1088
1089                         if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
1090                                 goto err_inval;
1091                         err = lwtunnel_build_state(cfg->fc_encap_type,
1092                                                    cfg->fc_encap, AF_INET, cfg,
1093                                                    &lwtstate);
1094                         if (err)
1095                                 goto failure;
1096
1097                         nh->nh_lwtstate = lwtstate_get(lwtstate);
1098                 }
1099                 nh->nh_oif = cfg->fc_oif;
1100                 nh->nh_gw = cfg->fc_gw;
1101                 nh->nh_flags = cfg->fc_flags;
1102 #ifdef CONFIG_IP_ROUTE_CLASSID
1103                 nh->nh_tclassid = cfg->fc_flow;
1104                 if (nh->nh_tclassid)
1105                         fi->fib_net->ipv4.fib_num_tclassid_users++;
1106 #endif
1107 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1108                 nh->nh_weight = 1;
1109 #endif
1110         }
1111
1112         if (fib_props[cfg->fc_type].error) {
1113                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
1114                         goto err_inval;
1115                 goto link_it;
1116         } else {
1117                 switch (cfg->fc_type) {
1118                 case RTN_UNICAST:
1119                 case RTN_LOCAL:
1120                 case RTN_BROADCAST:
1121                 case RTN_ANYCAST:
1122                 case RTN_MULTICAST:
1123                         break;
1124                 default:
1125                         goto err_inval;
1126                 }
1127         }
1128
1129         if (cfg->fc_scope > RT_SCOPE_HOST)
1130                 goto err_inval;
1131
1132         if (cfg->fc_scope == RT_SCOPE_HOST) {
1133                 struct fib_nh *nh = fi->fib_nh;
1134
1135                 /* Local address is added. */
1136                 if (nhs != 1 || nh->nh_gw)
1137                         goto err_inval;
1138                 nh->nh_scope = RT_SCOPE_NOWHERE;
1139                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
1140                 err = -ENODEV;
1141                 if (!nh->nh_dev)
1142                         goto failure;
1143         } else {
1144                 int linkdown = 0;
1145
1146                 change_nexthops(fi) {
1147                         err = fib_check_nh(cfg, fi, nexthop_nh);
1148                         if (err != 0)
1149                                 goto failure;
1150                         if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
1151                                 linkdown++;
1152                 } endfor_nexthops(fi)
1153                 if (linkdown == fi->fib_nhs)
1154                         fi->fib_flags |= RTNH_F_LINKDOWN;
1155         }
1156
1157         if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc))
1158                 goto err_inval;
1159
1160         change_nexthops(fi) {
1161                 fib_info_update_nh_saddr(net, nexthop_nh);
1162                 fib_add_weight(fi, nexthop_nh);
1163         } endfor_nexthops(fi)
1164
1165         fib_rebalance(fi);
1166
1167 link_it:
1168         ofi = fib_find_info(fi);
1169         if (ofi) {
1170                 fi->fib_dead = 1;
1171                 free_fib_info(fi);
1172                 ofi->fib_treeref++;
1173                 return ofi;
1174         }
1175
1176         fi->fib_treeref++;
1177         atomic_inc(&fi->fib_clntref);
1178         spin_lock_bh(&fib_info_lock);
1179         hlist_add_head(&fi->fib_hash,
1180                        &fib_info_hash[fib_info_hashfn(fi)]);
1181         if (fi->fib_prefsrc) {
1182                 struct hlist_head *head;
1183
1184                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
1185                 hlist_add_head(&fi->fib_lhash, head);
1186         }
1187         change_nexthops(fi) {
1188                 struct hlist_head *head;
1189                 unsigned int hash;
1190
1191                 if (!nexthop_nh->nh_dev)
1192                         continue;
1193                 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
1194                 head = &fib_info_devhash[hash];
1195                 hlist_add_head(&nexthop_nh->nh_hash, head);
1196         } endfor_nexthops(fi)
1197         spin_unlock_bh(&fib_info_lock);
1198         return fi;
1199
1200 err_inval:
1201         err = -EINVAL;
1202
1203 failure:
1204         if (fi) {
1205                 fi->fib_dead = 1;
1206                 free_fib_info(fi);
1207         }
1208
1209         return ERR_PTR(err);
1210 }
1211
1212 int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1213                   u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
1214                   struct fib_info *fi, unsigned int flags)
1215 {
1216         struct nlmsghdr *nlh;
1217         struct rtmsg *rtm;
1218
1219         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
1220         if (!nlh)
1221                 return -EMSGSIZE;
1222
1223         rtm = nlmsg_data(nlh);
1224         rtm->rtm_family = AF_INET;
1225         rtm->rtm_dst_len = dst_len;
1226         rtm->rtm_src_len = 0;
1227         rtm->rtm_tos = tos;
1228         if (tb_id < 256)
1229                 rtm->rtm_table = tb_id;
1230         else
1231                 rtm->rtm_table = RT_TABLE_COMPAT;
1232         if (nla_put_u32(skb, RTA_TABLE, tb_id))
1233                 goto nla_put_failure;
1234         rtm->rtm_type = type;
1235         rtm->rtm_flags = fi->fib_flags;
1236         rtm->rtm_scope = fi->fib_scope;
1237         rtm->rtm_protocol = fi->fib_protocol;
1238
1239         if (rtm->rtm_dst_len &&
1240             nla_put_in_addr(skb, RTA_DST, dst))
1241                 goto nla_put_failure;
1242         if (fi->fib_priority &&
1243             nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
1244                 goto nla_put_failure;
1245         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
1246                 goto nla_put_failure;
1247
1248         if (fi->fib_prefsrc &&
1249             nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
1250                 goto nla_put_failure;
1251         if (fi->fib_nhs == 1) {
1252                 struct in_device *in_dev;
1253
1254                 if (fi->fib_nh->nh_gw &&
1255                     nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
1256                         goto nla_put_failure;
1257                 if (fi->fib_nh->nh_oif &&
1258                     nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
1259                         goto nla_put_failure;
1260                 if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
1261                         in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
1262                         if (in_dev &&
1263                             IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
1264                                 rtm->rtm_flags |= RTNH_F_DEAD;
1265                 }
1266 #ifdef CONFIG_IP_ROUTE_CLASSID
1267                 if (fi->fib_nh[0].nh_tclassid &&
1268                     nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
1269                         goto nla_put_failure;
1270 #endif
1271                 if (fi->fib_nh->nh_lwtstate &&
1272                     lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
1273                         goto nla_put_failure;
1274         }
1275 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1276         if (fi->fib_nhs > 1) {
1277                 struct rtnexthop *rtnh;
1278                 struct nlattr *mp;
1279
1280                 mp = nla_nest_start(skb, RTA_MULTIPATH);
1281                 if (!mp)
1282                         goto nla_put_failure;
1283
1284                 for_nexthops(fi) {
1285                         struct in_device *in_dev;
1286
1287                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1288                         if (!rtnh)
1289                                 goto nla_put_failure;
1290
1291                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1292                         if (nh->nh_flags & RTNH_F_LINKDOWN) {
1293                                 in_dev = __in_dev_get_rtnl(nh->nh_dev);
1294                                 if (in_dev &&
1295                                     IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
1296                                         rtnh->rtnh_flags |= RTNH_F_DEAD;
1297                         }
1298                         rtnh->rtnh_hops = nh->nh_weight - 1;
1299                         rtnh->rtnh_ifindex = nh->nh_oif;
1300
1301                         if (nh->nh_gw &&
1302                             nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
1303                                 goto nla_put_failure;
1304 #ifdef CONFIG_IP_ROUTE_CLASSID
1305                         if (nh->nh_tclassid &&
1306                             nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
1307                                 goto nla_put_failure;
1308 #endif
1309                         if (nh->nh_lwtstate &&
1310                             lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
1311                                 goto nla_put_failure;
1312
1313                         /* length of rtnetlink header + attributes */
1314                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1315                 } endfor_nexthops(fi);
1316
1317                 nla_nest_end(skb, mp);
1318         }
1319 #endif
1320         nlmsg_end(skb, nlh);
1321         return 0;
1322
1323 nla_put_failure:
1324         nlmsg_cancel(skb, nlh);
1325         return -EMSGSIZE;
1326 }
1327
1328 /*
1329  * Update FIB if:
1330  * - local address disappeared -> we must delete all the entries
1331  *   referring to it.
1332  * - device went down -> we must shutdown all nexthops going via it.
1333  */
1334 int fib_sync_down_addr(struct net_device *dev, __be32 local)
1335 {
1336         int ret = 0;
1337         unsigned int hash = fib_laddr_hashfn(local);
1338         struct hlist_head *head = &fib_info_laddrhash[hash];
1339         struct net *net = dev_net(dev);
1340         int tb_id = l3mdev_fib_table(dev);
1341         struct fib_info *fi;
1342
1343         if (!fib_info_laddrhash || local == 0)
1344                 return 0;
1345
1346         hlist_for_each_entry(fi, head, fib_lhash) {
1347                 if (!net_eq(fi->fib_net, net) ||
1348                     fi->fib_tb_id != tb_id)
1349                         continue;
1350                 if (fi->fib_prefsrc == local) {
1351                         fi->fib_flags |= RTNH_F_DEAD;
1352                         ret++;
1353                 }
1354         }
1355         return ret;
1356 }
1357
1358 static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
1359                                  enum fib_event_type event_type)
1360 {
1361         struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
1362         struct fib_nh_notifier_info info = {
1363                 .fib_nh = fib_nh,
1364         };
1365
1366         switch (event_type) {
1367         case FIB_EVENT_NH_ADD:
1368                 if (fib_nh->nh_flags & RTNH_F_DEAD)
1369                         break;
1370                 if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1371                     fib_nh->nh_flags & RTNH_F_LINKDOWN)
1372                         break;
1373                 return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
1374                                           &info.info);
1375         case FIB_EVENT_NH_DEL:
1376                 if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
1377                      fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
1378                     (fib_nh->nh_flags & RTNH_F_DEAD))
1379                         return call_fib_notifiers(dev_net(fib_nh->nh_dev),
1380                                                   event_type, &info.info);
1381         default:
1382                 break;
1383         }
1384
1385         return NOTIFY_DONE;
1386 }
1387
1388 /* Event              force Flags           Description
1389  * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
1390  * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
1391  * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
1392  * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
1393  */
1394 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
1395 {
1396         int ret = 0;
1397         int scope = RT_SCOPE_NOWHERE;
1398         struct fib_info *prev_fi = NULL;
1399         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1400         struct hlist_head *head = &fib_info_devhash[hash];
1401         struct fib_nh *nh;
1402
1403         if (force)
1404                 scope = -1;
1405
1406         hlist_for_each_entry(nh, head, nh_hash) {
1407                 struct fib_info *fi = nh->nh_parent;
1408                 int dead;
1409
1410                 BUG_ON(!fi->fib_nhs);
1411                 if (nh->nh_dev != dev || fi == prev_fi)
1412                         continue;
1413                 prev_fi = fi;
1414                 dead = 0;
1415                 change_nexthops(fi) {
1416                         if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1417                                 dead++;
1418                         else if (nexthop_nh->nh_dev == dev &&
1419                                  nexthop_nh->nh_scope != scope) {
1420                                 switch (event) {
1421                                 case NETDEV_DOWN:
1422                                 case NETDEV_UNREGISTER:
1423                                         nexthop_nh->nh_flags |= RTNH_F_DEAD;
1424                                         /* fall through */
1425                                 case NETDEV_CHANGE:
1426                                         nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
1427                                         break;
1428                                 }
1429                                 call_fib_nh_notifiers(nexthop_nh,
1430                                                       FIB_EVENT_NH_DEL);
1431                                 dead++;
1432                         }
1433 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1434                         if (event == NETDEV_UNREGISTER &&
1435                             nexthop_nh->nh_dev == dev) {
1436                                 dead = fi->fib_nhs;
1437                                 break;
1438                         }
1439 #endif
1440                 } endfor_nexthops(fi)
1441                 if (dead == fi->fib_nhs) {
1442                         switch (event) {
1443                         case NETDEV_DOWN:
1444                         case NETDEV_UNREGISTER:
1445                                 fi->fib_flags |= RTNH_F_DEAD;
1446                                 /* fall through */
1447                         case NETDEV_CHANGE:
1448                                 fi->fib_flags |= RTNH_F_LINKDOWN;
1449                                 break;
1450                         }
1451                         ret++;
1452                 }
1453
1454                 fib_rebalance(fi);
1455         }
1456
1457         return ret;
1458 }
1459
1460 /* Must be invoked inside of an RCU protected region.  */
1461 static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
1462 {
1463         struct fib_info *fi = NULL, *last_resort = NULL;
1464         struct hlist_head *fa_head = res->fa_head;
1465         struct fib_table *tb = res->table;
1466         u8 slen = 32 - res->prefixlen;
1467         int order = -1, last_idx = -1;
1468         struct fib_alias *fa, *fa1 = NULL;
1469         u32 last_prio = res->fi->fib_priority;
1470         u8 last_tos = 0;
1471
1472         hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
1473                 struct fib_info *next_fi = fa->fa_info;
1474
1475                 if (fa->fa_slen != slen)
1476                         continue;
1477                 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1478                         continue;
1479                 if (fa->tb_id != tb->tb_id)
1480                         continue;
1481                 if (next_fi->fib_priority > last_prio &&
1482                     fa->fa_tos == last_tos) {
1483                         if (last_tos)
1484                                 continue;
1485                         break;
1486                 }
1487                 if (next_fi->fib_flags & RTNH_F_DEAD)
1488                         continue;
1489                 last_tos = fa->fa_tos;
1490                 last_prio = next_fi->fib_priority;
1491
1492                 if (next_fi->fib_scope != res->scope ||
1493                     fa->fa_type != RTN_UNICAST)
1494                         continue;
1495                 if (!next_fi->fib_nh[0].nh_gw ||
1496                     next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1497                         continue;
1498
1499                 fib_alias_accessed(fa);
1500
1501                 if (!fi) {
1502                         if (next_fi != res->fi)
1503                                 break;
1504                         fa1 = fa;
1505                 } else if (!fib_detect_death(fi, order, &last_resort,
1506                                              &last_idx, fa1->fa_default)) {
1507                         fib_result_assign(res, fi);
1508                         fa1->fa_default = order;
1509                         goto out;
1510                 }
1511                 fi = next_fi;
1512                 order++;
1513         }
1514
1515         if (order <= 0 || !fi) {
1516                 if (fa1)
1517                         fa1->fa_default = -1;
1518                 goto out;
1519         }
1520
1521         if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1522                               fa1->fa_default)) {
1523                 fib_result_assign(res, fi);
1524                 fa1->fa_default = order;
1525                 goto out;
1526         }
1527
1528         if (last_idx >= 0)
1529                 fib_result_assign(res, last_resort);
1530         fa1->fa_default = last_idx;
1531 out:
1532         return;
1533 }
1534
1535 /*
1536  * Dead device goes up. We wake up dead nexthops.
1537  * It takes sense only on multipath routes.
1538  */
1539 int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
1540 {
1541         struct fib_info *prev_fi;
1542         unsigned int hash;
1543         struct hlist_head *head;
1544         struct fib_nh *nh;
1545         int ret;
1546
1547         if (!(dev->flags & IFF_UP))
1548                 return 0;
1549
1550         if (nh_flags & RTNH_F_DEAD) {
1551                 unsigned int flags = dev_get_flags(dev);
1552
1553                 if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1554                         nh_flags |= RTNH_F_LINKDOWN;
1555         }
1556
1557         prev_fi = NULL;
1558         hash = fib_devindex_hashfn(dev->ifindex);
1559         head = &fib_info_devhash[hash];
1560         ret = 0;
1561
1562         hlist_for_each_entry(nh, head, nh_hash) {
1563                 struct fib_info *fi = nh->nh_parent;
1564                 int alive;
1565
1566                 BUG_ON(!fi->fib_nhs);
1567                 if (nh->nh_dev != dev || fi == prev_fi)
1568                         continue;
1569
1570                 prev_fi = fi;
1571                 alive = 0;
1572                 change_nexthops(fi) {
1573                         if (!(nexthop_nh->nh_flags & nh_flags)) {
1574                                 alive++;
1575                                 continue;
1576                         }
1577                         if (!nexthop_nh->nh_dev ||
1578                             !(nexthop_nh->nh_dev->flags & IFF_UP))
1579                                 continue;
1580                         if (nexthop_nh->nh_dev != dev ||
1581                             !__in_dev_get_rtnl(dev))
1582                                 continue;
1583                         alive++;
1584                         nexthop_nh->nh_flags &= ~nh_flags;
1585                         call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
1586                 } endfor_nexthops(fi)
1587
1588                 if (alive > 0) {
1589                         fi->fib_flags &= ~nh_flags;
1590                         ret++;
1591                 }
1592
1593                 fib_rebalance(fi);
1594         }
1595
1596         return ret;
1597 }
1598
1599 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1600 static bool fib_good_nh(const struct fib_nh *nh)
1601 {
1602         int state = NUD_REACHABLE;
1603
1604         if (nh->nh_scope == RT_SCOPE_LINK) {
1605                 struct neighbour *n;
1606
1607                 rcu_read_lock_bh();
1608
1609                 n = __ipv4_neigh_lookup_noref(nh->nh_dev,
1610                                               (__force u32)nh->nh_gw);
1611                 if (n)
1612                         state = n->nud_state;
1613
1614                 rcu_read_unlock_bh();
1615         }
1616
1617         return !!(state & NUD_VALID);
1618 }
1619
1620 void fib_select_multipath(struct fib_result *res, int hash)
1621 {
1622         struct fib_info *fi = res->fi;
1623         struct net *net = fi->fib_net;
1624         bool first = false;
1625
1626         for_nexthops(fi) {
1627                 if (hash > atomic_read(&nh->nh_upper_bound))
1628                         continue;
1629
1630                 if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
1631                     fib_good_nh(nh)) {
1632                         res->nh_sel = nhsel;
1633                         return;
1634                 }
1635                 if (!first) {
1636                         res->nh_sel = nhsel;
1637                         first = true;
1638                 }
1639         } endfor_nexthops(fi);
1640 }
1641 #endif
1642
1643 void fib_select_path(struct net *net, struct fib_result *res,
1644                      struct flowi4 *fl4, int mp_hash)
1645 {
1646         bool oif_check;
1647
1648         oif_check = (fl4->flowi4_oif == 0 ||
1649                      fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);
1650
1651 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1652         if (res->fi->fib_nhs > 1 && oif_check) {
1653                 if (mp_hash < 0)
1654                         mp_hash = get_hash_from_flowi4(fl4) >> 1;
1655
1656                 fib_select_multipath(res, mp_hash);
1657         }
1658         else
1659 #endif
1660         if (!res->prefixlen &&
1661             res->table->tb_num_default > 1 &&
1662             res->type == RTN_UNICAST && oif_check)
1663                 fib_select_default(fl4, res);
1664
1665         if (!fl4->saddr)
1666                 fl4->saddr = FIB_RES_PREFSRC(net, *res);
1667 }
1668 EXPORT_SYMBOL_GPL(fib_select_path);