net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134 #include <linux/if_macvlan.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 static DEFINE_SPINLOCK(ptype_lock);
 145 static DEFINE_SPINLOCK(offload_lock);
 146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147 struct list_head ptype_all __read_mostly;       /* Taps */
 148 static struct list_head offload_base __read_mostly;
 149
 150 static int netif_rx_internal(struct sk_buff *skb);
 151 static int call_netdevice_notifiers_info(unsigned long val,
 152                                          struct net_device *dev,
 153                                          struct netdev_notifier_info *info);
 154
 155 /*
 156  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 157  * semaphore.
 158  *
 159  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 160  *
 161  * Writers must hold the rtnl semaphore while they loop through the
 162  * dev_base_head list, and hold dev_base_lock for writing when they do the
 163  * actual updates.  This allows pure readers to access the list even
 164  * while a writer is preparing to update it.
 165  *
 166  * To put it another way, dev_base_lock is held for writing only to
 167  * protect against pure readers; the rtnl semaphore provides the
 168  * protection against other writers.
 169  *
 170  * See, for example usages, register_netdevice() and
 171  * unregister_netdevice(), which must be called with the rtnl
 172  * semaphore held.
 173  */
 174 DEFINE_RWLOCK(dev_base_lock);
 175 EXPORT_SYMBOL(dev_base_lock);
 176
 177 /* protects napi_hash addition/deletion and napi_gen_id */
 178 static DEFINE_SPINLOCK(napi_hash_lock);
 179
 180 static unsigned int napi_gen_id;
 181 static DEFINE_HASHTABLE(napi_hash, 8);
 182
 183 static seqcount_t devnet_rename_seq;
 184
 185 static inline void dev_base_seq_inc(struct net *net)
 186 {
 187         while (++net->dev_base_seq == 0);
 188 }
 189
 190 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 191 {
 192         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 193
 194         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 195 }
 196
 197 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 198 {
 199         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 200 }
 201
 202 static inline void rps_lock(struct softnet_data *sd)
 203 {
 204 #ifdef CONFIG_RPS
 205         spin_lock(&sd->input_pkt_queue.lock);
 206 #endif
 207 }
 208
 209 static inline void rps_unlock(struct softnet_data *sd)
 210 {
 211 #ifdef CONFIG_RPS
 212         spin_unlock(&sd->input_pkt_queue.lock);
 213 #endif
 214 }
 215
 216 /* Device list insertion */
 217 static void list_netdevice(struct net_device *dev)
 218 {
 219         struct net *net = dev_net(dev);
 220
 221         ASSERT_RTNL();
 222
 223         write_lock_bh(&dev_base_lock);
 224         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 225         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 226         hlist_add_head_rcu(&dev->index_hlist,
 227                            dev_index_hash(net, dev->ifindex));
 228         write_unlock_bh(&dev_base_lock);
 229
 230         dev_base_seq_inc(net);
 231 }
 232
 233 /* Device list removal
 234  * caller must respect a RCU grace period before freeing/reusing dev
 235  */
 236 static void unlist_netdevice(struct net_device *dev)
 237 {
 238         ASSERT_RTNL();
 239
 240         /* Unlink dev from the device chain */
 241         write_lock_bh(&dev_base_lock);
 242         list_del_rcu(&dev->dev_list);
 243         hlist_del_rcu(&dev->name_hlist);
 244         hlist_del_rcu(&dev->index_hlist);
 245         write_unlock_bh(&dev_base_lock);
 246
 247         dev_base_seq_inc(dev_net(dev));
 248 }
 249
 250 /*
 251  *      Our notifier list
 252  */
 253
 254 static RAW_NOTIFIER_HEAD(netdev_chain);
 255
 256 /*
 257  *      Device drivers call our routines to queue packets here. We empty the
 258  *      queue in the local softnet handler.
 259  */
 260
 261 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 262 EXPORT_PER_CPU_SYMBOL(softnet_data);
 263
 264 #ifdef CONFIG_LOCKDEP
 265 /*
 266  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 267  * according to dev->type
 268  */
 269 static const unsigned short netdev_lock_type[] =
 270         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 271          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 272          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 273          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 274          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 275          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 276          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 277          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 278          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 279          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 280          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 281          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 282          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 283          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 284          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 285
 286 static const char *const netdev_lock_name[] =
 287         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 288          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 289          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 290          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 291          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 292          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 293          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 294          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 295          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 296          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 297          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 298          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 299          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 300          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 301          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 302
 303 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 304 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305
 306 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 307 {
 308         int i;
 309
 310         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 311                 if (netdev_lock_type[i] == dev_type)
 312                         return i;
 313         /* the last key is used by default */
 314         return ARRAY_SIZE(netdev_lock_type) - 1;
 315 }
 316
 317 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 318                                                  unsigned short dev_type)
 319 {
 320         int i;
 321
 322         i = netdev_lock_pos(dev_type);
 323         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev->type);
 332         lockdep_set_class_and_name(&dev->addr_list_lock,
 333                                    &netdev_addr_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336 #else
 337 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338                                                  unsigned short dev_type)
 339 {
 340 }
 341 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 342 {
 343 }
 344 #endif
 345
 346 /*******************************************************************************
 347
 348                 Protocol management and registration routines
 349
 350 *******************************************************************************/
 351
 352 /*
 353  *      Add a protocol ID to the list. Now that the input handler is
 354  *      smarter we can dispense with all the messy stuff that used to be
 355  *      here.
 356  *
 357  *      BEWARE!!! Protocol handlers, mangling input packets,
 358  *      MUST BE last in hash buckets and checking protocol handlers
 359  *      MUST start from promiscuous ptype_all chain in net_bh.
 360  *      It is true now, do not change it.
 361  *      Explanation follows: if protocol handler, mangling packet, will
 362  *      be the first on list, it is not able to sense, that packet
 363  *      is cloned and should be copied-on-write, so that it will
 364  *      change it and subsequent readers will get broken packet.
 365  *                                                      --ANK (980803)
 366  */
 367
 368 static inline struct list_head *ptype_head(const struct packet_type *pt)
 369 {
 370         if (pt->type == htons(ETH_P_ALL))
 371                 return &ptype_all;
 372         else
 373                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 374 }
 375
 376 /**
 377  *      dev_add_pack - add packet handler
 378  *      @pt: packet type declaration
 379  *
 380  *      Add a protocol handler to the networking stack. The passed &packet_type
 381  *      is linked into kernel lists and may not be freed until it has been
 382  *      removed from the kernel lists.
 383  *
 384  *      This call does not sleep therefore it can not
 385  *      guarantee all CPU's that are in middle of receiving packets
 386  *      will see the new packet type (until the next received packet).
 387  */
 388
 389 void dev_add_pack(struct packet_type *pt)
 390 {
 391         struct list_head *head = ptype_head(pt);
 392
 393         spin_lock(&ptype_lock);
 394         list_add_rcu(&pt->list, head);
 395         spin_unlock(&ptype_lock);
 396 }
 397 EXPORT_SYMBOL(dev_add_pack);
 398
 399 /**
 400  *      __dev_remove_pack        - remove packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Remove a protocol handler that was previously added to the kernel
 404  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 405  *      from the kernel lists and can be freed or reused once this function
 406  *      returns.
 407  *
 408  *      The packet type might still be in use by receivers
 409  *      and must not be freed until after all the CPU's have gone
 410  *      through a quiescent state.
 411  */
 412 void __dev_remove_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415         struct packet_type *pt1;
 416
 417         spin_lock(&ptype_lock);
 418
 419         list_for_each_entry(pt1, head, list) {
 420                 if (pt == pt1) {
 421                         list_del_rcu(&pt->list);
 422                         goto out;
 423                 }
 424         }
 425
 426         pr_warn("dev_remove_pack: %p not found\n", pt);
 427 out:
 428         spin_unlock(&ptype_lock);
 429 }
 430 EXPORT_SYMBOL(__dev_remove_pack);
 431
 432 /**
 433  *      dev_remove_pack  - remove packet handler
 434  *      @pt: packet type declaration
 435  *
 436  *      Remove a protocol handler that was previously added to the kernel
 437  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 438  *      from the kernel lists and can be freed or reused once this function
 439  *      returns.
 440  *
 441  *      This call sleeps to guarantee that no CPU is looking at the packet
 442  *      type after return.
 443  */
 444 void dev_remove_pack(struct packet_type *pt)
 445 {
 446         __dev_remove_pack(pt);
 447
 448         synchronize_net();
 449 }
 450 EXPORT_SYMBOL(dev_remove_pack);
 451
 452
 453 /**
 454  *      dev_add_offload - register offload handlers
 455  *      @po: protocol offload declaration
 456  *
 457  *      Add protocol offload handlers to the networking stack. The passed
 458  *      &proto_offload is linked into kernel lists and may not be freed until
 459  *      it has been removed from the kernel lists.
 460  *
 461  *      This call does not sleep therefore it can not
 462  *      guarantee all CPU's that are in middle of receiving packets
 463  *      will see the new offload handlers (until the next received packet).
 464  */
 465 void dev_add_offload(struct packet_offload *po)
 466 {
 467         struct list_head *head = &offload_base;
 468
 469         spin_lock(&offload_lock);
 470         list_add_rcu(&po->list, head);
 471         spin_unlock(&offload_lock);
 472 }
 473 EXPORT_SYMBOL(dev_add_offload);
 474
 475 /**
 476  *      __dev_remove_offload     - remove offload handler
 477  *      @po: packet offload declaration
 478  *
 479  *      Remove a protocol offload handler that was previously added to the
 480  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 481  *      is removed from the kernel lists and can be freed or reused once this
 482  *      function returns.
 483  *
 484  *      The packet type might still be in use by receivers
 485  *      and must not be freed until after all the CPU's have gone
 486  *      through a quiescent state.
 487  */
 488 static void __dev_remove_offload(struct packet_offload *po)
 489 {
 490         struct list_head *head = &offload_base;
 491         struct packet_offload *po1;
 492
 493         spin_lock(&offload_lock);
 494
 495         list_for_each_entry(po1, head, list) {
 496                 if (po == po1) {
 497                         list_del_rcu(&po->list);
 498                         goto out;
 499                 }
 500         }
 501
 502         pr_warn("dev_remove_offload: %p not found\n", po);
 503 out:
 504         spin_unlock(&offload_lock);
 505 }
 506
 507 /**
 508  *      dev_remove_offload       - remove packet offload handler
 509  *      @po: packet offload declaration
 510  *
 511  *      Remove a packet offload handler that was previously added to the kernel
 512  *      offload handlers by dev_add_offload(). The passed &offload_type is
 513  *      removed from the kernel lists and can be freed or reused once this
 514  *      function returns.
 515  *
 516  *      This call sleeps to guarantee that no CPU is looking at the packet
 517  *      type after return.
 518  */
 519 void dev_remove_offload(struct packet_offload *po)
 520 {
 521         __dev_remove_offload(po);
 522
 523         synchronize_net();
 524 }
 525 EXPORT_SYMBOL(dev_remove_offload);
 526
 527 /******************************************************************************
 528
 529                       Device Boot-time Settings Routines
 530
 531 *******************************************************************************/
 532
 533 /* Boot time configuration table */
 534 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 535
 536 /**
 537  *      netdev_boot_setup_add   - add new setup entry
 538  *      @name: name of the device
 539  *      @map: configured settings for the device
 540  *
 541  *      Adds new setup entry to the dev_boot_setup list.  The function
 542  *      returns 0 on error and 1 on success.  This is a generic routine to
 543  *      all netdevices.
 544  */
 545 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 546 {
 547         struct netdev_boot_setup *s;
 548         int i;
 549
 550         s = dev_boot_setup;
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 552                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 553                         memset(s[i].name, 0, sizeof(s[i].name));
 554                         strlcpy(s[i].name, name, IFNAMSIZ);
 555                         memcpy(&s[i].map, map, sizeof(s[i].map));
 556                         break;
 557                 }
 558         }
 559
 560         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 561 }
 562
 563 /**
 564  *      netdev_boot_setup_check - check boot time settings
 565  *      @dev: the netdevice
 566  *
 567  *      Check boot time settings for the device.
 568  *      The found settings are set for the device to be used
 569  *      later in the device probing.
 570  *      Returns 0 if no settings found, 1 if they are.
 571  */
 572 int netdev_boot_setup_check(struct net_device *dev)
 573 {
 574         struct netdev_boot_setup *s = dev_boot_setup;
 575         int i;
 576
 577         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 578                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 579                     !strcmp(dev->name, s[i].name)) {
 580                         dev->irq        = s[i].map.irq;
 581                         dev->base_addr  = s[i].map.base_addr;
 582                         dev->mem_start  = s[i].map.mem_start;
 583                         dev->mem_end    = s[i].map.mem_end;
 584                         return 1;
 585                 }
 586         }
 587         return 0;
 588 }
 589 EXPORT_SYMBOL(netdev_boot_setup_check);
 590
 591
 592 /**
 593  *      netdev_boot_base        - get address from boot time settings
 594  *      @prefix: prefix for network device
 595  *      @unit: id for network device
 596  *
 597  *      Check boot time settings for the base address of device.
 598  *      The found settings are set for the device to be used
 599  *      later in the device probing.
 600  *      Returns 0 if no settings found.
 601  */
 602 unsigned long netdev_boot_base(const char *prefix, int unit)
 603 {
 604         const struct netdev_boot_setup *s = dev_boot_setup;
 605         char name[IFNAMSIZ];
 606         int i;
 607
 608         sprintf(name, "%s%d", prefix, unit);
 609
 610         /*
 611          * If device already registered then return base of 1
 612          * to indicate not to probe for this interface
 613          */
 614         if (__dev_get_by_name(&init_net, name))
 615                 return 1;
 616
 617         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 618                 if (!strcmp(name, s[i].name))
 619                         return s[i].map.base_addr;
 620         return 0;
 621 }
 622
 623 /*
 624  * Saves at boot time configured settings for any netdevice.
 625  */
 626 int __init netdev_boot_setup(char *str)
 627 {
 628         int ints[5];
 629         struct ifmap map;
 630
 631         str = get_options(str, ARRAY_SIZE(ints), ints);
 632         if (!str || !*str)
 633                 return 0;
 634
 635         /* Save settings */
 636         memset(&map, 0, sizeof(map));
 637         if (ints[0] > 0)
 638                 map.irq = ints[1];
 639         if (ints[0] > 1)
 640                 map.base_addr = ints[2];
 641         if (ints[0] > 2)
 642                 map.mem_start = ints[3];
 643         if (ints[0] > 3)
 644                 map.mem_end = ints[4];
 645
 646         /* Add new entry to the list */
 647         return netdev_boot_setup_add(str, &map);
 648 }
 649
 650 __setup("netdev=", netdev_boot_setup);
 651
 652 /*******************************************************************************
 653
 654                             Device Interface Subroutines
 655
 656 *******************************************************************************/
 657
 658 /**
 659  *      __dev_get_by_name       - find a device by its name
 660  *      @net: the applicable net namespace
 661  *      @name: name to find
 662  *
 663  *      Find an interface by name. Must be called under RTNL semaphore
 664  *      or @dev_base_lock. If the name is found a pointer to the device
 665  *      is returned. If the name is not found then %NULL is returned. The
 666  *      reference counters are not incremented so the caller must be
 667  *      careful with locks.
 668  */
 669
 670 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 671 {
 672         struct net_device *dev;
 673         struct hlist_head *head = dev_name_hash(net, name);
 674
 675         hlist_for_each_entry(dev, head, name_hlist)
 676                 if (!strncmp(dev->name, name, IFNAMSIZ))
 677                         return dev;
 678
 679         return NULL;
 680 }
 681 EXPORT_SYMBOL(__dev_get_by_name);
 682
 683 /**
 684  *      dev_get_by_name_rcu     - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name.
 689  *      If the name is found a pointer to the device is returned.
 690  *      If the name is not found then %NULL is returned.
 691  *      The reference counters are not incremented so the caller must be
 692  *      careful with locks. The caller must hold RCU lock.
 693  */
 694
 695 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 696 {
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry_rcu(dev, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(dev_get_by_name_rcu);
 707
 708 /**
 709  *      dev_get_by_name         - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name. This can be called from any
 714  *      context and does its own locking. The returned handle has
 715  *      the usage count incremented and the caller must use dev_put() to
 716  *      release it when it is no longer needed. %NULL is returned if no
 717  *      matching device is found.
 718  */
 719
 720 struct net_device *dev_get_by_name(struct net *net, const char *name)
 721 {
 722         struct net_device *dev;
 723
 724         rcu_read_lock();
 725         dev = dev_get_by_name_rcu(net, name);
 726         if (dev)
 727                 dev_hold(dev);
 728         rcu_read_unlock();
 729         return dev;
 730 }
 731 EXPORT_SYMBOL(dev_get_by_name);
 732
 733 /**
 734  *      __dev_get_by_index - find a device by its ifindex
 735  *      @net: the applicable net namespace
 736  *      @ifindex: index of device
 737  *
 738  *      Search for an interface by index. Returns %NULL if the device
 739  *      is not found or a pointer to the device. The device has not
 740  *      had its reference counter increased so the caller must be careful
 741  *      about locking. The caller must hold either the RTNL semaphore
 742  *      or @dev_base_lock.
 743  */
 744
 745 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 746 {
 747         struct net_device *dev;
 748         struct hlist_head *head = dev_index_hash(net, ifindex);
 749
 750         hlist_for_each_entry(dev, head, index_hlist)
 751                 if (dev->ifindex == ifindex)
 752                         return dev;
 753
 754         return NULL;
 755 }
 756 EXPORT_SYMBOL(__dev_get_by_index);
 757
 758 /**
 759  *      dev_get_by_index_rcu - find a device by its ifindex
 760  *      @net: the applicable net namespace
 761  *      @ifindex: index of device
 762  *
 763  *      Search for an interface by index. Returns %NULL if the device
 764  *      is not found or a pointer to the device. The device has not
 765  *      had its reference counter increased so the caller must be careful
 766  *      about locking. The caller must hold RCU lock.
 767  */
 768
 769 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 770 {
 771         struct net_device *dev;
 772         struct hlist_head *head = dev_index_hash(net, ifindex);
 773
 774         hlist_for_each_entry_rcu(dev, head, index_hlist)
 775                 if (dev->ifindex == ifindex)
 776                         return dev;
 777
 778         return NULL;
 779 }
 780 EXPORT_SYMBOL(dev_get_by_index_rcu);
 781
 782
 783 /**
 784  *      dev_get_by_index - find a device by its ifindex
 785  *      @net: the applicable net namespace
 786  *      @ifindex: index of device
 787  *
 788  *      Search for an interface by index. Returns NULL if the device
 789  *      is not found or a pointer to the device. The device returned has
 790  *      had a reference added and the pointer is safe until the user calls
 791  *      dev_put to indicate they have finished with it.
 792  */
 793
 794 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 795 {
 796         struct net_device *dev;
 797
 798         rcu_read_lock();
 799         dev = dev_get_by_index_rcu(net, ifindex);
 800         if (dev)
 801                 dev_hold(dev);
 802         rcu_read_unlock();
 803         return dev;
 804 }
 805 EXPORT_SYMBOL(dev_get_by_index);
 806
 807 /**
 808  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 809  *      @net: network namespace
 810  *      @name: a pointer to the buffer where the name will be stored.
 811  *      @ifindex: the ifindex of the interface to get the name from.
 812  *
 813  *      The use of raw_seqcount_begin() and cond_resched() before
 814  *      retrying is required as we want to give the writers a chance
 815  *      to complete when CONFIG_PREEMPT is not set.
 816  */
 817 int netdev_get_name(struct net *net, char *name, int ifindex)
 818 {
 819         struct net_device *dev;
 820         unsigned int seq;
 821
 822 retry:
 823         seq = raw_seqcount_begin(&devnet_rename_seq);
 824         rcu_read_lock();
 825         dev = dev_get_by_index_rcu(net, ifindex);
 826         if (!dev) {
 827                 rcu_read_unlock();
 828                 return -ENODEV;
 829         }
 830
 831         strcpy(name, dev->name);
 832         rcu_read_unlock();
 833         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 834                 cond_resched();
 835                 goto retry;
 836         }
 837
 838         return 0;
 839 }
 840
 841 /**
 842  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 843  *      @net: the applicable net namespace
 844  *      @type: media type of device
 845  *      @ha: hardware address
 846  *
 847  *      Search for an interface by MAC address. Returns NULL if the device
 848  *      is not found or a pointer to the device.
 849  *      The caller must hold RCU or RTNL.
 850  *      The returned device has not had its ref count increased
 851  *      and the caller must therefore be careful about locking
 852  *
 853  */
 854
 855 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 856                                        const char *ha)
 857 {
 858         struct net_device *dev;
 859
 860         for_each_netdev_rcu(net, dev)
 861                 if (dev->type == type &&
 862                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 863                         return dev;
 864
 865         return NULL;
 866 }
 867 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 868
 869 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 870 {
 871         struct net_device *dev;
 872
 873         ASSERT_RTNL();
 874         for_each_netdev(net, dev)
 875                 if (dev->type == type)
 876                         return dev;
 877
 878         return NULL;
 879 }
 880 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 881
 882 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 883 {
 884         struct net_device *dev, *ret = NULL;
 885
 886         rcu_read_lock();
 887         for_each_netdev_rcu(net, dev)
 888                 if (dev->type == type) {
 889                         dev_hold(dev);
 890                         ret = dev;
 891                         break;
 892                 }
 893         rcu_read_unlock();
 894         return ret;
 895 }
 896 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 897
 898 /**
 899  *      dev_get_by_flags_rcu - find any device with given flags
 900  *      @net: the applicable net namespace
 901  *      @if_flags: IFF_* values
 902  *      @mask: bitmask of bits in if_flags to check
 903  *
 904  *      Search for any interface with the given flags. Returns NULL if a device
 905  *      is not found or a pointer to the device. Must be called inside
 906  *      rcu_read_lock(), and result refcount is unchanged.
 907  */
 908
 909 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 910                                     unsigned short mask)
 911 {
 912         struct net_device *dev, *ret;
 913
 914         ret = NULL;
 915         for_each_netdev_rcu(net, dev) {
 916                 if (((dev->flags ^ if_flags) & mask) == 0) {
 917                         ret = dev;
 918                         break;
 919                 }
 920         }
 921         return ret;
 922 }
 923 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 924
 925 /**
 926  *      dev_valid_name - check if name is okay for network device
 927  *      @name: name string
 928  *
 929  *      Network device names need to be valid file names to
 930  *      to allow sysfs to work.  We also disallow any kind of
 931  *      whitespace.
 932  */
 933 bool dev_valid_name(const char *name)
 934 {
 935         if (*name == '\0')
 936                 return false;
 937         if (strlen(name) >= IFNAMSIZ)
 938                 return false;
 939         if (!strcmp(name, ".") || !strcmp(name, ".."))
 940                 return false;
 941
 942         while (*name) {
 943                 if (*name == '/' || isspace(*name))
 944                         return false;
 945                 name++;
 946         }
 947         return true;
 948 }
 949 EXPORT_SYMBOL(dev_valid_name);
 950
 951 /**
 952  *      __dev_alloc_name - allocate a name for a device
 953  *      @net: network namespace to allocate the device name in
 954  *      @name: name format string
 955  *      @buf:  scratch buffer and result name string
 956  *
 957  *      Passed a format string - eg "lt%d" it will try and find a suitable
 958  *      id. It scans list of devices to build up a free map, then chooses
 959  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 960  *      while allocating the name and adding the device in order to avoid
 961  *      duplicates.
 962  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 963  *      Returns the number of the unit assigned or a negative errno code.
 964  */
 965
 966 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 967 {
 968         int i = 0;
 969         const char *p;
 970         const int max_netdevices = 8*PAGE_SIZE;
 971         unsigned long *inuse;
 972         struct net_device *d;
 973
 974         p = strnchr(name, IFNAMSIZ-1, '%');
 975         if (p) {
 976                 /*
 977                  * Verify the string as this thing may have come from
 978                  * the user.  There must be either one "%d" and no other "%"
 979                  * characters.
 980                  */
 981                 if (p[1] != 'd' || strchr(p + 2, '%'))
 982                         return -EINVAL;
 983
 984                 /* Use one page as a bit array of possible slots */
 985                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 986                 if (!inuse)
 987                         return -ENOMEM;
 988
 989                 for_each_netdev(net, d) {
 990                         if (!sscanf(d->name, name, &i))
 991                                 continue;
 992                         if (i < 0 || i >= max_netdevices)
 993                                 continue;
 994
 995                         /*  avoid cases where sscanf is not exact inverse of printf */
 996                         snprintf(buf, IFNAMSIZ, name, i);
 997                         if (!strncmp(buf, d->name, IFNAMSIZ))
 998                                 set_bit(i, inuse);
 999                 }
1000
1001                 i = find_first_zero_bit(inuse, max_netdevices);
1002                 free_page((unsigned long) inuse);
1003         }
1004
1005         if (buf != name)
1006                 snprintf(buf, IFNAMSIZ, name, i);
1007         if (!__dev_get_by_name(net, buf))
1008                 return i;
1009
1010         /* It is possible to run out of possible slots
1011          * when the name is long and there isn't enough space left
1012          * for the digits, or if all bits are used.
1013          */
1014         return -ENFILE;
1015 }
1016
1017 /**
1018  *      dev_alloc_name - allocate a name for a device
1019  *      @dev: device
1020  *      @name: name format string
1021  *
1022  *      Passed a format string - eg "lt%d" it will try and find a suitable
1023  *      id. It scans list of devices to build up a free map, then chooses
1024  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1025  *      while allocating the name and adding the device in order to avoid
1026  *      duplicates.
1027  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1028  *      Returns the number of the unit assigned or a negative errno code.
1029  */
1030
1031 int dev_alloc_name(struct net_device *dev, const char *name)
1032 {
1033         char buf[IFNAMSIZ];
1034         struct net *net;
1035         int ret;
1036
1037         BUG_ON(!dev_net(dev));
1038         net = dev_net(dev);
1039         ret = __dev_alloc_name(net, name, buf);
1040         if (ret >= 0)
1041                 strlcpy(dev->name, buf, IFNAMSIZ);
1042         return ret;
1043 }
1044 EXPORT_SYMBOL(dev_alloc_name);
1045
1046 static int dev_alloc_name_ns(struct net *net,
1047                              struct net_device *dev,
1048                              const char *name)
1049 {
1050         char buf[IFNAMSIZ];
1051         int ret;
1052
1053         ret = __dev_alloc_name(net, name, buf);
1054         if (ret >= 0)
1055                 strlcpy(dev->name, buf, IFNAMSIZ);
1056         return ret;
1057 }
1058
1059 static int dev_get_valid_name(struct net *net,
1060                               struct net_device *dev,
1061                               const char *name)
1062 {
1063         BUG_ON(!net);
1064
1065         if (!dev_valid_name(name))
1066                 return -EINVAL;
1067
1068         if (strchr(name, '%'))
1069                 return dev_alloc_name_ns(net, dev, name);
1070         else if (__dev_get_by_name(net, name))
1071                 return -EEXIST;
1072         else if (dev->name != name)
1073                 strlcpy(dev->name, name, IFNAMSIZ);
1074
1075         return 0;
1076 }
1077
1078 /**
1079  *      dev_change_name - change name of a device
1080  *      @dev: device
1081  *      @newname: name (or format string) must be at least IFNAMSIZ
1082  *
1083  *      Change name of a device, can pass format strings "eth%d".
1084  *      for wildcarding.
1085  */
1086 int dev_change_name(struct net_device *dev, const char *newname)
1087 {
1088         char oldname[IFNAMSIZ];
1089         int err = 0;
1090         int ret;
1091         struct net *net;
1092
1093         ASSERT_RTNL();
1094         BUG_ON(!dev_net(dev));
1095
1096         net = dev_net(dev);
1097         if (dev->flags & IFF_UP)
1098                 return -EBUSY;
1099
1100         write_seqcount_begin(&devnet_rename_seq);
1101
1102         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1103                 write_seqcount_end(&devnet_rename_seq);
1104                 return 0;
1105         }
1106
1107         memcpy(oldname, dev->name, IFNAMSIZ);
1108
1109         err = dev_get_valid_name(net, dev, newname);
1110         if (err < 0) {
1111                 write_seqcount_end(&devnet_rename_seq);
1112                 return err;
1113         }
1114
1115 rollback:
1116         ret = device_rename(&dev->dev, dev->name);
1117         if (ret) {
1118                 memcpy(dev->name, oldname, IFNAMSIZ);
1119                 write_seqcount_end(&devnet_rename_seq);
1120                 return ret;
1121         }
1122
1123         write_seqcount_end(&devnet_rename_seq);
1124
1125         netdev_adjacent_rename_links(dev, oldname);
1126
1127         write_lock_bh(&dev_base_lock);
1128         hlist_del_rcu(&dev->name_hlist);
1129         write_unlock_bh(&dev_base_lock);
1130
1131         synchronize_rcu();
1132
1133         write_lock_bh(&dev_base_lock);
1134         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1135         write_unlock_bh(&dev_base_lock);
1136
1137         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1138         ret = notifier_to_errno(ret);
1139
1140         if (ret) {
1141                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1142                 if (err >= 0) {
1143                         err = ret;
1144                         write_seqcount_begin(&devnet_rename_seq);
1145                         memcpy(dev->name, oldname, IFNAMSIZ);
1146                         memcpy(oldname, newname, IFNAMSIZ);
1147                         goto rollback;
1148                 } else {
1149                         pr_err("%s: name change rollback failed: %d\n",
1150                                dev->name, ret);
1151                 }
1152         }
1153
1154         return err;
1155 }
1156
1157 /**
1158  *      dev_set_alias - change ifalias of a device
1159  *      @dev: device
1160  *      @alias: name up to IFALIASZ
1161  *      @len: limit of bytes to copy from info
1162  *
1163  *      Set ifalias for a device,
1164  */
1165 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1166 {
1167         char *new_ifalias;
1168
1169         ASSERT_RTNL();
1170
1171         if (len >= IFALIASZ)
1172                 return -EINVAL;
1173
1174         if (!len) {
1175                 kfree(dev->ifalias);
1176                 dev->ifalias = NULL;
1177                 return 0;
1178         }
1179
1180         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1181         if (!new_ifalias)
1182                 return -ENOMEM;
1183         dev->ifalias = new_ifalias;
1184
1185         strlcpy(dev->ifalias, alias, len+1);
1186         return len;
1187 }
1188
1189
1190 /**
1191  *      netdev_features_change - device changes features
1192  *      @dev: device to cause notification
1193  *
1194  *      Called to indicate a device has changed features.
1195  */
1196 void netdev_features_change(struct net_device *dev)
1197 {
1198         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1199 }
1200 EXPORT_SYMBOL(netdev_features_change);
1201
1202 /**
1203  *      netdev_state_change - device changes state
1204  *      @dev: device to cause notification
1205  *
1206  *      Called to indicate a device has changed state. This function calls
1207  *      the notifier chains for netdev_chain and sends a NEWLINK message
1208  *      to the routing socket.
1209  */
1210 void netdev_state_change(struct net_device *dev)
1211 {
1212         if (dev->flags & IFF_UP) {
1213                 struct netdev_notifier_change_info change_info;
1214
1215                 change_info.flags_changed = 0;
1216                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1217                                               &change_info.info);
1218                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1219         }
1220 }
1221 EXPORT_SYMBOL(netdev_state_change);
1222
1223 /**
1224  *      netdev_notify_peers - notify network peers about existence of @dev
1225  *      @dev: network device
1226  *
1227  * Generate traffic such that interested network peers are aware of
1228  * @dev, such as by generating a gratuitous ARP. This may be used when
1229  * a device wants to inform the rest of the network about some sort of
1230  * reconfiguration such as a failover event or virtual machine
1231  * migration.
1232  */
1233 void netdev_notify_peers(struct net_device *dev)
1234 {
1235         rtnl_lock();
1236         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1237         rtnl_unlock();
1238 }
1239 EXPORT_SYMBOL(netdev_notify_peers);
1240
1241 static int __dev_open(struct net_device *dev)
1242 {
1243         const struct net_device_ops *ops = dev->netdev_ops;
1244         int ret;
1245
1246         ASSERT_RTNL();
1247
1248         if (!netif_device_present(dev))
1249                 return -ENODEV;
1250
1251         /* Block netpoll from trying to do any rx path servicing.
1252          * If we don't do this there is a chance ndo_poll_controller
1253          * or ndo_poll may be running while we open the device
1254          */
1255         netpoll_rx_disable(dev);
1256
1257         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1258         ret = notifier_to_errno(ret);
1259         if (ret)
1260                 return ret;
1261
1262         set_bit(__LINK_STATE_START, &dev->state);
1263
1264         if (ops->ndo_validate_addr)
1265                 ret = ops->ndo_validate_addr(dev);
1266
1267         if (!ret && ops->ndo_open)
1268                 ret = ops->ndo_open(dev);
1269
1270         netpoll_rx_enable(dev);
1271
1272         if (ret)
1273                 clear_bit(__LINK_STATE_START, &dev->state);
1274         else {
1275                 dev->flags |= IFF_UP;
1276                 net_dmaengine_get();
1277                 dev_set_rx_mode(dev);
1278                 dev_activate(dev);
1279                 add_device_randomness(dev->dev_addr, dev->addr_len);
1280         }
1281
1282         return ret;
1283 }
1284
1285 /**
1286  *      dev_open        - prepare an interface for use.
1287  *      @dev:   device to open
1288  *
1289  *      Takes a device from down to up state. The device's private open
1290  *      function is invoked and then the multicast lists are loaded. Finally
1291  *      the device is moved into the up state and a %NETDEV_UP message is
1292  *      sent to the netdev notifier chain.
1293  *
1294  *      Calling this function on an active interface is a nop. On a failure
1295  *      a negative errno code is returned.
1296  */
1297 int dev_open(struct net_device *dev)
1298 {
1299         int ret;
1300
1301         if (dev->flags & IFF_UP)
1302                 return 0;
1303
1304         ret = __dev_open(dev);
1305         if (ret < 0)
1306                 return ret;
1307
1308         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1309         call_netdevice_notifiers(NETDEV_UP, dev);
1310
1311         return ret;
1312 }
1313 EXPORT_SYMBOL(dev_open);
1314
1315 static int __dev_close_many(struct list_head *head)
1316 {
1317         struct net_device *dev;
1318
1319         ASSERT_RTNL();
1320         might_sleep();
1321
1322         list_for_each_entry(dev, head, close_list) {
1323                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1324
1325                 clear_bit(__LINK_STATE_START, &dev->state);
1326
1327                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1328                  * can be even on different cpu. So just clear netif_running().
1329                  *
1330                  * dev->stop() will invoke napi_disable() on all of it's
1331                  * napi_struct instances on this device.
1332                  */
1333                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1334         }
1335
1336         dev_deactivate_many(head);
1337
1338         list_for_each_entry(dev, head, close_list) {
1339                 const struct net_device_ops *ops = dev->netdev_ops;
1340
1341                 /*
1342                  *      Call the device specific close. This cannot fail.
1343                  *      Only if device is UP
1344                  *
1345                  *      We allow it to be called even after a DETACH hot-plug
1346                  *      event.
1347                  */
1348                 if (ops->ndo_stop)
1349                         ops->ndo_stop(dev);
1350
1351                 dev->flags &= ~IFF_UP;
1352                 net_dmaengine_put();
1353         }
1354
1355         return 0;
1356 }
1357
1358 static int __dev_close(struct net_device *dev)
1359 {
1360         int retval;
1361         LIST_HEAD(single);
1362
1363         /* Temporarily disable netpoll until the interface is down */
1364         netpoll_rx_disable(dev);
1365
1366         list_add(&dev->close_list, &single);
1367         retval = __dev_close_many(&single);
1368         list_del(&single);
1369
1370         netpoll_rx_enable(dev);
1371         return retval;
1372 }
1373
1374 static int dev_close_many(struct list_head *head)
1375 {
1376         struct net_device *dev, *tmp;
1377
1378         /* Remove the devices that don't need to be closed */
1379         list_for_each_entry_safe(dev, tmp, head, close_list)
1380                 if (!(dev->flags & IFF_UP))
1381                         list_del_init(&dev->close_list);
1382
1383         __dev_close_many(head);
1384
1385         list_for_each_entry_safe(dev, tmp, head, close_list) {
1386                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1387                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1388                 list_del_init(&dev->close_list);
1389         }
1390
1391         return 0;
1392 }
1393
1394 /**
1395  *      dev_close - shutdown an interface.
1396  *      @dev: device to shutdown
1397  *
1398  *      This function moves an active device into down state. A
1399  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1400  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1401  *      chain.
1402  */
1403 int dev_close(struct net_device *dev)
1404 {
1405         if (dev->flags & IFF_UP) {
1406                 LIST_HEAD(single);
1407
1408                 /* Block netpoll rx while the interface is going down */
1409                 netpoll_rx_disable(dev);
1410
1411                 list_add(&dev->close_list, &single);
1412                 dev_close_many(&single);
1413                 list_del(&single);
1414
1415                 netpoll_rx_enable(dev);
1416         }
1417         return 0;
1418 }
1419 EXPORT_SYMBOL(dev_close);
1420
1421
1422 /**
1423  *      dev_disable_lro - disable Large Receive Offload on a device
1424  *      @dev: device
1425  *
1426  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1427  *      called under RTNL.  This is needed if received packets may be
1428  *      forwarded to another interface.
1429  */
1430 void dev_disable_lro(struct net_device *dev)
1431 {
1432         /*
1433          * If we're trying to disable lro on a vlan device
1434          * use the underlying physical device instead
1435          */
1436         if (is_vlan_dev(dev))
1437                 dev = vlan_dev_real_dev(dev);
1438
1439         /* the same for macvlan devices */
1440         if (netif_is_macvlan(dev))
1441                 dev = macvlan_dev_real_dev(dev);
1442
1443         dev->wanted_features &= ~NETIF_F_LRO;
1444         netdev_update_features(dev);
1445
1446         if (unlikely(dev->features & NETIF_F_LRO))
1447                 netdev_WARN(dev, "failed to disable LRO!\n");
1448 }
1449 EXPORT_SYMBOL(dev_disable_lro);
1450
1451 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1452                                    struct net_device *dev)
1453 {
1454         struct netdev_notifier_info info;
1455
1456         netdev_notifier_info_init(&info, dev);
1457         return nb->notifier_call(nb, val, &info);
1458 }
1459
1460 static int dev_boot_phase = 1;
1461
1462 /**
1463  *      register_netdevice_notifier - register a network notifier block
1464  *      @nb: notifier
1465  *
1466  *      Register a notifier to be called when network device events occur.
1467  *      The notifier passed is linked into the kernel structures and must
1468  *      not be reused until it has been unregistered. A negative errno code
1469  *      is returned on a failure.
1470  *
1471  *      When registered all registration and up events are replayed
1472  *      to the new notifier to allow device to have a race free
1473  *      view of the network device list.
1474  */
1475
1476 int register_netdevice_notifier(struct notifier_block *nb)
1477 {
1478         struct net_device *dev;
1479         struct net_device *last;
1480         struct net *net;
1481         int err;
1482
1483         rtnl_lock();
1484         err = raw_notifier_chain_register(&netdev_chain, nb);
1485         if (err)
1486                 goto unlock;
1487         if (dev_boot_phase)
1488                 goto unlock;
1489         for_each_net(net) {
1490                 for_each_netdev(net, dev) {
1491                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1492                         err = notifier_to_errno(err);
1493                         if (err)
1494                                 goto rollback;
1495
1496                         if (!(dev->flags & IFF_UP))
1497                                 continue;
1498
1499                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1500                 }
1501         }
1502
1503 unlock:
1504         rtnl_unlock();
1505         return err;
1506
1507 rollback:
1508         last = dev;
1509         for_each_net(net) {
1510                 for_each_netdev(net, dev) {
1511                         if (dev == last)
1512                                 goto outroll;
1513
1514                         if (dev->flags & IFF_UP) {
1515                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1516                                                         dev);
1517                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1518                         }
1519                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1520                 }
1521         }
1522
1523 outroll:
1524         raw_notifier_chain_unregister(&netdev_chain, nb);
1525         goto unlock;
1526 }
1527 EXPORT_SYMBOL(register_netdevice_notifier);
1528
1529 /**
1530  *      unregister_netdevice_notifier - unregister a network notifier block
1531  *      @nb: notifier
1532  *
1533  *      Unregister a notifier previously registered by
1534  *      register_netdevice_notifier(). The notifier is unlinked into the
1535  *      kernel structures and may then be reused. A negative errno code
1536  *      is returned on a failure.
1537  *
1538  *      After unregistering unregister and down device events are synthesized
1539  *      for all devices on the device list to the removed notifier to remove
1540  *      the need for special case cleanup code.
1541  */
1542
1543 int unregister_netdevice_notifier(struct notifier_block *nb)
1544 {
1545         struct net_device *dev;
1546         struct net *net;
1547         int err;
1548
1549         rtnl_lock();
1550         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1551         if (err)
1552                 goto unlock;
1553
1554         for_each_net(net) {
1555                 for_each_netdev(net, dev) {
1556                         if (dev->flags & IFF_UP) {
1557                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1558                                                         dev);
1559                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1560                         }
1561                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1562                 }
1563         }
1564 unlock:
1565         rtnl_unlock();
1566         return err;
1567 }
1568 EXPORT_SYMBOL(unregister_netdevice_notifier);
1569
1570 /**
1571  *      call_netdevice_notifiers_info - call all network notifier blocks
1572  *      @val: value passed unmodified to notifier function
1573  *      @dev: net_device pointer passed unmodified to notifier function
1574  *      @info: notifier information data
1575  *
1576  *      Call all network notifier blocks.  Parameters and return value
1577  *      are as for raw_notifier_call_chain().
1578  */
1579
1580 static int call_netdevice_notifiers_info(unsigned long val,
1581                                          struct net_device *dev,
1582                                          struct netdev_notifier_info *info)
1583 {
1584         ASSERT_RTNL();
1585         netdev_notifier_info_init(info, dev);
1586         return raw_notifier_call_chain(&netdev_chain, val, info);
1587 }
1588
1589 /**
1590  *      call_netdevice_notifiers - call all network notifier blocks
1591  *      @val: value passed unmodified to notifier function
1592  *      @dev: net_device pointer passed unmodified to notifier function
1593  *
1594  *      Call all network notifier blocks.  Parameters and return value
1595  *      are as for raw_notifier_call_chain().
1596  */
1597
1598 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1599 {
1600         struct netdev_notifier_info info;
1601
1602         return call_netdevice_notifiers_info(val, dev, &info);
1603 }
1604 EXPORT_SYMBOL(call_netdevice_notifiers);
1605
1606 static struct static_key netstamp_needed __read_mostly;
1607 #ifdef HAVE_JUMP_LABEL
1608 /* We are not allowed to call static_key_slow_dec() from irq context
1609  * If net_disable_timestamp() is called from irq context, defer the
1610  * static_key_slow_dec() calls.
1611  */
1612 static atomic_t netstamp_needed_deferred;
1613 #endif
1614
1615 void net_enable_timestamp(void)
1616 {
1617 #ifdef HAVE_JUMP_LABEL
1618         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1619
1620         if (deferred) {
1621                 while (--deferred)
1622                         static_key_slow_dec(&netstamp_needed);
1623                 return;
1624         }
1625 #endif
1626         static_key_slow_inc(&netstamp_needed);
1627 }
1628 EXPORT_SYMBOL(net_enable_timestamp);
1629
1630 void net_disable_timestamp(void)
1631 {
1632 #ifdef HAVE_JUMP_LABEL
1633         if (in_interrupt()) {
1634                 atomic_inc(&netstamp_needed_deferred);
1635                 return;
1636         }
1637 #endif
1638         static_key_slow_dec(&netstamp_needed);
1639 }
1640 EXPORT_SYMBOL(net_disable_timestamp);
1641
1642 static inline void net_timestamp_set(struct sk_buff *skb)
1643 {
1644         skb->tstamp.tv64 = 0;
1645         if (static_key_false(&netstamp_needed))
1646                 __net_timestamp(skb);
1647 }
1648
1649 #define net_timestamp_check(COND, SKB)                  \
1650         if (static_key_false(&netstamp_needed)) {               \
1651                 if ((COND) && !(SKB)->tstamp.tv64)      \
1652                         __net_timestamp(SKB);           \
1653         }                                               \
1654
1655 static inline bool is_skb_forwardable(struct net_device *dev,
1656                                       struct sk_buff *skb)
1657 {
1658         unsigned int len;
1659
1660         if (!(dev->flags & IFF_UP))
1661                 return false;
1662
1663         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1664         if (skb->len <= len)
1665                 return true;
1666
1667         /* if TSO is enabled, we don't care about the length as the packet
1668          * could be forwarded without being segmented before
1669          */
1670         if (skb_is_gso(skb))
1671                 return true;
1672
1673         return false;
1674 }
1675
1676 /**
1677  * dev_forward_skb - loopback an skb to another netif
1678  *
1679  * @dev: destination network device
1680  * @skb: buffer to forward
1681  *
1682  * return values:
1683  *      NET_RX_SUCCESS  (no congestion)
1684  *      NET_RX_DROP     (packet was dropped, but freed)
1685  *
1686  * dev_forward_skb can be used for injecting an skb from the
1687  * start_xmit function of one device into the receive queue
1688  * of another device.
1689  *
1690  * The receiving device may be in another namespace, so
1691  * we have to clear all information in the skb that could
1692  * impact namespace isolation.
1693  */
1694 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1695 {
1696         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1697                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1698                         atomic_long_inc(&dev->rx_dropped);
1699                         kfree_skb(skb);
1700                         return NET_RX_DROP;
1701                 }
1702         }
1703
1704         if (unlikely(!is_skb_forwardable(dev, skb))) {
1705                 atomic_long_inc(&dev->rx_dropped);
1706                 kfree_skb(skb);
1707                 return NET_RX_DROP;
1708         }
1709
1710         skb_scrub_packet(skb, true);
1711         skb->protocol = eth_type_trans(skb, dev);
1712
1713         return netif_rx_internal(skb);
1714 }
1715 EXPORT_SYMBOL_GPL(dev_forward_skb);
1716
1717 static inline int deliver_skb(struct sk_buff *skb,
1718                               struct packet_type *pt_prev,
1719                               struct net_device *orig_dev)
1720 {
1721         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1722                 return -ENOMEM;
1723         atomic_inc(&skb->users);
1724         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1725 }
1726
1727 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1728 {
1729         if (!ptype->af_packet_priv || !skb->sk)
1730                 return false;
1731
1732         if (ptype->id_match)
1733                 return ptype->id_match(ptype, skb->sk);
1734         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1735                 return true;
1736
1737         return false;
1738 }
1739
1740 /*
1741  *      Support routine. Sends outgoing frames to any network
1742  *      taps currently in use.
1743  */
1744
1745 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1746 {
1747         struct packet_type *ptype;
1748         struct sk_buff *skb2 = NULL;
1749         struct packet_type *pt_prev = NULL;
1750
1751         rcu_read_lock();
1752         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1753                 /* Never send packets back to the socket
1754                  * they originated from - MvS (miquels@drinkel.ow.org)
1755                  */
1756                 if ((ptype->dev == dev || !ptype->dev) &&
1757                     (!skb_loop_sk(ptype, skb))) {
1758                         if (pt_prev) {
1759                                 deliver_skb(skb2, pt_prev, skb->dev);
1760                                 pt_prev = ptype;
1761                                 continue;
1762                         }
1763
1764                         skb2 = skb_clone(skb, GFP_ATOMIC);
1765                         if (!skb2)
1766                                 break;
1767
1768                         net_timestamp_set(skb2);
1769
1770                         /* skb->nh should be correctly
1771                            set by sender, so that the second statement is
1772                            just protection against buggy protocols.
1773                          */
1774                         skb_reset_mac_header(skb2);
1775
1776                         if (skb_network_header(skb2) < skb2->data ||
1777                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1778                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1779                                                      ntohs(skb2->protocol),
1780                                                      dev->name);
1781                                 skb_reset_network_header(skb2);
1782                         }
1783
1784                         skb2->transport_header = skb2->network_header;
1785                         skb2->pkt_type = PACKET_OUTGOING;
1786                         pt_prev = ptype;
1787                 }
1788         }
1789         if (pt_prev)
1790                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1791         rcu_read_unlock();
1792 }
1793
1794 /**
1795  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1796  * @dev: Network device
1797  * @txq: number of queues available
1798  *
1799  * If real_num_tx_queues is changed the tc mappings may no longer be
1800  * valid. To resolve this verify the tc mapping remains valid and if
1801  * not NULL the mapping. With no priorities mapping to this
1802  * offset/count pair it will no longer be used. In the worst case TC0
1803  * is invalid nothing can be done so disable priority mappings. If is
1804  * expected that drivers will fix this mapping if they can before
1805  * calling netif_set_real_num_tx_queues.
1806  */
1807 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1808 {
1809         int i;
1810         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1811
1812         /* If TC0 is invalidated disable TC mapping */
1813         if (tc->offset + tc->count > txq) {
1814                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1815                 dev->num_tc = 0;
1816                 return;
1817         }
1818
1819         /* Invalidated prio to tc mappings set to TC0 */
1820         for (i = 1; i < TC_BITMASK + 1; i++) {
1821                 int q = netdev_get_prio_tc_map(dev, i);
1822
1823                 tc = &dev->tc_to_txq[q];
1824                 if (tc->offset + tc->count > txq) {
1825                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1826                                 i, q);
1827                         netdev_set_prio_tc_map(dev, i, 0);
1828                 }
1829         }
1830 }
1831
1832 #ifdef CONFIG_XPS
1833 static DEFINE_MUTEX(xps_map_mutex);
1834 #define xmap_dereference(P)             \
1835         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1836
1837 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1838                                         int cpu, u16 index)
1839 {
1840         struct xps_map *map = NULL;
1841         int pos;
1842
1843         if (dev_maps)
1844                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1845
1846         for (pos = 0; map && pos < map->len; pos++) {
1847                 if (map->queues[pos] == index) {
1848                         if (map->len > 1) {
1849                                 map->queues[pos] = map->queues[--map->len];
1850                         } else {
1851                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1852                                 kfree_rcu(map, rcu);
1853                                 map = NULL;
1854                         }
1855                         break;
1856                 }
1857         }
1858
1859         return map;
1860 }
1861
1862 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1863 {
1864         struct xps_dev_maps *dev_maps;
1865         int cpu, i;
1866         bool active = false;
1867
1868         mutex_lock(&xps_map_mutex);
1869         dev_maps = xmap_dereference(dev->xps_maps);
1870
1871         if (!dev_maps)
1872                 goto out_no_maps;
1873
1874         for_each_possible_cpu(cpu) {
1875                 for (i = index; i < dev->num_tx_queues; i++) {
1876                         if (!remove_xps_queue(dev_maps, cpu, i))
1877                                 break;
1878                 }
1879                 if (i == dev->num_tx_queues)
1880                         active = true;
1881         }
1882
1883         if (!active) {
1884                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1885                 kfree_rcu(dev_maps, rcu);
1886         }
1887
1888         for (i = index; i < dev->num_tx_queues; i++)
1889                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1890                                              NUMA_NO_NODE);
1891
1892 out_no_maps:
1893         mutex_unlock(&xps_map_mutex);
1894 }
1895
1896 static struct xps_map *expand_xps_map(struct xps_map *map,
1897                                       int cpu, u16 index)
1898 {
1899         struct xps_map *new_map;
1900         int alloc_len = XPS_MIN_MAP_ALLOC;
1901         int i, pos;
1902
1903         for (pos = 0; map && pos < map->len; pos++) {
1904                 if (map->queues[pos] != index)
1905                         continue;
1906                 return map;
1907         }
1908
1909         /* Need to add queue to this CPU's existing map */
1910         if (map) {
1911                 if (pos < map->alloc_len)
1912                         return map;
1913
1914                 alloc_len = map->alloc_len * 2;
1915         }
1916
1917         /* Need to allocate new map to store queue on this CPU's map */
1918         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1919                                cpu_to_node(cpu));
1920         if (!new_map)
1921                 return NULL;
1922
1923         for (i = 0; i < pos; i++)
1924                 new_map->queues[i] = map->queues[i];
1925         new_map->alloc_len = alloc_len;
1926         new_map->len = pos;
1927
1928         return new_map;
1929 }
1930
1931 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1932                         u16 index)
1933 {
1934         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1935         struct xps_map *map, *new_map;
1936         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1937         int cpu, numa_node_id = -2;
1938         bool active = false;
1939
1940         mutex_lock(&xps_map_mutex);
1941
1942         dev_maps = xmap_dereference(dev->xps_maps);
1943
1944         /* allocate memory for queue storage */
1945         for_each_online_cpu(cpu) {
1946                 if (!cpumask_test_cpu(cpu, mask))
1947                         continue;
1948
1949                 if (!new_dev_maps)
1950                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1951                 if (!new_dev_maps) {
1952                         mutex_unlock(&xps_map_mutex);
1953                         return -ENOMEM;
1954                 }
1955
1956                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1957                                  NULL;
1958
1959                 map = expand_xps_map(map, cpu, index);
1960                 if (!map)
1961                         goto error;
1962
1963                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1964         }
1965
1966         if (!new_dev_maps)
1967                 goto out_no_new_maps;
1968
1969         for_each_possible_cpu(cpu) {
1970                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1971                         /* add queue to CPU maps */
1972                         int pos = 0;
1973
1974                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1975                         while ((pos < map->len) && (map->queues[pos] != index))
1976                                 pos++;
1977
1978                         if (pos == map->len)
1979                                 map->queues[map->len++] = index;
1980 #ifdef CONFIG_NUMA
1981                         if (numa_node_id == -2)
1982                                 numa_node_id = cpu_to_node(cpu);
1983                         else if (numa_node_id != cpu_to_node(cpu))
1984                                 numa_node_id = -1;
1985 #endif
1986                 } else if (dev_maps) {
1987                         /* fill in the new device map from the old device map */
1988                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1989                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1990                 }
1991
1992         }
1993
1994         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1995
1996         /* Cleanup old maps */
1997         if (dev_maps) {
1998                 for_each_possible_cpu(cpu) {
1999                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2000                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2001                         if (map && map != new_map)
2002                                 kfree_rcu(map, rcu);
2003                 }
2004
2005                 kfree_rcu(dev_maps, rcu);
2006         }
2007
2008         dev_maps = new_dev_maps;
2009         active = true;
2010
2011 out_no_new_maps:
2012         /* update Tx queue numa node */
2013         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2014                                      (numa_node_id >= 0) ? numa_node_id :
2015                                      NUMA_NO_NODE);
2016
2017         if (!dev_maps)
2018                 goto out_no_maps;
2019
2020         /* removes queue from unused CPUs */
2021         for_each_possible_cpu(cpu) {
2022                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2023                         continue;
2024
2025                 if (remove_xps_queue(dev_maps, cpu, index))
2026                         active = true;
2027         }
2028
2029         /* free map if not active */
2030         if (!active) {
2031                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2032                 kfree_rcu(dev_maps, rcu);
2033         }
2034
2035 out_no_maps:
2036         mutex_unlock(&xps_map_mutex);
2037
2038         return 0;
2039 error:
2040         /* remove any maps that we added */
2041         for_each_possible_cpu(cpu) {
2042                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2043                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2044                                  NULL;
2045                 if (new_map && new_map != map)
2046                         kfree(new_map);
2047         }
2048
2049         mutex_unlock(&xps_map_mutex);
2050
2051         kfree(new_dev_maps);
2052         return -ENOMEM;
2053 }
2054 EXPORT_SYMBOL(netif_set_xps_queue);
2055
2056 #endif
2057 /*
2058  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2059  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2060  */
2061 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2062 {
2063         int rc;
2064
2065         if (txq < 1 || txq > dev->num_tx_queues)
2066                 return -EINVAL;
2067
2068         if (dev->reg_state == NETREG_REGISTERED ||
2069             dev->reg_state == NETREG_UNREGISTERING) {
2070                 ASSERT_RTNL();
2071
2072                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2073                                                   txq);
2074                 if (rc)
2075                         return rc;
2076
2077                 if (dev->num_tc)
2078                         netif_setup_tc(dev, txq);
2079
2080                 if (txq < dev->real_num_tx_queues) {
2081                         qdisc_reset_all_tx_gt(dev, txq);
2082 #ifdef CONFIG_XPS
2083                         netif_reset_xps_queues_gt(dev, txq);
2084 #endif
2085                 }
2086         }
2087
2088         dev->real_num_tx_queues = txq;
2089         return 0;
2090 }
2091 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2092
2093 #ifdef CONFIG_SYSFS
2094 /**
2095  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2096  *      @dev: Network device
2097  *      @rxq: Actual number of RX queues
2098  *
2099  *      This must be called either with the rtnl_lock held or before
2100  *      registration of the net device.  Returns 0 on success, or a
2101  *      negative error code.  If called before registration, it always
2102  *      succeeds.
2103  */
2104 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2105 {
2106         int rc;
2107
2108         if (rxq < 1 || rxq > dev->num_rx_queues)
2109                 return -EINVAL;
2110
2111         if (dev->reg_state == NETREG_REGISTERED) {
2112                 ASSERT_RTNL();
2113
2114                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2115                                                   rxq);
2116                 if (rc)
2117                         return rc;
2118         }
2119
2120         dev->real_num_rx_queues = rxq;
2121         return 0;
2122 }
2123 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2124 #endif
2125
2126 /**
2127  * netif_get_num_default_rss_queues - default number of RSS queues
2128  *
2129  * This routine should set an upper limit on the number of RSS queues
2130  * used by default by multiqueue devices.
2131  */
2132 int netif_get_num_default_rss_queues(void)
2133 {
2134         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2135 }
2136 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2137
2138 static inline void __netif_reschedule(struct Qdisc *q)
2139 {
2140         struct softnet_data *sd;
2141         unsigned long flags;
2142
2143         local_irq_save(flags);
2144         sd = &__get_cpu_var(softnet_data);
2145         q->next_sched = NULL;
2146         *sd->output_queue_tailp = q;
2147         sd->output_queue_tailp = &q->next_sched;
2148         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2149         local_irq_restore(flags);
2150 }
2151
2152 void __netif_schedule(struct Qdisc *q)
2153 {
2154         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2155                 __netif_reschedule(q);
2156 }
2157 EXPORT_SYMBOL(__netif_schedule);
2158
2159 struct dev_kfree_skb_cb {
2160         enum skb_free_reason reason;
2161 };
2162
2163 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2164 {
2165         return (struct dev_kfree_skb_cb *)skb->cb;
2166 }
2167
2168 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2169 {
2170         unsigned long flags;
2171
2172         if (likely(atomic_read(&skb->users) == 1)) {
2173                 smp_rmb();
2174                 atomic_set(&skb->users, 0);
2175         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2176                 return;
2177         }
2178         get_kfree_skb_cb(skb)->reason = reason;
2179         local_irq_save(flags);
2180         skb->next = __this_cpu_read(softnet_data.completion_queue);
2181         __this_cpu_write(softnet_data.completion_queue, skb);
2182         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2183         local_irq_restore(flags);
2184 }
2185 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2186
2187 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2188 {
2189         if (in_irq() || irqs_disabled())
2190                 __dev_kfree_skb_irq(skb, reason);
2191         else
2192                 dev_kfree_skb(skb);
2193 }
2194 EXPORT_SYMBOL(__dev_kfree_skb_any);
2195
2196
2197 /**
2198  * netif_device_detach - mark device as removed
2199  * @dev: network device
2200  *
2201  * Mark device as removed from system and therefore no longer available.
2202  */
2203 void netif_device_detach(struct net_device *dev)
2204 {
2205         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2206             netif_running(dev)) {
2207                 netif_tx_stop_all_queues(dev);
2208         }
2209 }
2210 EXPORT_SYMBOL(netif_device_detach);
2211
2212 /**
2213  * netif_device_attach - mark device as attached
2214  * @dev: network device
2215  *
2216  * Mark device as attached from system and restart if needed.
2217  */
2218 void netif_device_attach(struct net_device *dev)
2219 {
2220         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2221             netif_running(dev)) {
2222                 netif_tx_wake_all_queues(dev);
2223                 __netdev_watchdog_up(dev);
2224         }
2225 }
2226 EXPORT_SYMBOL(netif_device_attach);
2227
2228 static void skb_warn_bad_offload(const struct sk_buff *skb)
2229 {
2230         static const netdev_features_t null_features = 0;
2231         struct net_device *dev = skb->dev;
2232         const char *driver = "";
2233
2234         if (!net_ratelimit())
2235                 return;
2236
2237         if (dev && dev->dev.parent)
2238                 driver = dev_driver_string(dev->dev.parent);
2239
2240         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2241              "gso_type=%d ip_summed=%d\n",
2242              driver, dev ? &dev->features : &null_features,
2243              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2244              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2245              skb_shinfo(skb)->gso_type, skb->ip_summed);
2246 }
2247
2248 /*
2249  * Invalidate hardware checksum when packet is to be mangled, and
2250  * complete checksum manually on outgoing path.
2251  */
2252 int skb_checksum_help(struct sk_buff *skb)
2253 {
2254         __wsum csum;
2255         int ret = 0, offset;
2256
2257         if (skb->ip_summed == CHECKSUM_COMPLETE)
2258                 goto out_set_summed;
2259
2260         if (unlikely(skb_shinfo(skb)->gso_size)) {
2261                 skb_warn_bad_offload(skb);
2262                 return -EINVAL;
2263         }
2264
2265         /* Before computing a checksum, we should make sure no frag could
2266          * be modified by an external entity : checksum could be wrong.
2267          */
2268         if (skb_has_shared_frag(skb)) {
2269                 ret = __skb_linearize(skb);
2270                 if (ret)
2271                         goto out;
2272         }
2273
2274         offset = skb_checksum_start_offset(skb);
2275         BUG_ON(offset >= skb_headlen(skb));
2276         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2277
2278         offset += skb->csum_offset;
2279         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2280
2281         if (skb_cloned(skb) &&
2282             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2283                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2284                 if (ret)
2285                         goto out;
2286         }
2287
2288         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2289 out_set_summed:
2290         skb->ip_summed = CHECKSUM_NONE;
2291 out:
2292         return ret;
2293 }
2294 EXPORT_SYMBOL(skb_checksum_help);
2295
2296 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2297 {
2298         __be16 type = skb->protocol;
2299         int vlan_depth = skb->mac_len;
2300
2301         /* Tunnel gso handlers can set protocol to ethernet. */
2302         if (type == htons(ETH_P_TEB)) {
2303                 struct ethhdr *eth;
2304
2305                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2306                         return 0;
2307
2308                 eth = (struct ethhdr *)skb_mac_header(skb);
2309                 type = eth->h_proto;
2310         }
2311
2312         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2313                 struct vlan_hdr *vh;
2314
2315                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2316                         return 0;
2317
2318                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2319                 type = vh->h_vlan_encapsulated_proto;
2320                 vlan_depth += VLAN_HLEN;
2321         }
2322
2323         *depth = vlan_depth;
2324
2325         return type;
2326 }
2327
2328 /**
2329  *      skb_mac_gso_segment - mac layer segmentation handler.
2330  *      @skb: buffer to segment
2331  *      @features: features for the output path (see dev->features)
2332  */
2333 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2334                                     netdev_features_t features)
2335 {
2336         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2337         struct packet_offload *ptype;
2338         int vlan_depth = skb->mac_len;
2339         __be16 type = skb_network_protocol(skb, &vlan_depth);
2340
2341         if (unlikely(!type))
2342                 return ERR_PTR(-EINVAL);
2343
2344         __skb_pull(skb, vlan_depth);
2345
2346         rcu_read_lock();
2347         list_for_each_entry_rcu(ptype, &offload_base, list) {
2348                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2349                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2350                                 int err;
2351
2352                                 err = ptype->callbacks.gso_send_check(skb);
2353                                 segs = ERR_PTR(err);
2354                                 if (err || skb_gso_ok(skb, features))
2355                                         break;
2356                                 __skb_push(skb, (skb->data -
2357                                                  skb_network_header(skb)));
2358                         }
2359                         segs = ptype->callbacks.gso_segment(skb, features);
2360                         break;
2361                 }
2362         }
2363         rcu_read_unlock();
2364
2365         __skb_push(skb, skb->data - skb_mac_header(skb));
2366
2367         return segs;
2368 }
2369 EXPORT_SYMBOL(skb_mac_gso_segment);
2370
2371
2372 /* openvswitch calls this on rx path, so we need a different check.
2373  */
2374 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2375 {
2376         if (tx_path)
2377                 return skb->ip_summed != CHECKSUM_PARTIAL;
2378         else
2379                 return skb->ip_summed == CHECKSUM_NONE;
2380 }
2381
2382 /**
2383  *      __skb_gso_segment - Perform segmentation on skb.
2384  *      @skb: buffer to segment
2385  *      @features: features for the output path (see dev->features)
2386  *      @tx_path: whether it is called in TX path
2387  *
2388  *      This function segments the given skb and returns a list of segments.
2389  *
2390  *      It may return NULL if the skb requires no segmentation.  This is
2391  *      only possible when GSO is used for verifying header integrity.
2392  */
2393 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2394                                   netdev_features_t features, bool tx_path)
2395 {
2396         if (unlikely(skb_needs_check(skb, tx_path))) {
2397                 int err;
2398
2399                 skb_warn_bad_offload(skb);
2400
2401                 if (skb_header_cloned(skb) &&
2402                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2403                         return ERR_PTR(err);
2404         }
2405
2406         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2407         SKB_GSO_CB(skb)->encap_level = 0;
2408
2409         skb_reset_mac_header(skb);
2410         skb_reset_mac_len(skb);
2411
2412         return skb_mac_gso_segment(skb, features);
2413 }
2414 EXPORT_SYMBOL(__skb_gso_segment);
2415
2416 /* Take action when hardware reception checksum errors are detected. */
2417 #ifdef CONFIG_BUG
2418 void netdev_rx_csum_fault(struct net_device *dev)
2419 {
2420         if (net_ratelimit()) {
2421                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2422                 dump_stack();
2423         }
2424 }
2425 EXPORT_SYMBOL(netdev_rx_csum_fault);
2426 #endif
2427
2428 /* Actually, we should eliminate this check as soon as we know, that:
2429  * 1. IOMMU is present and allows to map all the memory.
2430  * 2. No high memory really exists on this machine.
2431  */
2432
2433 static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
2434 {
2435 #ifdef CONFIG_HIGHMEM
2436         int i;
2437         if (!(dev->features & NETIF_F_HIGHDMA)) {
2438                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2439                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2440                         if (PageHighMem(skb_frag_page(frag)))
2441                                 return 1;
2442                 }
2443         }
2444
2445         if (PCI_DMA_BUS_IS_PHYS) {
2446                 struct device *pdev = dev->dev.parent;
2447
2448                 if (!pdev)
2449                         return 0;
2450                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2451                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2452                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2453                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2454                                 return 1;
2455                 }
2456         }
2457 #endif
2458         return 0;
2459 }
2460
2461 struct dev_gso_cb {
2462         void (*destructor)(struct sk_buff *skb);
2463 };
2464
2465 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2466
2467 static void dev_gso_skb_destructor(struct sk_buff *skb)
2468 {
2469         struct dev_gso_cb *cb;
2470
2471         kfree_skb_list(skb->next);
2472         skb->next = NULL;
2473
2474         cb = DEV_GSO_CB(skb);
2475         if (cb->destructor)
2476                 cb->destructor(skb);
2477 }
2478
2479 /**
2480  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2481  *      @skb: buffer to segment
2482  *      @features: device features as applicable to this skb
2483  *
2484  *      This function segments the given skb and stores the list of segments
2485  *      in skb->next.
2486  */
2487 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2488 {
2489         struct sk_buff *segs;
2490
2491         segs = skb_gso_segment(skb, features);
2492
2493         /* Verifying header integrity only. */
2494         if (!segs)
2495                 return 0;
2496
2497         if (IS_ERR(segs))
2498                 return PTR_ERR(segs);
2499
2500         skb->next = segs;
2501         DEV_GSO_CB(skb)->destructor = skb->destructor;
2502         skb->destructor = dev_gso_skb_destructor;
2503
2504         return 0;
2505 }
2506
2507 static netdev_features_t harmonize_features(struct sk_buff *skb,
2508                                             const struct net_device *dev,
2509                                             netdev_features_t features)
2510 {
2511         int tmp;
2512
2513         if (skb->ip_summed != CHECKSUM_NONE &&
2514             !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) {
2515                 features &= ~NETIF_F_ALL_CSUM;
2516         } else if (illegal_highdma(dev, skb)) {
2517                 features &= ~NETIF_F_SG;
2518         }
2519
2520         return features;
2521 }
2522
2523 netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
2524                                          const struct net_device *dev)
2525 {
2526         __be16 protocol = skb->protocol;
2527         netdev_features_t features = dev->features;
2528
2529         if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
2530                 features &= ~NETIF_F_GSO_MASK;
2531
2532         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2533                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2534                 protocol = veh->h_vlan_encapsulated_proto;
2535         } else if (!vlan_tx_tag_present(skb)) {
2536                 return harmonize_features(skb, dev, features);
2537         }
2538
2539         features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2540                                                NETIF_F_HW_VLAN_STAG_TX);
2541
2542         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2543                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2544                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2545                                 NETIF_F_HW_VLAN_STAG_TX;
2546
2547         return harmonize_features(skb, dev, features);
2548 }
2549 EXPORT_SYMBOL(netif_skb_dev_features);
2550
2551 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2552                         struct netdev_queue *txq)
2553 {
2554         const struct net_device_ops *ops = dev->netdev_ops;
2555         int rc = NETDEV_TX_OK;
2556         unsigned int skb_len;
2557
2558         if (likely(!skb->next)) {
2559                 netdev_features_t features;
2560
2561                 /*
2562                  * If device doesn't need skb->dst, release it right now while
2563                  * its hot in this cpu cache
2564                  */
2565                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2566                         skb_dst_drop(skb);
2567
2568                 features = netif_skb_features(skb);
2569
2570                 if (vlan_tx_tag_present(skb) &&
2571                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2572                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2573                                              vlan_tx_tag_get(skb));
2574                         if (unlikely(!skb))
2575                                 goto out;
2576
2577                         skb->vlan_tci = 0;
2578                 }
2579
2580                 /* If encapsulation offload request, verify we are testing
2581                  * hardware encapsulation features instead of standard
2582                  * features for the netdev
2583                  */
2584                 if (skb->encapsulation)
2585                         features &= dev->hw_enc_features;
2586
2587                 if (netif_needs_gso(skb, features)) {
2588                         if (unlikely(dev_gso_segment(skb, features)))
2589                                 goto out_kfree_skb;
2590                         if (skb->next)
2591                                 goto gso;
2592                 } else {
2593                         if (skb_needs_linearize(skb, features) &&
2594                             __skb_linearize(skb))
2595                                 goto out_kfree_skb;
2596
2597                         /* If packet is not checksummed and device does not
2598                          * support checksumming for this protocol, complete
2599                          * checksumming here.
2600                          */
2601                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2602                                 if (skb->encapsulation)
2603                                         skb_set_inner_transport_header(skb,
2604                                                 skb_checksum_start_offset(skb));
2605                                 else
2606                                         skb_set_transport_header(skb,
2607                                                 skb_checksum_start_offset(skb));
2608                                 if (!(features & NETIF_F_ALL_CSUM) &&
2609                                      skb_checksum_help(skb))
2610                                         goto out_kfree_skb;
2611                         }
2612                 }
2613
2614                 if (!list_empty(&ptype_all))
2615                         dev_queue_xmit_nit(skb, dev);
2616
2617                 skb_len = skb->len;
2618                 trace_net_dev_start_xmit(skb, dev);
2619                 rc = ops->ndo_start_xmit(skb, dev);
2620                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2621                 if (rc == NETDEV_TX_OK)
2622                         txq_trans_update(txq);
2623                 return rc;
2624         }
2625
2626 gso:
2627         do {
2628                 struct sk_buff *nskb = skb->next;
2629
2630                 skb->next = nskb->next;
2631                 nskb->next = NULL;
2632
2633                 if (!list_empty(&ptype_all))
2634                         dev_queue_xmit_nit(nskb, dev);
2635
2636                 skb_len = nskb->len;
2637                 trace_net_dev_start_xmit(nskb, dev);
2638                 rc = ops->ndo_start_xmit(nskb, dev);
2639                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2640                 if (unlikely(rc != NETDEV_TX_OK)) {
2641                         if (rc & ~NETDEV_TX_MASK)
2642                                 goto out_kfree_gso_skb;
2643                         nskb->next = skb->next;
2644                         skb->next = nskb;
2645                         return rc;
2646                 }
2647                 txq_trans_update(txq);
2648                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2649                         return NETDEV_TX_BUSY;
2650         } while (skb->next);
2651
2652 out_kfree_gso_skb:
2653         if (likely(skb->next == NULL)) {
2654                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2655                 consume_skb(skb);
2656                 return rc;
2657         }
2658 out_kfree_skb:
2659         kfree_skb(skb);
2660 out:
2661         return rc;
2662 }
2663 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2664
2665 static void qdisc_pkt_len_init(struct sk_buff *skb)
2666 {
2667         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2668
2669         qdisc_skb_cb(skb)->pkt_len = skb->len;
2670
2671         /* To get more precise estimation of bytes sent on wire,
2672          * we add to pkt_len the headers size of all segments
2673          */
2674         if (shinfo->gso_size)  {
2675                 unsigned int hdr_len;
2676                 u16 gso_segs = shinfo->gso_segs;
2677
2678                 /* mac layer + network layer */
2679                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2680
2681                 /* + transport layer */
2682                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2683                         hdr_len += tcp_hdrlen(skb);
2684                 else
2685                         hdr_len += sizeof(struct udphdr);
2686
2687                 if (shinfo->gso_type & SKB_GSO_DODGY)
2688                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2689                                                 shinfo->gso_size);
2690
2691                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2692         }
2693 }
2694
2695 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2696                                  struct net_device *dev,
2697                                  struct netdev_queue *txq)
2698 {
2699         spinlock_t *root_lock = qdisc_lock(q);
2700         bool contended;
2701         int rc;
2702
2703         qdisc_pkt_len_init(skb);
2704         qdisc_calculate_pkt_len(skb, q);
2705         /*
2706          * Heuristic to force contended enqueues to serialize on a
2707          * separate lock before trying to get qdisc main lock.
2708          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2709          * and dequeue packets faster.
2710          */
2711         contended = qdisc_is_running(q);
2712         if (unlikely(contended))
2713                 spin_lock(&q->busylock);
2714
2715         spin_lock(root_lock);
2716         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2717                 kfree_skb(skb);
2718                 rc = NET_XMIT_DROP;
2719         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2720                    qdisc_run_begin(q)) {
2721                 /*
2722                  * This is a work-conserving queue; there are no old skbs
2723                  * waiting to be sent out; and the qdisc is not running -
2724                  * xmit the skb directly.
2725                  */
2726                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2727                         skb_dst_force(skb);
2728
2729                 qdisc_bstats_update(q, skb);
2730
2731                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2732                         if (unlikely(contended)) {
2733                                 spin_unlock(&q->busylock);
2734                                 contended = false;
2735                         }
2736                         __qdisc_run(q);
2737                 } else
2738                         qdisc_run_end(q);
2739
2740                 rc = NET_XMIT_SUCCESS;
2741         } else {
2742                 skb_dst_force(skb);
2743                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2744                 if (qdisc_run_begin(q)) {
2745                         if (unlikely(contended)) {
2746                                 spin_unlock(&q->busylock);
2747                                 contended = false;
2748                         }
2749                         __qdisc_run(q);
2750                 }
2751         }
2752         spin_unlock(root_lock);
2753         if (unlikely(contended))
2754                 spin_unlock(&q->busylock);
2755         return rc;
2756 }
2757
2758 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2759 static void skb_update_prio(struct sk_buff *skb)
2760 {
2761         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2762
2763         if (!skb->priority && skb->sk && map) {
2764                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2765
2766                 if (prioidx < map->priomap_len)
2767                         skb->priority = map->priomap[prioidx];
2768         }
2769 }
2770 #else
2771 #define skb_update_prio(skb)
2772 #endif
2773
2774 static DEFINE_PER_CPU(int, xmit_recursion);
2775 #define RECURSION_LIMIT 10
2776
2777 /**
2778  *      dev_loopback_xmit - loop back @skb
2779  *      @skb: buffer to transmit
2780  */
2781 int dev_loopback_xmit(struct sk_buff *skb)
2782 {
2783         skb_reset_mac_header(skb);
2784         __skb_pull(skb, skb_network_offset(skb));
2785         skb->pkt_type = PACKET_LOOPBACK;
2786         skb->ip_summed = CHECKSUM_UNNECESSARY;
2787         WARN_ON(!skb_dst(skb));
2788         skb_dst_force(skb);
2789         netif_rx_ni(skb);
2790         return 0;
2791 }
2792 EXPORT_SYMBOL(dev_loopback_xmit);
2793
2794 /**
2795  *      __dev_queue_xmit - transmit a buffer
2796  *      @skb: buffer to transmit
2797  *      @accel_priv: private data used for L2 forwarding offload
2798  *
2799  *      Queue a buffer for transmission to a network device. The caller must
2800  *      have set the device and priority and built the buffer before calling
2801  *      this function. The function can be called from an interrupt.
2802  *
2803  *      A negative errno code is returned on a failure. A success does not
2804  *      guarantee the frame will be transmitted as it may be dropped due
2805  *      to congestion or traffic shaping.
2806  *
2807  * -----------------------------------------------------------------------------------
2808  *      I notice this method can also return errors from the queue disciplines,
2809  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2810  *      be positive.
2811  *
2812  *      Regardless of the return value, the skb is consumed, so it is currently
2813  *      difficult to retry a send to this method.  (You can bump the ref count
2814  *      before sending to hold a reference for retry if you are careful.)
2815  *
2816  *      When calling this method, interrupts MUST be enabled.  This is because
2817  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2818  *          --BLG
2819  */
2820 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2821 {
2822         struct net_device *dev = skb->dev;
2823         struct netdev_queue *txq;
2824         struct Qdisc *q;
2825         int rc = -ENOMEM;
2826
2827         skb_reset_mac_header(skb);
2828
2829         /* Disable soft irqs for various locks below. Also
2830          * stops preemption for RCU.
2831          */
2832         rcu_read_lock_bh();
2833
2834         skb_update_prio(skb);
2835
2836         txq = netdev_pick_tx(dev, skb, accel_priv);
2837         q = rcu_dereference_bh(txq->qdisc);
2838
2839 #ifdef CONFIG_NET_CLS_ACT
2840         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2841 #endif
2842         trace_net_dev_queue(skb);
2843         if (q->enqueue) {
2844                 rc = __dev_xmit_skb(skb, q, dev, txq);
2845                 goto out;
2846         }
2847
2848         /* The device has no queue. Common case for software devices:
2849            loopback, all the sorts of tunnels...
2850
2851            Really, it is unlikely that netif_tx_lock protection is necessary
2852            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2853            counters.)
2854            However, it is possible, that they rely on protection
2855            made by us here.
2856
2857            Check this and shot the lock. It is not prone from deadlocks.
2858            Either shot noqueue qdisc, it is even simpler 8)
2859          */
2860         if (dev->flags & IFF_UP) {
2861                 int cpu = smp_processor_id(); /* ok because BHs are off */
2862
2863                 if (txq->xmit_lock_owner != cpu) {
2864
2865                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2866                                 goto recursion_alert;
2867
2868                         HARD_TX_LOCK(dev, txq, cpu);
2869
2870                         if (!netif_xmit_stopped(txq)) {
2871                                 __this_cpu_inc(xmit_recursion);
2872                                 rc = dev_hard_start_xmit(skb, dev, txq);
2873                                 __this_cpu_dec(xmit_recursion);
2874                                 if (dev_xmit_complete(rc)) {
2875                                         HARD_TX_UNLOCK(dev, txq);
2876                                         goto out;
2877                                 }
2878                         }
2879                         HARD_TX_UNLOCK(dev, txq);
2880                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2881                                              dev->name);
2882                 } else {
2883                         /* Recursion is detected! It is possible,
2884                          * unfortunately
2885                          */
2886 recursion_alert:
2887                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2888                                              dev->name);
2889                 }
2890         }
2891
2892         rc = -ENETDOWN;
2893         rcu_read_unlock_bh();
2894
2895         kfree_skb(skb);
2896         return rc;
2897 out:
2898         rcu_read_unlock_bh();
2899         return rc;
2900 }
2901
2902 int dev_queue_xmit(struct sk_buff *skb)
2903 {
2904         return __dev_queue_xmit(skb, NULL);
2905 }
2906 EXPORT_SYMBOL(dev_queue_xmit);
2907
2908 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2909 {
2910         return __dev_queue_xmit(skb, accel_priv);
2911 }
2912 EXPORT_SYMBOL(dev_queue_xmit_accel);
2913
2914
2915 /*=======================================================================
2916                         Receiver routines
2917   =======================================================================*/
2918
2919 int netdev_max_backlog __read_mostly = 1000;
2920 EXPORT_SYMBOL(netdev_max_backlog);
2921
2922 int netdev_tstamp_prequeue __read_mostly = 1;
2923 int netdev_budget __read_mostly = 300;
2924 int weight_p __read_mostly = 64;            /* old backlog weight */
2925
2926 /* Called with irq disabled */
2927 static inline void ____napi_schedule(struct softnet_data *sd,
2928                                      struct napi_struct *napi)
2929 {
2930         list_add_tail(&napi->poll_list, &sd->poll_list);
2931         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2932 }
2933
2934 #ifdef CONFIG_RPS
2935
2936 /* One global table that all flow-based protocols share. */
2937 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2938 EXPORT_SYMBOL(rps_sock_flow_table);
2939
2940 struct static_key rps_needed __read_mostly;
2941
2942 static struct rps_dev_flow *
2943 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2944             struct rps_dev_flow *rflow, u16 next_cpu)
2945 {
2946         if (next_cpu != RPS_NO_CPU) {
2947 #ifdef CONFIG_RFS_ACCEL
2948                 struct netdev_rx_queue *rxqueue;
2949                 struct rps_dev_flow_table *flow_table;
2950                 struct rps_dev_flow *old_rflow;
2951                 u32 flow_id;
2952                 u16 rxq_index;
2953                 int rc;
2954
2955                 /* Should we steer this flow to a different hardware queue? */
2956                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2957                     !(dev->features & NETIF_F_NTUPLE))
2958                         goto out;
2959                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2960                 if (rxq_index == skb_get_rx_queue(skb))
2961                         goto out;
2962
2963                 rxqueue = dev->_rx + rxq_index;
2964                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2965                 if (!flow_table)
2966                         goto out;
2967                 flow_id = skb->rxhash & flow_table->mask;
2968                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2969                                                         rxq_index, flow_id);
2970                 if (rc < 0)
2971                         goto out;
2972                 old_rflow = rflow;
2973                 rflow = &flow_table->flows[flow_id];
2974                 rflow->filter = rc;
2975                 if (old_rflow->filter == rflow->filter)
2976                         old_rflow->filter = RPS_NO_FILTER;
2977         out:
2978 #endif
2979                 rflow->last_qtail =
2980                         per_cpu(softnet_data, next_cpu).input_queue_head;
2981         }
2982
2983         rflow->cpu = next_cpu;
2984         return rflow;
2985 }
2986
2987 /*
2988  * get_rps_cpu is called from netif_receive_skb and returns the target
2989  * CPU from the RPS map of the receiving queue for a given skb.
2990  * rcu_read_lock must be held on entry.
2991  */
2992 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2993                        struct rps_dev_flow **rflowp)
2994 {
2995         struct netdev_rx_queue *rxqueue;
2996         struct rps_map *map;
2997         struct rps_dev_flow_table *flow_table;
2998         struct rps_sock_flow_table *sock_flow_table;
2999         int cpu = -1;
3000         u16 tcpu;
3001
3002         if (skb_rx_queue_recorded(skb)) {
3003                 u16 index = skb_get_rx_queue(skb);
3004                 if (unlikely(index >= dev->real_num_rx_queues)) {
3005                         WARN_ONCE(dev->real_num_rx_queues > 1,
3006                                   "%s received packet on queue %u, but number "
3007                                   "of RX queues is %u\n",
3008                                   dev->name, index, dev->real_num_rx_queues);
3009                         goto done;
3010                 }
3011                 rxqueue = dev->_rx + index;
3012         } else
3013                 rxqueue = dev->_rx;
3014
3015         map = rcu_dereference(rxqueue->rps_map);
3016         if (map) {
3017                 if (map->len == 1 &&
3018                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3019                         tcpu = map->cpus[0];
3020                         if (cpu_online(tcpu))
3021                                 cpu = tcpu;
3022                         goto done;
3023                 }
3024         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3025                 goto done;
3026         }
3027
3028         skb_reset_network_header(skb);
3029         if (!skb_get_hash(skb))
3030                 goto done;
3031
3032         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3033         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3034         if (flow_table && sock_flow_table) {
3035                 u16 next_cpu;
3036                 struct rps_dev_flow *rflow;
3037
3038                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3039                 tcpu = rflow->cpu;
3040
3041                 next_cpu = sock_flow_table->ents[skb->rxhash &
3042                     sock_flow_table->mask];
3043
3044                 /*
3045                  * If the desired CPU (where last recvmsg was done) is
3046                  * different from current CPU (one in the rx-queue flow
3047                  * table entry), switch if one of the following holds:
3048                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3049                  *   - Current CPU is offline.
3050                  *   - The current CPU's queue tail has advanced beyond the
3051                  *     last packet that was enqueued using this table entry.
3052                  *     This guarantees that all previous packets for the flow
3053                  *     have been dequeued, thus preserving in order delivery.
3054                  */
3055                 if (unlikely(tcpu != next_cpu) &&
3056                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3057                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3058                       rflow->last_qtail)) >= 0)) {
3059                         tcpu = next_cpu;
3060                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3061                 }
3062
3063                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3064                         *rflowp = rflow;
3065                         cpu = tcpu;
3066                         goto done;
3067                 }
3068         }
3069
3070         if (map) {
3071                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3072
3073                 if (cpu_online(tcpu)) {
3074                         cpu = tcpu;
3075                         goto done;
3076                 }
3077         }
3078
3079 done:
3080         return cpu;
3081 }
3082
3083 #ifdef CONFIG_RFS_ACCEL
3084
3085 /**
3086  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3087  * @dev: Device on which the filter was set
3088  * @rxq_index: RX queue index
3089  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3090  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3091  *
3092  * Drivers that implement ndo_rx_flow_steer() should periodically call
3093  * this function for each installed filter and remove the filters for
3094  * which it returns %true.
3095  */
3096 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3097                          u32 flow_id, u16 filter_id)
3098 {
3099         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3100         struct rps_dev_flow_table *flow_table;
3101         struct rps_dev_flow *rflow;
3102         bool expire = true;
3103         int cpu;
3104
3105         rcu_read_lock();
3106         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3107         if (flow_table && flow_id <= flow_table->mask) {
3108                 rflow = &flow_table->flows[flow_id];
3109                 cpu = ACCESS_ONCE(rflow->cpu);
3110                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3111                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3112                            rflow->last_qtail) <
3113                      (int)(10 * flow_table->mask)))
3114                         expire = false;
3115         }
3116         rcu_read_unlock();
3117         return expire;
3118 }
3119 EXPORT_SYMBOL(rps_may_expire_flow);
3120
3121 #endif /* CONFIG_RFS_ACCEL */
3122
3123 /* Called from hardirq (IPI) context */
3124 static void rps_trigger_softirq(void *data)
3125 {
3126         struct softnet_data *sd = data;
3127
3128         ____napi_schedule(sd, &sd->backlog);
3129         sd->received_rps++;
3130 }
3131
3132 #endif /* CONFIG_RPS */
3133
3134 /*
3135  * Check if this softnet_data structure is another cpu one
3136  * If yes, queue it to our IPI list and return 1
3137  * If no, return 0
3138  */
3139 static int rps_ipi_queued(struct softnet_data *sd)
3140 {
3141 #ifdef CONFIG_RPS
3142         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3143
3144         if (sd != mysd) {
3145                 sd->rps_ipi_next = mysd->rps_ipi_list;
3146                 mysd->rps_ipi_list = sd;
3147
3148                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3149                 return 1;
3150         }
3151 #endif /* CONFIG_RPS */
3152         return 0;
3153 }
3154
3155 #ifdef CONFIG_NET_FLOW_LIMIT
3156 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3157 #endif
3158
3159 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3160 {
3161 #ifdef CONFIG_NET_FLOW_LIMIT
3162         struct sd_flow_limit *fl;
3163         struct softnet_data *sd;
3164         unsigned int old_flow, new_flow;
3165
3166         if (qlen < (netdev_max_backlog >> 1))
3167                 return false;
3168
3169         sd = &__get_cpu_var(softnet_data);
3170
3171         rcu_read_lock();
3172         fl = rcu_dereference(sd->flow_limit);
3173         if (fl) {
3174                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3175                 old_flow = fl->history[fl->history_head];
3176                 fl->history[fl->history_head] = new_flow;
3177
3178                 fl->history_head++;
3179                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3180
3181                 if (likely(fl->buckets[old_flow]))
3182                         fl->buckets[old_flow]--;
3183
3184                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3185                         fl->count++;
3186                         rcu_read_unlock();
3187                         return true;
3188                 }
3189         }
3190         rcu_read_unlock();
3191 #endif
3192         return false;
3193 }
3194
3195 /*
3196  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3197  * queue (may be a remote CPU queue).
3198  */
3199 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3200                               unsigned int *qtail)
3201 {
3202         struct softnet_data *sd;
3203         unsigned long flags;
3204         unsigned int qlen;
3205
3206         sd = &per_cpu(softnet_data, cpu);
3207
3208         local_irq_save(flags);
3209
3210         rps_lock(sd);
3211         qlen = skb_queue_len(&sd->input_pkt_queue);
3212         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3213                 if (skb_queue_len(&sd->input_pkt_queue)) {
3214 enqueue:
3215                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3216                         input_queue_tail_incr_save(sd, qtail);
3217                         rps_unlock(sd);
3218                         local_irq_restore(flags);
3219                         return NET_RX_SUCCESS;
3220                 }
3221
3222                 /* Schedule NAPI for backlog device
3223                  * We can use non atomic operation since we own the queue lock
3224                  */
3225                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3226                         if (!rps_ipi_queued(sd))
3227                                 ____napi_schedule(sd, &sd->backlog);
3228                 }
3229                 goto enqueue;
3230         }
3231
3232         sd->dropped++;
3233         rps_unlock(sd);
3234
3235         local_irq_restore(flags);
3236
3237         atomic_long_inc(&skb->dev->rx_dropped);
3238         kfree_skb(skb);
3239         return NET_RX_DROP;
3240 }
3241
3242 static int netif_rx_internal(struct sk_buff *skb)
3243 {
3244         int ret;
3245
3246         /* if netpoll wants it, pretend we never saw it */
3247         if (netpoll_rx(skb))
3248                 return NET_RX_DROP;
3249
3250         net_timestamp_check(netdev_tstamp_prequeue, skb);
3251
3252         trace_netif_rx(skb);
3253 #ifdef CONFIG_RPS
3254         if (static_key_false(&rps_needed)) {
3255                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3256                 int cpu;
3257
3258                 preempt_disable();
3259                 rcu_read_lock();
3260
3261                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3262                 if (cpu < 0)
3263                         cpu = smp_processor_id();
3264
3265                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3266
3267                 rcu_read_unlock();
3268                 preempt_enable();
3269         } else
3270 #endif
3271         {
3272                 unsigned int qtail;
3273                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3274                 put_cpu();
3275         }
3276         return ret;
3277 }
3278
3279 /**
3280  *      netif_rx        -       post buffer to the network code
3281  *      @skb: buffer to post
3282  *
3283  *      This function receives a packet from a device driver and queues it for
3284  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3285  *      may be dropped during processing for congestion control or by the
3286  *      protocol layers.
3287  *
3288  *      return values:
3289  *      NET_RX_SUCCESS  (no congestion)
3290  *      NET_RX_DROP     (packet was dropped)
3291  *
3292  */
3293
3294 int netif_rx(struct sk_buff *skb)
3295 {
3296         trace_netif_rx_entry(skb);
3297
3298         return netif_rx_internal(skb);
3299 }
3300 EXPORT_SYMBOL(netif_rx);
3301
3302 int netif_rx_ni(struct sk_buff *skb)
3303 {
3304         int err;
3305
3306         trace_netif_rx_ni_entry(skb);
3307
3308         preempt_disable();
3309         err = netif_rx_internal(skb);
3310         if (local_softirq_pending())
3311                 do_softirq();
3312         preempt_enable();
3313
3314         return err;
3315 }
3316 EXPORT_SYMBOL(netif_rx_ni);
3317
3318 static void net_tx_action(struct softirq_action *h)
3319 {
3320         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3321
3322         if (sd->completion_queue) {
3323                 struct sk_buff *clist;
3324
3325                 local_irq_disable();
3326                 clist = sd->completion_queue;
3327                 sd->completion_queue = NULL;
3328                 local_irq_enable();
3329
3330                 while (clist) {
3331                         struct sk_buff *skb = clist;
3332                         clist = clist->next;
3333
3334                         WARN_ON(atomic_read(&skb->users));
3335                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3336                                 trace_consume_skb(skb);
3337                         else
3338                                 trace_kfree_skb(skb, net_tx_action);
3339                         __kfree_skb(skb);
3340                 }
3341         }
3342
3343         if (sd->output_queue) {
3344                 struct Qdisc *head;
3345
3346                 local_irq_disable();
3347                 head = sd->output_queue;
3348                 sd->output_queue = NULL;
3349                 sd->output_queue_tailp = &sd->output_queue;
3350                 local_irq_enable();
3351
3352                 while (head) {
3353                         struct Qdisc *q = head;
3354                         spinlock_t *root_lock;
3355
3356                         head = head->next_sched;
3357
3358                         root_lock = qdisc_lock(q);
3359                         if (spin_trylock(root_lock)) {
3360                                 smp_mb__before_clear_bit();
3361                                 clear_bit(__QDISC_STATE_SCHED,
3362                                           &q->state);
3363                                 qdisc_run(q);
3364                                 spin_unlock(root_lock);
3365                         } else {
3366                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3367                                               &q->state)) {
3368                                         __netif_reschedule(q);
3369                                 } else {
3370                                         smp_mb__before_clear_bit();
3371                                         clear_bit(__QDISC_STATE_SCHED,
3372                                                   &q->state);
3373                                 }
3374                         }
3375                 }
3376         }
3377 }
3378
3379 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3380     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3381 /* This hook is defined here for ATM LANE */
3382 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3383                              unsigned char *addr) __read_mostly;
3384 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3385 #endif
3386
3387 #ifdef CONFIG_NET_CLS_ACT
3388 /* TODO: Maybe we should just force sch_ingress to be compiled in
3389  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3390  * a compare and 2 stores extra right now if we dont have it on
3391  * but have CONFIG_NET_CLS_ACT
3392  * NOTE: This doesn't stop any functionality; if you dont have
3393  * the ingress scheduler, you just can't add policies on ingress.
3394  *
3395  */
3396 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3397 {
3398         struct net_device *dev = skb->dev;
3399         u32 ttl = G_TC_RTTL(skb->tc_verd);
3400         int result = TC_ACT_OK;
3401         struct Qdisc *q;
3402
3403         if (unlikely(MAX_RED_LOOP < ttl++)) {
3404                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3405                                      skb->skb_iif, dev->ifindex);
3406                 return TC_ACT_SHOT;
3407         }
3408
3409         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3410         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3411
3412         q = rxq->qdisc;
3413         if (q != &noop_qdisc) {
3414                 spin_lock(qdisc_lock(q));
3415                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3416                         result = qdisc_enqueue_root(skb, q);
3417                 spin_unlock(qdisc_lock(q));
3418         }
3419
3420         return result;
3421 }
3422
3423 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3424                                          struct packet_type **pt_prev,
3425                                          int *ret, struct net_device *orig_dev)
3426 {
3427         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3428
3429         if (!rxq || rxq->qdisc == &noop_qdisc)
3430                 goto out;
3431
3432         if (*pt_prev) {
3433                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3434                 *pt_prev = NULL;
3435         }
3436
3437         switch (ing_filter(skb, rxq)) {
3438         case TC_ACT_SHOT:
3439         case TC_ACT_STOLEN:
3440                 kfree_skb(skb);
3441                 return NULL;
3442         }
3443
3444 out:
3445         skb->tc_verd = 0;
3446         return skb;
3447 }
3448 #endif
3449
3450 /**
3451  *      netdev_rx_handler_register - register receive handler
3452  *      @dev: device to register a handler for
3453  *      @rx_handler: receive handler to register
3454  *      @rx_handler_data: data pointer that is used by rx handler
3455  *
3456  *      Register a receive hander for a device. This handler will then be
3457  *      called from __netif_receive_skb. A negative errno code is returned
3458  *      on a failure.
3459  *
3460  *      The caller must hold the rtnl_mutex.
3461  *
3462  *      For a general description of rx_handler, see enum rx_handler_result.
3463  */
3464 int netdev_rx_handler_register(struct net_device *dev,
3465                                rx_handler_func_t *rx_handler,
3466                                void *rx_handler_data)
3467 {
3468         ASSERT_RTNL();
3469
3470         if (dev->rx_handler)
3471                 return -EBUSY;
3472
3473         /* Note: rx_handler_data must be set before rx_handler */
3474         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3475         rcu_assign_pointer(dev->rx_handler, rx_handler);
3476
3477         return 0;
3478 }
3479 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3480
3481 /**
3482  *      netdev_rx_handler_unregister - unregister receive handler
3483  *      @dev: device to unregister a handler from
3484  *
3485  *      Unregister a receive handler from a device.
3486  *
3487  *      The caller must hold the rtnl_mutex.
3488  */
3489 void netdev_rx_handler_unregister(struct net_device *dev)
3490 {
3491
3492         ASSERT_RTNL();
3493         RCU_INIT_POINTER(dev->rx_handler, NULL);
3494         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3495          * section has a guarantee to see a non NULL rx_handler_data
3496          * as well.
3497          */
3498         synchronize_net();
3499         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3500 }
3501 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3502
3503 /*
3504  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3505  * the special handling of PFMEMALLOC skbs.
3506  */
3507 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3508 {
3509         switch (skb->protocol) {
3510         case __constant_htons(ETH_P_ARP):
3511         case __constant_htons(ETH_P_IP):
3512         case __constant_htons(ETH_P_IPV6):
3513         case __constant_htons(ETH_P_8021Q):
3514         case __constant_htons(ETH_P_8021AD):
3515                 return true;
3516         default:
3517                 return false;
3518         }
3519 }
3520
3521 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3522 {
3523         struct packet_type *ptype, *pt_prev;
3524         rx_handler_func_t *rx_handler;
3525         struct net_device *orig_dev;
3526         struct net_device *null_or_dev;
3527         bool deliver_exact = false;
3528         int ret = NET_RX_DROP;
3529         __be16 type;
3530
3531         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3532
3533         trace_netif_receive_skb(skb);
3534
3535         /* if we've gotten here through NAPI, check netpoll */
3536         if (netpoll_receive_skb(skb))
3537                 goto out;
3538
3539         orig_dev = skb->dev;
3540
3541         skb_reset_network_header(skb);
3542         if (!skb_transport_header_was_set(skb))
3543                 skb_reset_transport_header(skb);
3544         skb_reset_mac_len(skb);
3545
3546         pt_prev = NULL;
3547
3548         rcu_read_lock();
3549
3550 another_round:
3551         skb->skb_iif = skb->dev->ifindex;
3552
3553         __this_cpu_inc(softnet_data.processed);
3554
3555         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3556             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3557                 skb = vlan_untag(skb);
3558                 if (unlikely(!skb))
3559                         goto unlock;
3560         }
3561
3562 #ifdef CONFIG_NET_CLS_ACT
3563         if (skb->tc_verd & TC_NCLS) {
3564                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3565                 goto ncls;
3566         }
3567 #endif
3568
3569         if (pfmemalloc)
3570                 goto skip_taps;
3571
3572         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3573                 if (!ptype->dev || ptype->dev == skb->dev) {
3574                         if (pt_prev)
3575                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3576                         pt_prev = ptype;
3577                 }
3578         }
3579
3580 skip_taps:
3581 #ifdef CONFIG_NET_CLS_ACT
3582         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3583         if (!skb)
3584                 goto unlock;
3585 ncls:
3586 #endif
3587
3588         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3589                 goto drop;
3590
3591         if (vlan_tx_tag_present(skb)) {
3592                 if (pt_prev) {
3593                         ret = deliver_skb(skb, pt_prev, orig_dev);
3594                         pt_prev = NULL;
3595                 }
3596                 if (vlan_do_receive(&skb))
3597                         goto another_round;
3598                 else if (unlikely(!skb))
3599                         goto unlock;
3600         }
3601
3602         rx_handler = rcu_dereference(skb->dev->rx_handler);
3603         if (rx_handler) {
3604                 if (pt_prev) {
3605                         ret = deliver_skb(skb, pt_prev, orig_dev);
3606                         pt_prev = NULL;
3607                 }
3608                 switch (rx_handler(&skb)) {
3609                 case RX_HANDLER_CONSUMED:
3610                         ret = NET_RX_SUCCESS;
3611                         goto unlock;
3612                 case RX_HANDLER_ANOTHER:
3613                         goto another_round;
3614                 case RX_HANDLER_EXACT:
3615                         deliver_exact = true;
3616                 case RX_HANDLER_PASS:
3617                         break;
3618                 default:
3619                         BUG();
3620                 }
3621         }
3622
3623         if (unlikely(vlan_tx_tag_present(skb))) {
3624                 if (vlan_tx_tag_get_id(skb))
3625                         skb->pkt_type = PACKET_OTHERHOST;
3626                 /* Note: we might in the future use prio bits
3627                  * and set skb->priority like in vlan_do_receive()
3628                  * For the time being, just ignore Priority Code Point
3629                  */
3630                 skb->vlan_tci = 0;
3631         }
3632
3633         /* deliver only exact match when indicated */
3634         null_or_dev = deliver_exact ? skb->dev : NULL;
3635
3636         type = skb->protocol;
3637         list_for_each_entry_rcu(ptype,
3638                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3639                 if (ptype->type == type &&
3640                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3641                      ptype->dev == orig_dev)) {
3642                         if (pt_prev)
3643                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3644                         pt_prev = ptype;
3645                 }
3646         }
3647
3648         if (pt_prev) {
3649                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3650                         goto drop;
3651                 else
3652                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3653         } else {
3654 drop:
3655                 atomic_long_inc(&skb->dev->rx_dropped);
3656                 kfree_skb(skb);
3657                 /* Jamal, now you will not able to escape explaining
3658                  * me how you were going to use this. :-)
3659                  */
3660                 ret = NET_RX_DROP;
3661         }
3662
3663 unlock:
3664         rcu_read_unlock();
3665 out:
3666         return ret;
3667 }
3668
3669 static int __netif_receive_skb(struct sk_buff *skb)
3670 {
3671         int ret;
3672
3673         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3674                 unsigned long pflags = current->flags;
3675
3676                 /*
3677                  * PFMEMALLOC skbs are special, they should
3678                  * - be delivered to SOCK_MEMALLOC sockets only
3679                  * - stay away from userspace
3680                  * - have bounded memory usage
3681                  *
3682                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3683                  * context down to all allocation sites.
3684                  */
3685                 current->flags |= PF_MEMALLOC;
3686                 ret = __netif_receive_skb_core(skb, true);
3687                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3688         } else
3689                 ret = __netif_receive_skb_core(skb, false);
3690
3691         return ret;
3692 }
3693
3694 static int netif_receive_skb_internal(struct sk_buff *skb)
3695 {
3696         net_timestamp_check(netdev_tstamp_prequeue, skb);
3697
3698         if (skb_defer_rx_timestamp(skb))
3699                 return NET_RX_SUCCESS;
3700
3701 #ifdef CONFIG_RPS
3702         if (static_key_false(&rps_needed)) {
3703                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3704                 int cpu, ret;
3705
3706                 rcu_read_lock();
3707
3708                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3709
3710                 if (cpu >= 0) {
3711                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3712                         rcu_read_unlock();
3713                         return ret;
3714                 }
3715                 rcu_read_unlock();
3716         }
3717 #endif
3718         return __netif_receive_skb(skb);
3719 }
3720
3721 /**
3722  *      netif_receive_skb - process receive buffer from network
3723  *      @skb: buffer to process
3724  *
3725  *      netif_receive_skb() is the main receive data processing function.
3726  *      It always succeeds. The buffer may be dropped during processing
3727  *      for congestion control or by the protocol layers.
3728  *
3729  *      This function may only be called from softirq context and interrupts
3730  *      should be enabled.
3731  *
3732  *      Return values (usually ignored):
3733  *      NET_RX_SUCCESS: no congestion
3734  *      NET_RX_DROP: packet was dropped
3735  */
3736 int netif_receive_skb(struct sk_buff *skb)
3737 {
3738         trace_netif_receive_skb_entry(skb);
3739
3740         return netif_receive_skb_internal(skb);
3741 }
3742 EXPORT_SYMBOL(netif_receive_skb);
3743
3744 /* Network device is going away, flush any packets still pending
3745  * Called with irqs disabled.
3746  */
3747 static void flush_backlog(void *arg)
3748 {
3749         struct net_device *dev = arg;
3750         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3751         struct sk_buff *skb, *tmp;
3752
3753         rps_lock(sd);
3754         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3755                 if (skb->dev == dev) {
3756                         __skb_unlink(skb, &sd->input_pkt_queue);
3757                         kfree_skb(skb);
3758                         input_queue_head_incr(sd);
3759                 }
3760         }
3761         rps_unlock(sd);
3762
3763         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3764                 if (skb->dev == dev) {
3765                         __skb_unlink(skb, &sd->process_queue);
3766                         kfree_skb(skb);
3767                         input_queue_head_incr(sd);
3768                 }
3769         }
3770 }
3771
3772 static int napi_gro_complete(struct sk_buff *skb)
3773 {
3774         struct packet_offload *ptype;
3775         __be16 type = skb->protocol;
3776         struct list_head *head = &offload_base;
3777         int err = -ENOENT;
3778
3779         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3780
3781         if (NAPI_GRO_CB(skb)->count == 1) {
3782                 skb_shinfo(skb)->gso_size = 0;
3783                 goto out;
3784         }
3785
3786         rcu_read_lock();
3787         list_for_each_entry_rcu(ptype, head, list) {
3788                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3789                         continue;
3790
3791                 err = ptype->callbacks.gro_complete(skb, 0);
3792                 break;
3793         }
3794         rcu_read_unlock();
3795
3796         if (err) {
3797                 WARN_ON(&ptype->list == head);
3798                 kfree_skb(skb);
3799                 return NET_RX_SUCCESS;
3800         }
3801
3802 out:
3803         return netif_receive_skb_internal(skb);
3804 }
3805
3806 /* napi->gro_list contains packets ordered by age.
3807  * youngest packets at the head of it.
3808  * Complete skbs in reverse order to reduce latencies.
3809  */
3810 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3811 {
3812         struct sk_buff *skb, *prev = NULL;
3813
3814         /* scan list and build reverse chain */
3815         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3816                 skb->prev = prev;
3817                 prev = skb;
3818         }
3819
3820         for (skb = prev; skb; skb = prev) {
3821                 skb->next = NULL;
3822
3823                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3824                         return;
3825
3826                 prev = skb->prev;
3827                 napi_gro_complete(skb);
3828                 napi->gro_count--;
3829         }
3830
3831         napi->gro_list = NULL;
3832 }
3833 EXPORT_SYMBOL(napi_gro_flush);
3834
3835 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3836 {
3837         struct sk_buff *p;
3838         unsigned int maclen = skb->dev->hard_header_len;
3839         u32 hash = skb_get_hash_raw(skb);
3840
3841         for (p = napi->gro_list; p; p = p->next) {
3842                 unsigned long diffs;
3843
3844                 NAPI_GRO_CB(p)->flush = 0;
3845
3846                 if (hash != skb_get_hash_raw(p)) {
3847                         NAPI_GRO_CB(p)->same_flow = 0;
3848                         continue;
3849                 }
3850
3851                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3852                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3853                 if (maclen == ETH_HLEN)
3854                         diffs |= compare_ether_header(skb_mac_header(p),
3855                                                       skb_gro_mac_header(skb));
3856                 else if (!diffs)
3857                         diffs = memcmp(skb_mac_header(p),
3858                                        skb_gro_mac_header(skb),
3859                                        maclen);
3860                 NAPI_GRO_CB(p)->same_flow = !diffs;
3861         }
3862 }
3863
3864 static void skb_gro_reset_offset(struct sk_buff *skb)
3865 {
3866         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3867         const skb_frag_t *frag0 = &pinfo->frags[0];
3868
3869         NAPI_GRO_CB(skb)->data_offset = 0;
3870         NAPI_GRO_CB(skb)->frag0 = NULL;
3871         NAPI_GRO_CB(skb)->frag0_len = 0;
3872
3873         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3874             pinfo->nr_frags &&
3875             !PageHighMem(skb_frag_page(frag0))) {
3876                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3877                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3878         }
3879 }
3880
3881 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3882 {
3883         struct sk_buff **pp = NULL;
3884         struct packet_offload *ptype;
3885         __be16 type = skb->protocol;
3886         struct list_head *head = &offload_base;
3887         int same_flow;
3888         enum gro_result ret;
3889
3890         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3891                 goto normal;
3892
3893         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3894                 goto normal;
3895
3896         skb_gro_reset_offset(skb);
3897         gro_list_prepare(napi, skb);
3898         NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3899
3900         rcu_read_lock();
3901         list_for_each_entry_rcu(ptype, head, list) {
3902                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3903                         continue;
3904
3905                 skb_set_network_header(skb, skb_gro_offset(skb));
3906                 skb_reset_mac_len(skb);
3907                 NAPI_GRO_CB(skb)->same_flow = 0;
3908                 NAPI_GRO_CB(skb)->flush = 0;
3909                 NAPI_GRO_CB(skb)->free = 0;
3910                 NAPI_GRO_CB(skb)->udp_mark = 0;
3911
3912                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3913                 break;
3914         }
3915         rcu_read_unlock();
3916
3917         if (&ptype->list == head)
3918                 goto normal;
3919
3920         same_flow = NAPI_GRO_CB(skb)->same_flow;
3921         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3922
3923         if (pp) {
3924                 struct sk_buff *nskb = *pp;
3925
3926                 *pp = nskb->next;
3927                 nskb->next = NULL;
3928                 napi_gro_complete(nskb);
3929                 napi->gro_count--;
3930         }
3931
3932         if (same_flow)
3933                 goto ok;
3934
3935         if (NAPI_GRO_CB(skb)->flush)
3936                 goto normal;
3937
3938         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
3939                 struct sk_buff *nskb = napi->gro_list;
3940
3941                 /* locate the end of the list to select the 'oldest' flow */
3942                 while (nskb->next) {
3943                         pp = &nskb->next;
3944                         nskb = *pp;
3945                 }
3946                 *pp = NULL;
3947                 nskb->next = NULL;
3948                 napi_gro_complete(nskb);
3949         } else {
3950                 napi->gro_count++;
3951         }
3952         NAPI_GRO_CB(skb)->count = 1;
3953         NAPI_GRO_CB(skb)->age = jiffies;
3954         NAPI_GRO_CB(skb)->last = skb;
3955         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3956         skb->next = napi->gro_list;
3957         napi->gro_list = skb;
3958         ret = GRO_HELD;
3959
3960 pull:
3961         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3962                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3963
3964                 BUG_ON(skb->end - skb->tail < grow);
3965
3966                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3967
3968                 skb->tail += grow;
3969                 skb->data_len -= grow;
3970
3971                 skb_shinfo(skb)->frags[0].page_offset += grow;
3972                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3973
3974                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3975                         skb_frag_unref(skb, 0);
3976                         memmove(skb_shinfo(skb)->frags,
3977                                 skb_shinfo(skb)->frags + 1,
3978                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3979                 }
3980         }
3981
3982 ok:
3983         return ret;
3984
3985 normal:
3986         ret = GRO_NORMAL;
3987         goto pull;
3988 }
3989
3990 struct packet_offload *gro_find_receive_by_type(__be16 type)
3991 {
3992         struct list_head *offload_head = &offload_base;
3993         struct packet_offload *ptype;
3994
3995         list_for_each_entry_rcu(ptype, offload_head, list) {
3996                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3997                         continue;
3998                 return ptype;
3999         }
4000         return NULL;
4001 }
4002 EXPORT_SYMBOL(gro_find_receive_by_type);
4003
4004 struct packet_offload *gro_find_complete_by_type(__be16 type)
4005 {
4006         struct list_head *offload_head = &offload_base;
4007         struct packet_offload *ptype;
4008
4009         list_for_each_entry_rcu(ptype, offload_head, list) {
4010                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4011                         continue;
4012                 return ptype;
4013         }
4014         return NULL;
4015 }
4016 EXPORT_SYMBOL(gro_find_complete_by_type);
4017
4018 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4019 {
4020         switch (ret) {
4021         case GRO_NORMAL:
4022                 if (netif_receive_skb_internal(skb))
4023                         ret = GRO_DROP;
4024                 break;
4025
4026         case GRO_DROP:
4027                 kfree_skb(skb);
4028                 break;
4029
4030         case GRO_MERGED_FREE:
4031                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4032                         kmem_cache_free(skbuff_head_cache, skb);
4033                 else
4034                         __kfree_skb(skb);
4035                 break;
4036
4037         case GRO_HELD:
4038         case GRO_MERGED:
4039                 break;
4040         }
4041
4042         return ret;
4043 }
4044
4045 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4046 {
4047         trace_napi_gro_receive_entry(skb);
4048
4049         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4050 }
4051 EXPORT_SYMBOL(napi_gro_receive);
4052
4053 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4054 {
4055         __skb_pull(skb, skb_headlen(skb));
4056         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4057         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4058         skb->vlan_tci = 0;
4059         skb->dev = napi->dev;
4060         skb->skb_iif = 0;
4061         skb->encapsulation = 0;
4062         skb_shinfo(skb)->gso_type = 0;
4063         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4064
4065         napi->skb = skb;
4066 }
4067
4068 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4069 {
4070         struct sk_buff *skb = napi->skb;
4071
4072         if (!skb) {
4073                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4074                 napi->skb = skb;
4075         }
4076         return skb;
4077 }
4078 EXPORT_SYMBOL(napi_get_frags);
4079
4080 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
4081                                gro_result_t ret)
4082 {
4083         switch (ret) {
4084         case GRO_NORMAL:
4085                 if (netif_receive_skb_internal(skb))
4086                         ret = GRO_DROP;
4087                 break;
4088
4089         case GRO_DROP:
4090         case GRO_MERGED_FREE:
4091                 napi_reuse_skb(napi, skb);
4092                 break;
4093
4094         case GRO_HELD:
4095         case GRO_MERGED:
4096                 break;
4097         }
4098
4099         return ret;
4100 }
4101
4102 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4103 {
4104         struct sk_buff *skb = napi->skb;
4105
4106         napi->skb = NULL;
4107
4108         if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
4109                 napi_reuse_skb(napi, skb);
4110                 return NULL;
4111         }
4112         skb->protocol = eth_type_trans(skb, skb->dev);
4113
4114         return skb;
4115 }
4116
4117 gro_result_t napi_gro_frags(struct napi_struct *napi)
4118 {
4119         struct sk_buff *skb = napi_frags_skb(napi);
4120
4121         if (!skb)
4122                 return GRO_DROP;
4123
4124         trace_napi_gro_frags_entry(skb);
4125
4126         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4127 }
4128 EXPORT_SYMBOL(napi_gro_frags);
4129
4130 /*
4131  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4132  * Note: called with local irq disabled, but exits with local irq enabled.
4133  */
4134 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4135 {
4136 #ifdef CONFIG_RPS
4137         struct softnet_data *remsd = sd->rps_ipi_list;
4138
4139         if (remsd) {
4140                 sd->rps_ipi_list = NULL;
4141
4142                 local_irq_enable();
4143
4144                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4145                 while (remsd) {
4146                         struct softnet_data *next = remsd->rps_ipi_next;
4147
4148                         if (cpu_online(remsd->cpu))
4149                                 __smp_call_function_single(remsd->cpu,
4150                                                            &remsd->csd, 0);
4151                         remsd = next;
4152                 }
4153         } else
4154 #endif
4155                 local_irq_enable();
4156 }
4157
4158 static int process_backlog(struct napi_struct *napi, int quota)
4159 {
4160         int work = 0;
4161         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4162
4163 #ifdef CONFIG_RPS
4164         /* Check if we have pending ipi, its better to send them now,
4165          * not waiting net_rx_action() end.
4166          */
4167         if (sd->rps_ipi_list) {
4168                 local_irq_disable();
4169                 net_rps_action_and_irq_enable(sd);
4170         }
4171 #endif
4172         napi->weight = weight_p;
4173         local_irq_disable();
4174         while (work < quota) {
4175                 struct sk_buff *skb;
4176                 unsigned int qlen;
4177
4178                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4179                         local_irq_enable();
4180                         __netif_receive_skb(skb);
4181                         local_irq_disable();
4182                         input_queue_head_incr(sd);
4183                         if (++work >= quota) {
4184                                 local_irq_enable();
4185                                 return work;
4186                         }
4187                 }
4188
4189                 rps_lock(sd);
4190                 qlen = skb_queue_len(&sd->input_pkt_queue);
4191                 if (qlen)
4192                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4193                                                    &sd->process_queue);
4194
4195                 if (qlen < quota - work) {
4196                         /*
4197                          * Inline a custom version of __napi_complete().
4198                          * only current cpu owns and manipulates this napi,
4199                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4200                          * we can use a plain write instead of clear_bit(),
4201                          * and we dont need an smp_mb() memory barrier.
4202                          */
4203                         list_del(&napi->poll_list);
4204                         napi->state = 0;
4205
4206                         quota = work + qlen;
4207                 }
4208                 rps_unlock(sd);
4209         }
4210         local_irq_enable();
4211
4212         return work;
4213 }
4214
4215 /**
4216  * __napi_schedule - schedule for receive
4217  * @n: entry to schedule
4218  *
4219  * The entry's receive function will be scheduled to run
4220  */
4221 void __napi_schedule(struct napi_struct *n)
4222 {
4223         unsigned long flags;
4224
4225         local_irq_save(flags);
4226         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4227         local_irq_restore(flags);
4228 }
4229 EXPORT_SYMBOL(__napi_schedule);
4230
4231 void __napi_complete(struct napi_struct *n)
4232 {
4233         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4234         BUG_ON(n->gro_list);
4235
4236         list_del(&n->poll_list);
4237         smp_mb__before_clear_bit();
4238         clear_bit(NAPI_STATE_SCHED, &n->state);
4239 }
4240 EXPORT_SYMBOL(__napi_complete);
4241
4242 void napi_complete(struct napi_struct *n)
4243 {
4244         unsigned long flags;
4245
4246         /*
4247          * don't let napi dequeue from the cpu poll list
4248          * just in case its running on a different cpu
4249          */
4250         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4251                 return;
4252
4253         napi_gro_flush(n, false);
4254         local_irq_save(flags);
4255         __napi_complete(n);
4256         local_irq_restore(flags);
4257 }
4258 EXPORT_SYMBOL(napi_complete);
4259
4260 /* must be called under rcu_read_lock(), as we dont take a reference */
4261 struct napi_struct *napi_by_id(unsigned int napi_id)
4262 {
4263         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4264         struct napi_struct *napi;
4265
4266         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4267                 if (napi->napi_id == napi_id)
4268                         return napi;
4269
4270         return NULL;
4271 }
4272 EXPORT_SYMBOL_GPL(napi_by_id);
4273
4274 void napi_hash_add(struct napi_struct *napi)
4275 {
4276         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4277
4278                 spin_lock(&napi_hash_lock);
4279
4280                 /* 0 is not a valid id, we also skip an id that is taken
4281                  * we expect both events to be extremely rare
4282                  */
4283                 napi->napi_id = 0;
4284                 while (!napi->napi_id) {
4285                         napi->napi_id = ++napi_gen_id;
4286                         if (napi_by_id(napi->napi_id))
4287                                 napi->napi_id = 0;
4288                 }
4289
4290                 hlist_add_head_rcu(&napi->napi_hash_node,
4291                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4292
4293                 spin_unlock(&napi_hash_lock);
4294         }
4295 }
4296 EXPORT_SYMBOL_GPL(napi_hash_add);
4297
4298 /* Warning : caller is responsible to make sure rcu grace period
4299  * is respected before freeing memory containing @napi
4300  */
4301 void napi_hash_del(struct napi_struct *napi)
4302 {
4303         spin_lock(&napi_hash_lock);
4304
4305         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4306                 hlist_del_rcu(&napi->napi_hash_node);
4307
4308         spin_unlock(&napi_hash_lock);
4309 }
4310 EXPORT_SYMBOL_GPL(napi_hash_del);
4311
4312 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4313                     int (*poll)(struct napi_struct *, int), int weight)
4314 {
4315         INIT_LIST_HEAD(&napi->poll_list);
4316         napi->gro_count = 0;
4317         napi->gro_list = NULL;
4318         napi->skb = NULL;
4319         napi->poll = poll;
4320         if (weight > NAPI_POLL_WEIGHT)
4321                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4322                             weight, dev->name);
4323         napi->weight = weight;
4324         list_add(&napi->dev_list, &dev->napi_list);
4325         napi->dev = dev;
4326 #ifdef CONFIG_NETPOLL
4327         spin_lock_init(&napi->poll_lock);
4328         napi->poll_owner = -1;
4329 #endif
4330         set_bit(NAPI_STATE_SCHED, &napi->state);
4331 }
4332 EXPORT_SYMBOL(netif_napi_add);
4333
4334 void netif_napi_del(struct napi_struct *napi)
4335 {
4336         list_del_init(&napi->dev_list);
4337         napi_free_frags(napi);
4338
4339         kfree_skb_list(napi->gro_list);
4340         napi->gro_list = NULL;
4341         napi->gro_count = 0;
4342 }
4343 EXPORT_SYMBOL(netif_napi_del);
4344
4345 static void net_rx_action(struct softirq_action *h)
4346 {
4347         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4348         unsigned long time_limit = jiffies + 2;
4349         int budget = netdev_budget;
4350         void *have;
4351
4352         local_irq_disable();
4353
4354         while (!list_empty(&sd->poll_list)) {
4355                 struct napi_struct *n;
4356                 int work, weight;
4357
4358                 /* If softirq window is exhuasted then punt.
4359                  * Allow this to run for 2 jiffies since which will allow
4360                  * an average latency of 1.5/HZ.
4361                  */
4362                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4363                         goto softnet_break;
4364
4365                 local_irq_enable();
4366
4367                 /* Even though interrupts have been re-enabled, this
4368                  * access is safe because interrupts can only add new
4369                  * entries to the tail of this list, and only ->poll()
4370                  * calls can remove this head entry from the list.
4371                  */
4372                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4373
4374                 have = netpoll_poll_lock(n);
4375
4376                 weight = n->weight;
4377
4378                 /* This NAPI_STATE_SCHED test is for avoiding a race
4379                  * with netpoll's poll_napi().  Only the entity which
4380                  * obtains the lock and sees NAPI_STATE_SCHED set will
4381                  * actually make the ->poll() call.  Therefore we avoid
4382                  * accidentally calling ->poll() when NAPI is not scheduled.
4383                  */
4384                 work = 0;
4385                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4386                         work = n->poll(n, weight);
4387                         trace_napi_poll(n);
4388                 }
4389
4390                 WARN_ON_ONCE(work > weight);
4391
4392                 budget -= work;
4393
4394                 local_irq_disable();
4395
4396                 /* Drivers must not modify the NAPI state if they
4397                  * consume the entire weight.  In such cases this code
4398                  * still "owns" the NAPI instance and therefore can
4399                  * move the instance around on the list at-will.
4400                  */
4401                 if (unlikely(work == weight)) {
4402                         if (unlikely(napi_disable_pending(n))) {
4403                                 local_irq_enable();
4404                                 napi_complete(n);
4405                                 local_irq_disable();
4406                         } else {
4407                                 if (n->gro_list) {
4408                                         /* flush too old packets
4409                                          * If HZ < 1000, flush all packets.
4410                                          */
4411                                         local_irq_enable();
4412                                         napi_gro_flush(n, HZ >= 1000);
4413                                         local_irq_disable();
4414                                 }
4415                                 list_move_tail(&n->poll_list, &sd->poll_list);
4416                         }
4417                 }
4418
4419                 netpoll_poll_unlock(have);
4420         }
4421 out:
4422         net_rps_action_and_irq_enable(sd);
4423
4424 #ifdef CONFIG_NET_DMA
4425         /*
4426          * There may not be any more sk_buffs coming right now, so push
4427          * any pending DMA copies to hardware
4428          */
4429         dma_issue_pending_all();
4430 #endif
4431
4432         return;
4433
4434 softnet_break:
4435         sd->time_squeeze++;
4436         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4437         goto out;
4438 }
4439
4440 struct netdev_adjacent {
4441         struct net_device *dev;
4442
4443         /* upper master flag, there can only be one master device per list */
4444         bool master;
4445
4446         /* counter for the number of times this device was added to us */
4447         u16 ref_nr;
4448
4449         /* private field for the users */
4450         void *private;
4451
4452         struct list_head list;
4453         struct rcu_head rcu;
4454 };
4455
4456 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4457                                                  struct net_device *adj_dev,
4458                                                  struct list_head *adj_list)
4459 {
4460         struct netdev_adjacent *adj;
4461
4462         list_for_each_entry(adj, adj_list, list) {
4463                 if (adj->dev == adj_dev)
4464                         return adj;
4465         }
4466         return NULL;
4467 }
4468
4469 /**
4470  * netdev_has_upper_dev - Check if device is linked to an upper device
4471  * @dev: device
4472  * @upper_dev: upper device to check
4473  *
4474  * Find out if a device is linked to specified upper device and return true
4475  * in case it is. Note that this checks only immediate upper device,
4476  * not through a complete stack of devices. The caller must hold the RTNL lock.
4477  */
4478 bool netdev_has_upper_dev(struct net_device *dev,
4479                           struct net_device *upper_dev)
4480 {
4481         ASSERT_RTNL();
4482
4483         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4484 }
4485 EXPORT_SYMBOL(netdev_has_upper_dev);
4486
4487 /**
4488  * netdev_has_any_upper_dev - Check if device is linked to some device
4489  * @dev: device
4490  *
4491  * Find out if a device is linked to an upper device and return true in case
4492  * it is. The caller must hold the RTNL lock.
4493  */
4494 static bool netdev_has_any_upper_dev(struct net_device *dev)
4495 {
4496         ASSERT_RTNL();
4497
4498         return !list_empty(&dev->all_adj_list.upper);
4499 }
4500
4501 /**
4502  * netdev_master_upper_dev_get - Get master upper device
4503  * @dev: device
4504  *
4505  * Find a master upper device and return pointer to it or NULL in case
4506  * it's not there. The caller must hold the RTNL lock.
4507  */
4508 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4509 {
4510         struct netdev_adjacent *upper;
4511
4512         ASSERT_RTNL();
4513
4514         if (list_empty(&dev->adj_list.upper))
4515                 return NULL;
4516
4517         upper = list_first_entry(&dev->adj_list.upper,
4518                                  struct netdev_adjacent, list);
4519         if (likely(upper->master))
4520                 return upper->dev;
4521         return NULL;
4522 }
4523 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4524
4525 void *netdev_adjacent_get_private(struct list_head *adj_list)
4526 {
4527         struct netdev_adjacent *adj;
4528
4529         adj = list_entry(adj_list, struct netdev_adjacent, list);
4530
4531         return adj->private;
4532 }
4533 EXPORT_SYMBOL(netdev_adjacent_get_private);
4534
4535 /**
4536  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4537  * @dev: device
4538  * @iter: list_head ** of the current position
4539  *
4540  * Gets the next device from the dev's upper list, starting from iter
4541  * position. The caller must hold RCU read lock.
4542  */
4543 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4544                                                      struct list_head **iter)
4545 {
4546         struct netdev_adjacent *upper;
4547
4548         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4549
4550         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4551
4552         if (&upper->list == &dev->all_adj_list.upper)
4553                 return NULL;
4554
4555         *iter = &upper->list;
4556
4557         return upper->dev;
4558 }
4559 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4560
4561 /**
4562  * netdev_lower_get_next_private - Get the next ->private from the
4563  *                                 lower neighbour list
4564  * @dev: device
4565  * @iter: list_head ** of the current position
4566  *
4567  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4568  * list, starting from iter position. The caller must hold either hold the
4569  * RTNL lock or its own locking that guarantees that the neighbour lower
4570  * list will remain unchainged.
4571  */
4572 void *netdev_lower_get_next_private(struct net_device *dev,
4573                                     struct list_head **iter)
4574 {
4575         struct netdev_adjacent *lower;
4576
4577         lower = list_entry(*iter, struct netdev_adjacent, list);
4578
4579         if (&lower->list == &dev->adj_list.lower)
4580                 return NULL;
4581
4582         if (iter)
4583                 *iter = lower->list.next;
4584
4585         return lower->private;
4586 }
4587 EXPORT_SYMBOL(netdev_lower_get_next_private);
4588
4589 /**
4590  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4591  *                                     lower neighbour list, RCU
4592  *                                     variant
4593  * @dev: device
4594  * @iter: list_head ** of the current position
4595  *
4596  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4597  * list, starting from iter position. The caller must hold RCU read lock.
4598  */
4599 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4600                                         struct list_head **iter)
4601 {
4602         struct netdev_adjacent *lower;
4603
4604         WARN_ON_ONCE(!rcu_read_lock_held());
4605
4606         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4607
4608         if (&lower->list == &dev->adj_list.lower)
4609                 return NULL;
4610
4611         if (iter)
4612                 *iter = &lower->list;
4613
4614         return lower->private;
4615 }
4616 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4617
4618 /**
4619  * netdev_lower_get_next - Get the next device from the lower neighbour
4620  *                         list
4621  * @dev: device
4622  * @iter: list_head ** of the current position
4623  *
4624  * Gets the next netdev_adjacent from the dev's lower neighbour
4625  * list, starting from iter position. The caller must hold RTNL lock or
4626  * its own locking that guarantees that the neighbour lower
4627  * list will remain unchainged.
4628  */
4629 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4630 {
4631         struct netdev_adjacent *lower;
4632
4633         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4634
4635         if (&lower->list == &dev->adj_list.lower)
4636                 return NULL;
4637
4638         *iter = &lower->list;
4639
4640         return lower->dev;
4641 }
4642 EXPORT_SYMBOL(netdev_lower_get_next);
4643
4644 /**
4645  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4646  *                                     lower neighbour list, RCU
4647  *                                     variant
4648  * @dev: device
4649  *
4650  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4651  * list. The caller must hold RCU read lock.
4652  */
4653 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4654 {
4655         struct netdev_adjacent *lower;
4656
4657         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4658                         struct netdev_adjacent, list);
4659         if (lower)
4660                 return lower->private;
4661         return NULL;
4662 }
4663 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4664
4665 /**
4666  * netdev_master_upper_dev_get_rcu - Get master upper device
4667  * @dev: device
4668  *
4669  * Find a master upper device and return pointer to it or NULL in case
4670  * it's not there. The caller must hold the RCU read lock.
4671  */
4672 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4673 {
4674         struct netdev_adjacent *upper;
4675
4676         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4677                                        struct netdev_adjacent, list);
4678         if (upper && likely(upper->master))
4679                 return upper->dev;
4680         return NULL;
4681 }
4682 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4683
4684 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4685                               struct net_device *adj_dev,
4686                               struct list_head *dev_list)
4687 {
4688         char linkname[IFNAMSIZ+7];
4689         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4690                 "upper_%s" : "lower_%s", adj_dev->name);
4691         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4692                                  linkname);
4693 }
4694 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4695                                char *name,
4696                                struct list_head *dev_list)
4697 {
4698         char linkname[IFNAMSIZ+7];
4699         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4700                 "upper_%s" : "lower_%s", name);
4701         sysfs_remove_link(&(dev->dev.kobj), linkname);
4702 }
4703
4704 #define netdev_adjacent_is_neigh_list(dev, dev_list) \
4705                 (dev_list == &dev->adj_list.upper || \
4706                  dev_list == &dev->adj_list.lower)
4707
4708 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4709                                         struct net_device *adj_dev,
4710                                         struct list_head *dev_list,
4711                                         void *private, bool master)
4712 {
4713         struct netdev_adjacent *adj;
4714         int ret;
4715
4716         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4717
4718         if (adj) {
4719                 adj->ref_nr++;
4720                 return 0;
4721         }
4722
4723         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4724         if (!adj)
4725                 return -ENOMEM;
4726
4727         adj->dev = adj_dev;
4728         adj->master = master;
4729         adj->ref_nr = 1;
4730         adj->private = private;
4731         dev_hold(adj_dev);
4732
4733         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4734                  adj_dev->name, dev->name, adj_dev->name);
4735
4736         if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4737                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4738                 if (ret)
4739                         goto free_adj;
4740         }
4741
4742         /* Ensure that master link is always the first item in list. */
4743         if (master) {
4744                 ret = sysfs_create_link(&(dev->dev.kobj),
4745                                         &(adj_dev->dev.kobj), "master");
4746                 if (ret)
4747                         goto remove_symlinks;
4748
4749                 list_add_rcu(&adj->list, dev_list);
4750         } else {
4751                 list_add_tail_rcu(&adj->list, dev_list);
4752         }
4753
4754         return 0;
4755
4756 remove_symlinks:
4757         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4758                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4759 free_adj:
4760         kfree(adj);
4761         dev_put(adj_dev);
4762
4763         return ret;
4764 }
4765
4766 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4767                                          struct net_device *adj_dev,
4768                                          struct list_head *dev_list)
4769 {
4770         struct netdev_adjacent *adj;
4771
4772         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4773
4774         if (!adj) {
4775                 pr_err("tried to remove device %s from %s\n",
4776                        dev->name, adj_dev->name);
4777                 BUG();
4778         }
4779
4780         if (adj->ref_nr > 1) {
4781                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4782                          adj->ref_nr-1);
4783                 adj->ref_nr--;
4784                 return;
4785         }
4786
4787         if (adj->master)
4788                 sysfs_remove_link(&(dev->dev.kobj), "master");
4789
4790         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4791                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4792
4793         list_del_rcu(&adj->list);
4794         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4795                  adj_dev->name, dev->name, adj_dev->name);
4796         dev_put(adj_dev);
4797         kfree_rcu(adj, rcu);
4798 }
4799
4800 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4801                                             struct net_device *upper_dev,
4802                                             struct list_head *up_list,
4803                                             struct list_head *down_list,
4804                                             void *private, bool master)
4805 {
4806         int ret;
4807
4808         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4809                                            master);
4810         if (ret)
4811                 return ret;
4812
4813         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4814                                            false);
4815         if (ret) {
4816                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4817                 return ret;
4818         }
4819
4820         return 0;
4821 }
4822
4823 static int __netdev_adjacent_dev_link(struct net_device *dev,
4824                                       struct net_device *upper_dev)
4825 {
4826         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4827                                                 &dev->all_adj_list.upper,
4828                                                 &upper_dev->all_adj_list.lower,
4829                                                 NULL, false);
4830 }
4831
4832 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4833                                                struct net_device *upper_dev,
4834                                                struct list_head *up_list,
4835                                                struct list_head *down_list)
4836 {
4837         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4838         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4839 }
4840
4841 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4842                                          struct net_device *upper_dev)
4843 {
4844         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4845                                            &dev->all_adj_list.upper,
4846                                            &upper_dev->all_adj_list.lower);
4847 }
4848
4849 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4850                                                 struct net_device *upper_dev,
4851                                                 void *private, bool master)
4852 {
4853         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4854
4855         if (ret)
4856                 return ret;
4857
4858         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4859                                                &dev->adj_list.upper,
4860                                                &upper_dev->adj_list.lower,
4861                                                private, master);
4862         if (ret) {
4863                 __netdev_adjacent_dev_unlink(dev, upper_dev);
4864                 return ret;
4865         }
4866
4867         return 0;
4868 }
4869
4870 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4871                                                    struct net_device *upper_dev)
4872 {
4873         __netdev_adjacent_dev_unlink(dev, upper_dev);
4874         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4875                                            &dev->adj_list.upper,
4876                                            &upper_dev->adj_list.lower);
4877 }
4878
4879 static int __netdev_upper_dev_link(struct net_device *dev,
4880                                    struct net_device *upper_dev, bool master,
4881                                    void *private)
4882 {
4883         struct netdev_adjacent *i, *j, *to_i, *to_j;
4884         int ret = 0;
4885
4886         ASSERT_RTNL();
4887
4888         if (dev == upper_dev)
4889                 return -EBUSY;
4890
4891         /* To prevent loops, check if dev is not upper device to upper_dev. */
4892         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4893                 return -EBUSY;
4894
4895         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4896                 return -EEXIST;
4897
4898         if (master && netdev_master_upper_dev_get(dev))
4899                 return -EBUSY;
4900
4901         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4902                                                    master);
4903         if (ret)
4904                 return ret;
4905
4906         /* Now that we linked these devs, make all the upper_dev's
4907          * all_adj_list.upper visible to every dev's all_adj_list.lower an
4908          * versa, and don't forget the devices itself. All of these
4909          * links are non-neighbours.
4910          */
4911         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4912                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4913                         pr_debug("Interlinking %s with %s, non-neighbour\n",
4914                                  i->dev->name, j->dev->name);
4915                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4916                         if (ret)
4917                                 goto rollback_mesh;
4918                 }
4919         }
4920
4921         /* add dev to every upper_dev's upper device */
4922         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4923                 pr_debug("linking %s's upper device %s with %s\n",
4924                          upper_dev->name, i->dev->name, dev->name);
4925                 ret = __netdev_adjacent_dev_link(dev, i->dev);
4926                 if (ret)
4927                         goto rollback_upper_mesh;
4928         }
4929
4930         /* add upper_dev to every dev's lower device */
4931         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4932                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
4933                          i->dev->name, upper_dev->name);
4934                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4935                 if (ret)
4936                         goto rollback_lower_mesh;
4937         }
4938
4939         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4940         return 0;
4941
4942 rollback_lower_mesh:
4943         to_i = i;
4944         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4945                 if (i == to_i)
4946                         break;
4947                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
4948         }
4949
4950         i = NULL;
4951
4952 rollback_upper_mesh:
4953         to_i = i;
4954         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4955                 if (i == to_i)
4956                         break;
4957                 __netdev_adjacent_dev_unlink(dev, i->dev);
4958         }
4959
4960         i = j = NULL;
4961
4962 rollback_mesh:
4963         to_i = i;
4964         to_j = j;
4965         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4966                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4967                         if (i == to_i && j == to_j)
4968                                 break;
4969                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
4970                 }
4971                 if (i == to_i)
4972                         break;
4973         }
4974
4975         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4976
4977         return ret;
4978 }
4979
4980 /**
4981  * netdev_upper_dev_link - Add a link to the upper device
4982  * @dev: device
4983  * @upper_dev: new upper device
4984  *
4985  * Adds a link to device which is upper to this one. The caller must hold
4986  * the RTNL lock. On a failure a negative errno code is returned.
4987  * On success the reference counts are adjusted and the function
4988  * returns zero.
4989  */
4990 int netdev_upper_dev_link(struct net_device *dev,
4991                           struct net_device *upper_dev)
4992 {
4993         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
4994 }
4995 EXPORT_SYMBOL(netdev_upper_dev_link);
4996
4997 /**
4998  * netdev_master_upper_dev_link - Add a master link to the upper device
4999  * @dev: device
5000  * @upper_dev: new upper device
5001  *
5002  * Adds a link to device which is upper to this one. In this case, only
5003  * one master upper device can be linked, although other non-master devices
5004  * might be linked as well. The caller must hold the RTNL lock.
5005  * On a failure a negative errno code is returned. On success the reference
5006  * counts are adjusted and the function returns zero.
5007  */
5008 int netdev_master_upper_dev_link(struct net_device *dev,
5009                                  struct net_device *upper_dev)
5010 {
5011         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5012 }
5013 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5014
5015 int netdev_master_upper_dev_link_private(struct net_device *dev,
5016                                          struct net_device *upper_dev,
5017                                          void *private)
5018 {
5019         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5020 }
5021 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5022
5023 /**
5024  * netdev_upper_dev_unlink - Removes a link to upper device
5025  * @dev: device
5026  * @upper_dev: new upper device
5027  *
5028  * Removes a link to device which is upper to this one. The caller must hold
5029  * the RTNL lock.
5030  */
5031 void netdev_upper_dev_unlink(struct net_device *dev,
5032                              struct net_device *upper_dev)
5033 {
5034         struct netdev_adjacent *i, *j;
5035         ASSERT_RTNL();
5036
5037         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5038
5039         /* Here is the tricky part. We must remove all dev's lower
5040          * devices from all upper_dev's upper devices and vice
5041          * versa, to maintain the graph relationship.
5042          */
5043         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5044                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5045                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5046
5047         /* remove also the devices itself from lower/upper device
5048          * list
5049          */
5050         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5051                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5052
5053         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5054                 __netdev_adjacent_dev_unlink(dev, i->dev);
5055
5056         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5057 }
5058 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5059
5060 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5061 {
5062         struct netdev_adjacent *iter;
5063
5064         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5065                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5066                                           &iter->dev->adj_list.lower);
5067                 netdev_adjacent_sysfs_add(iter->dev, dev,
5068                                           &iter->dev->adj_list.lower);
5069         }
5070
5071         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5072                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5073                                           &iter->dev->adj_list.upper);
5074                 netdev_adjacent_sysfs_add(iter->dev, dev,
5075                                           &iter->dev->adj_list.upper);
5076         }
5077 }
5078
5079 void *netdev_lower_dev_get_private(struct net_device *dev,
5080                                    struct net_device *lower_dev)
5081 {
5082         struct netdev_adjacent *lower;
5083
5084         if (!lower_dev)
5085                 return NULL;
5086         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5087         if (!lower)
5088                 return NULL;
5089
5090         return lower->private;
5091 }
5092 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5093
5094
5095 int dev_get_nest_level(struct net_device *dev,
5096                        bool (*type_check)(struct net_device *dev))
5097 {
5098         struct net_device *lower = NULL;
5099         struct list_head *iter;
5100         int max_nest = -1;
5101         int nest;
5102
5103         ASSERT_RTNL();
5104
5105         netdev_for_each_lower_dev(dev, lower, iter) {
5106                 nest = dev_get_nest_level(lower, type_check);
5107                 if (max_nest < nest)
5108                         max_nest = nest;
5109         }
5110
5111         if (type_check(dev))
5112                 max_nest++;
5113
5114         return max_nest;
5115 }
5116 EXPORT_SYMBOL(dev_get_nest_level);
5117
5118 static void dev_change_rx_flags(struct net_device *dev, int flags)
5119 {
5120         const struct net_device_ops *ops = dev->netdev_ops;
5121
5122         if (ops->ndo_change_rx_flags)
5123                 ops->ndo_change_rx_flags(dev, flags);
5124 }
5125
5126 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5127 {
5128         unsigned int old_flags = dev->flags;
5129         kuid_t uid;
5130         kgid_t gid;
5131
5132         ASSERT_RTNL();
5133
5134         dev->flags |= IFF_PROMISC;
5135         dev->promiscuity += inc;
5136         if (dev->promiscuity == 0) {
5137                 /*
5138                  * Avoid overflow.
5139                  * If inc causes overflow, untouch promisc and return error.
5140                  */
5141                 if (inc < 0)
5142                         dev->flags &= ~IFF_PROMISC;
5143                 else {
5144                         dev->promiscuity -= inc;
5145                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5146                                 dev->name);
5147                         return -EOVERFLOW;
5148                 }
5149         }
5150         if (dev->flags != old_flags) {
5151                 pr_info("device %s %s promiscuous mode\n",
5152                         dev->name,
5153                         dev->flags & IFF_PROMISC ? "entered" : "left");
5154                 if (audit_enabled) {
5155                         current_uid_gid(&uid, &gid);
5156                         audit_log(current->audit_context, GFP_ATOMIC,
5157                                 AUDIT_ANOM_PROMISCUOUS,
5158                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5159                                 dev->name, (dev->flags & IFF_PROMISC),
5160                                 (old_flags & IFF_PROMISC),
5161                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5162                                 from_kuid(&init_user_ns, uid),
5163                                 from_kgid(&init_user_ns, gid),
5164                                 audit_get_sessionid(current));
5165                 }
5166
5167                 dev_change_rx_flags(dev, IFF_PROMISC);
5168         }
5169         if (notify)
5170                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5171         return 0;
5172 }
5173
5174 /**
5175  *      dev_set_promiscuity     - update promiscuity count on a device
5176  *      @dev: device
5177  *      @inc: modifier
5178  *
5179  *      Add or remove promiscuity from a device. While the count in the device
5180  *      remains above zero the interface remains promiscuous. Once it hits zero
5181  *      the device reverts back to normal filtering operation. A negative inc
5182  *      value is used to drop promiscuity on the device.
5183  *      Return 0 if successful or a negative errno code on error.
5184  */
5185 int dev_set_promiscuity(struct net_device *dev, int inc)
5186 {
5187         unsigned int old_flags = dev->flags;
5188         int err;
5189
5190         err = __dev_set_promiscuity(dev, inc, true);
5191         if (err < 0)
5192                 return err;
5193         if (dev->flags != old_flags)
5194                 dev_set_rx_mode(dev);
5195         return err;
5196 }
5197 EXPORT_SYMBOL(dev_set_promiscuity);
5198
5199 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5200 {
5201         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5202
5203         ASSERT_RTNL();
5204
5205         dev->flags |= IFF_ALLMULTI;
5206         dev->allmulti += inc;
5207         if (dev->allmulti == 0) {
5208                 /*
5209                  * Avoid overflow.
5210                  * If inc causes overflow, untouch allmulti and return error.
5211                  */
5212                 if (inc < 0)
5213                         dev->flags &= ~IFF_ALLMULTI;
5214                 else {
5215                         dev->allmulti -= inc;
5216                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5217                                 dev->name);
5218                         return -EOVERFLOW;
5219                 }
5220         }
5221         if (dev->flags ^ old_flags) {
5222                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5223                 dev_set_rx_mode(dev);
5224                 if (notify)
5225                         __dev_notify_flags(dev, old_flags,
5226                                            dev->gflags ^ old_gflags);
5227         }
5228         return 0;
5229 }
5230
5231 /**
5232  *      dev_set_allmulti        - update allmulti count on a device
5233  *      @dev: device
5234  *      @inc: modifier
5235  *
5236  *      Add or remove reception of all multicast frames to a device. While the
5237  *      count in the device remains above zero the interface remains listening
5238  *      to all interfaces. Once it hits zero the device reverts back to normal
5239  *      filtering operation. A negative @inc value is used to drop the counter
5240  *      when releasing a resource needing all multicasts.
5241  *      Return 0 if successful or a negative errno code on error.
5242  */
5243
5244 int dev_set_allmulti(struct net_device *dev, int inc)
5245 {
5246         return __dev_set_allmulti(dev, inc, true);
5247 }
5248 EXPORT_SYMBOL(dev_set_allmulti);
5249
5250 /*
5251  *      Upload unicast and multicast address lists to device and
5252  *      configure RX filtering. When the device doesn't support unicast
5253  *      filtering it is put in promiscuous mode while unicast addresses
5254  *      are present.
5255  */
5256 void __dev_set_rx_mode(struct net_device *dev)
5257 {
5258         const struct net_device_ops *ops = dev->netdev_ops;
5259
5260         /* dev_open will call this function so the list will stay sane. */
5261         if (!(dev->flags&IFF_UP))
5262                 return;
5263
5264         if (!netif_device_present(dev))
5265                 return;
5266
5267         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5268                 /* Unicast addresses changes may only happen under the rtnl,
5269                  * therefore calling __dev_set_promiscuity here is safe.
5270                  */
5271                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5272                         __dev_set_promiscuity(dev, 1, false);
5273                         dev->uc_promisc = true;
5274                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5275                         __dev_set_promiscuity(dev, -1, false);
5276                         dev->uc_promisc = false;
5277                 }
5278         }
5279
5280         if (ops->ndo_set_rx_mode)
5281                 ops->ndo_set_rx_mode(dev);
5282 }
5283
5284 void dev_set_rx_mode(struct net_device *dev)
5285 {
5286         netif_addr_lock_bh(dev);
5287         __dev_set_rx_mode(dev);
5288         netif_addr_unlock_bh(dev);
5289 }
5290
5291 /**
5292  *      dev_get_flags - get flags reported to userspace
5293  *      @dev: device
5294  *
5295  *      Get the combination of flag bits exported through APIs to userspace.
5296  */
5297 unsigned int dev_get_flags(const struct net_device *dev)
5298 {
5299         unsigned int flags;
5300
5301         flags = (dev->flags & ~(IFF_PROMISC |
5302                                 IFF_ALLMULTI |
5303                                 IFF_RUNNING |
5304                                 IFF_LOWER_UP |
5305                                 IFF_DORMANT)) |
5306                 (dev->gflags & (IFF_PROMISC |
5307                                 IFF_ALLMULTI));
5308
5309         if (netif_running(dev)) {
5310                 if (netif_oper_up(dev))
5311                         flags |= IFF_RUNNING;
5312                 if (netif_carrier_ok(dev))
5313                         flags |= IFF_LOWER_UP;
5314                 if (netif_dormant(dev))
5315                         flags |= IFF_DORMANT;
5316         }
5317
5318         return flags;
5319 }
5320 EXPORT_SYMBOL(dev_get_flags);
5321
5322 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5323 {
5324         unsigned int old_flags = dev->flags;
5325         int ret;
5326
5327         ASSERT_RTNL();
5328
5329         /*
5330          *      Set the flags on our device.
5331          */
5332
5333         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5334                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5335                                IFF_AUTOMEDIA)) |
5336                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5337                                     IFF_ALLMULTI));
5338
5339         /*
5340          *      Load in the correct multicast list now the flags have changed.
5341          */
5342
5343         if ((old_flags ^ flags) & IFF_MULTICAST)
5344                 dev_change_rx_flags(dev, IFF_MULTICAST);
5345
5346         dev_set_rx_mode(dev);
5347
5348         /*
5349          *      Have we downed the interface. We handle IFF_UP ourselves
5350          *      according to user attempts to set it, rather than blindly
5351          *      setting it.
5352          */
5353
5354         ret = 0;
5355         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5356                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5357
5358                 if (!ret)
5359                         dev_set_rx_mode(dev);
5360         }
5361
5362         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5363                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5364                 unsigned int old_flags = dev->flags;
5365
5366                 dev->gflags ^= IFF_PROMISC;
5367
5368                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5369                         if (dev->flags != old_flags)
5370                                 dev_set_rx_mode(dev);
5371         }
5372
5373         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5374            is important. Some (broken) drivers set IFF_PROMISC, when
5375            IFF_ALLMULTI is requested not asking us and not reporting.
5376          */
5377         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5378                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5379
5380                 dev->gflags ^= IFF_ALLMULTI;
5381                 __dev_set_allmulti(dev, inc, false);
5382         }
5383
5384         return ret;
5385 }
5386
5387 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5388                         unsigned int gchanges)
5389 {
5390         unsigned int changes = dev->flags ^ old_flags;
5391
5392         if (gchanges)
5393                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5394
5395         if (changes & IFF_UP) {
5396                 if (dev->flags & IFF_UP)
5397                         call_netdevice_notifiers(NETDEV_UP, dev);
5398                 else
5399                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5400         }
5401
5402         if (dev->flags & IFF_UP &&
5403             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5404                 struct netdev_notifier_change_info change_info;
5405
5406                 change_info.flags_changed = changes;
5407                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5408                                               &change_info.info);
5409         }
5410 }
5411
5412 /**
5413  *      dev_change_flags - change device settings
5414  *      @dev: device
5415  *      @flags: device state flags
5416  *
5417  *      Change settings on device based state flags. The flags are
5418  *      in the userspace exported format.
5419  */
5420 int dev_change_flags(struct net_device *dev, unsigned int flags)
5421 {
5422         int ret;
5423         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5424
5425         ret = __dev_change_flags(dev, flags);
5426         if (ret < 0)
5427                 return ret;
5428
5429         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5430         __dev_notify_flags(dev, old_flags, changes);
5431         return ret;
5432 }
5433 EXPORT_SYMBOL(dev_change_flags);
5434
5435 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5436 {
5437         const struct net_device_ops *ops = dev->netdev_ops;
5438
5439         if (ops->ndo_change_mtu)
5440                 return ops->ndo_change_mtu(dev, new_mtu);
5441
5442         dev->mtu = new_mtu;
5443         return 0;
5444 }
5445
5446 /**
5447  *      dev_set_mtu - Change maximum transfer unit
5448  *      @dev: device
5449  *      @new_mtu: new transfer unit
5450  *
5451  *      Change the maximum transfer size of the network device.
5452  */
5453 int dev_set_mtu(struct net_device *dev, int new_mtu)
5454 {
5455         int err, orig_mtu;
5456
5457         if (new_mtu == dev->mtu)
5458                 return 0;
5459
5460         /*      MTU must be positive.    */
5461         if (new_mtu < 0)
5462                 return -EINVAL;
5463
5464         if (!netif_device_present(dev))
5465                 return -ENODEV;
5466
5467         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5468         err = notifier_to_errno(err);
5469         if (err)
5470                 return err;
5471
5472         orig_mtu = dev->mtu;
5473         err = __dev_set_mtu(dev, new_mtu);
5474
5475         if (!err) {
5476                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5477                 err = notifier_to_errno(err);
5478                 if (err) {
5479                         /* setting mtu back and notifying everyone again,
5480                          * so that they have a chance to revert changes.
5481                          */
5482                         __dev_set_mtu(dev, orig_mtu);
5483                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5484                 }
5485         }
5486         return err;
5487 }
5488 EXPORT_SYMBOL(dev_set_mtu);
5489
5490 /**
5491  *      dev_set_group - Change group this device belongs to
5492  *      @dev: device
5493  *      @new_group: group this device should belong to
5494  */
5495 void dev_set_group(struct net_device *dev, int new_group)
5496 {
5497         dev->group = new_group;
5498 }
5499 EXPORT_SYMBOL(dev_set_group);
5500
5501 /**
5502  *      dev_set_mac_address - Change Media Access Control Address
5503  *      @dev: device
5504  *      @sa: new address
5505  *
5506  *      Change the hardware (MAC) address of the device
5507  */
5508 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5509 {
5510         const struct net_device_ops *ops = dev->netdev_ops;
5511         int err;
5512
5513         if (!ops->ndo_set_mac_address)
5514                 return -EOPNOTSUPP;
5515         if (sa->sa_family != dev->type)
5516                 return -EINVAL;
5517         if (!netif_device_present(dev))
5518                 return -ENODEV;
5519         err = ops->ndo_set_mac_address(dev, sa);
5520         if (err)
5521                 return err;
5522         dev->addr_assign_type = NET_ADDR_SET;
5523         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5524         add_device_randomness(dev->dev_addr, dev->addr_len);
5525         return 0;
5526 }
5527 EXPORT_SYMBOL(dev_set_mac_address);
5528
5529 /**
5530  *      dev_change_carrier - Change device carrier
5531  *      @dev: device
5532  *      @new_carrier: new value
5533  *
5534  *      Change device carrier
5535  */
5536 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5537 {
5538         const struct net_device_ops *ops = dev->netdev_ops;
5539
5540         if (!ops->ndo_change_carrier)
5541                 return -EOPNOTSUPP;
5542         if (!netif_device_present(dev))
5543                 return -ENODEV;
5544         return ops->ndo_change_carrier(dev, new_carrier);
5545 }
5546 EXPORT_SYMBOL(dev_change_carrier);
5547
5548 /**
5549  *      dev_get_phys_port_id - Get device physical port ID
5550  *      @dev: device
5551  *      @ppid: port ID
5552  *
5553  *      Get device physical port ID
5554  */
5555 int dev_get_phys_port_id(struct net_device *dev,
5556                          struct netdev_phys_port_id *ppid)
5557 {
5558         const struct net_device_ops *ops = dev->netdev_ops;
5559
5560         if (!ops->ndo_get_phys_port_id)
5561                 return -EOPNOTSUPP;
5562         return ops->ndo_get_phys_port_id(dev, ppid);
5563 }
5564 EXPORT_SYMBOL(dev_get_phys_port_id);
5565
5566 /**
5567  *      dev_new_index   -       allocate an ifindex
5568  *      @net: the applicable net namespace
5569  *
5570  *      Returns a suitable unique value for a new device interface
5571  *      number.  The caller must hold the rtnl semaphore or the
5572  *      dev_base_lock to be sure it remains unique.
5573  */
5574 static int dev_new_index(struct net *net)
5575 {
5576         int ifindex = net->ifindex;
5577         for (;;) {
5578                 if (++ifindex <= 0)
5579                         ifindex = 1;
5580                 if (!__dev_get_by_index(net, ifindex))
5581                         return net->ifindex = ifindex;
5582         }
5583 }
5584
5585 /* Delayed registration/unregisteration */
5586 static LIST_HEAD(net_todo_list);
5587 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5588
5589 static void net_set_todo(struct net_device *dev)
5590 {
5591         list_add_tail(&dev->todo_list, &net_todo_list);
5592         dev_net(dev)->dev_unreg_count++;
5593 }
5594
5595 static void rollback_registered_many(struct list_head *head)
5596 {
5597         struct net_device *dev, *tmp;
5598         LIST_HEAD(close_head);
5599
5600         BUG_ON(dev_boot_phase);
5601         ASSERT_RTNL();
5602
5603         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5604                 /* Some devices call without registering
5605                  * for initialization unwind. Remove those
5606                  * devices and proceed with the remaining.
5607                  */
5608                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5609                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5610                                  dev->name, dev);
5611
5612                         WARN_ON(1);
5613                         list_del(&dev->unreg_list);
5614                         continue;
5615                 }
5616                 dev->dismantle = true;
5617                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5618         }
5619
5620         /* If device is running, close it first. */
5621         list_for_each_entry(dev, head, unreg_list)
5622                 list_add_tail(&dev->close_list, &close_head);
5623         dev_close_many(&close_head);
5624
5625         list_for_each_entry(dev, head, unreg_list) {
5626                 /* And unlink it from device chain. */
5627                 unlist_netdevice(dev);
5628
5629                 dev->reg_state = NETREG_UNREGISTERING;
5630         }
5631
5632         synchronize_net();
5633
5634         list_for_each_entry(dev, head, unreg_list) {
5635                 /* Shutdown queueing discipline. */
5636                 dev_shutdown(dev);
5637
5638
5639                 /* Notify protocols, that we are about to destroy
5640                    this device. They should clean all the things.
5641                 */
5642                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5643
5644                 if (!dev->rtnl_link_ops ||
5645                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5646                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5647
5648                 /*
5649                  *      Flush the unicast and multicast chains
5650                  */
5651                 dev_uc_flush(dev);
5652                 dev_mc_flush(dev);
5653
5654                 if (dev->netdev_ops->ndo_uninit)
5655                         dev->netdev_ops->ndo_uninit(dev);
5656
5657                 /* Notifier chain MUST detach us all upper devices. */
5658                 WARN_ON(netdev_has_any_upper_dev(dev));
5659
5660                 /* Remove entries from kobject tree */
5661                 netdev_unregister_kobject(dev);
5662 #ifdef CONFIG_XPS
5663                 /* Remove XPS queueing entries */
5664                 netif_reset_xps_queues_gt(dev, 0);
5665 #endif
5666         }
5667
5668         synchronize_net();
5669
5670         list_for_each_entry(dev, head, unreg_list)
5671                 dev_put(dev);
5672 }
5673
5674 static void rollback_registered(struct net_device *dev)
5675 {
5676         LIST_HEAD(single);
5677
5678         list_add(&dev->unreg_list, &single);
5679         rollback_registered_many(&single);
5680         list_del(&single);
5681 }
5682
5683 static netdev_features_t netdev_fix_features(struct net_device *dev,
5684         netdev_features_t features)
5685 {
5686         /* Fix illegal checksum combinations */
5687         if ((features & NETIF_F_HW_CSUM) &&
5688             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5689                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5690                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5691         }
5692
5693         /* TSO requires that SG is present as well. */
5694         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5695                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5696                 features &= ~NETIF_F_ALL_TSO;
5697         }
5698
5699         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5700                                         !(features & NETIF_F_IP_CSUM)) {
5701                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5702                 features &= ~NETIF_F_TSO;
5703                 features &= ~NETIF_F_TSO_ECN;
5704         }
5705
5706         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5707                                          !(features & NETIF_F_IPV6_CSUM)) {
5708                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5709                 features &= ~NETIF_F_TSO6;
5710         }
5711
5712         /* TSO ECN requires that TSO is present as well. */
5713         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5714                 features &= ~NETIF_F_TSO_ECN;
5715
5716         /* Software GSO depends on SG. */
5717         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5718                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5719                 features &= ~NETIF_F_GSO;
5720         }
5721
5722         /* UFO needs SG and checksumming */
5723         if (features & NETIF_F_UFO) {
5724                 /* maybe split UFO into V4 and V6? */
5725                 if (!((features & NETIF_F_GEN_CSUM) ||
5726                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5727                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5728                         netdev_dbg(dev,
5729                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5730                         features &= ~NETIF_F_UFO;
5731                 }
5732
5733                 if (!(features & NETIF_F_SG)) {
5734                         netdev_dbg(dev,
5735                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5736                         features &= ~NETIF_F_UFO;
5737                 }
5738         }
5739
5740         return features;
5741 }
5742
5743 int __netdev_update_features(struct net_device *dev)
5744 {
5745         netdev_features_t features;
5746         int err = 0;
5747
5748         ASSERT_RTNL();
5749
5750         features = netdev_get_wanted_features(dev);
5751
5752         if (dev->netdev_ops->ndo_fix_features)
5753                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5754
5755         /* driver might be less strict about feature dependencies */
5756         features = netdev_fix_features(dev, features);
5757
5758         if (dev->features == features)
5759                 return 0;
5760
5761         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5762                 &dev->features, &features);
5763
5764         if (dev->netdev_ops->ndo_set_features)
5765                 err = dev->netdev_ops->ndo_set_features(dev, features);
5766
5767         if (unlikely(err < 0)) {
5768                 netdev_err(dev,
5769                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5770                         err, &features, &dev->features);
5771                 return -1;
5772         }
5773
5774         if (!err)
5775                 dev->features = features;
5776
5777         return 1;
5778 }
5779
5780 /**
5781  *      netdev_update_features - recalculate device features
5782  *      @dev: the device to check
5783  *
5784  *      Recalculate dev->features set and send notifications if it
5785  *      has changed. Should be called after driver or hardware dependent
5786  *      conditions might have changed that influence the features.
5787  */
5788 void netdev_update_features(struct net_device *dev)
5789 {
5790         if (__netdev_update_features(dev))
5791                 netdev_features_change(dev);
5792 }
5793 EXPORT_SYMBOL(netdev_update_features);
5794
5795 /**
5796  *      netdev_change_features - recalculate device features
5797  *      @dev: the device to check
5798  *
5799  *      Recalculate dev->features set and send notifications even
5800  *      if they have not changed. Should be called instead of
5801  *      netdev_update_features() if also dev->vlan_features might
5802  *      have changed to allow the changes to be propagated to stacked
5803  *      VLAN devices.
5804  */
5805 void netdev_change_features(struct net_device *dev)
5806 {
5807         __netdev_update_features(dev);
5808         netdev_features_change(dev);
5809 }
5810 EXPORT_SYMBOL(netdev_change_features);
5811
5812 /**
5813  *      netif_stacked_transfer_operstate -      transfer operstate
5814  *      @rootdev: the root or lower level device to transfer state from
5815  *      @dev: the device to transfer operstate to
5816  *
5817  *      Transfer operational state from root to device. This is normally
5818  *      called when a stacking relationship exists between the root
5819  *      device and the device(a leaf device).
5820  */
5821 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5822                                         struct net_device *dev)
5823 {
5824         if (rootdev->operstate == IF_OPER_DORMANT)
5825                 netif_dormant_on(dev);
5826         else
5827                 netif_dormant_off(dev);
5828
5829         if (netif_carrier_ok(rootdev)) {
5830                 if (!netif_carrier_ok(dev))
5831                         netif_carrier_on(dev);
5832         } else {
5833                 if (netif_carrier_ok(dev))
5834                         netif_carrier_off(dev);
5835         }
5836 }
5837 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5838
5839 #ifdef CONFIG_SYSFS
5840 static int netif_alloc_rx_queues(struct net_device *dev)
5841 {
5842         unsigned int i, count = dev->num_rx_queues;
5843         struct netdev_rx_queue *rx;
5844
5845         BUG_ON(count < 1);
5846
5847         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5848         if (!rx)
5849                 return -ENOMEM;
5850
5851         dev->_rx = rx;
5852
5853         for (i = 0; i < count; i++)
5854                 rx[i].dev = dev;
5855         return 0;
5856 }
5857 #endif
5858
5859 static void netdev_init_one_queue(struct net_device *dev,
5860                                   struct netdev_queue *queue, void *_unused)
5861 {
5862         /* Initialize queue lock */
5863         spin_lock_init(&queue->_xmit_lock);
5864         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5865         queue->xmit_lock_owner = -1;
5866         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5867         queue->dev = dev;
5868 #ifdef CONFIG_BQL
5869         dql_init(&queue->dql, HZ);
5870 #endif
5871 }
5872
5873 static void netif_free_tx_queues(struct net_device *dev)
5874 {
5875         if (is_vmalloc_addr(dev->_tx))
5876                 vfree(dev->_tx);
5877         else
5878                 kfree(dev->_tx);
5879 }
5880
5881 static int netif_alloc_netdev_queues(struct net_device *dev)
5882 {
5883         unsigned int count = dev->num_tx_queues;
5884         struct netdev_queue *tx;
5885         size_t sz = count * sizeof(*tx);
5886
5887         BUG_ON(count < 1 || count > 0xffff);
5888
5889         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5890         if (!tx) {
5891                 tx = vzalloc(sz);
5892                 if (!tx)
5893                         return -ENOMEM;
5894         }
5895         dev->_tx = tx;
5896
5897         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5898         spin_lock_init(&dev->tx_global_lock);
5899
5900         return 0;
5901 }
5902
5903 /**
5904  *      register_netdevice      - register a network device
5905  *      @dev: device to register
5906  *
5907  *      Take a completed network device structure and add it to the kernel
5908  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5909  *      chain. 0 is returned on success. A negative errno code is returned
5910  *      on a failure to set up the device, or if the name is a duplicate.
5911  *
5912  *      Callers must hold the rtnl semaphore. You may want
5913  *      register_netdev() instead of this.
5914  *
5915  *      BUGS:
5916  *      The locking appears insufficient to guarantee two parallel registers
5917  *      will not get the same name.
5918  */
5919
5920 int register_netdevice(struct net_device *dev)
5921 {
5922         int ret;
5923         struct net *net = dev_net(dev);
5924
5925         BUG_ON(dev_boot_phase);
5926         ASSERT_RTNL();
5927
5928         might_sleep();
5929
5930         /* When net_device's are persistent, this will be fatal. */
5931         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5932         BUG_ON(!net);
5933
5934         spin_lock_init(&dev->addr_list_lock);
5935         netdev_set_addr_lockdep_class(dev);
5936
5937         dev->iflink = -1;
5938
5939         ret = dev_get_valid_name(net, dev, dev->name);
5940         if (ret < 0)
5941                 goto out;
5942
5943         /* Init, if this function is available */
5944         if (dev->netdev_ops->ndo_init) {
5945                 ret = dev->netdev_ops->ndo_init(dev);
5946                 if (ret) {
5947                         if (ret > 0)
5948                                 ret = -EIO;
5949                         goto out;
5950                 }
5951         }
5952
5953         if (((dev->hw_features | dev->features) &
5954              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5955             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5956              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5957                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5958                 ret = -EINVAL;
5959                 goto err_uninit;
5960         }
5961
5962         ret = -EBUSY;
5963         if (!dev->ifindex)
5964                 dev->ifindex = dev_new_index(net);
5965         else if (__dev_get_by_index(net, dev->ifindex))
5966                 goto err_uninit;
5967
5968         if (dev->iflink == -1)
5969                 dev->iflink = dev->ifindex;
5970
5971         /* Transfer changeable features to wanted_features and enable
5972          * software offloads (GSO and GRO).
5973          */
5974         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5975         dev->features |= NETIF_F_SOFT_FEATURES;
5976         dev->wanted_features = dev->features & dev->hw_features;
5977
5978         if (!(dev->flags & IFF_LOOPBACK)) {
5979                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5980         }
5981
5982         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5983          */
5984         dev->vlan_features |= NETIF_F_HIGHDMA;
5985
5986         /* Make NETIF_F_SG inheritable to tunnel devices.
5987          */
5988         dev->hw_enc_features |= NETIF_F_SG;
5989
5990         /* Make NETIF_F_SG inheritable to MPLS.
5991          */
5992         dev->mpls_features |= NETIF_F_SG;
5993
5994         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5995         ret = notifier_to_errno(ret);
5996         if (ret)
5997                 goto err_uninit;
5998
5999         ret = netdev_register_kobject(dev);
6000         if (ret)
6001                 goto err_uninit;
6002         dev->reg_state = NETREG_REGISTERED;
6003
6004         __netdev_update_features(dev);
6005
6006         /*
6007          *      Default initial state at registry is that the
6008          *      device is present.
6009          */
6010
6011         set_bit(__LINK_STATE_PRESENT, &dev->state);
6012
6013         linkwatch_init_dev(dev);
6014
6015         dev_init_scheduler(dev);
6016         dev_hold(dev);
6017         list_netdevice(dev);
6018         add_device_randomness(dev->dev_addr, dev->addr_len);
6019
6020         /* If the device has permanent device address, driver should
6021          * set dev_addr and also addr_assign_type should be set to
6022          * NET_ADDR_PERM (default value).
6023          */
6024         if (dev->addr_assign_type == NET_ADDR_PERM)
6025                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6026
6027         /* Notify protocols, that a new device appeared. */
6028         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6029         ret = notifier_to_errno(ret);
6030         if (ret) {
6031                 rollback_registered(dev);
6032                 dev->reg_state = NETREG_UNREGISTERED;
6033         }
6034         /*
6035          *      Prevent userspace races by waiting until the network
6036          *      device is fully setup before sending notifications.
6037          */
6038         if (!dev->rtnl_link_ops ||
6039             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6040                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6041
6042 out:
6043         return ret;
6044
6045 err_uninit:
6046         if (dev->netdev_ops->ndo_uninit)
6047                 dev->netdev_ops->ndo_uninit(dev);
6048         goto out;
6049 }
6050 EXPORT_SYMBOL(register_netdevice);
6051
6052 /**
6053  *      init_dummy_netdev       - init a dummy network device for NAPI
6054  *      @dev: device to init
6055  *
6056  *      This takes a network device structure and initialize the minimum
6057  *      amount of fields so it can be used to schedule NAPI polls without
6058  *      registering a full blown interface. This is to be used by drivers
6059  *      that need to tie several hardware interfaces to a single NAPI
6060  *      poll scheduler due to HW limitations.
6061  */
6062 int init_dummy_netdev(struct net_device *dev)
6063 {
6064         /* Clear everything. Note we don't initialize spinlocks
6065          * are they aren't supposed to be taken by any of the
6066          * NAPI code and this dummy netdev is supposed to be
6067          * only ever used for NAPI polls
6068          */
6069         memset(dev, 0, sizeof(struct net_device));
6070
6071         /* make sure we BUG if trying to hit standard
6072          * register/unregister code path
6073          */
6074         dev->reg_state = NETREG_DUMMY;
6075
6076         /* NAPI wants this */
6077         INIT_LIST_HEAD(&dev->napi_list);
6078
6079         /* a dummy interface is started by default */
6080         set_bit(__LINK_STATE_PRESENT, &dev->state);
6081         set_bit(__LINK_STATE_START, &dev->state);
6082
6083         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6084          * because users of this 'device' dont need to change
6085          * its refcount.
6086          */
6087
6088         return 0;
6089 }
6090 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6091
6092
6093 /**
6094  *      register_netdev - register a network device
6095  *      @dev: device to register
6096  *
6097  *      Take a completed network device structure and add it to the kernel
6098  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6099  *      chain. 0 is returned on success. A negative errno code is returned
6100  *      on a failure to set up the device, or if the name is a duplicate.
6101  *
6102  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6103  *      and expands the device name if you passed a format string to
6104  *      alloc_netdev.
6105  */
6106 int register_netdev(struct net_device *dev)
6107 {
6108         int err;
6109
6110         rtnl_lock();
6111         err = register_netdevice(dev);
6112         rtnl_unlock();
6113         return err;
6114 }
6115 EXPORT_SYMBOL(register_netdev);
6116
6117 int netdev_refcnt_read(const struct net_device *dev)
6118 {
6119         int i, refcnt = 0;
6120
6121         for_each_possible_cpu(i)
6122                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6123         return refcnt;
6124 }
6125 EXPORT_SYMBOL(netdev_refcnt_read);
6126
6127 /**
6128  * netdev_wait_allrefs - wait until all references are gone.
6129  * @dev: target net_device
6130  *
6131  * This is called when unregistering network devices.
6132  *
6133  * Any protocol or device that holds a reference should register
6134  * for netdevice notification, and cleanup and put back the
6135  * reference if they receive an UNREGISTER event.
6136  * We can get stuck here if buggy protocols don't correctly
6137  * call dev_put.
6138  */
6139 static void netdev_wait_allrefs(struct net_device *dev)
6140 {
6141         unsigned long rebroadcast_time, warning_time;
6142         int refcnt;
6143
6144         linkwatch_forget_dev(dev);
6145
6146         rebroadcast_time = warning_time = jiffies;
6147         refcnt = netdev_refcnt_read(dev);
6148
6149         while (refcnt != 0) {
6150                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6151                         rtnl_lock();
6152
6153                         /* Rebroadcast unregister notification */
6154                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6155
6156                         __rtnl_unlock();
6157                         rcu_barrier();
6158                         rtnl_lock();
6159
6160                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6161                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6162                                      &dev->state)) {
6163                                 /* We must not have linkwatch events
6164                                  * pending on unregister. If this
6165                                  * happens, we simply run the queue
6166                                  * unscheduled, resulting in a noop
6167                                  * for this device.
6168                                  */
6169                                 linkwatch_run_queue();
6170                         }
6171
6172                         __rtnl_unlock();
6173
6174                         rebroadcast_time = jiffies;
6175                 }
6176
6177                 msleep(250);
6178
6179                 refcnt = netdev_refcnt_read(dev);
6180
6181                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6182                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6183                                  dev->name, refcnt);
6184                         warning_time = jiffies;
6185                 }
6186         }
6187 }
6188
6189 /* The sequence is:
6190  *
6191  *      rtnl_lock();
6192  *      ...
6193  *      register_netdevice(x1);
6194  *      register_netdevice(x2);
6195  *      ...
6196  *      unregister_netdevice(y1);
6197  *      unregister_netdevice(y2);
6198  *      ...
6199  *      rtnl_unlock();
6200  *      free_netdev(y1);
6201  *      free_netdev(y2);
6202  *
6203  * We are invoked by rtnl_unlock().
6204  * This allows us to deal with problems:
6205  * 1) We can delete sysfs objects which invoke hotplug
6206  *    without deadlocking with linkwatch via keventd.
6207  * 2) Since we run with the RTNL semaphore not held, we can sleep
6208  *    safely in order to wait for the netdev refcnt to drop to zero.
6209  *
6210  * We must not return until all unregister events added during
6211  * the interval the lock was held have been completed.
6212  */
6213 void netdev_run_todo(void)
6214 {
6215         struct list_head list;
6216
6217         /* Snapshot list, allow later requests */
6218         list_replace_init(&net_todo_list, &list);
6219
6220         __rtnl_unlock();
6221
6222
6223         /* Wait for rcu callbacks to finish before next phase */
6224         if (!list_empty(&list))
6225                 rcu_barrier();
6226
6227         while (!list_empty(&list)) {
6228                 struct net_device *dev
6229                         = list_first_entry(&list, struct net_device, todo_list);
6230                 list_del(&dev->todo_list);
6231
6232                 rtnl_lock();
6233                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6234                 __rtnl_unlock();
6235
6236                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6237                         pr_err("network todo '%s' but state %d\n",
6238                                dev->name, dev->reg_state);
6239                         dump_stack();
6240                         continue;
6241                 }
6242
6243                 dev->reg_state = NETREG_UNREGISTERED;
6244
6245                 on_each_cpu(flush_backlog, dev, 1);
6246
6247                 netdev_wait_allrefs(dev);
6248
6249                 /* paranoia */
6250                 BUG_ON(netdev_refcnt_read(dev));
6251                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6252                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6253                 WARN_ON(dev->dn_ptr);
6254
6255                 if (dev->destructor)
6256                         dev->destructor(dev);
6257
6258                 /* Report a network device has been unregistered */
6259                 rtnl_lock();
6260                 dev_net(dev)->dev_unreg_count--;
6261                 __rtnl_unlock();
6262                 wake_up(&netdev_unregistering_wq);
6263
6264                 /* Free network device */
6265                 kobject_put(&dev->dev.kobj);
6266         }
6267 }
6268
6269 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6270  * fields in the same order, with only the type differing.
6271  */
6272 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6273                              const struct net_device_stats *netdev_stats)
6274 {
6275 #if BITS_PER_LONG == 64
6276         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6277         memcpy(stats64, netdev_stats, sizeof(*stats64));
6278 #else
6279         size_t i, n = sizeof(*stats64) / sizeof(u64);
6280         const unsigned long *src = (const unsigned long *)netdev_stats;
6281         u64 *dst = (u64 *)stats64;
6282
6283         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6284                      sizeof(*stats64) / sizeof(u64));
6285         for (i = 0; i < n; i++)
6286                 dst[i] = src[i];
6287 #endif
6288 }
6289 EXPORT_SYMBOL(netdev_stats_to_stats64);
6290
6291 /**
6292  *      dev_get_stats   - get network device statistics
6293  *      @dev: device to get statistics from
6294  *      @storage: place to store stats
6295  *
6296  *      Get network statistics from device. Return @storage.
6297  *      The device driver may provide its own method by setting
6298  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6299  *      otherwise the internal statistics structure is used.
6300  */
6301 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6302                                         struct rtnl_link_stats64 *storage)
6303 {
6304         const struct net_device_ops *ops = dev->netdev_ops;
6305
6306         if (ops->ndo_get_stats64) {
6307                 memset(storage, 0, sizeof(*storage));
6308                 ops->ndo_get_stats64(dev, storage);
6309         } else if (ops->ndo_get_stats) {
6310                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6311         } else {
6312                 netdev_stats_to_stats64(storage, &dev->stats);
6313         }
6314         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6315         return storage;
6316 }
6317 EXPORT_SYMBOL(dev_get_stats);
6318
6319 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6320 {
6321         struct netdev_queue *queue = dev_ingress_queue(dev);
6322
6323 #ifdef CONFIG_NET_CLS_ACT
6324         if (queue)
6325                 return queue;
6326         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6327         if (!queue)
6328                 return NULL;
6329         netdev_init_one_queue(dev, queue, NULL);
6330         queue->qdisc = &noop_qdisc;
6331         queue->qdisc_sleeping = &noop_qdisc;
6332         rcu_assign_pointer(dev->ingress_queue, queue);
6333 #endif
6334         return queue;
6335 }
6336
6337 static const struct ethtool_ops default_ethtool_ops;
6338
6339 void netdev_set_default_ethtool_ops(struct net_device *dev,
6340                                     const struct ethtool_ops *ops)
6341 {
6342         if (dev->ethtool_ops == &default_ethtool_ops)
6343                 dev->ethtool_ops = ops;
6344 }
6345 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6346
6347 void netdev_freemem(struct net_device *dev)
6348 {
6349         char *addr = (char *)dev - dev->padded;
6350
6351         if (is_vmalloc_addr(addr))
6352                 vfree(addr);
6353         else
6354                 kfree(addr);
6355 }
6356
6357 /**
6358  *      alloc_netdev_mqs - allocate network device
6359  *      @sizeof_priv:   size of private data to allocate space for
6360  *      @name:          device name format string
6361  *      @setup:         callback to initialize device
6362  *      @txqs:          the number of TX subqueues to allocate
6363  *      @rxqs:          the number of RX subqueues to allocate
6364  *
6365  *      Allocates a struct net_device with private data area for driver use
6366  *      and performs basic initialization.  Also allocates subqueue structs
6367  *      for each queue on the device.
6368  */
6369 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6370                 void (*setup)(struct net_device *),
6371                 unsigned int txqs, unsigned int rxqs)
6372 {
6373         struct net_device *dev;
6374         size_t alloc_size;
6375         struct net_device *p;
6376
6377         BUG_ON(strlen(name) >= sizeof(dev->name));
6378
6379         if (txqs < 1) {
6380                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6381                 return NULL;
6382         }
6383
6384 #ifdef CONFIG_SYSFS
6385         if (rxqs < 1) {
6386                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6387                 return NULL;
6388         }
6389 #endif
6390
6391         alloc_size = sizeof(struct net_device);
6392         if (sizeof_priv) {
6393                 /* ensure 32-byte alignment of private area */
6394                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6395                 alloc_size += sizeof_priv;
6396         }
6397         /* ensure 32-byte alignment of whole construct */
6398         alloc_size += NETDEV_ALIGN - 1;
6399
6400         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6401         if (!p)
6402                 p = vzalloc(alloc_size);
6403         if (!p)
6404                 return NULL;
6405
6406         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6407         dev->padded = (char *)dev - (char *)p;
6408
6409         dev->pcpu_refcnt = alloc_percpu(int);
6410         if (!dev->pcpu_refcnt)
6411                 goto free_dev;
6412
6413         if (dev_addr_init(dev))
6414                 goto free_pcpu;
6415
6416         dev_mc_init(dev);
6417         dev_uc_init(dev);
6418
6419         dev_net_set(dev, &init_net);
6420
6421         dev->gso_max_size = GSO_MAX_SIZE;
6422         dev->gso_max_segs = GSO_MAX_SEGS;
6423
6424         INIT_LIST_HEAD(&dev->napi_list);
6425         INIT_LIST_HEAD(&dev->unreg_list);
6426         INIT_LIST_HEAD(&dev->close_list);
6427         INIT_LIST_HEAD(&dev->link_watch_list);
6428         INIT_LIST_HEAD(&dev->adj_list.upper);
6429         INIT_LIST_HEAD(&dev->adj_list.lower);
6430         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6431         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6432         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6433         setup(dev);
6434
6435         dev->num_tx_queues = txqs;
6436         dev->real_num_tx_queues = txqs;
6437         if (netif_alloc_netdev_queues(dev))
6438                 goto free_all;
6439
6440 #ifdef CONFIG_SYSFS
6441         dev->num_rx_queues = rxqs;
6442         dev->real_num_rx_queues = rxqs;
6443         if (netif_alloc_rx_queues(dev))
6444                 goto free_all;
6445 #endif
6446
6447         strcpy(dev->name, name);
6448         dev->group = INIT_NETDEV_GROUP;
6449         if (!dev->ethtool_ops)
6450                 dev->ethtool_ops = &default_ethtool_ops;
6451         return dev;
6452
6453 free_all:
6454         free_netdev(dev);
6455         return NULL;
6456
6457 free_pcpu:
6458         free_percpu(dev->pcpu_refcnt);
6459         netif_free_tx_queues(dev);
6460 #ifdef CONFIG_SYSFS
6461         kfree(dev->_rx);
6462 #endif
6463
6464 free_dev:
6465         netdev_freemem(dev);
6466         return NULL;
6467 }
6468 EXPORT_SYMBOL(alloc_netdev_mqs);
6469
6470 /**
6471  *      free_netdev - free network device
6472  *      @dev: device
6473  *
6474  *      This function does the last stage of destroying an allocated device
6475  *      interface. The reference to the device object is released.
6476  *      If this is the last reference then it will be freed.
6477  */
6478 void free_netdev(struct net_device *dev)
6479 {
6480         struct napi_struct *p, *n;
6481
6482         release_net(dev_net(dev));
6483
6484         netif_free_tx_queues(dev);
6485 #ifdef CONFIG_SYSFS
6486         kfree(dev->_rx);
6487 #endif
6488
6489         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6490
6491         /* Flush device addresses */
6492         dev_addr_flush(dev);
6493
6494         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6495                 netif_napi_del(p);
6496
6497         free_percpu(dev->pcpu_refcnt);
6498         dev->pcpu_refcnt = NULL;
6499
6500         /*  Compatibility with error handling in drivers */
6501         if (dev->reg_state == NETREG_UNINITIALIZED) {
6502                 netdev_freemem(dev);
6503                 return;
6504         }
6505
6506         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6507         dev->reg_state = NETREG_RELEASED;
6508
6509         /* will free via device release */
6510         put_device(&dev->dev);
6511 }
6512 EXPORT_SYMBOL(free_netdev);
6513
6514 /**
6515  *      synchronize_net -  Synchronize with packet receive processing
6516  *
6517  *      Wait for packets currently being received to be done.
6518  *      Does not block later packets from starting.
6519  */
6520 void synchronize_net(void)
6521 {
6522         might_sleep();
6523         if (rtnl_is_locked())
6524                 synchronize_rcu_expedited();
6525         else
6526                 synchronize_rcu();
6527 }
6528 EXPORT_SYMBOL(synchronize_net);
6529
6530 /**
6531  *      unregister_netdevice_queue - remove device from the kernel
6532  *      @dev: device
6533  *      @head: list
6534  *
6535  *      This function shuts down a device interface and removes it
6536  *      from the kernel tables.
6537  *      If head not NULL, device is queued to be unregistered later.
6538  *
6539  *      Callers must hold the rtnl semaphore.  You may want
6540  *      unregister_netdev() instead of this.
6541  */
6542
6543 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6544 {
6545         ASSERT_RTNL();
6546
6547         if (head) {
6548                 list_move_tail(&dev->unreg_list, head);
6549         } else {
6550                 rollback_registered(dev);
6551                 /* Finish processing unregister after unlock */
6552                 net_set_todo(dev);
6553         }
6554 }
6555 EXPORT_SYMBOL(unregister_netdevice_queue);
6556
6557 /**
6558  *      unregister_netdevice_many - unregister many devices
6559  *      @head: list of devices
6560  *
6561  *  Note: As most callers use a stack allocated list_head,
6562  *  we force a list_del() to make sure stack wont be corrupted later.
6563  */
6564 void unregister_netdevice_many(struct list_head *head)
6565 {
6566         struct net_device *dev;
6567
6568         if (!list_empty(head)) {
6569                 rollback_registered_many(head);
6570                 list_for_each_entry(dev, head, unreg_list)
6571                         net_set_todo(dev);
6572                 list_del(head);
6573         }
6574 }
6575 EXPORT_SYMBOL(unregister_netdevice_many);
6576
6577 /**
6578  *      unregister_netdev - remove device from the kernel
6579  *      @dev: device
6580  *
6581  *      This function shuts down a device interface and removes it
6582  *      from the kernel tables.
6583  *
6584  *      This is just a wrapper for unregister_netdevice that takes
6585  *      the rtnl semaphore.  In general you want to use this and not
6586  *      unregister_netdevice.
6587  */
6588 void unregister_netdev(struct net_device *dev)
6589 {
6590         rtnl_lock();
6591         unregister_netdevice(dev);
6592         rtnl_unlock();
6593 }
6594 EXPORT_SYMBOL(unregister_netdev);
6595
6596 /**
6597  *      dev_change_net_namespace - move device to different nethost namespace
6598  *      @dev: device
6599  *      @net: network namespace
6600  *      @pat: If not NULL name pattern to try if the current device name
6601  *            is already taken in the destination network namespace.
6602  *
6603  *      This function shuts down a device interface and moves it
6604  *      to a new network namespace. On success 0 is returned, on
6605  *      a failure a netagive errno code is returned.
6606  *
6607  *      Callers must hold the rtnl semaphore.
6608  */
6609
6610 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6611 {
6612         int err;
6613
6614         ASSERT_RTNL();
6615
6616         /* Don't allow namespace local devices to be moved. */
6617         err = -EINVAL;
6618         if (dev->features & NETIF_F_NETNS_LOCAL)
6619                 goto out;
6620
6621         /* Ensure the device has been registrered */
6622         if (dev->reg_state != NETREG_REGISTERED)
6623                 goto out;
6624
6625         /* Get out if there is nothing todo */
6626         err = 0;
6627         if (net_eq(dev_net(dev), net))
6628                 goto out;
6629
6630         /* Pick the destination device name, and ensure
6631          * we can use it in the destination network namespace.
6632          */
6633         err = -EEXIST;
6634         if (__dev_get_by_name(net, dev->name)) {
6635                 /* We get here if we can't use the current device name */
6636                 if (!pat)
6637                         goto out;
6638                 if (dev_get_valid_name(net, dev, pat) < 0)
6639                         goto out;
6640         }
6641
6642         /*
6643          * And now a mini version of register_netdevice unregister_netdevice.
6644          */
6645
6646         /* If device is running close it first. */
6647         dev_close(dev);
6648
6649         /* And unlink it from device chain */
6650         err = -ENODEV;
6651         unlist_netdevice(dev);
6652
6653         synchronize_net();
6654
6655         /* Shutdown queueing discipline. */
6656         dev_shutdown(dev);
6657
6658         /* Notify protocols, that we are about to destroy
6659            this device. They should clean all the things.
6660
6661            Note that dev->reg_state stays at NETREG_REGISTERED.
6662            This is wanted because this way 8021q and macvlan know
6663            the device is just moving and can keep their slaves up.
6664         */
6665         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6666         rcu_barrier();
6667         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6668         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6669
6670         /*
6671          *      Flush the unicast and multicast chains
6672          */
6673         dev_uc_flush(dev);
6674         dev_mc_flush(dev);
6675
6676         /* Send a netdev-removed uevent to the old namespace */
6677         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6678
6679         /* Actually switch the network namespace */
6680         dev_net_set(dev, net);
6681
6682         /* If there is an ifindex conflict assign a new one */
6683         if (__dev_get_by_index(net, dev->ifindex)) {
6684                 int iflink = (dev->iflink == dev->ifindex);
6685                 dev->ifindex = dev_new_index(net);
6686                 if (iflink)
6687                         dev->iflink = dev->ifindex;
6688         }
6689
6690         /* Send a netdev-add uevent to the new namespace */
6691         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6692
6693         /* Fixup kobjects */
6694         err = device_rename(&dev->dev, dev->name);
6695         WARN_ON(err);
6696
6697         /* Add the device back in the hashes */
6698         list_netdevice(dev);
6699
6700         /* Notify protocols, that a new device appeared. */
6701         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6702
6703         /*
6704          *      Prevent userspace races by waiting until the network
6705          *      device is fully setup before sending notifications.
6706          */
6707         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6708
6709         synchronize_net();
6710         err = 0;
6711 out:
6712         return err;
6713 }
6714 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6715
6716 static int dev_cpu_callback(struct notifier_block *nfb,
6717                             unsigned long action,
6718                             void *ocpu)
6719 {
6720         struct sk_buff **list_skb;
6721         struct sk_buff *skb;
6722         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6723         struct softnet_data *sd, *oldsd;
6724
6725         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6726                 return NOTIFY_OK;
6727
6728         local_irq_disable();
6729         cpu = smp_processor_id();
6730         sd = &per_cpu(softnet_data, cpu);
6731         oldsd = &per_cpu(softnet_data, oldcpu);
6732
6733         /* Find end of our completion_queue. */
6734         list_skb = &sd->completion_queue;
6735         while (*list_skb)
6736                 list_skb = &(*list_skb)->next;
6737         /* Append completion queue from offline CPU. */
6738         *list_skb = oldsd->completion_queue;
6739         oldsd->completion_queue = NULL;
6740
6741         /* Append output queue from offline CPU. */
6742         if (oldsd->output_queue) {
6743                 *sd->output_queue_tailp = oldsd->output_queue;
6744                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6745                 oldsd->output_queue = NULL;
6746                 oldsd->output_queue_tailp = &oldsd->output_queue;
6747         }
6748         /* Append NAPI poll list from offline CPU. */
6749         if (!list_empty(&oldsd->poll_list)) {
6750                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6751                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6752         }
6753
6754         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6755         local_irq_enable();
6756
6757         /* Process offline CPU's input_pkt_queue */
6758         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6759                 netif_rx_internal(skb);
6760                 input_queue_head_incr(oldsd);
6761         }
6762         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6763                 netif_rx_internal(skb);
6764                 input_queue_head_incr(oldsd);
6765         }
6766
6767         return NOTIFY_OK;
6768 }
6769
6770
6771 /**
6772  *      netdev_increment_features - increment feature set by one
6773  *      @all: current feature set
6774  *      @one: new feature set
6775  *      @mask: mask feature set
6776  *
6777  *      Computes a new feature set after adding a device with feature set
6778  *      @one to the master device with current feature set @all.  Will not
6779  *      enable anything that is off in @mask. Returns the new feature set.
6780  */
6781 netdev_features_t netdev_increment_features(netdev_features_t all,
6782         netdev_features_t one, netdev_features_t mask)
6783 {
6784         if (mask & NETIF_F_GEN_CSUM)
6785                 mask |= NETIF_F_ALL_CSUM;
6786         mask |= NETIF_F_VLAN_CHALLENGED;
6787
6788         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6789         all &= one | ~NETIF_F_ALL_FOR_ALL;
6790
6791         /* If one device supports hw checksumming, set for all. */
6792         if (all & NETIF_F_GEN_CSUM)
6793                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6794
6795         return all;
6796 }
6797 EXPORT_SYMBOL(netdev_increment_features);
6798
6799 static struct hlist_head * __net_init netdev_create_hash(void)
6800 {
6801         int i;
6802         struct hlist_head *hash;
6803
6804         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6805         if (hash != NULL)
6806                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6807                         INIT_HLIST_HEAD(&hash[i]);
6808
6809         return hash;
6810 }
6811
6812 /* Initialize per network namespace state */
6813 static int __net_init netdev_init(struct net *net)
6814 {
6815         if (net != &init_net)
6816                 INIT_LIST_HEAD(&net->dev_base_head);
6817
6818         net->dev_name_head = netdev_create_hash();
6819         if (net->dev_name_head == NULL)
6820                 goto err_name;
6821
6822         net->dev_index_head = netdev_create_hash();
6823         if (net->dev_index_head == NULL)
6824                 goto err_idx;
6825
6826         return 0;
6827
6828 err_idx:
6829         kfree(net->dev_name_head);
6830 err_name:
6831         return -ENOMEM;
6832 }
6833
6834 /**
6835  *      netdev_drivername - network driver for the device
6836  *      @dev: network device
6837  *
6838  *      Determine network driver for device.
6839  */
6840 const char *netdev_drivername(const struct net_device *dev)
6841 {
6842         const struct device_driver *driver;
6843         const struct device *parent;
6844         const char *empty = "";
6845
6846         parent = dev->dev.parent;
6847         if (!parent)
6848                 return empty;
6849
6850         driver = parent->driver;
6851         if (driver && driver->name)
6852                 return driver->name;
6853         return empty;
6854 }
6855
6856 static int __netdev_printk(const char *level, const struct net_device *dev,
6857                            struct va_format *vaf)
6858 {
6859         int r;
6860
6861         if (dev && dev->dev.parent) {
6862                 r = dev_printk_emit(level[1] - '0',
6863                                     dev->dev.parent,
6864                                     "%s %s %s: %pV",
6865                                     dev_driver_string(dev->dev.parent),
6866                                     dev_name(dev->dev.parent),
6867                                     netdev_name(dev), vaf);
6868         } else if (dev) {
6869                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6870         } else {
6871                 r = printk("%s(NULL net_device): %pV", level, vaf);
6872         }
6873
6874         return r;
6875 }
6876
6877 int netdev_printk(const char *level, const struct net_device *dev,
6878                   const char *format, ...)
6879 {
6880         struct va_format vaf;
6881         va_list args;
6882         int r;
6883
6884         va_start(args, format);
6885
6886         vaf.fmt = format;
6887         vaf.va = &args;
6888
6889         r = __netdev_printk(level, dev, &vaf);
6890
6891         va_end(args);
6892
6893         return r;
6894 }
6895 EXPORT_SYMBOL(netdev_printk);
6896
6897 #define define_netdev_printk_level(func, level)                 \
6898 int func(const struct net_device *dev, const char *fmt, ...)    \
6899 {                                                               \
6900         int r;                                                  \
6901         struct va_format vaf;                                   \
6902         va_list args;                                           \
6903                                                                 \
6904         va_start(args, fmt);                                    \
6905                                                                 \
6906         vaf.fmt = fmt;                                          \
6907         vaf.va = &args;                                         \
6908                                                                 \
6909         r = __netdev_printk(level, dev, &vaf);                  \
6910                                                                 \
6911         va_end(args);                                           \
6912                                                                 \
6913         return r;                                               \
6914 }                                                               \
6915 EXPORT_SYMBOL(func);
6916
6917 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6918 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6919 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6920 define_netdev_printk_level(netdev_err, KERN_ERR);
6921 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6922 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6923 define_netdev_printk_level(netdev_info, KERN_INFO);
6924
6925 static void __net_exit netdev_exit(struct net *net)
6926 {
6927         kfree(net->dev_name_head);
6928         kfree(net->dev_index_head);
6929 }
6930
6931 static struct pernet_operations __net_initdata netdev_net_ops = {
6932         .init = netdev_init,
6933         .exit = netdev_exit,
6934 };
6935
6936 static void __net_exit default_device_exit(struct net *net)
6937 {
6938         struct net_device *dev, *aux;
6939         /*
6940          * Push all migratable network devices back to the
6941          * initial network namespace
6942          */
6943         rtnl_lock();
6944         for_each_netdev_safe(net, dev, aux) {
6945                 int err;
6946                 char fb_name[IFNAMSIZ];
6947
6948                 /* Ignore unmoveable devices (i.e. loopback) */
6949                 if (dev->features & NETIF_F_NETNS_LOCAL)
6950                         continue;
6951
6952                 /* Leave virtual devices for the generic cleanup */
6953                 if (dev->rtnl_link_ops)
6954                         continue;
6955
6956                 /* Push remaining network devices to init_net */
6957                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6958                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6959                 if (err) {
6960                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6961                                  __func__, dev->name, err);
6962                         BUG();
6963                 }
6964         }
6965         rtnl_unlock();
6966 }
6967
6968 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
6969 {
6970         /* Return with the rtnl_lock held when there are no network
6971          * devices unregistering in any network namespace in net_list.
6972          */
6973         struct net *net;
6974         bool unregistering;
6975         DEFINE_WAIT(wait);
6976
6977         for (;;) {
6978                 prepare_to_wait(&netdev_unregistering_wq, &wait,
6979                                 TASK_UNINTERRUPTIBLE);
6980                 unregistering = false;
6981                 rtnl_lock();
6982                 list_for_each_entry(net, net_list, exit_list) {
6983                         if (net->dev_unreg_count > 0) {
6984                                 unregistering = true;
6985                                 break;
6986                         }
6987                 }
6988                 if (!unregistering)
6989                         break;
6990                 __rtnl_unlock();
6991                 schedule();
6992         }
6993         finish_wait(&netdev_unregistering_wq, &wait);
6994 }
6995
6996 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6997 {
6998         /* At exit all network devices most be removed from a network
6999          * namespace.  Do this in the reverse order of registration.
7000          * Do this across as many network namespaces as possible to
7001          * improve batching efficiency.
7002          */
7003         struct net_device *dev;
7004         struct net *net;
7005         LIST_HEAD(dev_kill_list);
7006
7007         /* To prevent network device cleanup code from dereferencing
7008          * loopback devices or network devices that have been freed
7009          * wait here for all pending unregistrations to complete,
7010          * before unregistring the loopback device and allowing the
7011          * network namespace be freed.
7012          *
7013          * The netdev todo list containing all network devices
7014          * unregistrations that happen in default_device_exit_batch
7015          * will run in the rtnl_unlock() at the end of
7016          * default_device_exit_batch.
7017          */
7018         rtnl_lock_unregistering(net_list);
7019         list_for_each_entry(net, net_list, exit_list) {
7020                 for_each_netdev_reverse(net, dev) {
7021                         if (dev->rtnl_link_ops)
7022                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7023                         else
7024                                 unregister_netdevice_queue(dev, &dev_kill_list);
7025                 }
7026         }
7027         unregister_netdevice_many(&dev_kill_list);
7028         rtnl_unlock();
7029 }
7030
7031 static struct pernet_operations __net_initdata default_device_ops = {
7032         .exit = default_device_exit,
7033         .exit_batch = default_device_exit_batch,
7034 };
7035
7036 /*
7037  *      Initialize the DEV module. At boot time this walks the device list and
7038  *      unhooks any devices that fail to initialise (normally hardware not
7039  *      present) and leaves us with a valid list of present and active devices.
7040  *
7041  */
7042
7043 /*
7044  *       This is called single threaded during boot, so no need
7045  *       to take the rtnl semaphore.
7046  */
7047 static int __init net_dev_init(void)
7048 {
7049         int i, rc = -ENOMEM;
7050
7051         BUG_ON(!dev_boot_phase);
7052
7053         if (dev_proc_init())
7054                 goto out;
7055
7056         if (netdev_kobject_init())
7057                 goto out;
7058
7059         INIT_LIST_HEAD(&ptype_all);
7060         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7061                 INIT_LIST_HEAD(&ptype_base[i]);
7062
7063         INIT_LIST_HEAD(&offload_base);
7064
7065         if (register_pernet_subsys(&netdev_net_ops))
7066                 goto out;
7067
7068         /*
7069          *      Initialise the packet receive queues.
7070          */
7071
7072         for_each_possible_cpu(i) {
7073                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7074
7075                 skb_queue_head_init(&sd->input_pkt_queue);
7076                 skb_queue_head_init(&sd->process_queue);
7077                 INIT_LIST_HEAD(&sd->poll_list);
7078                 sd->output_queue_tailp = &sd->output_queue;
7079 #ifdef CONFIG_RPS
7080                 sd->csd.func = rps_trigger_softirq;
7081                 sd->csd.info = sd;
7082                 sd->cpu = i;
7083 #endif
7084
7085                 sd->backlog.poll = process_backlog;
7086                 sd->backlog.weight = weight_p;
7087         }
7088
7089         dev_boot_phase = 0;
7090
7091         /* The loopback device is special if any other network devices
7092          * is present in a network namespace the loopback device must
7093          * be present. Since we now dynamically allocate and free the
7094          * loopback device ensure this invariant is maintained by
7095          * keeping the loopback device as the first device on the
7096          * list of network devices.  Ensuring the loopback devices
7097          * is the first device that appears and the last network device
7098          * that disappears.
7099          */
7100         if (register_pernet_device(&loopback_net_ops))
7101                 goto out;
7102
7103         if (register_pernet_device(&default_device_ops))
7104                 goto out;
7105
7106         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7107         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7108
7109         hotcpu_notifier(dev_cpu_callback, 0);
7110         dst_init();
7111         rc = 0;
7112 out:
7113         return rc;
7114 }
7115
7116 subsys_initcall(net_dev_init);