net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <linux/pci.h>
 132 #include <linux/inetdevice.h>
 133
 134 #include "net-sysfs.h"
 135
 136 /* Instead of increasing this, you should create a hash table. */
 137 #define MAX_GRO_SKBS 8
 138
 139 /* This should be increased if a protocol with a bigger head is added. */
 140 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 141
 142 /*
 143  *      The list of packet types we will receive (as opposed to discard)
 144  *      and the routines to invoke.
 145  *
 146  *      Why 16. Because with 16 the only overlap we get on a hash of the
 147  *      low nibble of the protocol value is RARP/SNAP/X.25.
 148  *
 149  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 150  *             sure which should go first, but I bet it won't make much
 151  *             difference if we are running VLANs.  The good news is that
 152  *             this protocol won't be in the list unless compiled in, so
 153  *             the average user (w/out VLANs) will not be adversely affected.
 154  *             --BLG
 155  *
 156  *              0800    IP
 157  *              8100    802.1Q VLAN
 158  *              0001    802.3
 159  *              0002    AX.25
 160  *              0004    802.2
 161  *              8035    RARP
 162  *              0005    SNAP
 163  *              0805    X.25
 164  *              0806    ARP
 165  *              8137    IPX
 166  *              0009    Localtalk
 167  *              86DD    IPv6
 168  */
 169
 170 #define PTYPE_HASH_SIZE (16)
 171 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 172
 173 static DEFINE_SPINLOCK(ptype_lock);
 174 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 175 static struct list_head ptype_all __read_mostly;        /* Taps */
 176
 177 /*
 178  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 179  * semaphore.
 180  *
 181  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 182  *
 183  * Writers must hold the rtnl semaphore while they loop through the
 184  * dev_base_head list, and hold dev_base_lock for writing when they do the
 185  * actual updates.  This allows pure readers to access the list even
 186  * while a writer is preparing to update it.
 187  *
 188  * To put it another way, dev_base_lock is held for writing only to
 189  * protect against pure readers; the rtnl semaphore provides the
 190  * protection against other writers.
 191  *
 192  * See, for example usages, register_netdevice() and
 193  * unregister_netdevice(), which must be called with the rtnl
 194  * semaphore held.
 195  */
 196 DEFINE_RWLOCK(dev_base_lock);
 197 EXPORT_SYMBOL(dev_base_lock);
 198
 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200 {
 201         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208 }
 209
 210 static inline void rps_lock(struct softnet_data *sd)
 211 {
 212 #ifdef CONFIG_RPS
 213         spin_lock(&sd->input_pkt_queue.lock);
 214 #endif
 215 }
 216
 217 static inline void rps_unlock(struct softnet_data *sd)
 218 {
 219 #ifdef CONFIG_RPS
 220         spin_unlock(&sd->input_pkt_queue.lock);
 221 #endif
 222 }
 223
 224 /* Device list insertion */
 225 static int list_netdevice(struct net_device *dev)
 226 {
 227         struct net *net = dev_net(dev);
 228
 229         ASSERT_RTNL();
 230
 231         write_lock_bh(&dev_base_lock);
 232         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234         hlist_add_head_rcu(&dev->index_hlist,
 235                            dev_index_hash(net, dev->ifindex));
 236         write_unlock_bh(&dev_base_lock);
 237         return 0;
 238 }
 239
 240 /* Device list removal
 241  * caller must respect a RCU grace period before freeing/reusing dev
 242  */
 243 static void unlist_netdevice(struct net_device *dev)
 244 {
 245         ASSERT_RTNL();
 246
 247         /* Unlink dev from the device chain */
 248         write_lock_bh(&dev_base_lock);
 249         list_del_rcu(&dev->dev_list);
 250         hlist_del_rcu(&dev->name_hlist);
 251         hlist_del_rcu(&dev->index_hlist);
 252         write_unlock_bh(&dev_base_lock);
 253 }
 254
 255 /*
 256  *      Our notifier list
 257  */
 258
 259 static RAW_NOTIFIER_HEAD(netdev_chain);
 260
 261 /*
 262  *      Device drivers call our routines to queue packets here. We empty the
 263  *      queue in the local softnet handler.
 264  */
 265
 266 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 267 EXPORT_PER_CPU_SYMBOL(softnet_data);
 268
 269 #ifdef CONFIG_LOCKDEP
 270 /*
 271  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 272  * according to dev->type
 273  */
 274 static const unsigned short netdev_lock_type[] =
 275         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 276          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 277          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 278          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 279          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 280          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 281          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 282          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 283          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 284          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 285          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 286          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 287          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 288          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 289          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 290          ARPHRD_VOID, ARPHRD_NONE};
 291
 292 static const char *const netdev_lock_name[] =
 293         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 294          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 295          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 296          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 297          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 298          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 299          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 300          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 301          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 302          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 303          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 304          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 305          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 306          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 307          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 308          "_xmit_VOID", "_xmit_NONE"};
 309
 310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312
 313 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 314 {
 315         int i;
 316
 317         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 318                 if (netdev_lock_type[i] == dev_type)
 319                         return i;
 320         /* the last key is used by default */
 321         return ARRAY_SIZE(netdev_lock_type) - 1;
 322 }
 323
 324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325                                                  unsigned short dev_type)
 326 {
 327         int i;
 328
 329         i = netdev_lock_pos(dev_type);
 330         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 331                                    netdev_lock_name[i]);
 332 }
 333
 334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335 {
 336         int i;
 337
 338         i = netdev_lock_pos(dev->type);
 339         lockdep_set_class_and_name(&dev->addr_list_lock,
 340                                    &netdev_addr_lock_key[i],
 341                                    netdev_lock_name[i]);
 342 }
 343 #else
 344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 345                                                  unsigned short dev_type)
 346 {
 347 }
 348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349 {
 350 }
 351 #endif
 352
 353 /*******************************************************************************
 354
 355                 Protocol management and registration routines
 356
 357 *******************************************************************************/
 358
 359 /*
 360  *      Add a protocol ID to the list. Now that the input handler is
 361  *      smarter we can dispense with all the messy stuff that used to be
 362  *      here.
 363  *
 364  *      BEWARE!!! Protocol handlers, mangling input packets,
 365  *      MUST BE last in hash buckets and checking protocol handlers
 366  *      MUST start from promiscuous ptype_all chain in net_bh.
 367  *      It is true now, do not change it.
 368  *      Explanation follows: if protocol handler, mangling packet, will
 369  *      be the first on list, it is not able to sense, that packet
 370  *      is cloned and should be copied-on-write, so that it will
 371  *      change it and subsequent readers will get broken packet.
 372  *                                                      --ANK (980803)
 373  */
 374
 375 static inline struct list_head *ptype_head(const struct packet_type *pt)
 376 {
 377         if (pt->type == htons(ETH_P_ALL))
 378                 return &ptype_all;
 379         else
 380                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 381 }
 382
 383 /**
 384  *      dev_add_pack - add packet handler
 385  *      @pt: packet type declaration
 386  *
 387  *      Add a protocol handler to the networking stack. The passed &packet_type
 388  *      is linked into kernel lists and may not be freed until it has been
 389  *      removed from the kernel lists.
 390  *
 391  *      This call does not sleep therefore it can not
 392  *      guarantee all CPU's that are in middle of receiving packets
 393  *      will see the new packet type (until the next received packet).
 394  */
 395
 396 void dev_add_pack(struct packet_type *pt)
 397 {
 398         struct list_head *head = ptype_head(pt);
 399
 400         spin_lock(&ptype_lock);
 401         list_add_rcu(&pt->list, head);
 402         spin_unlock(&ptype_lock);
 403 }
 404 EXPORT_SYMBOL(dev_add_pack);
 405
 406 /**
 407  *      __dev_remove_pack        - remove packet handler
 408  *      @pt: packet type declaration
 409  *
 410  *      Remove a protocol handler that was previously added to the kernel
 411  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 412  *      from the kernel lists and can be freed or reused once this function
 413  *      returns.
 414  *
 415  *      The packet type might still be in use by receivers
 416  *      and must not be freed until after all the CPU's have gone
 417  *      through a quiescent state.
 418  */
 419 void __dev_remove_pack(struct packet_type *pt)
 420 {
 421         struct list_head *head = ptype_head(pt);
 422         struct packet_type *pt1;
 423
 424         spin_lock(&ptype_lock);
 425
 426         list_for_each_entry(pt1, head, list) {
 427                 if (pt == pt1) {
 428                         list_del_rcu(&pt->list);
 429                         goto out;
 430                 }
 431         }
 432
 433         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 434 out:
 435         spin_unlock(&ptype_lock);
 436 }
 437 EXPORT_SYMBOL(__dev_remove_pack);
 438
 439 /**
 440  *      dev_remove_pack  - remove packet handler
 441  *      @pt: packet type declaration
 442  *
 443  *      Remove a protocol handler that was previously added to the kernel
 444  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 445  *      from the kernel lists and can be freed or reused once this function
 446  *      returns.
 447  *
 448  *      This call sleeps to guarantee that no CPU is looking at the packet
 449  *      type after return.
 450  */
 451 void dev_remove_pack(struct packet_type *pt)
 452 {
 453         __dev_remove_pack(pt);
 454
 455         synchronize_net();
 456 }
 457 EXPORT_SYMBOL(dev_remove_pack);
 458
 459 /******************************************************************************
 460
 461                       Device Boot-time Settings Routines
 462
 463 *******************************************************************************/
 464
 465 /* Boot time configuration table */
 466 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 467
 468 /**
 469  *      netdev_boot_setup_add   - add new setup entry
 470  *      @name: name of the device
 471  *      @map: configured settings for the device
 472  *
 473  *      Adds new setup entry to the dev_boot_setup list.  The function
 474  *      returns 0 on error and 1 on success.  This is a generic routine to
 475  *      all netdevices.
 476  */
 477 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 478 {
 479         struct netdev_boot_setup *s;
 480         int i;
 481
 482         s = dev_boot_setup;
 483         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 484                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 485                         memset(s[i].name, 0, sizeof(s[i].name));
 486                         strlcpy(s[i].name, name, IFNAMSIZ);
 487                         memcpy(&s[i].map, map, sizeof(s[i].map));
 488                         break;
 489                 }
 490         }
 491
 492         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 493 }
 494
 495 /**
 496  *      netdev_boot_setup_check - check boot time settings
 497  *      @dev: the netdevice
 498  *
 499  *      Check boot time settings for the device.
 500  *      The found settings are set for the device to be used
 501  *      later in the device probing.
 502  *      Returns 0 if no settings found, 1 if they are.
 503  */
 504 int netdev_boot_setup_check(struct net_device *dev)
 505 {
 506         struct netdev_boot_setup *s = dev_boot_setup;
 507         int i;
 508
 509         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 510                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 511                     !strcmp(dev->name, s[i].name)) {
 512                         dev->irq        = s[i].map.irq;
 513                         dev->base_addr  = s[i].map.base_addr;
 514                         dev->mem_start  = s[i].map.mem_start;
 515                         dev->mem_end    = s[i].map.mem_end;
 516                         return 1;
 517                 }
 518         }
 519         return 0;
 520 }
 521 EXPORT_SYMBOL(netdev_boot_setup_check);
 522
 523
 524 /**
 525  *      netdev_boot_base        - get address from boot time settings
 526  *      @prefix: prefix for network device
 527  *      @unit: id for network device
 528  *
 529  *      Check boot time settings for the base address of device.
 530  *      The found settings are set for the device to be used
 531  *      later in the device probing.
 532  *      Returns 0 if no settings found.
 533  */
 534 unsigned long netdev_boot_base(const char *prefix, int unit)
 535 {
 536         const struct netdev_boot_setup *s = dev_boot_setup;
 537         char name[IFNAMSIZ];
 538         int i;
 539
 540         sprintf(name, "%s%d", prefix, unit);
 541
 542         /*
 543          * If device already registered then return base of 1
 544          * to indicate not to probe for this interface
 545          */
 546         if (__dev_get_by_name(&init_net, name))
 547                 return 1;
 548
 549         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 550                 if (!strcmp(name, s[i].name))
 551                         return s[i].map.base_addr;
 552         return 0;
 553 }
 554
 555 /*
 556  * Saves at boot time configured settings for any netdevice.
 557  */
 558 int __init netdev_boot_setup(char *str)
 559 {
 560         int ints[5];
 561         struct ifmap map;
 562
 563         str = get_options(str, ARRAY_SIZE(ints), ints);
 564         if (!str || !*str)
 565                 return 0;
 566
 567         /* Save settings */
 568         memset(&map, 0, sizeof(map));
 569         if (ints[0] > 0)
 570                 map.irq = ints[1];
 571         if (ints[0] > 1)
 572                 map.base_addr = ints[2];
 573         if (ints[0] > 2)
 574                 map.mem_start = ints[3];
 575         if (ints[0] > 3)
 576                 map.mem_end = ints[4];
 577
 578         /* Add new entry to the list */
 579         return netdev_boot_setup_add(str, &map);
 580 }
 581
 582 __setup("netdev=", netdev_boot_setup);
 583
 584 /*******************************************************************************
 585
 586                             Device Interface Subroutines
 587
 588 *******************************************************************************/
 589
 590 /**
 591  *      __dev_get_by_name       - find a device by its name
 592  *      @net: the applicable net namespace
 593  *      @name: name to find
 594  *
 595  *      Find an interface by name. Must be called under RTNL semaphore
 596  *      or @dev_base_lock. If the name is found a pointer to the device
 597  *      is returned. If the name is not found then %NULL is returned. The
 598  *      reference counters are not incremented so the caller must be
 599  *      careful with locks.
 600  */
 601
 602 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 603 {
 604         struct hlist_node *p;
 605         struct net_device *dev;
 606         struct hlist_head *head = dev_name_hash(net, name);
 607
 608         hlist_for_each_entry(dev, p, head, name_hlist)
 609                 if (!strncmp(dev->name, name, IFNAMSIZ))
 610                         return dev;
 611
 612         return NULL;
 613 }
 614 EXPORT_SYMBOL(__dev_get_by_name);
 615
 616 /**
 617  *      dev_get_by_name_rcu     - find a device by its name
 618  *      @net: the applicable net namespace
 619  *      @name: name to find
 620  *
 621  *      Find an interface by name.
 622  *      If the name is found a pointer to the device is returned.
 623  *      If the name is not found then %NULL is returned.
 624  *      The reference counters are not incremented so the caller must be
 625  *      careful with locks. The caller must hold RCU lock.
 626  */
 627
 628 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 629 {
 630         struct hlist_node *p;
 631         struct net_device *dev;
 632         struct hlist_head *head = dev_name_hash(net, name);
 633
 634         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 635                 if (!strncmp(dev->name, name, IFNAMSIZ))
 636                         return dev;
 637
 638         return NULL;
 639 }
 640 EXPORT_SYMBOL(dev_get_by_name_rcu);
 641
 642 /**
 643  *      dev_get_by_name         - find a device by its name
 644  *      @net: the applicable net namespace
 645  *      @name: name to find
 646  *
 647  *      Find an interface by name. This can be called from any
 648  *      context and does its own locking. The returned handle has
 649  *      the usage count incremented and the caller must use dev_put() to
 650  *      release it when it is no longer needed. %NULL is returned if no
 651  *      matching device is found.
 652  */
 653
 654 struct net_device *dev_get_by_name(struct net *net, const char *name)
 655 {
 656         struct net_device *dev;
 657
 658         rcu_read_lock();
 659         dev = dev_get_by_name_rcu(net, name);
 660         if (dev)
 661                 dev_hold(dev);
 662         rcu_read_unlock();
 663         return dev;
 664 }
 665 EXPORT_SYMBOL(dev_get_by_name);
 666
 667 /**
 668  *      __dev_get_by_index - find a device by its ifindex
 669  *      @net: the applicable net namespace
 670  *      @ifindex: index of device
 671  *
 672  *      Search for an interface by index. Returns %NULL if the device
 673  *      is not found or a pointer to the device. The device has not
 674  *      had its reference counter increased so the caller must be careful
 675  *      about locking. The caller must hold either the RTNL semaphore
 676  *      or @dev_base_lock.
 677  */
 678
 679 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 680 {
 681         struct hlist_node *p;
 682         struct net_device *dev;
 683         struct hlist_head *head = dev_index_hash(net, ifindex);
 684
 685         hlist_for_each_entry(dev, p, head, index_hlist)
 686                 if (dev->ifindex == ifindex)
 687                         return dev;
 688
 689         return NULL;
 690 }
 691 EXPORT_SYMBOL(__dev_get_by_index);
 692
 693 /**
 694  *      dev_get_by_index_rcu - find a device by its ifindex
 695  *      @net: the applicable net namespace
 696  *      @ifindex: index of device
 697  *
 698  *      Search for an interface by index. Returns %NULL if the device
 699  *      is not found or a pointer to the device. The device has not
 700  *      had its reference counter increased so the caller must be careful
 701  *      about locking. The caller must hold RCU lock.
 702  */
 703
 704 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 705 {
 706         struct hlist_node *p;
 707         struct net_device *dev;
 708         struct hlist_head *head = dev_index_hash(net, ifindex);
 709
 710         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 711                 if (dev->ifindex == ifindex)
 712                         return dev;
 713
 714         return NULL;
 715 }
 716 EXPORT_SYMBOL(dev_get_by_index_rcu);
 717
 718
 719 /**
 720  *      dev_get_by_index - find a device by its ifindex
 721  *      @net: the applicable net namespace
 722  *      @ifindex: index of device
 723  *
 724  *      Search for an interface by index. Returns NULL if the device
 725  *      is not found or a pointer to the device. The device returned has
 726  *      had a reference added and the pointer is safe until the user calls
 727  *      dev_put to indicate they have finished with it.
 728  */
 729
 730 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 731 {
 732         struct net_device *dev;
 733
 734         rcu_read_lock();
 735         dev = dev_get_by_index_rcu(net, ifindex);
 736         if (dev)
 737                 dev_hold(dev);
 738         rcu_read_unlock();
 739         return dev;
 740 }
 741 EXPORT_SYMBOL(dev_get_by_index);
 742
 743 /**
 744  *      dev_getbyhwaddr - find a device by its hardware address
 745  *      @net: the applicable net namespace
 746  *      @type: media type of device
 747  *      @ha: hardware address
 748  *
 749  *      Search for an interface by MAC address. Returns NULL if the device
 750  *      is not found or a pointer to the device. The caller must hold the
 751  *      rtnl semaphore. The returned device has not had its ref count increased
 752  *      and the caller must therefore be careful about locking
 753  *
 754  *      BUGS:
 755  *      If the API was consistent this would be __dev_get_by_hwaddr
 756  */
 757
 758 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 759 {
 760         struct net_device *dev;
 761
 762         ASSERT_RTNL();
 763
 764         for_each_netdev(net, dev)
 765                 if (dev->type == type &&
 766                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 767                         return dev;
 768
 769         return NULL;
 770 }
 771 EXPORT_SYMBOL(dev_getbyhwaddr);
 772
 773 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 774 {
 775         struct net_device *dev;
 776
 777         ASSERT_RTNL();
 778         for_each_netdev(net, dev)
 779                 if (dev->type == type)
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 785
 786 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 787 {
 788         struct net_device *dev, *ret = NULL;
 789
 790         rcu_read_lock();
 791         for_each_netdev_rcu(net, dev)
 792                 if (dev->type == type) {
 793                         dev_hold(dev);
 794                         ret = dev;
 795                         break;
 796                 }
 797         rcu_read_unlock();
 798         return ret;
 799 }
 800 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 801
 802 /**
 803  *      dev_get_by_flags_rcu - find any device with given flags
 804  *      @net: the applicable net namespace
 805  *      @if_flags: IFF_* values
 806  *      @mask: bitmask of bits in if_flags to check
 807  *
 808  *      Search for any interface with the given flags. Returns NULL if a device
 809  *      is not found or a pointer to the device. Must be called inside
 810  *      rcu_read_lock(), and result refcount is unchanged.
 811  */
 812
 813 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 814                                     unsigned short mask)
 815 {
 816         struct net_device *dev, *ret;
 817
 818         ret = NULL;
 819         for_each_netdev_rcu(net, dev) {
 820                 if (((dev->flags ^ if_flags) & mask) == 0) {
 821                         ret = dev;
 822                         break;
 823                 }
 824         }
 825         return ret;
 826 }
 827 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 828
 829 /**
 830  *      dev_valid_name - check if name is okay for network device
 831  *      @name: name string
 832  *
 833  *      Network device names need to be valid file names to
 834  *      to allow sysfs to work.  We also disallow any kind of
 835  *      whitespace.
 836  */
 837 int dev_valid_name(const char *name)
 838 {
 839         if (*name == '\0')
 840                 return 0;
 841         if (strlen(name) >= IFNAMSIZ)
 842                 return 0;
 843         if (!strcmp(name, ".") || !strcmp(name, ".."))
 844                 return 0;
 845
 846         while (*name) {
 847                 if (*name == '/' || isspace(*name))
 848                         return 0;
 849                 name++;
 850         }
 851         return 1;
 852 }
 853 EXPORT_SYMBOL(dev_valid_name);
 854
 855 /**
 856  *      __dev_alloc_name - allocate a name for a device
 857  *      @net: network namespace to allocate the device name in
 858  *      @name: name format string
 859  *      @buf:  scratch buffer and result name string
 860  *
 861  *      Passed a format string - eg "lt%d" it will try and find a suitable
 862  *      id. It scans list of devices to build up a free map, then chooses
 863  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 864  *      while allocating the name and adding the device in order to avoid
 865  *      duplicates.
 866  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 867  *      Returns the number of the unit assigned or a negative errno code.
 868  */
 869
 870 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 871 {
 872         int i = 0;
 873         const char *p;
 874         const int max_netdevices = 8*PAGE_SIZE;
 875         unsigned long *inuse;
 876         struct net_device *d;
 877
 878         p = strnchr(name, IFNAMSIZ-1, '%');
 879         if (p) {
 880                 /*
 881                  * Verify the string as this thing may have come from
 882                  * the user.  There must be either one "%d" and no other "%"
 883                  * characters.
 884                  */
 885                 if (p[1] != 'd' || strchr(p + 2, '%'))
 886                         return -EINVAL;
 887
 888                 /* Use one page as a bit array of possible slots */
 889                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 890                 if (!inuse)
 891                         return -ENOMEM;
 892
 893                 for_each_netdev(net, d) {
 894                         if (!sscanf(d->name, name, &i))
 895                                 continue;
 896                         if (i < 0 || i >= max_netdevices)
 897                                 continue;
 898
 899                         /*  avoid cases where sscanf is not exact inverse of printf */
 900                         snprintf(buf, IFNAMSIZ, name, i);
 901                         if (!strncmp(buf, d->name, IFNAMSIZ))
 902                                 set_bit(i, inuse);
 903                 }
 904
 905                 i = find_first_zero_bit(inuse, max_netdevices);
 906                 free_page((unsigned long) inuse);
 907         }
 908
 909         if (buf != name)
 910                 snprintf(buf, IFNAMSIZ, name, i);
 911         if (!__dev_get_by_name(net, buf))
 912                 return i;
 913
 914         /* It is possible to run out of possible slots
 915          * when the name is long and there isn't enough space left
 916          * for the digits, or if all bits are used.
 917          */
 918         return -ENFILE;
 919 }
 920
 921 /**
 922  *      dev_alloc_name - allocate a name for a device
 923  *      @dev: device
 924  *      @name: name format string
 925  *
 926  *      Passed a format string - eg "lt%d" it will try and find a suitable
 927  *      id. It scans list of devices to build up a free map, then chooses
 928  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 929  *      while allocating the name and adding the device in order to avoid
 930  *      duplicates.
 931  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 932  *      Returns the number of the unit assigned or a negative errno code.
 933  */
 934
 935 int dev_alloc_name(struct net_device *dev, const char *name)
 936 {
 937         char buf[IFNAMSIZ];
 938         struct net *net;
 939         int ret;
 940
 941         BUG_ON(!dev_net(dev));
 942         net = dev_net(dev);
 943         ret = __dev_alloc_name(net, name, buf);
 944         if (ret >= 0)
 945                 strlcpy(dev->name, buf, IFNAMSIZ);
 946         return ret;
 947 }
 948 EXPORT_SYMBOL(dev_alloc_name);
 949
 950 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 951 {
 952         struct net *net;
 953
 954         BUG_ON(!dev_net(dev));
 955         net = dev_net(dev);
 956
 957         if (!dev_valid_name(name))
 958                 return -EINVAL;
 959
 960         if (fmt && strchr(name, '%'))
 961                 return dev_alloc_name(dev, name);
 962         else if (__dev_get_by_name(net, name))
 963                 return -EEXIST;
 964         else if (dev->name != name)
 965                 strlcpy(dev->name, name, IFNAMSIZ);
 966
 967         return 0;
 968 }
 969
 970 /**
 971  *      dev_change_name - change name of a device
 972  *      @dev: device
 973  *      @newname: name (or format string) must be at least IFNAMSIZ
 974  *
 975  *      Change name of a device, can pass format strings "eth%d".
 976  *      for wildcarding.
 977  */
 978 int dev_change_name(struct net_device *dev, const char *newname)
 979 {
 980         char oldname[IFNAMSIZ];
 981         int err = 0;
 982         int ret;
 983         struct net *net;
 984
 985         ASSERT_RTNL();
 986         BUG_ON(!dev_net(dev));
 987
 988         net = dev_net(dev);
 989         if (dev->flags & IFF_UP)
 990                 return -EBUSY;
 991
 992         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 993                 return 0;
 994
 995         memcpy(oldname, dev->name, IFNAMSIZ);
 996
 997         err = dev_get_valid_name(dev, newname, 1);
 998         if (err < 0)
 999                 return err;
1000
1001 rollback:
1002         ret = device_rename(&dev->dev, dev->name);
1003         if (ret) {
1004                 memcpy(dev->name, oldname, IFNAMSIZ);
1005                 return ret;
1006         }
1007
1008         write_lock_bh(&dev_base_lock);
1009         hlist_del(&dev->name_hlist);
1010         write_unlock_bh(&dev_base_lock);
1011
1012         synchronize_rcu();
1013
1014         write_lock_bh(&dev_base_lock);
1015         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1016         write_unlock_bh(&dev_base_lock);
1017
1018         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1019         ret = notifier_to_errno(ret);
1020
1021         if (ret) {
1022                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1023                 if (err >= 0) {
1024                         err = ret;
1025                         memcpy(dev->name, oldname, IFNAMSIZ);
1026                         goto rollback;
1027                 } else {
1028                         printk(KERN_ERR
1029                                "%s: name change rollback failed: %d.\n",
1030                                dev->name, ret);
1031                 }
1032         }
1033
1034         return err;
1035 }
1036
1037 /**
1038  *      dev_set_alias - change ifalias of a device
1039  *      @dev: device
1040  *      @alias: name up to IFALIASZ
1041  *      @len: limit of bytes to copy from info
1042  *
1043  *      Set ifalias for a device,
1044  */
1045 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1046 {
1047         ASSERT_RTNL();
1048
1049         if (len >= IFALIASZ)
1050                 return -EINVAL;
1051
1052         if (!len) {
1053                 if (dev->ifalias) {
1054                         kfree(dev->ifalias);
1055                         dev->ifalias = NULL;
1056                 }
1057                 return 0;
1058         }
1059
1060         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1061         if (!dev->ifalias)
1062                 return -ENOMEM;
1063
1064         strlcpy(dev->ifalias, alias, len+1);
1065         return len;
1066 }
1067
1068
1069 /**
1070  *      netdev_features_change - device changes features
1071  *      @dev: device to cause notification
1072  *
1073  *      Called to indicate a device has changed features.
1074  */
1075 void netdev_features_change(struct net_device *dev)
1076 {
1077         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1078 }
1079 EXPORT_SYMBOL(netdev_features_change);
1080
1081 /**
1082  *      netdev_state_change - device changes state
1083  *      @dev: device to cause notification
1084  *
1085  *      Called to indicate a device has changed state. This function calls
1086  *      the notifier chains for netdev_chain and sends a NEWLINK message
1087  *      to the routing socket.
1088  */
1089 void netdev_state_change(struct net_device *dev)
1090 {
1091         if (dev->flags & IFF_UP) {
1092                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1093                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1094         }
1095 }
1096 EXPORT_SYMBOL(netdev_state_change);
1097
1098 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1099 {
1100         return call_netdevice_notifiers(event, dev);
1101 }
1102 EXPORT_SYMBOL(netdev_bonding_change);
1103
1104 /**
1105  *      dev_load        - load a network module
1106  *      @net: the applicable net namespace
1107  *      @name: name of interface
1108  *
1109  *      If a network interface is not present and the process has suitable
1110  *      privileges this function loads the module. If module loading is not
1111  *      available in this kernel then it becomes a nop.
1112  */
1113
1114 void dev_load(struct net *net, const char *name)
1115 {
1116         struct net_device *dev;
1117
1118         rcu_read_lock();
1119         dev = dev_get_by_name_rcu(net, name);
1120         rcu_read_unlock();
1121
1122         if (!dev && capable(CAP_NET_ADMIN))
1123                 request_module("%s", name);
1124 }
1125 EXPORT_SYMBOL(dev_load);
1126
1127 static int __dev_open(struct net_device *dev)
1128 {
1129         const struct net_device_ops *ops = dev->netdev_ops;
1130         int ret;
1131
1132         ASSERT_RTNL();
1133
1134         /*
1135          *      Is it even present?
1136          */
1137         if (!netif_device_present(dev))
1138                 return -ENODEV;
1139
1140         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1141         ret = notifier_to_errno(ret);
1142         if (ret)
1143                 return ret;
1144
1145         /*
1146          *      Call device private open method
1147          */
1148         set_bit(__LINK_STATE_START, &dev->state);
1149
1150         if (ops->ndo_validate_addr)
1151                 ret = ops->ndo_validate_addr(dev);
1152
1153         if (!ret && ops->ndo_open)
1154                 ret = ops->ndo_open(dev);
1155
1156         /*
1157          *      If it went open OK then:
1158          */
1159
1160         if (ret)
1161                 clear_bit(__LINK_STATE_START, &dev->state);
1162         else {
1163                 /*
1164                  *      Set the flags.
1165                  */
1166                 dev->flags |= IFF_UP;
1167
1168                 /*
1169                  *      Enable NET_DMA
1170                  */
1171                 net_dmaengine_get();
1172
1173                 /*
1174                  *      Initialize multicasting status
1175                  */
1176                 dev_set_rx_mode(dev);
1177
1178                 /*
1179                  *      Wakeup transmit queue engine
1180                  */
1181                 dev_activate(dev);
1182         }
1183
1184         return ret;
1185 }
1186
1187 /**
1188  *      dev_open        - prepare an interface for use.
1189  *      @dev:   device to open
1190  *
1191  *      Takes a device from down to up state. The device's private open
1192  *      function is invoked and then the multicast lists are loaded. Finally
1193  *      the device is moved into the up state and a %NETDEV_UP message is
1194  *      sent to the netdev notifier chain.
1195  *
1196  *      Calling this function on an active interface is a nop. On a failure
1197  *      a negative errno code is returned.
1198  */
1199 int dev_open(struct net_device *dev)
1200 {
1201         int ret;
1202
1203         /*
1204          *      Is it already up?
1205          */
1206         if (dev->flags & IFF_UP)
1207                 return 0;
1208
1209         /*
1210          *      Open device
1211          */
1212         ret = __dev_open(dev);
1213         if (ret < 0)
1214                 return ret;
1215
1216         /*
1217          *      ... and announce new interface.
1218          */
1219         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1220         call_netdevice_notifiers(NETDEV_UP, dev);
1221
1222         return ret;
1223 }
1224 EXPORT_SYMBOL(dev_open);
1225
1226 static int __dev_close(struct net_device *dev)
1227 {
1228         const struct net_device_ops *ops = dev->netdev_ops;
1229
1230         ASSERT_RTNL();
1231         might_sleep();
1232
1233         /*
1234          *      Tell people we are going down, so that they can
1235          *      prepare to death, when device is still operating.
1236          */
1237         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238
1239         clear_bit(__LINK_STATE_START, &dev->state);
1240
1241         /* Synchronize to scheduled poll. We cannot touch poll list,
1242          * it can be even on different cpu. So just clear netif_running().
1243          *
1244          * dev->stop() will invoke napi_disable() on all of it's
1245          * napi_struct instances on this device.
1246          */
1247         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248
1249         dev_deactivate(dev);
1250
1251         /*
1252          *      Call the device specific close. This cannot fail.
1253          *      Only if device is UP
1254          *
1255          *      We allow it to be called even after a DETACH hot-plug
1256          *      event.
1257          */
1258         if (ops->ndo_stop)
1259                 ops->ndo_stop(dev);
1260
1261         /*
1262          *      Device is now down.
1263          */
1264
1265         dev->flags &= ~IFF_UP;
1266
1267         /*
1268          *      Shutdown NET_DMA
1269          */
1270         net_dmaengine_put();
1271
1272         return 0;
1273 }
1274
1275 /**
1276  *      dev_close - shutdown an interface.
1277  *      @dev: device to shutdown
1278  *
1279  *      This function moves an active device into down state. A
1280  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1281  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1282  *      chain.
1283  */
1284 int dev_close(struct net_device *dev)
1285 {
1286         if (!(dev->flags & IFF_UP))
1287                 return 0;
1288
1289         __dev_close(dev);
1290
1291         /*
1292          * Tell people we are down
1293          */
1294         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1295         call_netdevice_notifiers(NETDEV_DOWN, dev);
1296
1297         return 0;
1298 }
1299 EXPORT_SYMBOL(dev_close);
1300
1301
1302 /**
1303  *      dev_disable_lro - disable Large Receive Offload on a device
1304  *      @dev: device
1305  *
1306  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1307  *      called under RTNL.  This is needed if received packets may be
1308  *      forwarded to another interface.
1309  */
1310 void dev_disable_lro(struct net_device *dev)
1311 {
1312         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1313             dev->ethtool_ops->set_flags) {
1314                 u32 flags = dev->ethtool_ops->get_flags(dev);
1315                 if (flags & ETH_FLAG_LRO) {
1316                         flags &= ~ETH_FLAG_LRO;
1317                         dev->ethtool_ops->set_flags(dev, flags);
1318                 }
1319         }
1320         WARN_ON(dev->features & NETIF_F_LRO);
1321 }
1322 EXPORT_SYMBOL(dev_disable_lro);
1323
1324
1325 static int dev_boot_phase = 1;
1326
1327 /*
1328  *      Device change register/unregister. These are not inline or static
1329  *      as we export them to the world.
1330  */
1331
1332 /**
1333  *      register_netdevice_notifier - register a network notifier block
1334  *      @nb: notifier
1335  *
1336  *      Register a notifier to be called when network device events occur.
1337  *      The notifier passed is linked into the kernel structures and must
1338  *      not be reused until it has been unregistered. A negative errno code
1339  *      is returned on a failure.
1340  *
1341  *      When registered all registration and up events are replayed
1342  *      to the new notifier to allow device to have a race free
1343  *      view of the network device list.
1344  */
1345
1346 int register_netdevice_notifier(struct notifier_block *nb)
1347 {
1348         struct net_device *dev;
1349         struct net_device *last;
1350         struct net *net;
1351         int err;
1352
1353         rtnl_lock();
1354         err = raw_notifier_chain_register(&netdev_chain, nb);
1355         if (err)
1356                 goto unlock;
1357         if (dev_boot_phase)
1358                 goto unlock;
1359         for_each_net(net) {
1360                 for_each_netdev(net, dev) {
1361                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1362                         err = notifier_to_errno(err);
1363                         if (err)
1364                                 goto rollback;
1365
1366                         if (!(dev->flags & IFF_UP))
1367                                 continue;
1368
1369                         nb->notifier_call(nb, NETDEV_UP, dev);
1370                 }
1371         }
1372
1373 unlock:
1374         rtnl_unlock();
1375         return err;
1376
1377 rollback:
1378         last = dev;
1379         for_each_net(net) {
1380                 for_each_netdev(net, dev) {
1381                         if (dev == last)
1382                                 break;
1383
1384                         if (dev->flags & IFF_UP) {
1385                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1386                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1387                         }
1388                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1389                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1390                 }
1391         }
1392
1393         raw_notifier_chain_unregister(&netdev_chain, nb);
1394         goto unlock;
1395 }
1396 EXPORT_SYMBOL(register_netdevice_notifier);
1397
1398 /**
1399  *      unregister_netdevice_notifier - unregister a network notifier block
1400  *      @nb: notifier
1401  *
1402  *      Unregister a notifier previously registered by
1403  *      register_netdevice_notifier(). The notifier is unlinked into the
1404  *      kernel structures and may then be reused. A negative errno code
1405  *      is returned on a failure.
1406  */
1407
1408 int unregister_netdevice_notifier(struct notifier_block *nb)
1409 {
1410         int err;
1411
1412         rtnl_lock();
1413         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1414         rtnl_unlock();
1415         return err;
1416 }
1417 EXPORT_SYMBOL(unregister_netdevice_notifier);
1418
1419 /**
1420  *      call_netdevice_notifiers - call all network notifier blocks
1421  *      @val: value passed unmodified to notifier function
1422  *      @dev: net_device pointer passed unmodified to notifier function
1423  *
1424  *      Call all network notifier blocks.  Parameters and return value
1425  *      are as for raw_notifier_call_chain().
1426  */
1427
1428 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1429 {
1430         ASSERT_RTNL();
1431         return raw_notifier_call_chain(&netdev_chain, val, dev);
1432 }
1433
1434 /* When > 0 there are consumers of rx skb time stamps */
1435 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1436
1437 void net_enable_timestamp(void)
1438 {
1439         atomic_inc(&netstamp_needed);
1440 }
1441 EXPORT_SYMBOL(net_enable_timestamp);
1442
1443 void net_disable_timestamp(void)
1444 {
1445         atomic_dec(&netstamp_needed);
1446 }
1447 EXPORT_SYMBOL(net_disable_timestamp);
1448
1449 static inline void net_timestamp_set(struct sk_buff *skb)
1450 {
1451         if (atomic_read(&netstamp_needed))
1452                 __net_timestamp(skb);
1453         else
1454                 skb->tstamp.tv64 = 0;
1455 }
1456
1457 static inline void net_timestamp_check(struct sk_buff *skb)
1458 {
1459         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1460                 __net_timestamp(skb);
1461 }
1462
1463 /**
1464  * dev_forward_skb - loopback an skb to another netif
1465  *
1466  * @dev: destination network device
1467  * @skb: buffer to forward
1468  *
1469  * return values:
1470  *      NET_RX_SUCCESS  (no congestion)
1471  *      NET_RX_DROP     (packet was dropped, but freed)
1472  *
1473  * dev_forward_skb can be used for injecting an skb from the
1474  * start_xmit function of one device into the receive queue
1475  * of another device.
1476  *
1477  * The receiving device may be in another namespace, so
1478  * we have to clear all information in the skb that could
1479  * impact namespace isolation.
1480  */
1481 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1482 {
1483         skb_orphan(skb);
1484         nf_reset(skb);
1485
1486         if (!(dev->flags & IFF_UP) ||
1487             (skb->len > (dev->mtu + dev->hard_header_len))) {
1488                 kfree_skb(skb);
1489                 return NET_RX_DROP;
1490         }
1491         skb_set_dev(skb, dev);
1492         skb->tstamp.tv64 = 0;
1493         skb->pkt_type = PACKET_HOST;
1494         skb->protocol = eth_type_trans(skb, dev);
1495         return netif_rx(skb);
1496 }
1497 EXPORT_SYMBOL_GPL(dev_forward_skb);
1498
1499 /*
1500  *      Support routine. Sends outgoing frames to any network
1501  *      taps currently in use.
1502  */
1503
1504 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1505 {
1506         struct packet_type *ptype;
1507
1508 #ifdef CONFIG_NET_CLS_ACT
1509         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1510                 net_timestamp_set(skb);
1511 #else
1512         net_timestamp_set(skb);
1513 #endif
1514
1515         rcu_read_lock();
1516         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1517                 /* Never send packets back to the socket
1518                  * they originated from - MvS (miquels@drinkel.ow.org)
1519                  */
1520                 if ((ptype->dev == dev || !ptype->dev) &&
1521                     (ptype->af_packet_priv == NULL ||
1522                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1523                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1524                         if (!skb2)
1525                                 break;
1526
1527                         /* skb->nh should be correctly
1528                            set by sender, so that the second statement is
1529                            just protection against buggy protocols.
1530                          */
1531                         skb_reset_mac_header(skb2);
1532
1533                         if (skb_network_header(skb2) < skb2->data ||
1534                             skb2->network_header > skb2->tail) {
1535                                 if (net_ratelimit())
1536                                         printk(KERN_CRIT "protocol %04x is "
1537                                                "buggy, dev %s\n",
1538                                                ntohs(skb2->protocol),
1539                                                dev->name);
1540                                 skb_reset_network_header(skb2);
1541                         }
1542
1543                         skb2->transport_header = skb2->network_header;
1544                         skb2->pkt_type = PACKET_OUTGOING;
1545                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1546                 }
1547         }
1548         rcu_read_unlock();
1549 }
1550
1551 /*
1552  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1553  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1554  */
1555 void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1556 {
1557         unsigned int real_num = dev->real_num_tx_queues;
1558
1559         if (unlikely(txq > dev->num_tx_queues))
1560                 ;
1561         else if (txq > real_num)
1562                 dev->real_num_tx_queues = txq;
1563         else if (txq < real_num) {
1564                 dev->real_num_tx_queues = txq;
1565                 qdisc_reset_all_tx_gt(dev, txq);
1566         }
1567 }
1568 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1569
1570 #ifdef CONFIG_RPS
1571 /**
1572  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1573  *      @dev: Network device
1574  *      @rxq: Actual number of RX queues
1575  *
1576  *      This must be called either with the rtnl_lock held or before
1577  *      registration of the net device.  Returns 0 on success, or a
1578  *      negative error code.  If called before registration, it also
1579  *      sets the maximum number of queues, and always succeeds.
1580  */
1581 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1582 {
1583         int rc;
1584
1585         if (dev->reg_state == NETREG_REGISTERED) {
1586                 ASSERT_RTNL();
1587
1588                 if (rxq > dev->num_rx_queues)
1589                         return -EINVAL;
1590
1591                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1592                                                   rxq);
1593                 if (rc)
1594                         return rc;
1595         } else {
1596                 dev->num_rx_queues = rxq;
1597         }
1598
1599         dev->real_num_rx_queues = rxq;
1600         return 0;
1601 }
1602 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1603 #endif
1604
1605 static inline void __netif_reschedule(struct Qdisc *q)
1606 {
1607         struct softnet_data *sd;
1608         unsigned long flags;
1609
1610         local_irq_save(flags);
1611         sd = &__get_cpu_var(softnet_data);
1612         q->next_sched = NULL;
1613         *sd->output_queue_tailp = q;
1614         sd->output_queue_tailp = &q->next_sched;
1615         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1616         local_irq_restore(flags);
1617 }
1618
1619 void __netif_schedule(struct Qdisc *q)
1620 {
1621         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1622                 __netif_reschedule(q);
1623 }
1624 EXPORT_SYMBOL(__netif_schedule);
1625
1626 void dev_kfree_skb_irq(struct sk_buff *skb)
1627 {
1628         if (atomic_dec_and_test(&skb->users)) {
1629                 struct softnet_data *sd;
1630                 unsigned long flags;
1631
1632                 local_irq_save(flags);
1633                 sd = &__get_cpu_var(softnet_data);
1634                 skb->next = sd->completion_queue;
1635                 sd->completion_queue = skb;
1636                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1637                 local_irq_restore(flags);
1638         }
1639 }
1640 EXPORT_SYMBOL(dev_kfree_skb_irq);
1641
1642 void dev_kfree_skb_any(struct sk_buff *skb)
1643 {
1644         if (in_irq() || irqs_disabled())
1645                 dev_kfree_skb_irq(skb);
1646         else
1647                 dev_kfree_skb(skb);
1648 }
1649 EXPORT_SYMBOL(dev_kfree_skb_any);
1650
1651
1652 /**
1653  * netif_device_detach - mark device as removed
1654  * @dev: network device
1655  *
1656  * Mark device as removed from system and therefore no longer available.
1657  */
1658 void netif_device_detach(struct net_device *dev)
1659 {
1660         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1661             netif_running(dev)) {
1662                 netif_tx_stop_all_queues(dev);
1663         }
1664 }
1665 EXPORT_SYMBOL(netif_device_detach);
1666
1667 /**
1668  * netif_device_attach - mark device as attached
1669  * @dev: network device
1670  *
1671  * Mark device as attached from system and restart if needed.
1672  */
1673 void netif_device_attach(struct net_device *dev)
1674 {
1675         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1676             netif_running(dev)) {
1677                 netif_tx_wake_all_queues(dev);
1678                 __netdev_watchdog_up(dev);
1679         }
1680 }
1681 EXPORT_SYMBOL(netif_device_attach);
1682
1683 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1684 {
1685         return ((features & NETIF_F_GEN_CSUM) ||
1686                 ((features & NETIF_F_IP_CSUM) &&
1687                  protocol == htons(ETH_P_IP)) ||
1688                 ((features & NETIF_F_IPV6_CSUM) &&
1689                  protocol == htons(ETH_P_IPV6)) ||
1690                 ((features & NETIF_F_FCOE_CRC) &&
1691                  protocol == htons(ETH_P_FCOE)));
1692 }
1693
1694 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1695 {
1696         if (can_checksum_protocol(dev->features, skb->protocol))
1697                 return true;
1698
1699         if (skb->protocol == htons(ETH_P_8021Q)) {
1700                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1701                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1702                                           veh->h_vlan_encapsulated_proto))
1703                         return true;
1704         }
1705
1706         return false;
1707 }
1708
1709 /**
1710  * skb_dev_set -- assign a new device to a buffer
1711  * @skb: buffer for the new device
1712  * @dev: network device
1713  *
1714  * If an skb is owned by a device already, we have to reset
1715  * all data private to the namespace a device belongs to
1716  * before assigning it a new device.
1717  */
1718 #ifdef CONFIG_NET_NS
1719 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1720 {
1721         skb_dst_drop(skb);
1722         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1723                 secpath_reset(skb);
1724                 nf_reset(skb);
1725                 skb_init_secmark(skb);
1726                 skb->mark = 0;
1727                 skb->priority = 0;
1728                 skb->nf_trace = 0;
1729                 skb->ipvs_property = 0;
1730 #ifdef CONFIG_NET_SCHED
1731                 skb->tc_index = 0;
1732 #endif
1733         }
1734         skb->dev = dev;
1735 }
1736 EXPORT_SYMBOL(skb_set_dev);
1737 #endif /* CONFIG_NET_NS */
1738
1739 /*
1740  * Invalidate hardware checksum when packet is to be mangled, and
1741  * complete checksum manually on outgoing path.
1742  */
1743 int skb_checksum_help(struct sk_buff *skb)
1744 {
1745         __wsum csum;
1746         int ret = 0, offset;
1747
1748         if (skb->ip_summed == CHECKSUM_COMPLETE)
1749                 goto out_set_summed;
1750
1751         if (unlikely(skb_shinfo(skb)->gso_size)) {
1752                 /* Let GSO fix up the checksum. */
1753                 goto out_set_summed;
1754         }
1755
1756         offset = skb->csum_start - skb_headroom(skb);
1757         BUG_ON(offset >= skb_headlen(skb));
1758         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1759
1760         offset += skb->csum_offset;
1761         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1762
1763         if (skb_cloned(skb) &&
1764             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1765                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1766                 if (ret)
1767                         goto out;
1768         }
1769
1770         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1771 out_set_summed:
1772         skb->ip_summed = CHECKSUM_NONE;
1773 out:
1774         return ret;
1775 }
1776 EXPORT_SYMBOL(skb_checksum_help);
1777
1778 /**
1779  *      skb_gso_segment - Perform segmentation on skb.
1780  *      @skb: buffer to segment
1781  *      @features: features for the output path (see dev->features)
1782  *
1783  *      This function segments the given skb and returns a list of segments.
1784  *
1785  *      It may return NULL if the skb requires no segmentation.  This is
1786  *      only possible when GSO is used for verifying header integrity.
1787  */
1788 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1789 {
1790         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1791         struct packet_type *ptype;
1792         __be16 type = skb->protocol;
1793         int err;
1794
1795         skb_reset_mac_header(skb);
1796         skb->mac_len = skb->network_header - skb->mac_header;
1797         __skb_pull(skb, skb->mac_len);
1798
1799         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1800                 struct net_device *dev = skb->dev;
1801                 struct ethtool_drvinfo info = {};
1802
1803                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1804                         dev->ethtool_ops->get_drvinfo(dev, &info);
1805
1806                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1807                         "ip_summed=%d",
1808                      info.driver, dev ? dev->features : 0L,
1809                      skb->sk ? skb->sk->sk_route_caps : 0L,
1810                      skb->len, skb->data_len, skb->ip_summed);
1811
1812                 if (skb_header_cloned(skb) &&
1813                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1814                         return ERR_PTR(err);
1815         }
1816
1817         rcu_read_lock();
1818         list_for_each_entry_rcu(ptype,
1819                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1820                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1821                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1822                                 err = ptype->gso_send_check(skb);
1823                                 segs = ERR_PTR(err);
1824                                 if (err || skb_gso_ok(skb, features))
1825                                         break;
1826                                 __skb_push(skb, (skb->data -
1827                                                  skb_network_header(skb)));
1828                         }
1829                         segs = ptype->gso_segment(skb, features);
1830                         break;
1831                 }
1832         }
1833         rcu_read_unlock();
1834
1835         __skb_push(skb, skb->data - skb_mac_header(skb));
1836
1837         return segs;
1838 }
1839 EXPORT_SYMBOL(skb_gso_segment);
1840
1841 /* Take action when hardware reception checksum errors are detected. */
1842 #ifdef CONFIG_BUG
1843 void netdev_rx_csum_fault(struct net_device *dev)
1844 {
1845         if (net_ratelimit()) {
1846                 printk(KERN_ERR "%s: hw csum failure.\n",
1847                         dev ? dev->name : "<unknown>");
1848                 dump_stack();
1849         }
1850 }
1851 EXPORT_SYMBOL(netdev_rx_csum_fault);
1852 #endif
1853
1854 /* Actually, we should eliminate this check as soon as we know, that:
1855  * 1. IOMMU is present and allows to map all the memory.
1856  * 2. No high memory really exists on this machine.
1857  */
1858
1859 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1860 {
1861 #ifdef CONFIG_HIGHMEM
1862         int i;
1863         if (!(dev->features & NETIF_F_HIGHDMA)) {
1864                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1865                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1866                                 return 1;
1867         }
1868
1869         if (PCI_DMA_BUS_IS_PHYS) {
1870                 struct device *pdev = dev->dev.parent;
1871
1872                 if (!pdev)
1873                         return 0;
1874                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1875                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1876                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1877                                 return 1;
1878                 }
1879         }
1880 #endif
1881         return 0;
1882 }
1883
1884 struct dev_gso_cb {
1885         void (*destructor)(struct sk_buff *skb);
1886 };
1887
1888 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1889
1890 static void dev_gso_skb_destructor(struct sk_buff *skb)
1891 {
1892         struct dev_gso_cb *cb;
1893
1894         do {
1895                 struct sk_buff *nskb = skb->next;
1896
1897                 skb->next = nskb->next;
1898                 nskb->next = NULL;
1899                 kfree_skb(nskb);
1900         } while (skb->next);
1901
1902         cb = DEV_GSO_CB(skb);
1903         if (cb->destructor)
1904                 cb->destructor(skb);
1905 }
1906
1907 /**
1908  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1909  *      @skb: buffer to segment
1910  *
1911  *      This function segments the given skb and stores the list of segments
1912  *      in skb->next.
1913  */
1914 static int dev_gso_segment(struct sk_buff *skb)
1915 {
1916         struct net_device *dev = skb->dev;
1917         struct sk_buff *segs;
1918         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1919                                          NETIF_F_SG : 0);
1920
1921         segs = skb_gso_segment(skb, features);
1922
1923         /* Verifying header integrity only. */
1924         if (!segs)
1925                 return 0;
1926
1927         if (IS_ERR(segs))
1928                 return PTR_ERR(segs);
1929
1930         skb->next = segs;
1931         DEV_GSO_CB(skb)->destructor = skb->destructor;
1932         skb->destructor = dev_gso_skb_destructor;
1933
1934         return 0;
1935 }
1936
1937 /*
1938  * Try to orphan skb early, right before transmission by the device.
1939  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1940  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1941  */
1942 static inline void skb_orphan_try(struct sk_buff *skb)
1943 {
1944         struct sock *sk = skb->sk;
1945
1946         if (sk && !skb_shinfo(skb)->tx_flags) {
1947                 /* skb_tx_hash() wont be able to get sk.
1948                  * We copy sk_hash into skb->rxhash
1949                  */
1950                 if (!skb->rxhash)
1951                         skb->rxhash = sk->sk_hash;
1952                 skb_orphan(skb);
1953         }
1954 }
1955
1956 /*
1957  * Returns true if either:
1958  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
1959  *      2. skb is fragmented and the device does not support SG, or if
1960  *         at least one of fragments is in highmem and device does not
1961  *         support DMA from it.
1962  */
1963 static inline int skb_needs_linearize(struct sk_buff *skb,
1964                                       struct net_device *dev)
1965 {
1966         return skb_is_nonlinear(skb) &&
1967                ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
1968                 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
1969                                               illegal_highdma(dev, skb))));
1970 }
1971
1972 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1973                         struct netdev_queue *txq)
1974 {
1975         const struct net_device_ops *ops = dev->netdev_ops;
1976         int rc = NETDEV_TX_OK;
1977
1978         if (likely(!skb->next)) {
1979                 if (!list_empty(&ptype_all))
1980                         dev_queue_xmit_nit(skb, dev);
1981
1982                 /*
1983                  * If device doesnt need skb->dst, release it right now while
1984                  * its hot in this cpu cache
1985                  */
1986                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1987                         skb_dst_drop(skb);
1988
1989                 skb_orphan_try(skb);
1990
1991                 if (netif_needs_gso(dev, skb)) {
1992                         if (unlikely(dev_gso_segment(skb)))
1993                                 goto out_kfree_skb;
1994                         if (skb->next)
1995                                 goto gso;
1996                 } else {
1997                         if (skb_needs_linearize(skb, dev) &&
1998                             __skb_linearize(skb))
1999                                 goto out_kfree_skb;
2000
2001                         /* If packet is not checksummed and device does not
2002                          * support checksumming for this protocol, complete
2003                          * checksumming here.
2004                          */
2005                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2006                                 skb_set_transport_header(skb, skb->csum_start -
2007                                               skb_headroom(skb));
2008                                 if (!dev_can_checksum(dev, skb) &&
2009                                      skb_checksum_help(skb))
2010                                         goto out_kfree_skb;
2011                         }
2012                 }
2013
2014                 rc = ops->ndo_start_xmit(skb, dev);
2015                 if (rc == NETDEV_TX_OK)
2016                         txq_trans_update(txq);
2017                 return rc;
2018         }
2019
2020 gso:
2021         do {
2022                 struct sk_buff *nskb = skb->next;
2023
2024                 skb->next = nskb->next;
2025                 nskb->next = NULL;
2026
2027                 /*
2028                  * If device doesnt need nskb->dst, release it right now while
2029                  * its hot in this cpu cache
2030                  */
2031                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2032                         skb_dst_drop(nskb);
2033
2034                 rc = ops->ndo_start_xmit(nskb, dev);
2035                 if (unlikely(rc != NETDEV_TX_OK)) {
2036                         if (rc & ~NETDEV_TX_MASK)
2037                                 goto out_kfree_gso_skb;
2038                         nskb->next = skb->next;
2039                         skb->next = nskb;
2040                         return rc;
2041                 }
2042                 txq_trans_update(txq);
2043                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2044                         return NETDEV_TX_BUSY;
2045         } while (skb->next);
2046
2047 out_kfree_gso_skb:
2048         if (likely(skb->next == NULL))
2049                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2050 out_kfree_skb:
2051         kfree_skb(skb);
2052         return rc;
2053 }
2054
2055 static u32 hashrnd __read_mostly;
2056
2057 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
2058 {
2059         u32 hash;
2060
2061         if (skb_rx_queue_recorded(skb)) {
2062                 hash = skb_get_rx_queue(skb);
2063                 while (unlikely(hash >= dev->real_num_tx_queues))
2064                         hash -= dev->real_num_tx_queues;
2065                 return hash;
2066         }
2067
2068         if (skb->sk && skb->sk->sk_hash)
2069                 hash = skb->sk->sk_hash;
2070         else
2071                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2072         hash = jhash_1word(hash, hashrnd);
2073
2074         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2075 }
2076 EXPORT_SYMBOL(skb_tx_hash);
2077
2078 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2079 {
2080         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2081                 if (net_ratelimit()) {
2082                         pr_warning("%s selects TX queue %d, but "
2083                                 "real number of TX queues is %d\n",
2084                                 dev->name, queue_index, dev->real_num_tx_queues);
2085                 }
2086                 return 0;
2087         }
2088         return queue_index;
2089 }
2090
2091 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2092                                         struct sk_buff *skb)
2093 {
2094         int queue_index;
2095         const struct net_device_ops *ops = dev->netdev_ops;
2096
2097         if (ops->ndo_select_queue) {
2098                 queue_index = ops->ndo_select_queue(dev, skb);
2099                 queue_index = dev_cap_txqueue(dev, queue_index);
2100         } else {
2101                 struct sock *sk = skb->sk;
2102                 queue_index = sk_tx_queue_get(sk);
2103                 if (queue_index < 0) {
2104
2105                         queue_index = 0;
2106                         if (dev->real_num_tx_queues > 1)
2107                                 queue_index = skb_tx_hash(dev, skb);
2108
2109                         if (sk) {
2110                                 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2111
2112                                 if (dst && skb_dst(skb) == dst)
2113                                         sk_tx_queue_set(sk, queue_index);
2114                         }
2115                 }
2116         }
2117
2118         skb_set_queue_mapping(skb, queue_index);
2119         return netdev_get_tx_queue(dev, queue_index);
2120 }
2121
2122 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2123                                  struct net_device *dev,
2124                                  struct netdev_queue *txq)
2125 {
2126         spinlock_t *root_lock = qdisc_lock(q);
2127         bool contended = qdisc_is_running(q);
2128         int rc;
2129
2130         /*
2131          * Heuristic to force contended enqueues to serialize on a
2132          * separate lock before trying to get qdisc main lock.
2133          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2134          * and dequeue packets faster.
2135          */
2136         if (unlikely(contended))
2137                 spin_lock(&q->busylock);
2138
2139         spin_lock(root_lock);
2140         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2141                 kfree_skb(skb);
2142                 rc = NET_XMIT_DROP;
2143         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2144                    qdisc_run_begin(q)) {
2145                 /*
2146                  * This is a work-conserving queue; there are no old skbs
2147                  * waiting to be sent out; and the qdisc is not running -
2148                  * xmit the skb directly.
2149                  */
2150                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2151                         skb_dst_force(skb);
2152                 __qdisc_update_bstats(q, skb->len);
2153                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2154                         if (unlikely(contended)) {
2155                                 spin_unlock(&q->busylock);
2156                                 contended = false;
2157                         }
2158                         __qdisc_run(q);
2159                 } else
2160                         qdisc_run_end(q);
2161
2162                 rc = NET_XMIT_SUCCESS;
2163         } else {
2164                 skb_dst_force(skb);
2165                 rc = qdisc_enqueue_root(skb, q);
2166                 if (qdisc_run_begin(q)) {
2167                         if (unlikely(contended)) {
2168                                 spin_unlock(&q->busylock);
2169                                 contended = false;
2170                         }
2171                         __qdisc_run(q);
2172                 }
2173         }
2174         spin_unlock(root_lock);
2175         if (unlikely(contended))
2176                 spin_unlock(&q->busylock);
2177         return rc;
2178 }
2179
2180 static DEFINE_PER_CPU(int, xmit_recursion);
2181 #define RECURSION_LIMIT 3
2182
2183 /**
2184  *      dev_queue_xmit - transmit a buffer
2185  *      @skb: buffer to transmit
2186  *
2187  *      Queue a buffer for transmission to a network device. The caller must
2188  *      have set the device and priority and built the buffer before calling
2189  *      this function. The function can be called from an interrupt.
2190  *
2191  *      A negative errno code is returned on a failure. A success does not
2192  *      guarantee the frame will be transmitted as it may be dropped due
2193  *      to congestion or traffic shaping.
2194  *
2195  * -----------------------------------------------------------------------------------
2196  *      I notice this method can also return errors from the queue disciplines,
2197  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2198  *      be positive.
2199  *
2200  *      Regardless of the return value, the skb is consumed, so it is currently
2201  *      difficult to retry a send to this method.  (You can bump the ref count
2202  *      before sending to hold a reference for retry if you are careful.)
2203  *
2204  *      When calling this method, interrupts MUST be enabled.  This is because
2205  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2206  *          --BLG
2207  */
2208 int dev_queue_xmit(struct sk_buff *skb)
2209 {
2210         struct net_device *dev = skb->dev;
2211         struct netdev_queue *txq;
2212         struct Qdisc *q;
2213         int rc = -ENOMEM;
2214
2215         /* Disable soft irqs for various locks below. Also
2216          * stops preemption for RCU.
2217          */
2218         rcu_read_lock_bh();
2219
2220         txq = dev_pick_tx(dev, skb);
2221         q = rcu_dereference_bh(txq->qdisc);
2222
2223 #ifdef CONFIG_NET_CLS_ACT
2224         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2225 #endif
2226         if (q->enqueue) {
2227                 rc = __dev_xmit_skb(skb, q, dev, txq);
2228                 goto out;
2229         }
2230
2231         /* The device has no queue. Common case for software devices:
2232            loopback, all the sorts of tunnels...
2233
2234            Really, it is unlikely that netif_tx_lock protection is necessary
2235            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2236            counters.)
2237            However, it is possible, that they rely on protection
2238            made by us here.
2239
2240            Check this and shot the lock. It is not prone from deadlocks.
2241            Either shot noqueue qdisc, it is even simpler 8)
2242          */
2243         if (dev->flags & IFF_UP) {
2244                 int cpu = smp_processor_id(); /* ok because BHs are off */
2245
2246                 if (txq->xmit_lock_owner != cpu) {
2247
2248                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2249                                 goto recursion_alert;
2250
2251                         HARD_TX_LOCK(dev, txq, cpu);
2252
2253                         if (!netif_tx_queue_stopped(txq)) {
2254                                 __this_cpu_inc(xmit_recursion);
2255                                 rc = dev_hard_start_xmit(skb, dev, txq);
2256                                 __this_cpu_dec(xmit_recursion);
2257                                 if (dev_xmit_complete(rc)) {
2258                                         HARD_TX_UNLOCK(dev, txq);
2259                                         goto out;
2260                                 }
2261                         }
2262                         HARD_TX_UNLOCK(dev, txq);
2263                         if (net_ratelimit())
2264                                 printk(KERN_CRIT "Virtual device %s asks to "
2265                                        "queue packet!\n", dev->name);
2266                 } else {
2267                         /* Recursion is detected! It is possible,
2268                          * unfortunately
2269                          */
2270 recursion_alert:
2271                         if (net_ratelimit())
2272                                 printk(KERN_CRIT "Dead loop on virtual device "
2273                                        "%s, fix it urgently!\n", dev->name);
2274                 }
2275         }
2276
2277         rc = -ENETDOWN;
2278         rcu_read_unlock_bh();
2279
2280         kfree_skb(skb);
2281         return rc;
2282 out:
2283         rcu_read_unlock_bh();
2284         return rc;
2285 }
2286 EXPORT_SYMBOL(dev_queue_xmit);
2287
2288
2289 /*=======================================================================
2290                         Receiver routines
2291   =======================================================================*/
2292
2293 int netdev_max_backlog __read_mostly = 1000;
2294 int netdev_tstamp_prequeue __read_mostly = 1;
2295 int netdev_budget __read_mostly = 300;
2296 int weight_p __read_mostly = 64;            /* old backlog weight */
2297
2298 /* Called with irq disabled */
2299 static inline void ____napi_schedule(struct softnet_data *sd,
2300                                      struct napi_struct *napi)
2301 {
2302         list_add_tail(&napi->poll_list, &sd->poll_list);
2303         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2304 }
2305
2306 /*
2307  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2308  * and src/dst port numbers. Returns a non-zero hash number on success
2309  * and 0 on failure.
2310  */
2311 __u32 __skb_get_rxhash(struct sk_buff *skb)
2312 {
2313         int nhoff, hash = 0, poff;
2314         struct ipv6hdr *ip6;
2315         struct iphdr *ip;
2316         u8 ip_proto;
2317         u32 addr1, addr2, ihl;
2318         union {
2319                 u32 v32;
2320                 u16 v16[2];
2321         } ports;
2322
2323         nhoff = skb_network_offset(skb);
2324
2325         switch (skb->protocol) {
2326         case __constant_htons(ETH_P_IP):
2327                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2328                         goto done;
2329
2330                 ip = (struct iphdr *) (skb->data + nhoff);
2331                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2332                         ip_proto = 0;
2333                 else
2334                         ip_proto = ip->protocol;
2335                 addr1 = (__force u32) ip->saddr;
2336                 addr2 = (__force u32) ip->daddr;
2337                 ihl = ip->ihl;
2338                 break;
2339         case __constant_htons(ETH_P_IPV6):
2340                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2341                         goto done;
2342
2343                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2344                 ip_proto = ip6->nexthdr;
2345                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2346                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2347                 ihl = (40 >> 2);
2348                 break;
2349         default:
2350                 goto done;
2351         }
2352
2353         ports.v32 = 0;
2354         poff = proto_ports_offset(ip_proto);
2355         if (poff >= 0) {
2356                 nhoff += ihl * 4 + poff;
2357                 if (pskb_may_pull(skb, nhoff + 4)) {
2358                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2359                         if (ports.v16[1] < ports.v16[0])
2360                                 swap(ports.v16[0], ports.v16[1]);
2361                 }
2362         }
2363
2364         /* get a consistent hash (same value on both flow directions) */
2365         if (addr2 < addr1)
2366                 swap(addr1, addr2);
2367
2368         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2369         if (!hash)
2370                 hash = 1;
2371
2372 done:
2373         return hash;
2374 }
2375 EXPORT_SYMBOL(__skb_get_rxhash);
2376
2377 #ifdef CONFIG_RPS
2378
2379 /* One global table that all flow-based protocols share. */
2380 struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2381 EXPORT_SYMBOL(rps_sock_flow_table);
2382
2383 /*
2384  * get_rps_cpu is called from netif_receive_skb and returns the target
2385  * CPU from the RPS map of the receiving queue for a given skb.
2386  * rcu_read_lock must be held on entry.
2387  */
2388 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2389                        struct rps_dev_flow **rflowp)
2390 {
2391         struct netdev_rx_queue *rxqueue;
2392         struct rps_map *map = NULL;
2393         struct rps_dev_flow_table *flow_table;
2394         struct rps_sock_flow_table *sock_flow_table;
2395         int cpu = -1;
2396         u16 tcpu;
2397
2398         if (skb_rx_queue_recorded(skb)) {
2399                 u16 index = skb_get_rx_queue(skb);
2400                 if (unlikely(index >= dev->real_num_rx_queues)) {
2401                         WARN_ONCE(dev->real_num_rx_queues > 1,
2402                                   "%s received packet on queue %u, but number "
2403                                   "of RX queues is %u\n",
2404                                   dev->name, index, dev->real_num_rx_queues);
2405                         goto done;
2406                 }
2407                 rxqueue = dev->_rx + index;
2408         } else
2409                 rxqueue = dev->_rx;
2410
2411         if (rxqueue->rps_map) {
2412                 map = rcu_dereference(rxqueue->rps_map);
2413                 if (map && map->len == 1) {
2414                         tcpu = map->cpus[0];
2415                         if (cpu_online(tcpu))
2416                                 cpu = tcpu;
2417                         goto done;
2418                 }
2419         } else if (!rxqueue->rps_flow_table) {
2420                 goto done;
2421         }
2422
2423         skb_reset_network_header(skb);
2424         if (!skb_get_rxhash(skb))
2425                 goto done;
2426
2427         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2428         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2429         if (flow_table && sock_flow_table) {
2430                 u16 next_cpu;
2431                 struct rps_dev_flow *rflow;
2432
2433                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2434                 tcpu = rflow->cpu;
2435
2436                 next_cpu = sock_flow_table->ents[skb->rxhash &
2437                     sock_flow_table->mask];
2438
2439                 /*
2440                  * If the desired CPU (where last recvmsg was done) is
2441                  * different from current CPU (one in the rx-queue flow
2442                  * table entry), switch if one of the following holds:
2443                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2444                  *   - Current CPU is offline.
2445                  *   - The current CPU's queue tail has advanced beyond the
2446                  *     last packet that was enqueued using this table entry.
2447                  *     This guarantees that all previous packets for the flow
2448                  *     have been dequeued, thus preserving in order delivery.
2449                  */
2450                 if (unlikely(tcpu != next_cpu) &&
2451                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2452                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2453                       rflow->last_qtail)) >= 0)) {
2454                         tcpu = rflow->cpu = next_cpu;
2455                         if (tcpu != RPS_NO_CPU)
2456                                 rflow->last_qtail = per_cpu(softnet_data,
2457                                     tcpu).input_queue_head;
2458                 }
2459                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2460                         *rflowp = rflow;
2461                         cpu = tcpu;
2462                         goto done;
2463                 }
2464         }
2465
2466         if (map) {
2467                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2468
2469                 if (cpu_online(tcpu)) {
2470                         cpu = tcpu;
2471                         goto done;
2472                 }
2473         }
2474
2475 done:
2476         return cpu;
2477 }
2478
2479 /* Called from hardirq (IPI) context */
2480 static void rps_trigger_softirq(void *data)
2481 {
2482         struct softnet_data *sd = data;
2483
2484         ____napi_schedule(sd, &sd->backlog);
2485         sd->received_rps++;
2486 }
2487
2488 #endif /* CONFIG_RPS */
2489
2490 /*
2491  * Check if this softnet_data structure is another cpu one
2492  * If yes, queue it to our IPI list and return 1
2493  * If no, return 0
2494  */
2495 static int rps_ipi_queued(struct softnet_data *sd)
2496 {
2497 #ifdef CONFIG_RPS
2498         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2499
2500         if (sd != mysd) {
2501                 sd->rps_ipi_next = mysd->rps_ipi_list;
2502                 mysd->rps_ipi_list = sd;
2503
2504                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2505                 return 1;
2506         }
2507 #endif /* CONFIG_RPS */
2508         return 0;
2509 }
2510
2511 /*
2512  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2513  * queue (may be a remote CPU queue).
2514  */
2515 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2516                               unsigned int *qtail)
2517 {
2518         struct softnet_data *sd;
2519         unsigned long flags;
2520
2521         sd = &per_cpu(softnet_data, cpu);
2522
2523         local_irq_save(flags);
2524
2525         rps_lock(sd);
2526         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2527                 if (skb_queue_len(&sd->input_pkt_queue)) {
2528 enqueue:
2529                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2530                         input_queue_tail_incr_save(sd, qtail);
2531                         rps_unlock(sd);
2532                         local_irq_restore(flags);
2533                         return NET_RX_SUCCESS;
2534                 }
2535
2536                 /* Schedule NAPI for backlog device
2537                  * We can use non atomic operation since we own the queue lock
2538                  */
2539                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2540                         if (!rps_ipi_queued(sd))
2541                                 ____napi_schedule(sd, &sd->backlog);
2542                 }
2543                 goto enqueue;
2544         }
2545
2546         sd->dropped++;
2547         rps_unlock(sd);
2548
2549         local_irq_restore(flags);
2550
2551         kfree_skb(skb);
2552         return NET_RX_DROP;
2553 }
2554
2555 /**
2556  *      netif_rx        -       post buffer to the network code
2557  *      @skb: buffer to post
2558  *
2559  *      This function receives a packet from a device driver and queues it for
2560  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2561  *      may be dropped during processing for congestion control or by the
2562  *      protocol layers.
2563  *
2564  *      return values:
2565  *      NET_RX_SUCCESS  (no congestion)
2566  *      NET_RX_DROP     (packet was dropped)
2567  *
2568  */
2569
2570 int netif_rx(struct sk_buff *skb)
2571 {
2572         int ret;
2573
2574         /* if netpoll wants it, pretend we never saw it */
2575         if (netpoll_rx(skb))
2576                 return NET_RX_DROP;
2577
2578         if (netdev_tstamp_prequeue)
2579                 net_timestamp_check(skb);
2580
2581 #ifdef CONFIG_RPS
2582         {
2583                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2584                 int cpu;
2585
2586                 preempt_disable();
2587                 rcu_read_lock();
2588
2589                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2590                 if (cpu < 0)
2591                         cpu = smp_processor_id();
2592
2593                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2594
2595                 rcu_read_unlock();
2596                 preempt_enable();
2597         }
2598 #else
2599         {
2600                 unsigned int qtail;
2601                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2602                 put_cpu();
2603         }
2604 #endif
2605         return ret;
2606 }
2607 EXPORT_SYMBOL(netif_rx);
2608
2609 int netif_rx_ni(struct sk_buff *skb)
2610 {
2611         int err;
2612
2613         preempt_disable();
2614         err = netif_rx(skb);
2615         if (local_softirq_pending())
2616                 do_softirq();
2617         preempt_enable();
2618
2619         return err;
2620 }
2621 EXPORT_SYMBOL(netif_rx_ni);
2622
2623 static void net_tx_action(struct softirq_action *h)
2624 {
2625         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2626
2627         if (sd->completion_queue) {
2628                 struct sk_buff *clist;
2629
2630                 local_irq_disable();
2631                 clist = sd->completion_queue;
2632                 sd->completion_queue = NULL;
2633                 local_irq_enable();
2634
2635                 while (clist) {
2636                         struct sk_buff *skb = clist;
2637                         clist = clist->next;
2638
2639                         WARN_ON(atomic_read(&skb->users));
2640                         __kfree_skb(skb);
2641                 }
2642         }
2643
2644         if (sd->output_queue) {
2645                 struct Qdisc *head;
2646
2647                 local_irq_disable();
2648                 head = sd->output_queue;
2649                 sd->output_queue = NULL;
2650                 sd->output_queue_tailp = &sd->output_queue;
2651                 local_irq_enable();
2652
2653                 while (head) {
2654                         struct Qdisc *q = head;
2655                         spinlock_t *root_lock;
2656
2657                         head = head->next_sched;
2658
2659                         root_lock = qdisc_lock(q);
2660                         if (spin_trylock(root_lock)) {
2661                                 smp_mb__before_clear_bit();
2662                                 clear_bit(__QDISC_STATE_SCHED,
2663                                           &q->state);
2664                                 qdisc_run(q);
2665                                 spin_unlock(root_lock);
2666                         } else {
2667                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2668                                               &q->state)) {
2669                                         __netif_reschedule(q);
2670                                 } else {
2671                                         smp_mb__before_clear_bit();
2672                                         clear_bit(__QDISC_STATE_SCHED,
2673                                                   &q->state);
2674                                 }
2675                         }
2676                 }
2677         }
2678 }
2679
2680 static inline int deliver_skb(struct sk_buff *skb,
2681                               struct packet_type *pt_prev,
2682                               struct net_device *orig_dev)
2683 {
2684         atomic_inc(&skb->users);
2685         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2686 }
2687
2688 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2689     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2690 /* This hook is defined here for ATM LANE */
2691 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2692                              unsigned char *addr) __read_mostly;
2693 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2694 #endif
2695
2696 #ifdef CONFIG_NET_CLS_ACT
2697 /* TODO: Maybe we should just force sch_ingress to be compiled in
2698  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2699  * a compare and 2 stores extra right now if we dont have it on
2700  * but have CONFIG_NET_CLS_ACT
2701  * NOTE: This doesnt stop any functionality; if you dont have
2702  * the ingress scheduler, you just cant add policies on ingress.
2703  *
2704  */
2705 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2706 {
2707         struct net_device *dev = skb->dev;
2708         u32 ttl = G_TC_RTTL(skb->tc_verd);
2709         int result = TC_ACT_OK;
2710         struct Qdisc *q;
2711
2712         if (unlikely(MAX_RED_LOOP < ttl++)) {
2713                 if (net_ratelimit())
2714                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2715                                skb->skb_iif, dev->ifindex);
2716                 return TC_ACT_SHOT;
2717         }
2718
2719         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2720         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2721
2722         q = rxq->qdisc;
2723         if (q != &noop_qdisc) {
2724                 spin_lock(qdisc_lock(q));
2725                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2726                         result = qdisc_enqueue_root(skb, q);
2727                 spin_unlock(qdisc_lock(q));
2728         }
2729
2730         return result;
2731 }
2732
2733 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2734                                          struct packet_type **pt_prev,
2735                                          int *ret, struct net_device *orig_dev)
2736 {
2737         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2738
2739         if (!rxq || rxq->qdisc == &noop_qdisc)
2740                 goto out;
2741
2742         if (*pt_prev) {
2743                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2744                 *pt_prev = NULL;
2745         }
2746
2747         switch (ing_filter(skb, rxq)) {
2748         case TC_ACT_SHOT:
2749         case TC_ACT_STOLEN:
2750                 kfree_skb(skb);
2751                 return NULL;
2752         }
2753
2754 out:
2755         skb->tc_verd = 0;
2756         return skb;
2757 }
2758 #endif
2759
2760 /*
2761  *      netif_nit_deliver - deliver received packets to network taps
2762  *      @skb: buffer
2763  *
2764  *      This function is used to deliver incoming packets to network
2765  *      taps. It should be used when the normal netif_receive_skb path
2766  *      is bypassed, for example because of VLAN acceleration.
2767  */
2768 void netif_nit_deliver(struct sk_buff *skb)
2769 {
2770         struct packet_type *ptype;
2771
2772         if (list_empty(&ptype_all))
2773                 return;
2774
2775         skb_reset_network_header(skb);
2776         skb_reset_transport_header(skb);
2777         skb->mac_len = skb->network_header - skb->mac_header;
2778
2779         rcu_read_lock();
2780         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2781                 if (!ptype->dev || ptype->dev == skb->dev)
2782                         deliver_skb(skb, ptype, skb->dev);
2783         }
2784         rcu_read_unlock();
2785 }
2786
2787 /**
2788  *      netdev_rx_handler_register - register receive handler
2789  *      @dev: device to register a handler for
2790  *      @rx_handler: receive handler to register
2791  *      @rx_handler_data: data pointer that is used by rx handler
2792  *
2793  *      Register a receive hander for a device. This handler will then be
2794  *      called from __netif_receive_skb. A negative errno code is returned
2795  *      on a failure.
2796  *
2797  *      The caller must hold the rtnl_mutex.
2798  */
2799 int netdev_rx_handler_register(struct net_device *dev,
2800                                rx_handler_func_t *rx_handler,
2801                                void *rx_handler_data)
2802 {
2803         ASSERT_RTNL();
2804
2805         if (dev->rx_handler)
2806                 return -EBUSY;
2807
2808         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2809         rcu_assign_pointer(dev->rx_handler, rx_handler);
2810
2811         return 0;
2812 }
2813 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2814
2815 /**
2816  *      netdev_rx_handler_unregister - unregister receive handler
2817  *      @dev: device to unregister a handler from
2818  *
2819  *      Unregister a receive hander from a device.
2820  *
2821  *      The caller must hold the rtnl_mutex.
2822  */
2823 void netdev_rx_handler_unregister(struct net_device *dev)
2824 {
2825
2826         ASSERT_RTNL();
2827         rcu_assign_pointer(dev->rx_handler, NULL);
2828         rcu_assign_pointer(dev->rx_handler_data, NULL);
2829 }
2830 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2831
2832 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2833                                               struct net_device *master)
2834 {
2835         if (skb->pkt_type == PACKET_HOST) {
2836                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2837
2838                 memcpy(dest, master->dev_addr, ETH_ALEN);
2839         }
2840 }
2841
2842 /* On bonding slaves other than the currently active slave, suppress
2843  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2844  * ARP on active-backup slaves with arp_validate enabled.
2845  */
2846 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2847 {
2848         struct net_device *dev = skb->dev;
2849
2850         if (master->priv_flags & IFF_MASTER_ARPMON)
2851                 dev->last_rx = jiffies;
2852
2853         if ((master->priv_flags & IFF_MASTER_ALB) &&
2854             (master->priv_flags & IFF_BRIDGE_PORT)) {
2855                 /* Do address unmangle. The local destination address
2856                  * will be always the one master has. Provides the right
2857                  * functionality in a bridge.
2858                  */
2859                 skb_bond_set_mac_by_master(skb, master);
2860         }
2861
2862         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2863                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2864                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
2865                         return 0;
2866
2867                 if (master->priv_flags & IFF_MASTER_ALB) {
2868                         if (skb->pkt_type != PACKET_BROADCAST &&
2869                             skb->pkt_type != PACKET_MULTICAST)
2870                                 return 0;
2871                 }
2872                 if (master->priv_flags & IFF_MASTER_8023AD &&
2873                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2874                         return 0;
2875
2876                 return 1;
2877         }
2878         return 0;
2879 }
2880 EXPORT_SYMBOL(__skb_bond_should_drop);
2881
2882 static int __netif_receive_skb(struct sk_buff *skb)
2883 {
2884         struct packet_type *ptype, *pt_prev;
2885         rx_handler_func_t *rx_handler;
2886         struct net_device *orig_dev;
2887         struct net_device *master;
2888         struct net_device *null_or_orig;
2889         struct net_device *orig_or_bond;
2890         int ret = NET_RX_DROP;
2891         __be16 type;
2892
2893         if (!netdev_tstamp_prequeue)
2894                 net_timestamp_check(skb);
2895
2896         if (vlan_tx_tag_present(skb))
2897                 vlan_hwaccel_do_receive(skb);
2898
2899         /* if we've gotten here through NAPI, check netpoll */
2900         if (netpoll_receive_skb(skb))
2901                 return NET_RX_DROP;
2902
2903         if (!skb->skb_iif)
2904                 skb->skb_iif = skb->dev->ifindex;
2905
2906         /*
2907          * bonding note: skbs received on inactive slaves should only
2908          * be delivered to pkt handlers that are exact matches.  Also
2909          * the deliver_no_wcard flag will be set.  If packet handlers
2910          * are sensitive to duplicate packets these skbs will need to
2911          * be dropped at the handler.  The vlan accel path may have
2912          * already set the deliver_no_wcard flag.
2913          */
2914         null_or_orig = NULL;
2915         orig_dev = skb->dev;
2916         master = ACCESS_ONCE(orig_dev->master);
2917         if (skb->deliver_no_wcard)
2918                 null_or_orig = orig_dev;
2919         else if (master) {
2920                 if (skb_bond_should_drop(skb, master)) {
2921                         skb->deliver_no_wcard = 1;
2922                         null_or_orig = orig_dev; /* deliver only exact match */
2923                 } else
2924                         skb->dev = master;
2925         }
2926
2927         __this_cpu_inc(softnet_data.processed);
2928         skb_reset_network_header(skb);
2929         skb_reset_transport_header(skb);
2930         skb->mac_len = skb->network_header - skb->mac_header;
2931
2932         pt_prev = NULL;
2933
2934         rcu_read_lock();
2935
2936 #ifdef CONFIG_NET_CLS_ACT
2937         if (skb->tc_verd & TC_NCLS) {
2938                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2939                 goto ncls;
2940         }
2941 #endif
2942
2943         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2944                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2945                     ptype->dev == orig_dev) {
2946                         if (pt_prev)
2947                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2948                         pt_prev = ptype;
2949                 }
2950         }
2951
2952 #ifdef CONFIG_NET_CLS_ACT
2953         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2954         if (!skb)
2955                 goto out;
2956 ncls:
2957 #endif
2958
2959         /* Handle special case of bridge or macvlan */
2960         rx_handler = rcu_dereference(skb->dev->rx_handler);
2961         if (rx_handler) {
2962                 if (pt_prev) {
2963                         ret = deliver_skb(skb, pt_prev, orig_dev);
2964                         pt_prev = NULL;
2965                 }
2966                 skb = rx_handler(skb);
2967                 if (!skb)
2968                         goto out;
2969         }
2970
2971         /*
2972          * Make sure frames received on VLAN interfaces stacked on
2973          * bonding interfaces still make their way to any base bonding
2974          * device that may have registered for a specific ptype.  The
2975          * handler may have to adjust skb->dev and orig_dev.
2976          */
2977         orig_or_bond = orig_dev;
2978         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2979             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2980                 orig_or_bond = vlan_dev_real_dev(skb->dev);
2981         }
2982
2983         type = skb->protocol;
2984         list_for_each_entry_rcu(ptype,
2985                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2986                 if (ptype->type == type && (ptype->dev == null_or_orig ||
2987                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
2988                      ptype->dev == orig_or_bond)) {
2989                         if (pt_prev)
2990                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2991                         pt_prev = ptype;
2992                 }
2993         }
2994
2995         if (pt_prev) {
2996                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2997         } else {
2998                 kfree_skb(skb);
2999                 /* Jamal, now you will not able to escape explaining
3000                  * me how you were going to use this. :-)
3001                  */
3002                 ret = NET_RX_DROP;
3003         }
3004
3005 out:
3006         rcu_read_unlock();
3007         return ret;
3008 }
3009
3010 /**
3011  *      netif_receive_skb - process receive buffer from network
3012  *      @skb: buffer to process
3013  *
3014  *      netif_receive_skb() is the main receive data processing function.
3015  *      It always succeeds. The buffer may be dropped during processing
3016  *      for congestion control or by the protocol layers.
3017  *
3018  *      This function may only be called from softirq context and interrupts
3019  *      should be enabled.
3020  *
3021  *      Return values (usually ignored):
3022  *      NET_RX_SUCCESS: no congestion
3023  *      NET_RX_DROP: packet was dropped
3024  */
3025 int netif_receive_skb(struct sk_buff *skb)
3026 {
3027         if (netdev_tstamp_prequeue)
3028                 net_timestamp_check(skb);
3029
3030         if (skb_defer_rx_timestamp(skb))
3031                 return NET_RX_SUCCESS;
3032
3033 #ifdef CONFIG_RPS
3034         {
3035                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3036                 int cpu, ret;
3037
3038                 rcu_read_lock();
3039
3040                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3041
3042                 if (cpu >= 0) {
3043                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3044                         rcu_read_unlock();
3045                 } else {
3046                         rcu_read_unlock();
3047                         ret = __netif_receive_skb(skb);
3048                 }
3049
3050                 return ret;
3051         }
3052 #else
3053         return __netif_receive_skb(skb);
3054 #endif
3055 }
3056 EXPORT_SYMBOL(netif_receive_skb);
3057
3058 /* Network device is going away, flush any packets still pending
3059  * Called with irqs disabled.
3060  */
3061 static void flush_backlog(void *arg)
3062 {
3063         struct net_device *dev = arg;
3064         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3065         struct sk_buff *skb, *tmp;
3066
3067         rps_lock(sd);
3068         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3069                 if (skb->dev == dev) {
3070                         __skb_unlink(skb, &sd->input_pkt_queue);
3071                         kfree_skb(skb);
3072                         input_queue_head_incr(sd);
3073                 }
3074         }
3075         rps_unlock(sd);
3076
3077         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3078                 if (skb->dev == dev) {
3079                         __skb_unlink(skb, &sd->process_queue);
3080                         kfree_skb(skb);
3081                         input_queue_head_incr(sd);
3082                 }
3083         }
3084 }
3085
3086 static int napi_gro_complete(struct sk_buff *skb)
3087 {
3088         struct packet_type *ptype;
3089         __be16 type = skb->protocol;
3090         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3091         int err = -ENOENT;
3092
3093         if (NAPI_GRO_CB(skb)->count == 1) {
3094                 skb_shinfo(skb)->gso_size = 0;
3095                 goto out;
3096         }
3097
3098         rcu_read_lock();
3099         list_for_each_entry_rcu(ptype, head, list) {
3100                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3101                         continue;
3102
3103                 err = ptype->gro_complete(skb);
3104                 break;
3105         }
3106         rcu_read_unlock();
3107
3108         if (err) {
3109                 WARN_ON(&ptype->list == head);
3110                 kfree_skb(skb);
3111                 return NET_RX_SUCCESS;
3112         }
3113
3114 out:
3115         return netif_receive_skb(skb);
3116 }
3117
3118 inline void napi_gro_flush(struct napi_struct *napi)
3119 {
3120         struct sk_buff *skb, *next;
3121
3122         for (skb = napi->gro_list; skb; skb = next) {
3123                 next = skb->next;
3124                 skb->next = NULL;
3125                 napi_gro_complete(skb);
3126         }
3127
3128         napi->gro_count = 0;
3129         napi->gro_list = NULL;
3130 }
3131 EXPORT_SYMBOL(napi_gro_flush);
3132
3133 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3134 {
3135         struct sk_buff **pp = NULL;
3136         struct packet_type *ptype;
3137         __be16 type = skb->protocol;
3138         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3139         int same_flow;
3140         int mac_len;
3141         enum gro_result ret;
3142
3143         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3144                 goto normal;
3145
3146         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3147                 goto normal;
3148
3149         rcu_read_lock();
3150         list_for_each_entry_rcu(ptype, head, list) {
3151                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3152                         continue;
3153
3154                 skb_set_network_header(skb, skb_gro_offset(skb));
3155                 mac_len = skb->network_header - skb->mac_header;
3156                 skb->mac_len = mac_len;
3157                 NAPI_GRO_CB(skb)->same_flow = 0;
3158                 NAPI_GRO_CB(skb)->flush = 0;
3159                 NAPI_GRO_CB(skb)->free = 0;
3160
3161                 pp = ptype->gro_receive(&napi->gro_list, skb);
3162                 break;
3163         }
3164         rcu_read_unlock();
3165
3166         if (&ptype->list == head)
3167                 goto normal;
3168
3169         same_flow = NAPI_GRO_CB(skb)->same_flow;
3170         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3171
3172         if (pp) {
3173                 struct sk_buff *nskb = *pp;
3174
3175                 *pp = nskb->next;
3176                 nskb->next = NULL;
3177                 napi_gro_complete(nskb);
3178                 napi->gro_count--;
3179         }
3180
3181         if (same_flow)
3182                 goto ok;
3183
3184         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3185                 goto normal;
3186
3187         napi->gro_count++;
3188         NAPI_GRO_CB(skb)->count = 1;
3189         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3190         skb->next = napi->gro_list;
3191         napi->gro_list = skb;
3192         ret = GRO_HELD;
3193
3194 pull:
3195         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3196                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3197
3198                 BUG_ON(skb->end - skb->tail < grow);
3199
3200                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3201
3202                 skb->tail += grow;
3203                 skb->data_len -= grow;
3204
3205                 skb_shinfo(skb)->frags[0].page_offset += grow;
3206                 skb_shinfo(skb)->frags[0].size -= grow;
3207
3208                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3209                         put_page(skb_shinfo(skb)->frags[0].page);
3210                         memmove(skb_shinfo(skb)->frags,
3211                                 skb_shinfo(skb)->frags + 1,
3212                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3213                 }
3214         }
3215
3216 ok:
3217         return ret;
3218
3219 normal:
3220         ret = GRO_NORMAL;
3221         goto pull;
3222 }
3223 EXPORT_SYMBOL(dev_gro_receive);
3224
3225 static inline gro_result_t
3226 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3227 {
3228         struct sk_buff *p;
3229
3230         for (p = napi->gro_list; p; p = p->next) {
3231                 unsigned long diffs;
3232
3233                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3234                 diffs |= compare_ether_header(skb_mac_header(p),
3235                                               skb_gro_mac_header(skb));
3236                 NAPI_GRO_CB(p)->same_flow = !diffs;
3237                 NAPI_GRO_CB(p)->flush = 0;
3238         }
3239
3240         return dev_gro_receive(napi, skb);
3241 }
3242
3243 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3244 {
3245         switch (ret) {
3246         case GRO_NORMAL:
3247                 if (netif_receive_skb(skb))
3248                         ret = GRO_DROP;
3249                 break;
3250
3251         case GRO_DROP:
3252         case GRO_MERGED_FREE:
3253                 kfree_skb(skb);
3254                 break;
3255
3256         case GRO_HELD:
3257         case GRO_MERGED:
3258                 break;
3259         }
3260
3261         return ret;
3262 }
3263 EXPORT_SYMBOL(napi_skb_finish);
3264
3265 void skb_gro_reset_offset(struct sk_buff *skb)
3266 {
3267         NAPI_GRO_CB(skb)->data_offset = 0;
3268         NAPI_GRO_CB(skb)->frag0 = NULL;
3269         NAPI_GRO_CB(skb)->frag0_len = 0;
3270
3271         if (skb->mac_header == skb->tail &&
3272             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3273                 NAPI_GRO_CB(skb)->frag0 =
3274                         page_address(skb_shinfo(skb)->frags[0].page) +
3275                         skb_shinfo(skb)->frags[0].page_offset;
3276                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3277         }
3278 }
3279 EXPORT_SYMBOL(skb_gro_reset_offset);
3280
3281 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3282 {
3283         skb_gro_reset_offset(skb);
3284
3285         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3286 }
3287 EXPORT_SYMBOL(napi_gro_receive);
3288
3289 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3290 {
3291         __skb_pull(skb, skb_headlen(skb));
3292         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3293
3294         napi->skb = skb;
3295 }
3296 EXPORT_SYMBOL(napi_reuse_skb);
3297
3298 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3299 {
3300         struct sk_buff *skb = napi->skb;
3301
3302         if (!skb) {
3303                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3304                 if (skb)
3305                         napi->skb = skb;
3306         }
3307         return skb;
3308 }
3309 EXPORT_SYMBOL(napi_get_frags);
3310
3311 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3312                                gro_result_t ret)
3313 {
3314         switch (ret) {
3315         case GRO_NORMAL:
3316         case GRO_HELD:
3317                 skb->protocol = eth_type_trans(skb, skb->dev);
3318
3319                 if (ret == GRO_HELD)
3320                         skb_gro_pull(skb, -ETH_HLEN);
3321                 else if (netif_receive_skb(skb))
3322                         ret = GRO_DROP;
3323                 break;
3324
3325         case GRO_DROP:
3326         case GRO_MERGED_FREE:
3327                 napi_reuse_skb(napi, skb);
3328                 break;
3329
3330         case GRO_MERGED:
3331                 break;
3332         }
3333
3334         return ret;
3335 }
3336 EXPORT_SYMBOL(napi_frags_finish);
3337
3338 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3339 {
3340         struct sk_buff *skb = napi->skb;
3341         struct ethhdr *eth;
3342         unsigned int hlen;
3343         unsigned int off;
3344
3345         napi->skb = NULL;
3346
3347         skb_reset_mac_header(skb);
3348         skb_gro_reset_offset(skb);
3349
3350         off = skb_gro_offset(skb);
3351         hlen = off + sizeof(*eth);
3352         eth = skb_gro_header_fast(skb, off);
3353         if (skb_gro_header_hard(skb, hlen)) {
3354                 eth = skb_gro_header_slow(skb, hlen, off);
3355                 if (unlikely(!eth)) {
3356                         napi_reuse_skb(napi, skb);
3357                         skb = NULL;
3358                         goto out;
3359                 }
3360         }
3361
3362         skb_gro_pull(skb, sizeof(*eth));
3363
3364         /*
3365          * This works because the only protocols we care about don't require
3366          * special handling.  We'll fix it up properly at the end.
3367          */
3368         skb->protocol = eth->h_proto;
3369
3370 out:
3371         return skb;
3372 }
3373 EXPORT_SYMBOL(napi_frags_skb);
3374
3375 gro_result_t napi_gro_frags(struct napi_struct *napi)
3376 {
3377         struct sk_buff *skb = napi_frags_skb(napi);
3378
3379         if (!skb)
3380                 return GRO_DROP;
3381
3382         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3383 }
3384 EXPORT_SYMBOL(napi_gro_frags);
3385
3386 /*
3387  * net_rps_action sends any pending IPI's for rps.
3388  * Note: called with local irq disabled, but exits with local irq enabled.
3389  */
3390 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3391 {
3392 #ifdef CONFIG_RPS
3393         struct softnet_data *remsd = sd->rps_ipi_list;
3394
3395         if (remsd) {
3396                 sd->rps_ipi_list = NULL;
3397
3398                 local_irq_enable();
3399
3400                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3401                 while (remsd) {
3402                         struct softnet_data *next = remsd->rps_ipi_next;
3403
3404                         if (cpu_online(remsd->cpu))
3405                                 __smp_call_function_single(remsd->cpu,
3406                                                            &remsd->csd, 0);
3407                         remsd = next;
3408                 }
3409         } else
3410 #endif
3411                 local_irq_enable();
3412 }
3413
3414 static int process_backlog(struct napi_struct *napi, int quota)
3415 {
3416         int work = 0;
3417         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3418
3419 #ifdef CONFIG_RPS
3420         /* Check if we have pending ipi, its better to send them now,
3421          * not waiting net_rx_action() end.
3422          */
3423         if (sd->rps_ipi_list) {
3424                 local_irq_disable();
3425                 net_rps_action_and_irq_enable(sd);
3426         }
3427 #endif
3428         napi->weight = weight_p;
3429         local_irq_disable();
3430         while (work < quota) {
3431                 struct sk_buff *skb;
3432                 unsigned int qlen;
3433
3434                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3435                         local_irq_enable();
3436                         __netif_receive_skb(skb);
3437                         local_irq_disable();
3438                         input_queue_head_incr(sd);
3439                         if (++work >= quota) {
3440                                 local_irq_enable();
3441                                 return work;
3442                         }
3443                 }
3444
3445                 rps_lock(sd);
3446                 qlen = skb_queue_len(&sd->input_pkt_queue);
3447                 if (qlen)
3448                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3449                                                    &sd->process_queue);
3450
3451                 if (qlen < quota - work) {
3452                         /*
3453                          * Inline a custom version of __napi_complete().
3454                          * only current cpu owns and manipulates this napi,
3455                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3456                          * we can use a plain write instead of clear_bit(),
3457                          * and we dont need an smp_mb() memory barrier.
3458                          */
3459                         list_del(&napi->poll_list);
3460                         napi->state = 0;
3461
3462                         quota = work + qlen;
3463                 }
3464                 rps_unlock(sd);
3465         }
3466         local_irq_enable();
3467
3468         return work;
3469 }
3470
3471 /**
3472  * __napi_schedule - schedule for receive
3473  * @n: entry to schedule
3474  *
3475  * The entry's receive function will be scheduled to run
3476  */
3477 void __napi_schedule(struct napi_struct *n)
3478 {
3479         unsigned long flags;
3480
3481         local_irq_save(flags);
3482         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3483         local_irq_restore(flags);
3484 }
3485 EXPORT_SYMBOL(__napi_schedule);
3486
3487 void __napi_complete(struct napi_struct *n)
3488 {
3489         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3490         BUG_ON(n->gro_list);
3491
3492         list_del(&n->poll_list);
3493         smp_mb__before_clear_bit();
3494         clear_bit(NAPI_STATE_SCHED, &n->state);
3495 }
3496 EXPORT_SYMBOL(__napi_complete);
3497
3498 void napi_complete(struct napi_struct *n)
3499 {
3500         unsigned long flags;
3501
3502         /*
3503          * don't let napi dequeue from the cpu poll list
3504          * just in case its running on a different cpu
3505          */
3506         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3507                 return;
3508
3509         napi_gro_flush(n);
3510         local_irq_save(flags);
3511         __napi_complete(n);
3512         local_irq_restore(flags);
3513 }
3514 EXPORT_SYMBOL(napi_complete);
3515
3516 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3517                     int (*poll)(struct napi_struct *, int), int weight)
3518 {
3519         INIT_LIST_HEAD(&napi->poll_list);
3520         napi->gro_count = 0;
3521         napi->gro_list = NULL;
3522         napi->skb = NULL;
3523         napi->poll = poll;
3524         napi->weight = weight;
3525         list_add(&napi->dev_list, &dev->napi_list);
3526         napi->dev = dev;
3527 #ifdef CONFIG_NETPOLL
3528         spin_lock_init(&napi->poll_lock);
3529         napi->poll_owner = -1;
3530 #endif
3531         set_bit(NAPI_STATE_SCHED, &napi->state);
3532 }
3533 EXPORT_SYMBOL(netif_napi_add);
3534
3535 void netif_napi_del(struct napi_struct *napi)
3536 {
3537         struct sk_buff *skb, *next;
3538
3539         list_del_init(&napi->dev_list);
3540         napi_free_frags(napi);
3541
3542         for (skb = napi->gro_list; skb; skb = next) {
3543                 next = skb->next;
3544                 skb->next = NULL;
3545                 kfree_skb(skb);
3546         }
3547
3548         napi->gro_list = NULL;
3549         napi->gro_count = 0;
3550 }
3551 EXPORT_SYMBOL(netif_napi_del);
3552
3553 static void net_rx_action(struct softirq_action *h)
3554 {
3555         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3556         unsigned long time_limit = jiffies + 2;
3557         int budget = netdev_budget;
3558         void *have;
3559
3560         local_irq_disable();
3561
3562         while (!list_empty(&sd->poll_list)) {
3563                 struct napi_struct *n;
3564                 int work, weight;
3565
3566                 /* If softirq window is exhuasted then punt.
3567                  * Allow this to run for 2 jiffies since which will allow
3568                  * an average latency of 1.5/HZ.
3569                  */
3570                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3571                         goto softnet_break;
3572
3573                 local_irq_enable();
3574
3575                 /* Even though interrupts have been re-enabled, this
3576                  * access is safe because interrupts can only add new
3577                  * entries to the tail of this list, and only ->poll()
3578                  * calls can remove this head entry from the list.
3579                  */
3580                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3581
3582                 have = netpoll_poll_lock(n);
3583
3584                 weight = n->weight;
3585
3586                 /* This NAPI_STATE_SCHED test is for avoiding a race
3587                  * with netpoll's poll_napi().  Only the entity which
3588                  * obtains the lock and sees NAPI_STATE_SCHED set will
3589                  * actually make the ->poll() call.  Therefore we avoid
3590                  * accidently calling ->poll() when NAPI is not scheduled.
3591                  */
3592                 work = 0;
3593                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3594                         work = n->poll(n, weight);
3595                         trace_napi_poll(n);
3596                 }
3597
3598                 WARN_ON_ONCE(work > weight);
3599
3600                 budget -= work;
3601
3602                 local_irq_disable();
3603
3604                 /* Drivers must not modify the NAPI state if they
3605                  * consume the entire weight.  In such cases this code
3606                  * still "owns" the NAPI instance and therefore can
3607                  * move the instance around on the list at-will.
3608                  */
3609                 if (unlikely(work == weight)) {
3610                         if (unlikely(napi_disable_pending(n))) {
3611                                 local_irq_enable();
3612                                 napi_complete(n);
3613                                 local_irq_disable();
3614                         } else
3615                                 list_move_tail(&n->poll_list, &sd->poll_list);
3616                 }
3617
3618                 netpoll_poll_unlock(have);
3619         }
3620 out:
3621         net_rps_action_and_irq_enable(sd);
3622
3623 #ifdef CONFIG_NET_DMA
3624         /*
3625          * There may not be any more sk_buffs coming right now, so push
3626          * any pending DMA copies to hardware
3627          */
3628         dma_issue_pending_all();
3629 #endif
3630
3631         return;
3632
3633 softnet_break:
3634         sd->time_squeeze++;
3635         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3636         goto out;
3637 }
3638
3639 static gifconf_func_t *gifconf_list[NPROTO];
3640
3641 /**
3642  *      register_gifconf        -       register a SIOCGIF handler
3643  *      @family: Address family
3644  *      @gifconf: Function handler
3645  *
3646  *      Register protocol dependent address dumping routines. The handler
3647  *      that is passed must not be freed or reused until it has been replaced
3648  *      by another handler.
3649  */
3650 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3651 {
3652         if (family >= NPROTO)
3653                 return -EINVAL;
3654         gifconf_list[family] = gifconf;
3655         return 0;
3656 }
3657 EXPORT_SYMBOL(register_gifconf);
3658
3659
3660 /*
3661  *      Map an interface index to its name (SIOCGIFNAME)
3662  */
3663
3664 /*
3665  *      We need this ioctl for efficient implementation of the
3666  *      if_indextoname() function required by the IPv6 API.  Without
3667  *      it, we would have to search all the interfaces to find a
3668  *      match.  --pb
3669  */
3670
3671 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3672 {
3673         struct net_device *dev;
3674         struct ifreq ifr;
3675
3676         /*
3677          *      Fetch the caller's info block.
3678          */
3679
3680         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3681                 return -EFAULT;
3682
3683         rcu_read_lock();
3684         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3685         if (!dev) {
3686                 rcu_read_unlock();
3687                 return -ENODEV;
3688         }
3689
3690         strcpy(ifr.ifr_name, dev->name);
3691         rcu_read_unlock();
3692
3693         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3694                 return -EFAULT;
3695         return 0;
3696 }
3697
3698 /*
3699  *      Perform a SIOCGIFCONF call. This structure will change
3700  *      size eventually, and there is nothing I can do about it.
3701  *      Thus we will need a 'compatibility mode'.
3702  */
3703
3704 static int dev_ifconf(struct net *net, char __user *arg)
3705 {
3706         struct ifconf ifc;
3707         struct net_device *dev;
3708         char __user *pos;
3709         int len;
3710         int total;
3711         int i;
3712
3713         /*
3714          *      Fetch the caller's info block.
3715          */
3716
3717         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3718                 return -EFAULT;
3719
3720         pos = ifc.ifc_buf;
3721         len = ifc.ifc_len;
3722
3723         /*
3724          *      Loop over the interfaces, and write an info block for each.
3725          */
3726
3727         total = 0;
3728         for_each_netdev(net, dev) {
3729                 for (i = 0; i < NPROTO; i++) {
3730                         if (gifconf_list[i]) {
3731                                 int done;
3732                                 if (!pos)
3733                                         done = gifconf_list[i](dev, NULL, 0);
3734                                 else
3735                                         done = gifconf_list[i](dev, pos + total,
3736                                                                len - total);
3737                                 if (done < 0)
3738                                         return -EFAULT;
3739                                 total += done;
3740                         }
3741                 }
3742         }
3743
3744         /*
3745          *      All done.  Write the updated control block back to the caller.
3746          */
3747         ifc.ifc_len = total;
3748
3749         /*
3750          *      Both BSD and Solaris return 0 here, so we do too.
3751          */
3752         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3753 }
3754
3755 #ifdef CONFIG_PROC_FS
3756 /*
3757  *      This is invoked by the /proc filesystem handler to display a device
3758  *      in detail.
3759  */
3760 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3761         __acquires(RCU)
3762 {
3763         struct net *net = seq_file_net(seq);
3764         loff_t off;
3765         struct net_device *dev;
3766
3767         rcu_read_lock();
3768         if (!*pos)
3769                 return SEQ_START_TOKEN;
3770
3771         off = 1;
3772         for_each_netdev_rcu(net, dev)
3773                 if (off++ == *pos)
3774                         return dev;
3775
3776         return NULL;
3777 }
3778
3779 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3780 {
3781         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3782                                   first_net_device(seq_file_net(seq)) :
3783                                   next_net_device((struct net_device *)v);
3784
3785         ++*pos;
3786         return rcu_dereference(dev);
3787 }
3788
3789 void dev_seq_stop(struct seq_file *seq, void *v)
3790         __releases(RCU)
3791 {
3792         rcu_read_unlock();
3793 }
3794
3795 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3796 {
3797         struct rtnl_link_stats64 temp;
3798         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3799
3800         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3801                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3802                    dev->name, stats->rx_bytes, stats->rx_packets,
3803                    stats->rx_errors,
3804                    stats->rx_dropped + stats->rx_missed_errors,
3805                    stats->rx_fifo_errors,
3806                    stats->rx_length_errors + stats->rx_over_errors +
3807                     stats->rx_crc_errors + stats->rx_frame_errors,
3808                    stats->rx_compressed, stats->multicast,
3809                    stats->tx_bytes, stats->tx_packets,
3810                    stats->tx_errors, stats->tx_dropped,
3811                    stats->tx_fifo_errors, stats->collisions,
3812                    stats->tx_carrier_errors +
3813                     stats->tx_aborted_errors +
3814                     stats->tx_window_errors +
3815                     stats->tx_heartbeat_errors,
3816                    stats->tx_compressed);
3817 }
3818
3819 /*
3820  *      Called from the PROCfs module. This now uses the new arbitrary sized
3821  *      /proc/net interface to create /proc/net/dev
3822  */
3823 static int dev_seq_show(struct seq_file *seq, void *v)
3824 {
3825         if (v == SEQ_START_TOKEN)
3826                 seq_puts(seq, "Inter-|   Receive                            "
3827                               "                    |  Transmit\n"
3828                               " face |bytes    packets errs drop fifo frame "
3829                               "compressed multicast|bytes    packets errs "
3830                               "drop fifo colls carrier compressed\n");
3831         else
3832                 dev_seq_printf_stats(seq, v);
3833         return 0;
3834 }
3835
3836 static struct softnet_data *softnet_get_online(loff_t *pos)
3837 {
3838         struct softnet_data *sd = NULL;
3839
3840         while (*pos < nr_cpu_ids)
3841                 if (cpu_online(*pos)) {
3842                         sd = &per_cpu(softnet_data, *pos);
3843                         break;
3844                 } else
3845                         ++*pos;
3846         return sd;
3847 }
3848
3849 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3850 {
3851         return softnet_get_online(pos);
3852 }
3853
3854 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3855 {
3856         ++*pos;
3857         return softnet_get_online(pos);
3858 }
3859
3860 static void softnet_seq_stop(struct seq_file *seq, void *v)
3861 {
3862 }
3863
3864 static int softnet_seq_show(struct seq_file *seq, void *v)
3865 {
3866         struct softnet_data *sd = v;
3867
3868         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3869                    sd->processed, sd->dropped, sd->time_squeeze, 0,
3870                    0, 0, 0, 0, /* was fastroute */
3871                    sd->cpu_collision, sd->received_rps);
3872         return 0;
3873 }
3874
3875 static const struct seq_operations dev_seq_ops = {
3876         .start = dev_seq_start,
3877         .next  = dev_seq_next,
3878         .stop  = dev_seq_stop,
3879         .show  = dev_seq_show,
3880 };
3881
3882 static int dev_seq_open(struct inode *inode, struct file *file)
3883 {
3884         return seq_open_net(inode, file, &dev_seq_ops,
3885                             sizeof(struct seq_net_private));
3886 }
3887
3888 static const struct file_operations dev_seq_fops = {
3889         .owner   = THIS_MODULE,
3890         .open    = dev_seq_open,
3891         .read    = seq_read,
3892         .llseek  = seq_lseek,
3893         .release = seq_release_net,
3894 };
3895
3896 static const struct seq_operations softnet_seq_ops = {
3897         .start = softnet_seq_start,
3898         .next  = softnet_seq_next,
3899         .stop  = softnet_seq_stop,
3900         .show  = softnet_seq_show,
3901 };
3902
3903 static int softnet_seq_open(struct inode *inode, struct file *file)
3904 {
3905         return seq_open(file, &softnet_seq_ops);
3906 }
3907
3908 static const struct file_operations softnet_seq_fops = {
3909         .owner   = THIS_MODULE,
3910         .open    = softnet_seq_open,
3911         .read    = seq_read,
3912         .llseek  = seq_lseek,
3913         .release = seq_release,
3914 };
3915
3916 static void *ptype_get_idx(loff_t pos)
3917 {
3918         struct packet_type *pt = NULL;
3919         loff_t i = 0;
3920         int t;
3921
3922         list_for_each_entry_rcu(pt, &ptype_all, list) {
3923                 if (i == pos)
3924                         return pt;
3925                 ++i;
3926         }
3927
3928         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3929                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3930                         if (i == pos)
3931                                 return pt;
3932                         ++i;
3933                 }
3934         }
3935         return NULL;
3936 }
3937
3938 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3939         __acquires(RCU)
3940 {
3941         rcu_read_lock();
3942         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3943 }
3944
3945 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3946 {
3947         struct packet_type *pt;
3948         struct list_head *nxt;
3949         int hash;
3950
3951         ++*pos;
3952         if (v == SEQ_START_TOKEN)
3953                 return ptype_get_idx(0);
3954
3955         pt = v;
3956         nxt = pt->list.next;
3957         if (pt->type == htons(ETH_P_ALL)) {
3958                 if (nxt != &ptype_all)
3959                         goto found;
3960                 hash = 0;
3961                 nxt = ptype_base[0].next;
3962         } else
3963                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3964
3965         while (nxt == &ptype_base[hash]) {
3966                 if (++hash >= PTYPE_HASH_SIZE)
3967                         return NULL;
3968                 nxt = ptype_base[hash].next;
3969         }
3970 found:
3971         return list_entry(nxt, struct packet_type, list);
3972 }
3973
3974 static void ptype_seq_stop(struct seq_file *seq, void *v)
3975         __releases(RCU)
3976 {
3977         rcu_read_unlock();
3978 }
3979
3980 static int ptype_seq_show(struct seq_file *seq, void *v)
3981 {
3982         struct packet_type *pt = v;
3983
3984         if (v == SEQ_START_TOKEN)
3985                 seq_puts(seq, "Type Device      Function\n");
3986         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3987                 if (pt->type == htons(ETH_P_ALL))
3988                         seq_puts(seq, "ALL ");
3989                 else
3990                         seq_printf(seq, "%04x", ntohs(pt->type));
3991
3992                 seq_printf(seq, " %-8s %pF\n",
3993                            pt->dev ? pt->dev->name : "", pt->func);
3994         }
3995
3996         return 0;
3997 }
3998
3999 static const struct seq_operations ptype_seq_ops = {
4000         .start = ptype_seq_start,
4001         .next  = ptype_seq_next,
4002         .stop  = ptype_seq_stop,
4003         .show  = ptype_seq_show,
4004 };
4005
4006 static int ptype_seq_open(struct inode *inode, struct file *file)
4007 {
4008         return seq_open_net(inode, file, &ptype_seq_ops,
4009                         sizeof(struct seq_net_private));
4010 }
4011
4012 static const struct file_operations ptype_seq_fops = {
4013         .owner   = THIS_MODULE,
4014         .open    = ptype_seq_open,
4015         .read    = seq_read,
4016         .llseek  = seq_lseek,
4017         .release = seq_release_net,
4018 };
4019
4020
4021 static int __net_init dev_proc_net_init(struct net *net)
4022 {
4023         int rc = -ENOMEM;
4024
4025         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4026                 goto out;
4027         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4028                 goto out_dev;
4029         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4030                 goto out_softnet;
4031
4032         if (wext_proc_init(net))
4033                 goto out_ptype;
4034         rc = 0;
4035 out:
4036         return rc;
4037 out_ptype:
4038         proc_net_remove(net, "ptype");
4039 out_softnet:
4040         proc_net_remove(net, "softnet_stat");
4041 out_dev:
4042         proc_net_remove(net, "dev");
4043         goto out;
4044 }
4045
4046 static void __net_exit dev_proc_net_exit(struct net *net)
4047 {
4048         wext_proc_exit(net);
4049
4050         proc_net_remove(net, "ptype");
4051         proc_net_remove(net, "softnet_stat");
4052         proc_net_remove(net, "dev");
4053 }
4054
4055 static struct pernet_operations __net_initdata dev_proc_ops = {
4056         .init = dev_proc_net_init,
4057         .exit = dev_proc_net_exit,
4058 };
4059
4060 static int __init dev_proc_init(void)
4061 {
4062         return register_pernet_subsys(&dev_proc_ops);
4063 }
4064 #else
4065 #define dev_proc_init() 0
4066 #endif  /* CONFIG_PROC_FS */
4067
4068
4069 /**
4070  *      netdev_set_master       -       set up master/slave pair
4071  *      @slave: slave device
4072  *      @master: new master device
4073  *
4074  *      Changes the master device of the slave. Pass %NULL to break the
4075  *      bonding. The caller must hold the RTNL semaphore. On a failure
4076  *      a negative errno code is returned. On success the reference counts
4077  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4078  *      function returns zero.
4079  */
4080 int netdev_set_master(struct net_device *slave, struct net_device *master)
4081 {
4082         struct net_device *old = slave->master;
4083
4084         ASSERT_RTNL();
4085
4086         if (master) {
4087                 if (old)
4088                         return -EBUSY;
4089                 dev_hold(master);
4090         }
4091
4092         slave->master = master;
4093
4094         if (old) {
4095                 synchronize_net();
4096                 dev_put(old);
4097         }
4098         if (master)
4099                 slave->flags |= IFF_SLAVE;
4100         else
4101                 slave->flags &= ~IFF_SLAVE;
4102
4103         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4104         return 0;
4105 }
4106 EXPORT_SYMBOL(netdev_set_master);
4107
4108 static void dev_change_rx_flags(struct net_device *dev, int flags)
4109 {
4110         const struct net_device_ops *ops = dev->netdev_ops;
4111
4112         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4113                 ops->ndo_change_rx_flags(dev, flags);
4114 }
4115
4116 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4117 {
4118         unsigned short old_flags = dev->flags;
4119         uid_t uid;
4120         gid_t gid;
4121
4122         ASSERT_RTNL();
4123
4124         dev->flags |= IFF_PROMISC;
4125         dev->promiscuity += inc;
4126         if (dev->promiscuity == 0) {
4127                 /*
4128                  * Avoid overflow.
4129                  * If inc causes overflow, untouch promisc and return error.
4130                  */
4131                 if (inc < 0)
4132                         dev->flags &= ~IFF_PROMISC;
4133                 else {
4134                         dev->promiscuity -= inc;
4135                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4136                                 "set promiscuity failed, promiscuity feature "
4137                                 "of device might be broken.\n", dev->name);
4138                         return -EOVERFLOW;
4139                 }
4140         }
4141         if (dev->flags != old_flags) {
4142                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4143                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4144                                                                "left");
4145                 if (audit_enabled) {
4146                         current_uid_gid(&uid, &gid);
4147                         audit_log(current->audit_context, GFP_ATOMIC,
4148                                 AUDIT_ANOM_PROMISCUOUS,
4149                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4150                                 dev->name, (dev->flags & IFF_PROMISC),
4151                                 (old_flags & IFF_PROMISC),
4152                                 audit_get_loginuid(current),
4153                                 uid, gid,
4154                                 audit_get_sessionid(current));
4155                 }
4156
4157                 dev_change_rx_flags(dev, IFF_PROMISC);
4158         }
4159         return 0;
4160 }
4161
4162 /**
4163  *      dev_set_promiscuity     - update promiscuity count on a device
4164  *      @dev: device
4165  *      @inc: modifier
4166  *
4167  *      Add or remove promiscuity from a device. While the count in the device
4168  *      remains above zero the interface remains promiscuous. Once it hits zero
4169  *      the device reverts back to normal filtering operation. A negative inc
4170  *      value is used to drop promiscuity on the device.
4171  *      Return 0 if successful or a negative errno code on error.
4172  */
4173 int dev_set_promiscuity(struct net_device *dev, int inc)
4174 {
4175         unsigned short old_flags = dev->flags;
4176         int err;
4177
4178         err = __dev_set_promiscuity(dev, inc);
4179         if (err < 0)
4180                 return err;
4181         if (dev->flags != old_flags)
4182                 dev_set_rx_mode(dev);
4183         return err;
4184 }
4185 EXPORT_SYMBOL(dev_set_promiscuity);
4186
4187 /**
4188  *      dev_set_allmulti        - update allmulti count on a device
4189  *      @dev: device
4190  *      @inc: modifier
4191  *
4192  *      Add or remove reception of all multicast frames to a device. While the
4193  *      count in the device remains above zero the interface remains listening
4194  *      to all interfaces. Once it hits zero the device reverts back to normal
4195  *      filtering operation. A negative @inc value is used to drop the counter
4196  *      when releasing a resource needing all multicasts.
4197  *      Return 0 if successful or a negative errno code on error.
4198  */
4199
4200 int dev_set_allmulti(struct net_device *dev, int inc)
4201 {
4202         unsigned short old_flags = dev->flags;
4203
4204         ASSERT_RTNL();
4205
4206         dev->flags |= IFF_ALLMULTI;
4207         dev->allmulti += inc;
4208         if (dev->allmulti == 0) {
4209                 /*
4210                  * Avoid overflow.
4211                  * If inc causes overflow, untouch allmulti and return error.
4212                  */
4213                 if (inc < 0)
4214                         dev->flags &= ~IFF_ALLMULTI;
4215                 else {
4216                         dev->allmulti -= inc;
4217                         printk(KERN_WARNING "%s: allmulti touches roof, "
4218                                 "set allmulti failed, allmulti feature of "
4219                                 "device might be broken.\n", dev->name);
4220                         return -EOVERFLOW;
4221                 }
4222         }
4223         if (dev->flags ^ old_flags) {
4224                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4225                 dev_set_rx_mode(dev);
4226         }
4227         return 0;
4228 }
4229 EXPORT_SYMBOL(dev_set_allmulti);
4230
4231 /*
4232  *      Upload unicast and multicast address lists to device and
4233  *      configure RX filtering. When the device doesn't support unicast
4234  *      filtering it is put in promiscuous mode while unicast addresses
4235  *      are present.
4236  */
4237 void __dev_set_rx_mode(struct net_device *dev)
4238 {
4239         const struct net_device_ops *ops = dev->netdev_ops;
4240
4241         /* dev_open will call this function so the list will stay sane. */
4242         if (!(dev->flags&IFF_UP))
4243                 return;
4244
4245         if (!netif_device_present(dev))
4246                 return;
4247
4248         if (ops->ndo_set_rx_mode)
4249                 ops->ndo_set_rx_mode(dev);
4250         else {
4251                 /* Unicast addresses changes may only happen under the rtnl,
4252                  * therefore calling __dev_set_promiscuity here is safe.
4253                  */
4254                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4255                         __dev_set_promiscuity(dev, 1);
4256                         dev->uc_promisc = 1;
4257                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4258                         __dev_set_promiscuity(dev, -1);
4259                         dev->uc_promisc = 0;
4260                 }
4261
4262                 if (ops->ndo_set_multicast_list)
4263                         ops->ndo_set_multicast_list(dev);
4264         }
4265 }
4266
4267 void dev_set_rx_mode(struct net_device *dev)
4268 {
4269         netif_addr_lock_bh(dev);
4270         __dev_set_rx_mode(dev);
4271         netif_addr_unlock_bh(dev);
4272 }
4273
4274 /**
4275  *      dev_get_flags - get flags reported to userspace
4276  *      @dev: device
4277  *
4278  *      Get the combination of flag bits exported through APIs to userspace.
4279  */
4280 unsigned dev_get_flags(const struct net_device *dev)
4281 {
4282         unsigned flags;
4283
4284         flags = (dev->flags & ~(IFF_PROMISC |
4285                                 IFF_ALLMULTI |
4286                                 IFF_RUNNING |
4287                                 IFF_LOWER_UP |
4288                                 IFF_DORMANT)) |
4289                 (dev->gflags & (IFF_PROMISC |
4290                                 IFF_ALLMULTI));
4291
4292         if (netif_running(dev)) {
4293                 if (netif_oper_up(dev))
4294                         flags |= IFF_RUNNING;
4295                 if (netif_carrier_ok(dev))
4296                         flags |= IFF_LOWER_UP;
4297                 if (netif_dormant(dev))
4298                         flags |= IFF_DORMANT;
4299         }
4300
4301         return flags;
4302 }
4303 EXPORT_SYMBOL(dev_get_flags);
4304
4305 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4306 {
4307         int old_flags = dev->flags;
4308         int ret;
4309
4310         ASSERT_RTNL();
4311
4312         /*
4313          *      Set the flags on our device.
4314          */
4315
4316         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4317                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4318                                IFF_AUTOMEDIA)) |
4319                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4320                                     IFF_ALLMULTI));
4321
4322         /*
4323          *      Load in the correct multicast list now the flags have changed.
4324          */
4325
4326         if ((old_flags ^ flags) & IFF_MULTICAST)
4327                 dev_change_rx_flags(dev, IFF_MULTICAST);
4328
4329         dev_set_rx_mode(dev);
4330
4331         /*
4332          *      Have we downed the interface. We handle IFF_UP ourselves
4333          *      according to user attempts to set it, rather than blindly
4334          *      setting it.
4335          */
4336
4337         ret = 0;
4338         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4339                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4340
4341                 if (!ret)
4342                         dev_set_rx_mode(dev);
4343         }
4344
4345         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4346                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4347
4348                 dev->gflags ^= IFF_PROMISC;
4349                 dev_set_promiscuity(dev, inc);
4350         }
4351
4352         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4353            is important. Some (broken) drivers set IFF_PROMISC, when
4354            IFF_ALLMULTI is requested not asking us and not reporting.
4355          */
4356         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4357                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4358
4359                 dev->gflags ^= IFF_ALLMULTI;
4360                 dev_set_allmulti(dev, inc);
4361         }
4362
4363         return ret;
4364 }
4365
4366 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4367 {
4368         unsigned int changes = dev->flags ^ old_flags;
4369
4370         if (changes & IFF_UP) {
4371                 if (dev->flags & IFF_UP)
4372                         call_netdevice_notifiers(NETDEV_UP, dev);
4373                 else
4374                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4375         }
4376
4377         if (dev->flags & IFF_UP &&
4378             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4379                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4380 }
4381
4382 /**
4383  *      dev_change_flags - change device settings
4384  *      @dev: device
4385  *      @flags: device state flags
4386  *
4387  *      Change settings on device based state flags. The flags are
4388  *      in the userspace exported format.
4389  */
4390 int dev_change_flags(struct net_device *dev, unsigned flags)
4391 {
4392         int ret, changes;
4393         int old_flags = dev->flags;
4394
4395         ret = __dev_change_flags(dev, flags);
4396         if (ret < 0)
4397                 return ret;
4398
4399         changes = old_flags ^ dev->flags;
4400         if (changes)
4401                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4402
4403         __dev_notify_flags(dev, old_flags);
4404         return ret;
4405 }
4406 EXPORT_SYMBOL(dev_change_flags);
4407
4408 /**
4409  *      dev_set_mtu - Change maximum transfer unit
4410  *      @dev: device
4411  *      @new_mtu: new transfer unit
4412  *
4413  *      Change the maximum transfer size of the network device.
4414  */
4415 int dev_set_mtu(struct net_device *dev, int new_mtu)
4416 {
4417         const struct net_device_ops *ops = dev->netdev_ops;
4418         int err;
4419
4420         if (new_mtu == dev->mtu)
4421                 return 0;
4422
4423         /*      MTU must be positive.    */
4424         if (new_mtu < 0)
4425                 return -EINVAL;
4426
4427         if (!netif_device_present(dev))
4428                 return -ENODEV;
4429
4430         err = 0;
4431         if (ops->ndo_change_mtu)
4432                 err = ops->ndo_change_mtu(dev, new_mtu);
4433         else
4434                 dev->mtu = new_mtu;
4435
4436         if (!err && dev->flags & IFF_UP)
4437                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4438         return err;
4439 }
4440 EXPORT_SYMBOL(dev_set_mtu);
4441
4442 /**
4443  *      dev_set_mac_address - Change Media Access Control Address
4444  *      @dev: device
4445  *      @sa: new address
4446  *
4447  *      Change the hardware (MAC) address of the device
4448  */
4449 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4450 {
4451         const struct net_device_ops *ops = dev->netdev_ops;
4452         int err;
4453
4454         if (!ops->ndo_set_mac_address)
4455                 return -EOPNOTSUPP;
4456         if (sa->sa_family != dev->type)
4457                 return -EINVAL;
4458         if (!netif_device_present(dev))
4459                 return -ENODEV;
4460         err = ops->ndo_set_mac_address(dev, sa);
4461         if (!err)
4462                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4463         return err;
4464 }
4465 EXPORT_SYMBOL(dev_set_mac_address);
4466
4467 /*
4468  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4469  */
4470 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4471 {
4472         int err;
4473         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4474
4475         if (!dev)
4476                 return -ENODEV;
4477
4478         switch (cmd) {
4479         case SIOCGIFFLAGS:      /* Get interface flags */
4480                 ifr->ifr_flags = (short) dev_get_flags(dev);
4481                 return 0;
4482
4483         case SIOCGIFMETRIC:     /* Get the metric on the interface
4484                                    (currently unused) */
4485                 ifr->ifr_metric = 0;
4486                 return 0;
4487
4488         case SIOCGIFMTU:        /* Get the MTU of a device */
4489                 ifr->ifr_mtu = dev->mtu;
4490                 return 0;
4491
4492         case SIOCGIFHWADDR:
4493                 if (!dev->addr_len)
4494                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4495                 else
4496                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4497                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4498                 ifr->ifr_hwaddr.sa_family = dev->type;
4499                 return 0;
4500
4501         case SIOCGIFSLAVE:
4502                 err = -EINVAL;
4503                 break;
4504
4505         case SIOCGIFMAP:
4506                 ifr->ifr_map.mem_start = dev->mem_start;
4507                 ifr->ifr_map.mem_end   = dev->mem_end;
4508                 ifr->ifr_map.base_addr = dev->base_addr;
4509                 ifr->ifr_map.irq       = dev->irq;
4510                 ifr->ifr_map.dma       = dev->dma;
4511                 ifr->ifr_map.port      = dev->if_port;
4512                 return 0;
4513
4514         case SIOCGIFINDEX:
4515                 ifr->ifr_ifindex = dev->ifindex;
4516                 return 0;
4517
4518         case SIOCGIFTXQLEN:
4519                 ifr->ifr_qlen = dev->tx_queue_len;
4520                 return 0;
4521
4522         default:
4523                 /* dev_ioctl() should ensure this case
4524                  * is never reached
4525                  */
4526                 WARN_ON(1);
4527                 err = -EINVAL;
4528                 break;
4529
4530         }
4531         return err;
4532 }
4533
4534 /*
4535  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4536  */
4537 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4538 {
4539         int err;
4540         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4541         const struct net_device_ops *ops;
4542
4543         if (!dev)
4544                 return -ENODEV;
4545
4546         ops = dev->netdev_ops;
4547
4548         switch (cmd) {
4549         case SIOCSIFFLAGS:      /* Set interface flags */
4550                 return dev_change_flags(dev, ifr->ifr_flags);
4551
4552         case SIOCSIFMETRIC:     /* Set the metric on the interface
4553                                    (currently unused) */
4554                 return -EOPNOTSUPP;
4555
4556         case SIOCSIFMTU:        /* Set the MTU of a device */
4557                 return dev_set_mtu(dev, ifr->ifr_mtu);
4558
4559         case SIOCSIFHWADDR:
4560                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4561
4562         case SIOCSIFHWBROADCAST:
4563                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4564                         return -EINVAL;
4565                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4566                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4567                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4568                 return 0;
4569
4570         case SIOCSIFMAP:
4571                 if (ops->ndo_set_config) {
4572                         if (!netif_device_present(dev))
4573                                 return -ENODEV;
4574                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4575                 }
4576                 return -EOPNOTSUPP;
4577
4578         case SIOCADDMULTI:
4579                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4580                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4581                         return -EINVAL;
4582                 if (!netif_device_present(dev))
4583                         return -ENODEV;
4584                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4585
4586         case SIOCDELMULTI:
4587                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4588                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4589                         return -EINVAL;
4590                 if (!netif_device_present(dev))
4591                         return -ENODEV;
4592                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4593
4594         case SIOCSIFTXQLEN:
4595                 if (ifr->ifr_qlen < 0)
4596                         return -EINVAL;
4597                 dev->tx_queue_len = ifr->ifr_qlen;
4598                 return 0;
4599
4600         case SIOCSIFNAME:
4601                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4602                 return dev_change_name(dev, ifr->ifr_newname);
4603
4604         /*
4605          *      Unknown or private ioctl
4606          */
4607         default:
4608                 if ((cmd >= SIOCDEVPRIVATE &&
4609                     cmd <= SIOCDEVPRIVATE + 15) ||
4610                     cmd == SIOCBONDENSLAVE ||
4611                     cmd == SIOCBONDRELEASE ||
4612                     cmd == SIOCBONDSETHWADDR ||
4613                     cmd == SIOCBONDSLAVEINFOQUERY ||
4614                     cmd == SIOCBONDINFOQUERY ||
4615                     cmd == SIOCBONDCHANGEACTIVE ||
4616                     cmd == SIOCGMIIPHY ||
4617                     cmd == SIOCGMIIREG ||
4618                     cmd == SIOCSMIIREG ||
4619                     cmd == SIOCBRADDIF ||
4620                     cmd == SIOCBRDELIF ||
4621                     cmd == SIOCSHWTSTAMP ||
4622                     cmd == SIOCWANDEV) {
4623                         err = -EOPNOTSUPP;
4624                         if (ops->ndo_do_ioctl) {
4625                                 if (netif_device_present(dev))
4626                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4627                                 else
4628                                         err = -ENODEV;
4629                         }
4630                 } else
4631                         err = -EINVAL;
4632
4633         }
4634         return err;
4635 }
4636
4637 /*
4638  *      This function handles all "interface"-type I/O control requests. The actual
4639  *      'doing' part of this is dev_ifsioc above.
4640  */
4641
4642 /**
4643  *      dev_ioctl       -       network device ioctl
4644  *      @net: the applicable net namespace
4645  *      @cmd: command to issue
4646  *      @arg: pointer to a struct ifreq in user space
4647  *
4648  *      Issue ioctl functions to devices. This is normally called by the
4649  *      user space syscall interfaces but can sometimes be useful for
4650  *      other purposes. The return value is the return from the syscall if
4651  *      positive or a negative errno code on error.
4652  */
4653
4654 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4655 {
4656         struct ifreq ifr;
4657         int ret;
4658         char *colon;
4659
4660         /* One special case: SIOCGIFCONF takes ifconf argument
4661            and requires shared lock, because it sleeps writing
4662            to user space.
4663          */
4664
4665         if (cmd == SIOCGIFCONF) {
4666                 rtnl_lock();
4667                 ret = dev_ifconf(net, (char __user *) arg);
4668                 rtnl_unlock();
4669                 return ret;
4670         }
4671         if (cmd == SIOCGIFNAME)
4672                 return dev_ifname(net, (struct ifreq __user *)arg);
4673
4674         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4675                 return -EFAULT;
4676
4677         ifr.ifr_name[IFNAMSIZ-1] = 0;
4678
4679         colon = strchr(ifr.ifr_name, ':');
4680         if (colon)
4681                 *colon = 0;
4682
4683         /*
4684          *      See which interface the caller is talking about.
4685          */
4686
4687         switch (cmd) {
4688         /*
4689          *      These ioctl calls:
4690          *      - can be done by all.
4691          *      - atomic and do not require locking.
4692          *      - return a value
4693          */
4694         case SIOCGIFFLAGS:
4695         case SIOCGIFMETRIC:
4696         case SIOCGIFMTU:
4697         case SIOCGIFHWADDR:
4698         case SIOCGIFSLAVE:
4699         case SIOCGIFMAP:
4700         case SIOCGIFINDEX:
4701         case SIOCGIFTXQLEN:
4702                 dev_load(net, ifr.ifr_name);
4703                 rcu_read_lock();
4704                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4705                 rcu_read_unlock();
4706                 if (!ret) {
4707                         if (colon)
4708                                 *colon = ':';
4709                         if (copy_to_user(arg, &ifr,
4710                                          sizeof(struct ifreq)))
4711                                 ret = -EFAULT;
4712                 }
4713                 return ret;
4714
4715         case SIOCETHTOOL:
4716                 dev_load(net, ifr.ifr_name);
4717                 rtnl_lock();
4718                 ret = dev_ethtool(net, &ifr);
4719                 rtnl_unlock();
4720                 if (!ret) {
4721                         if (colon)
4722                                 *colon = ':';
4723                         if (copy_to_user(arg, &ifr,
4724                                          sizeof(struct ifreq)))
4725                                 ret = -EFAULT;
4726                 }
4727                 return ret;
4728
4729         /*
4730          *      These ioctl calls:
4731          *      - require superuser power.
4732          *      - require strict serialization.
4733          *      - return a value
4734          */
4735         case SIOCGMIIPHY:
4736         case SIOCGMIIREG:
4737         case SIOCSIFNAME:
4738                 if (!capable(CAP_NET_ADMIN))
4739                         return -EPERM;
4740                 dev_load(net, ifr.ifr_name);
4741                 rtnl_lock();
4742                 ret = dev_ifsioc(net, &ifr, cmd);
4743                 rtnl_unlock();
4744                 if (!ret) {
4745                         if (colon)
4746                                 *colon = ':';
4747                         if (copy_to_user(arg, &ifr,
4748                                          sizeof(struct ifreq)))
4749                                 ret = -EFAULT;
4750                 }
4751                 return ret;
4752
4753         /*
4754          *      These ioctl calls:
4755          *      - require superuser power.
4756          *      - require strict serialization.
4757          *      - do not return a value
4758          */
4759         case SIOCSIFFLAGS:
4760         case SIOCSIFMETRIC:
4761         case SIOCSIFMTU:
4762         case SIOCSIFMAP:
4763         case SIOCSIFHWADDR:
4764         case SIOCSIFSLAVE:
4765         case SIOCADDMULTI:
4766         case SIOCDELMULTI:
4767         case SIOCSIFHWBROADCAST:
4768         case SIOCSIFTXQLEN:
4769         case SIOCSMIIREG:
4770         case SIOCBONDENSLAVE:
4771         case SIOCBONDRELEASE:
4772         case SIOCBONDSETHWADDR:
4773         case SIOCBONDCHANGEACTIVE:
4774         case SIOCBRADDIF:
4775         case SIOCBRDELIF:
4776         case SIOCSHWTSTAMP:
4777                 if (!capable(CAP_NET_ADMIN))
4778                         return -EPERM;
4779                 /* fall through */
4780         case SIOCBONDSLAVEINFOQUERY:
4781         case SIOCBONDINFOQUERY:
4782                 dev_load(net, ifr.ifr_name);
4783                 rtnl_lock();
4784                 ret = dev_ifsioc(net, &ifr, cmd);
4785                 rtnl_unlock();
4786                 return ret;
4787
4788         case SIOCGIFMEM:
4789                 /* Get the per device memory space. We can add this but
4790                  * currently do not support it */
4791         case SIOCSIFMEM:
4792                 /* Set the per device memory buffer space.
4793                  * Not applicable in our case */
4794         case SIOCSIFLINK:
4795                 return -EINVAL;
4796
4797         /*
4798          *      Unknown or private ioctl.
4799          */
4800         default:
4801                 if (cmd == SIOCWANDEV ||
4802                     (cmd >= SIOCDEVPRIVATE &&
4803                      cmd <= SIOCDEVPRIVATE + 15)) {
4804                         dev_load(net, ifr.ifr_name);
4805                         rtnl_lock();
4806                         ret = dev_ifsioc(net, &ifr, cmd);
4807                         rtnl_unlock();
4808                         if (!ret && copy_to_user(arg, &ifr,
4809                                                  sizeof(struct ifreq)))
4810                                 ret = -EFAULT;
4811                         return ret;
4812                 }
4813                 /* Take care of Wireless Extensions */
4814                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4815                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4816                 return -EINVAL;
4817         }
4818 }
4819
4820
4821 /**
4822  *      dev_new_index   -       allocate an ifindex
4823  *      @net: the applicable net namespace
4824  *
4825  *      Returns a suitable unique value for a new device interface
4826  *      number.  The caller must hold the rtnl semaphore or the
4827  *      dev_base_lock to be sure it remains unique.
4828  */
4829 static int dev_new_index(struct net *net)
4830 {
4831         static int ifindex;
4832         for (;;) {
4833                 if (++ifindex <= 0)
4834                         ifindex = 1;
4835                 if (!__dev_get_by_index(net, ifindex))
4836                         return ifindex;
4837         }
4838 }
4839
4840 /* Delayed registration/unregisteration */
4841 static LIST_HEAD(net_todo_list);
4842
4843 static void net_set_todo(struct net_device *dev)
4844 {
4845         list_add_tail(&dev->todo_list, &net_todo_list);
4846 }
4847
4848 static void rollback_registered_many(struct list_head *head)
4849 {
4850         struct net_device *dev, *tmp;
4851
4852         BUG_ON(dev_boot_phase);
4853         ASSERT_RTNL();
4854
4855         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4856                 /* Some devices call without registering
4857                  * for initialization unwind. Remove those
4858                  * devices and proceed with the remaining.
4859                  */
4860                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4861                         pr_debug("unregister_netdevice: device %s/%p never "
4862                                  "was registered\n", dev->name, dev);
4863
4864                         WARN_ON(1);
4865                         list_del(&dev->unreg_list);
4866                         continue;
4867                 }
4868
4869                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4870
4871                 /* If device is running, close it first. */
4872                 dev_close(dev);
4873
4874                 /* And unlink it from device chain. */
4875                 unlist_netdevice(dev);
4876
4877                 dev->reg_state = NETREG_UNREGISTERING;
4878         }
4879
4880         synchronize_net();
4881
4882         list_for_each_entry(dev, head, unreg_list) {
4883                 /* Shutdown queueing discipline. */
4884                 dev_shutdown(dev);
4885
4886
4887                 /* Notify protocols, that we are about to destroy
4888                    this device. They should clean all the things.
4889                 */
4890                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4891
4892                 if (!dev->rtnl_link_ops ||
4893                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4894                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4895
4896                 /*
4897                  *      Flush the unicast and multicast chains
4898                  */
4899                 dev_uc_flush(dev);
4900                 dev_mc_flush(dev);
4901
4902                 if (dev->netdev_ops->ndo_uninit)
4903                         dev->netdev_ops->ndo_uninit(dev);
4904
4905                 /* Notifier chain MUST detach us from master device. */
4906                 WARN_ON(dev->master);
4907
4908                 /* Remove entries from kobject tree */
4909                 netdev_unregister_kobject(dev);
4910         }
4911
4912         /* Process any work delayed until the end of the batch */
4913         dev = list_first_entry(head, struct net_device, unreg_list);
4914         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4915
4916         rcu_barrier();
4917
4918         list_for_each_entry(dev, head, unreg_list)
4919                 dev_put(dev);
4920 }
4921
4922 static void rollback_registered(struct net_device *dev)
4923 {
4924         LIST_HEAD(single);
4925
4926         list_add(&dev->unreg_list, &single);
4927         rollback_registered_many(&single);
4928 }
4929
4930 static void __netdev_init_queue_locks_one(struct net_device *dev,
4931                                           struct netdev_queue *dev_queue,
4932                                           void *_unused)
4933 {
4934         spin_lock_init(&dev_queue->_xmit_lock);
4935         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4936         dev_queue->xmit_lock_owner = -1;
4937 }
4938
4939 static void netdev_init_queue_locks(struct net_device *dev)
4940 {
4941         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4942 }
4943
4944 unsigned long netdev_fix_features(unsigned long features, const char *name)
4945 {
4946         /* Fix illegal SG+CSUM combinations. */
4947         if ((features & NETIF_F_SG) &&
4948             !(features & NETIF_F_ALL_CSUM)) {
4949                 if (name)
4950                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4951                                "checksum feature.\n", name);
4952                 features &= ~NETIF_F_SG;
4953         }
4954
4955         /* TSO requires that SG is present as well. */
4956         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4957                 if (name)
4958                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4959                                "SG feature.\n", name);
4960                 features &= ~NETIF_F_TSO;
4961         }
4962
4963         if (features & NETIF_F_UFO) {
4964                 if (!(features & NETIF_F_GEN_CSUM)) {
4965                         if (name)
4966                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4967                                        "since no NETIF_F_HW_CSUM feature.\n",
4968                                        name);
4969                         features &= ~NETIF_F_UFO;
4970                 }
4971
4972                 if (!(features & NETIF_F_SG)) {
4973                         if (name)
4974                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4975                                        "since no NETIF_F_SG feature.\n", name);
4976                         features &= ~NETIF_F_UFO;
4977                 }
4978         }
4979
4980         return features;
4981 }
4982 EXPORT_SYMBOL(netdev_fix_features);
4983
4984 /**
4985  *      netif_stacked_transfer_operstate -      transfer operstate
4986  *      @rootdev: the root or lower level device to transfer state from
4987  *      @dev: the device to transfer operstate to
4988  *
4989  *      Transfer operational state from root to device. This is normally
4990  *      called when a stacking relationship exists between the root
4991  *      device and the device(a leaf device).
4992  */
4993 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4994                                         struct net_device *dev)
4995 {
4996         if (rootdev->operstate == IF_OPER_DORMANT)
4997                 netif_dormant_on(dev);
4998         else
4999                 netif_dormant_off(dev);
5000
5001         if (netif_carrier_ok(rootdev)) {
5002                 if (!netif_carrier_ok(dev))
5003                         netif_carrier_on(dev);
5004         } else {
5005                 if (netif_carrier_ok(dev))
5006                         netif_carrier_off(dev);
5007         }
5008 }
5009 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5010
5011 static int netif_alloc_rx_queues(struct net_device *dev)
5012 {
5013 #ifdef CONFIG_RPS
5014         unsigned int i, count = dev->num_rx_queues;
5015
5016         if (count) {
5017                 struct netdev_rx_queue *rx;
5018
5019                 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5020                 if (!rx) {
5021                         pr_err("netdev: Unable to allocate %u rx queues.\n",
5022                                count);
5023                         return -ENOMEM;
5024                 }
5025                 dev->_rx = rx;
5026                 atomic_set(&rx->count, count);
5027
5028                 /*
5029                  * Set a pointer to first element in the array which holds the
5030                  * reference count.
5031                  */
5032                 for (i = 0; i < count; i++)
5033                         rx[i].first = rx;
5034         }
5035 #endif
5036         return 0;
5037 }
5038
5039 /**
5040  *      register_netdevice      - register a network device
5041  *      @dev: device to register
5042  *
5043  *      Take a completed network device structure and add it to the kernel
5044  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5045  *      chain. 0 is returned on success. A negative errno code is returned
5046  *      on a failure to set up the device, or if the name is a duplicate.
5047  *
5048  *      Callers must hold the rtnl semaphore. You may want
5049  *      register_netdev() instead of this.
5050  *
5051  *      BUGS:
5052  *      The locking appears insufficient to guarantee two parallel registers
5053  *      will not get the same name.
5054  */
5055
5056 int register_netdevice(struct net_device *dev)
5057 {
5058         int ret;
5059         struct net *net = dev_net(dev);
5060
5061         BUG_ON(dev_boot_phase);
5062         ASSERT_RTNL();
5063
5064         might_sleep();
5065
5066         /* When net_device's are persistent, this will be fatal. */
5067         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5068         BUG_ON(!net);
5069
5070         spin_lock_init(&dev->addr_list_lock);
5071         netdev_set_addr_lockdep_class(dev);
5072         netdev_init_queue_locks(dev);
5073
5074         dev->iflink = -1;
5075
5076         ret = netif_alloc_rx_queues(dev);
5077         if (ret)
5078                 goto out;
5079
5080         /* Init, if this function is available */
5081         if (dev->netdev_ops->ndo_init) {
5082                 ret = dev->netdev_ops->ndo_init(dev);
5083                 if (ret) {
5084                         if (ret > 0)
5085                                 ret = -EIO;
5086                         goto out;
5087                 }
5088         }
5089
5090         ret = dev_get_valid_name(dev, dev->name, 0);
5091         if (ret)
5092                 goto err_uninit;
5093
5094         dev->ifindex = dev_new_index(net);
5095         if (dev->iflink == -1)
5096                 dev->iflink = dev->ifindex;
5097
5098         /* Fix illegal checksum combinations */
5099         if ((dev->features & NETIF_F_HW_CSUM) &&
5100             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5101                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5102                        dev->name);
5103                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5104         }
5105
5106         if ((dev->features & NETIF_F_NO_CSUM) &&
5107             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5108                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5109                        dev->name);
5110                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5111         }
5112
5113         dev->features = netdev_fix_features(dev->features, dev->name);
5114
5115         /* Enable software GSO if SG is supported. */
5116         if (dev->features & NETIF_F_SG)
5117                 dev->features |= NETIF_F_GSO;
5118
5119         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5120          * vlan_dev_init() will do the dev->features check, so these features
5121          * are enabled only if supported by underlying device.
5122          */
5123         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5124
5125         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5126         ret = notifier_to_errno(ret);
5127         if (ret)
5128                 goto err_uninit;
5129
5130         ret = netdev_register_kobject(dev);
5131         if (ret)
5132                 goto err_uninit;
5133         dev->reg_state = NETREG_REGISTERED;
5134
5135         /*
5136          *      Default initial state at registry is that the
5137          *      device is present.
5138          */
5139
5140         set_bit(__LINK_STATE_PRESENT, &dev->state);
5141
5142         dev_init_scheduler(dev);
5143         dev_hold(dev);
5144         list_netdevice(dev);
5145
5146         /* Notify protocols, that a new device appeared. */
5147         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5148         ret = notifier_to_errno(ret);
5149         if (ret) {
5150                 rollback_registered(dev);
5151                 dev->reg_state = NETREG_UNREGISTERED;
5152         }
5153         /*
5154          *      Prevent userspace races by waiting until the network
5155          *      device is fully setup before sending notifications.
5156          */
5157         if (!dev->rtnl_link_ops ||
5158             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5159                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5160
5161 out:
5162         return ret;
5163
5164 err_uninit:
5165         if (dev->netdev_ops->ndo_uninit)
5166                 dev->netdev_ops->ndo_uninit(dev);
5167         goto out;
5168 }
5169 EXPORT_SYMBOL(register_netdevice);
5170
5171 /**
5172  *      init_dummy_netdev       - init a dummy network device for NAPI
5173  *      @dev: device to init
5174  *
5175  *      This takes a network device structure and initialize the minimum
5176  *      amount of fields so it can be used to schedule NAPI polls without
5177  *      registering a full blown interface. This is to be used by drivers
5178  *      that need to tie several hardware interfaces to a single NAPI
5179  *      poll scheduler due to HW limitations.
5180  */
5181 int init_dummy_netdev(struct net_device *dev)
5182 {
5183         /* Clear everything. Note we don't initialize spinlocks
5184          * are they aren't supposed to be taken by any of the
5185          * NAPI code and this dummy netdev is supposed to be
5186          * only ever used for NAPI polls
5187          */
5188         memset(dev, 0, sizeof(struct net_device));
5189
5190         /* make sure we BUG if trying to hit standard
5191          * register/unregister code path
5192          */
5193         dev->reg_state = NETREG_DUMMY;
5194
5195         /* initialize the ref count */
5196         atomic_set(&dev->refcnt, 1);
5197
5198         /* NAPI wants this */
5199         INIT_LIST_HEAD(&dev->napi_list);
5200
5201         /* a dummy interface is started by default */
5202         set_bit(__LINK_STATE_PRESENT, &dev->state);
5203         set_bit(__LINK_STATE_START, &dev->state);
5204
5205         return 0;
5206 }
5207 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5208
5209
5210 /**
5211  *      register_netdev - register a network device
5212  *      @dev: device to register
5213  *
5214  *      Take a completed network device structure and add it to the kernel
5215  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5216  *      chain. 0 is returned on success. A negative errno code is returned
5217  *      on a failure to set up the device, or if the name is a duplicate.
5218  *
5219  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5220  *      and expands the device name if you passed a format string to
5221  *      alloc_netdev.
5222  */
5223 int register_netdev(struct net_device *dev)
5224 {
5225         int err;
5226
5227         rtnl_lock();
5228
5229         /*
5230          * If the name is a format string the caller wants us to do a
5231          * name allocation.
5232          */
5233         if (strchr(dev->name, '%')) {
5234                 err = dev_alloc_name(dev, dev->name);
5235                 if (err < 0)
5236                         goto out;
5237         }
5238
5239         err = register_netdevice(dev);
5240 out:
5241         rtnl_unlock();
5242         return err;
5243 }
5244 EXPORT_SYMBOL(register_netdev);
5245
5246 /*
5247  * netdev_wait_allrefs - wait until all references are gone.
5248  *
5249  * This is called when unregistering network devices.
5250  *
5251  * Any protocol or device that holds a reference should register
5252  * for netdevice notification, and cleanup and put back the
5253  * reference if they receive an UNREGISTER event.
5254  * We can get stuck here if buggy protocols don't correctly
5255  * call dev_put.
5256  */
5257 static void netdev_wait_allrefs(struct net_device *dev)
5258 {
5259         unsigned long rebroadcast_time, warning_time;
5260
5261         linkwatch_forget_dev(dev);
5262
5263         rebroadcast_time = warning_time = jiffies;
5264         while (atomic_read(&dev->refcnt) != 0) {
5265                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5266                         rtnl_lock();
5267
5268                         /* Rebroadcast unregister notification */
5269                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5270                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5271                          * should have already handle it the first time */
5272
5273                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5274                                      &dev->state)) {
5275                                 /* We must not have linkwatch events
5276                                  * pending on unregister. If this
5277                                  * happens, we simply run the queue
5278                                  * unscheduled, resulting in a noop
5279                                  * for this device.
5280                                  */
5281                                 linkwatch_run_queue();
5282                         }
5283
5284                         __rtnl_unlock();
5285
5286                         rebroadcast_time = jiffies;
5287                 }
5288
5289                 msleep(250);
5290
5291                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5292                         printk(KERN_EMERG "unregister_netdevice: "
5293                                "waiting for %s to become free. Usage "
5294                                "count = %d\n",
5295                                dev->name, atomic_read(&dev->refcnt));
5296                         warning_time = jiffies;
5297                 }
5298         }
5299 }
5300
5301 /* The sequence is:
5302  *
5303  *      rtnl_lock();
5304  *      ...
5305  *      register_netdevice(x1);
5306  *      register_netdevice(x2);
5307  *      ...
5308  *      unregister_netdevice(y1);
5309  *      unregister_netdevice(y2);
5310  *      ...
5311  *      rtnl_unlock();
5312  *      free_netdev(y1);
5313  *      free_netdev(y2);
5314  *
5315  * We are invoked by rtnl_unlock().
5316  * This allows us to deal with problems:
5317  * 1) We can delete sysfs objects which invoke hotplug
5318  *    without deadlocking with linkwatch via keventd.
5319  * 2) Since we run with the RTNL semaphore not held, we can sleep
5320  *    safely in order to wait for the netdev refcnt to drop to zero.
5321  *
5322  * We must not return until all unregister events added during
5323  * the interval the lock was held have been completed.
5324  */
5325 void netdev_run_todo(void)
5326 {
5327         struct list_head list;
5328
5329         /* Snapshot list, allow later requests */
5330         list_replace_init(&net_todo_list, &list);
5331
5332         __rtnl_unlock();
5333
5334         while (!list_empty(&list)) {
5335                 struct net_device *dev
5336                         = list_first_entry(&list, struct net_device, todo_list);
5337                 list_del(&dev->todo_list);
5338
5339                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5340                         printk(KERN_ERR "network todo '%s' but state %d\n",
5341                                dev->name, dev->reg_state);
5342                         dump_stack();
5343                         continue;
5344                 }
5345
5346                 dev->reg_state = NETREG_UNREGISTERED;
5347
5348                 on_each_cpu(flush_backlog, dev, 1);
5349
5350                 netdev_wait_allrefs(dev);
5351
5352                 /* paranoia */
5353                 BUG_ON(atomic_read(&dev->refcnt));
5354                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5355                 WARN_ON(dev->ip6_ptr);
5356                 WARN_ON(dev->dn_ptr);
5357
5358                 if (dev->destructor)
5359                         dev->destructor(dev);
5360
5361                 /* Free network device */
5362                 kobject_put(&dev->dev.kobj);
5363         }
5364 }
5365
5366 /**
5367  *      dev_txq_stats_fold - fold tx_queues stats
5368  *      @dev: device to get statistics from
5369  *      @stats: struct rtnl_link_stats64 to hold results
5370  */
5371 void dev_txq_stats_fold(const struct net_device *dev,
5372                         struct rtnl_link_stats64 *stats)
5373 {
5374         u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5375         unsigned int i;
5376         struct netdev_queue *txq;
5377
5378         for (i = 0; i < dev->num_tx_queues; i++) {
5379                 txq = netdev_get_tx_queue(dev, i);
5380                 spin_lock_bh(&txq->_xmit_lock);
5381                 tx_bytes   += txq->tx_bytes;
5382                 tx_packets += txq->tx_packets;
5383                 tx_dropped += txq->tx_dropped;
5384                 spin_unlock_bh(&txq->_xmit_lock);
5385         }
5386         if (tx_bytes || tx_packets || tx_dropped) {
5387                 stats->tx_bytes   = tx_bytes;
5388                 stats->tx_packets = tx_packets;
5389                 stats->tx_dropped = tx_dropped;
5390         }
5391 }
5392 EXPORT_SYMBOL(dev_txq_stats_fold);
5393
5394 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5395  * fields in the same order, with only the type differing.
5396  */
5397 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5398                                     const struct net_device_stats *netdev_stats)
5399 {
5400 #if BITS_PER_LONG == 64
5401         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5402         memcpy(stats64, netdev_stats, sizeof(*stats64));
5403 #else
5404         size_t i, n = sizeof(*stats64) / sizeof(u64);
5405         const unsigned long *src = (const unsigned long *)netdev_stats;
5406         u64 *dst = (u64 *)stats64;
5407
5408         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5409                      sizeof(*stats64) / sizeof(u64));
5410         for (i = 0; i < n; i++)
5411                 dst[i] = src[i];
5412 #endif
5413 }
5414
5415 /**
5416  *      dev_get_stats   - get network device statistics
5417  *      @dev: device to get statistics from
5418  *      @storage: place to store stats
5419  *
5420  *      Get network statistics from device. Return @storage.
5421  *      The device driver may provide its own method by setting
5422  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5423  *      otherwise the internal statistics structure is used.
5424  */
5425 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5426                                         struct rtnl_link_stats64 *storage)
5427 {
5428         const struct net_device_ops *ops = dev->netdev_ops;
5429
5430         if (ops->ndo_get_stats64) {
5431                 memset(storage, 0, sizeof(*storage));
5432                 return ops->ndo_get_stats64(dev, storage);
5433         }
5434         if (ops->ndo_get_stats) {
5435                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5436                 return storage;
5437         }
5438         netdev_stats_to_stats64(storage, &dev->stats);
5439         dev_txq_stats_fold(dev, storage);
5440         return storage;
5441 }
5442 EXPORT_SYMBOL(dev_get_stats);
5443
5444 static void netdev_init_one_queue(struct net_device *dev,
5445                                   struct netdev_queue *queue,
5446                                   void *_unused)
5447 {
5448         queue->dev = dev;
5449 }
5450
5451 static void netdev_init_queues(struct net_device *dev)
5452 {
5453         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5454         spin_lock_init(&dev->tx_global_lock);
5455 }
5456
5457 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5458 {
5459         struct netdev_queue *queue = dev_ingress_queue(dev);
5460
5461 #ifdef CONFIG_NET_CLS_ACT
5462         if (queue)
5463                 return queue;
5464         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5465         if (!queue)
5466                 return NULL;
5467         netdev_init_one_queue(dev, queue, NULL);
5468         __netdev_init_queue_locks_one(dev, queue, NULL);
5469         queue->qdisc = &noop_qdisc;
5470         queue->qdisc_sleeping = &noop_qdisc;
5471         rcu_assign_pointer(dev->ingress_queue, queue);
5472 #endif
5473         return queue;
5474 }
5475
5476 /**
5477  *      alloc_netdev_mq - allocate network device
5478  *      @sizeof_priv:   size of private data to allocate space for
5479  *      @name:          device name format string
5480  *      @setup:         callback to initialize device
5481  *      @queue_count:   the number of subqueues to allocate
5482  *
5483  *      Allocates a struct net_device with private data area for driver use
5484  *      and performs basic initialization.  Also allocates subquue structs
5485  *      for each queue on the device at the end of the netdevice.
5486  */
5487 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5488                 void (*setup)(struct net_device *), unsigned int queue_count)
5489 {
5490         struct netdev_queue *tx;
5491         struct net_device *dev;
5492         size_t alloc_size;
5493         struct net_device *p;
5494
5495         BUG_ON(strlen(name) >= sizeof(dev->name));
5496
5497         alloc_size = sizeof(struct net_device);
5498         if (sizeof_priv) {
5499                 /* ensure 32-byte alignment of private area */
5500                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5501                 alloc_size += sizeof_priv;
5502         }
5503         /* ensure 32-byte alignment of whole construct */
5504         alloc_size += NETDEV_ALIGN - 1;
5505
5506         p = kzalloc(alloc_size, GFP_KERNEL);
5507         if (!p) {
5508                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5509                 return NULL;
5510         }
5511
5512         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5513         if (!tx) {
5514                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5515                        "tx qdiscs.\n");
5516                 goto free_p;
5517         }
5518
5519
5520         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5521         dev->padded = (char *)dev - (char *)p;
5522
5523         if (dev_addr_init(dev))
5524                 goto free_tx;
5525
5526         dev_mc_init(dev);
5527         dev_uc_init(dev);
5528
5529         dev_net_set(dev, &init_net);
5530
5531         dev->_tx = tx;
5532         dev->num_tx_queues = queue_count;
5533         dev->real_num_tx_queues = queue_count;
5534
5535 #ifdef CONFIG_RPS
5536         dev->num_rx_queues = queue_count;
5537         dev->real_num_rx_queues = queue_count;
5538 #endif
5539
5540         dev->gso_max_size = GSO_MAX_SIZE;
5541
5542         netdev_init_queues(dev);
5543
5544         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5545         dev->ethtool_ntuple_list.count = 0;
5546         INIT_LIST_HEAD(&dev->napi_list);
5547         INIT_LIST_HEAD(&dev->unreg_list);
5548         INIT_LIST_HEAD(&dev->link_watch_list);
5549         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5550         setup(dev);
5551         strcpy(dev->name, name);
5552         return dev;
5553
5554 free_tx:
5555         kfree(tx);
5556 free_p:
5557         kfree(p);
5558         return NULL;
5559 }
5560 EXPORT_SYMBOL(alloc_netdev_mq);
5561
5562 /**
5563  *      free_netdev - free network device
5564  *      @dev: device
5565  *
5566  *      This function does the last stage of destroying an allocated device
5567  *      interface. The reference to the device object is released.
5568  *      If this is the last reference then it will be freed.
5569  */
5570 void free_netdev(struct net_device *dev)
5571 {
5572         struct napi_struct *p, *n;
5573
5574         release_net(dev_net(dev));
5575
5576         kfree(dev->_tx);
5577
5578         kfree(rcu_dereference_raw(dev->ingress_queue));
5579
5580         /* Flush device addresses */
5581         dev_addr_flush(dev);
5582
5583         /* Clear ethtool n-tuple list */
5584         ethtool_ntuple_flush(dev);
5585
5586         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5587                 netif_napi_del(p);
5588
5589         /*  Compatibility with error handling in drivers */
5590         if (dev->reg_state == NETREG_UNINITIALIZED) {
5591                 kfree((char *)dev - dev->padded);
5592                 return;
5593         }
5594
5595         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5596         dev->reg_state = NETREG_RELEASED;
5597
5598         /* will free via device release */
5599         put_device(&dev->dev);
5600 }
5601 EXPORT_SYMBOL(free_netdev);
5602
5603 /**
5604  *      synchronize_net -  Synchronize with packet receive processing
5605  *
5606  *      Wait for packets currently being received to be done.
5607  *      Does not block later packets from starting.
5608  */
5609 void synchronize_net(void)
5610 {
5611         might_sleep();
5612         synchronize_rcu();
5613 }
5614 EXPORT_SYMBOL(synchronize_net);
5615
5616 /**
5617  *      unregister_netdevice_queue - remove device from the kernel
5618  *      @dev: device
5619  *      @head: list
5620  *
5621  *      This function shuts down a device interface and removes it
5622  *      from the kernel tables.
5623  *      If head not NULL, device is queued to be unregistered later.
5624  *
5625  *      Callers must hold the rtnl semaphore.  You may want
5626  *      unregister_netdev() instead of this.
5627  */
5628
5629 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5630 {
5631         ASSERT_RTNL();
5632
5633         if (head) {
5634                 list_move_tail(&dev->unreg_list, head);
5635         } else {
5636                 rollback_registered(dev);
5637                 /* Finish processing unregister after unlock */
5638                 net_set_todo(dev);
5639         }
5640 }
5641 EXPORT_SYMBOL(unregister_netdevice_queue);
5642
5643 /**
5644  *      unregister_netdevice_many - unregister many devices
5645  *      @head: list of devices
5646  */
5647 void unregister_netdevice_many(struct list_head *head)
5648 {
5649         struct net_device *dev;
5650
5651         if (!list_empty(head)) {
5652                 rollback_registered_many(head);
5653                 list_for_each_entry(dev, head, unreg_list)
5654                         net_set_todo(dev);
5655         }
5656 }
5657 EXPORT_SYMBOL(unregister_netdevice_many);
5658
5659 /**
5660  *      unregister_netdev - remove device from the kernel
5661  *      @dev: device
5662  *
5663  *      This function shuts down a device interface and removes it
5664  *      from the kernel tables.
5665  *
5666  *      This is just a wrapper for unregister_netdevice that takes
5667  *      the rtnl semaphore.  In general you want to use this and not
5668  *      unregister_netdevice.
5669  */
5670 void unregister_netdev(struct net_device *dev)
5671 {
5672         rtnl_lock();
5673         unregister_netdevice(dev);
5674         rtnl_unlock();
5675 }
5676 EXPORT_SYMBOL(unregister_netdev);
5677
5678 /**
5679  *      dev_change_net_namespace - move device to different nethost namespace
5680  *      @dev: device
5681  *      @net: network namespace
5682  *      @pat: If not NULL name pattern to try if the current device name
5683  *            is already taken in the destination network namespace.
5684  *
5685  *      This function shuts down a device interface and moves it
5686  *      to a new network namespace. On success 0 is returned, on
5687  *      a failure a netagive errno code is returned.
5688  *
5689  *      Callers must hold the rtnl semaphore.
5690  */
5691
5692 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5693 {
5694         int err;
5695
5696         ASSERT_RTNL();
5697
5698         /* Don't allow namespace local devices to be moved. */
5699         err = -EINVAL;
5700         if (dev->features & NETIF_F_NETNS_LOCAL)
5701                 goto out;
5702
5703         /* Ensure the device has been registrered */
5704         err = -EINVAL;
5705         if (dev->reg_state != NETREG_REGISTERED)
5706                 goto out;
5707
5708         /* Get out if there is nothing todo */
5709         err = 0;
5710         if (net_eq(dev_net(dev), net))
5711                 goto out;
5712
5713         /* Pick the destination device name, and ensure
5714          * we can use it in the destination network namespace.
5715          */
5716         err = -EEXIST;
5717         if (__dev_get_by_name(net, dev->name)) {
5718                 /* We get here if we can't use the current device name */
5719                 if (!pat)
5720                         goto out;
5721                 if (dev_get_valid_name(dev, pat, 1))
5722                         goto out;
5723         }
5724
5725         /*
5726          * And now a mini version of register_netdevice unregister_netdevice.
5727          */
5728
5729         /* If device is running close it first. */
5730         dev_close(dev);
5731
5732         /* And unlink it from device chain */
5733         err = -ENODEV;
5734         unlist_netdevice(dev);
5735
5736         synchronize_net();
5737
5738         /* Shutdown queueing discipline. */
5739         dev_shutdown(dev);
5740
5741         /* Notify protocols, that we are about to destroy
5742            this device. They should clean all the things.
5743
5744            Note that dev->reg_state stays at NETREG_REGISTERED.
5745            This is wanted because this way 8021q and macvlan know
5746            the device is just moving and can keep their slaves up.
5747         */
5748         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5749         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5750
5751         /*
5752          *      Flush the unicast and multicast chains
5753          */
5754         dev_uc_flush(dev);
5755         dev_mc_flush(dev);
5756
5757         /* Actually switch the network namespace */
5758         dev_net_set(dev, net);
5759
5760         /* If there is an ifindex conflict assign a new one */
5761         if (__dev_get_by_index(net, dev->ifindex)) {
5762                 int iflink = (dev->iflink == dev->ifindex);
5763                 dev->ifindex = dev_new_index(net);
5764                 if (iflink)
5765                         dev->iflink = dev->ifindex;
5766         }
5767
5768         /* Fixup kobjects */
5769         err = device_rename(&dev->dev, dev->name);
5770         WARN_ON(err);
5771
5772         /* Add the device back in the hashes */
5773         list_netdevice(dev);
5774
5775         /* Notify protocols, that a new device appeared. */
5776         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5777
5778         /*
5779          *      Prevent userspace races by waiting until the network
5780          *      device is fully setup before sending notifications.
5781          */
5782         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5783
5784         synchronize_net();
5785         err = 0;
5786 out:
5787         return err;
5788 }
5789 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5790
5791 static int dev_cpu_callback(struct notifier_block *nfb,
5792                             unsigned long action,
5793                             void *ocpu)
5794 {
5795         struct sk_buff **list_skb;
5796         struct sk_buff *skb;
5797         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5798         struct softnet_data *sd, *oldsd;
5799
5800         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5801                 return NOTIFY_OK;
5802
5803         local_irq_disable();
5804         cpu = smp_processor_id();
5805         sd = &per_cpu(softnet_data, cpu);
5806         oldsd = &per_cpu(softnet_data, oldcpu);
5807
5808         /* Find end of our completion_queue. */
5809         list_skb = &sd->completion_queue;
5810         while (*list_skb)
5811                 list_skb = &(*list_skb)->next;
5812         /* Append completion queue from offline CPU. */
5813         *list_skb = oldsd->completion_queue;
5814         oldsd->completion_queue = NULL;
5815
5816         /* Append output queue from offline CPU. */
5817         if (oldsd->output_queue) {
5818                 *sd->output_queue_tailp = oldsd->output_queue;
5819                 sd->output_queue_tailp = oldsd->output_queue_tailp;
5820                 oldsd->output_queue = NULL;
5821                 oldsd->output_queue_tailp = &oldsd->output_queue;
5822         }
5823
5824         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5825         local_irq_enable();
5826
5827         /* Process offline CPU's input_pkt_queue */
5828         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5829                 netif_rx(skb);
5830                 input_queue_head_incr(oldsd);
5831         }
5832         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5833                 netif_rx(skb);
5834                 input_queue_head_incr(oldsd);
5835         }
5836
5837         return NOTIFY_OK;
5838 }
5839
5840
5841 /**
5842  *      netdev_increment_features - increment feature set by one
5843  *      @all: current feature set
5844  *      @one: new feature set
5845  *      @mask: mask feature set
5846  *
5847  *      Computes a new feature set after adding a device with feature set
5848  *      @one to the master device with current feature set @all.  Will not
5849  *      enable anything that is off in @mask. Returns the new feature set.
5850  */
5851 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5852                                         unsigned long mask)
5853 {
5854         /* If device needs checksumming, downgrade to it. */
5855         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5856                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5857         else if (mask & NETIF_F_ALL_CSUM) {
5858                 /* If one device supports v4/v6 checksumming, set for all. */
5859                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5860                     !(all & NETIF_F_GEN_CSUM)) {
5861                         all &= ~NETIF_F_ALL_CSUM;
5862                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5863                 }
5864
5865                 /* If one device supports hw checksumming, set for all. */
5866                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5867                         all &= ~NETIF_F_ALL_CSUM;
5868                         all |= NETIF_F_HW_CSUM;
5869                 }
5870         }
5871
5872         one |= NETIF_F_ALL_CSUM;
5873
5874         one |= all & NETIF_F_ONE_FOR_ALL;
5875         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5876         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5877
5878         return all;
5879 }
5880 EXPORT_SYMBOL(netdev_increment_features);
5881
5882 static struct hlist_head *netdev_create_hash(void)
5883 {
5884         int i;
5885         struct hlist_head *hash;
5886
5887         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5888         if (hash != NULL)
5889                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5890                         INIT_HLIST_HEAD(&hash[i]);
5891
5892         return hash;
5893 }
5894
5895 /* Initialize per network namespace state */
5896 static int __net_init netdev_init(struct net *net)
5897 {
5898         INIT_LIST_HEAD(&net->dev_base_head);
5899
5900         net->dev_name_head = netdev_create_hash();
5901         if (net->dev_name_head == NULL)
5902                 goto err_name;
5903
5904         net->dev_index_head = netdev_create_hash();
5905         if (net->dev_index_head == NULL)
5906                 goto err_idx;
5907
5908         return 0;
5909
5910 err_idx:
5911         kfree(net->dev_name_head);
5912 err_name:
5913         return -ENOMEM;
5914 }
5915
5916 /**
5917  *      netdev_drivername - network driver for the device
5918  *      @dev: network device
5919  *      @buffer: buffer for resulting name
5920  *      @len: size of buffer
5921  *
5922  *      Determine network driver for device.
5923  */
5924 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5925 {
5926         const struct device_driver *driver;
5927         const struct device *parent;
5928
5929         if (len <= 0 || !buffer)
5930                 return buffer;
5931         buffer[0] = 0;
5932
5933         parent = dev->dev.parent;
5934
5935         if (!parent)
5936                 return buffer;
5937
5938         driver = parent->driver;
5939         if (driver && driver->name)
5940                 strlcpy(buffer, driver->name, len);
5941         return buffer;
5942 }
5943
5944 static int __netdev_printk(const char *level, const struct net_device *dev,
5945                            struct va_format *vaf)
5946 {
5947         int r;
5948
5949         if (dev && dev->dev.parent)
5950                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
5951                                netdev_name(dev), vaf);
5952         else if (dev)
5953                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
5954         else
5955                 r = printk("%s(NULL net_device): %pV", level, vaf);
5956
5957         return r;
5958 }
5959
5960 int netdev_printk(const char *level, const struct net_device *dev,
5961                   const char *format, ...)
5962 {
5963         struct va_format vaf;
5964         va_list args;
5965         int r;
5966
5967         va_start(args, format);
5968
5969         vaf.fmt = format;
5970         vaf.va = &args;
5971
5972         r = __netdev_printk(level, dev, &vaf);
5973         va_end(args);
5974
5975         return r;
5976 }
5977 EXPORT_SYMBOL(netdev_printk);
5978
5979 #define define_netdev_printk_level(func, level)                 \
5980 int func(const struct net_device *dev, const char *fmt, ...)    \
5981 {                                                               \
5982         int r;                                                  \
5983         struct va_format vaf;                                   \
5984         va_list args;                                           \
5985                                                                 \
5986         va_start(args, fmt);                                    \
5987                                                                 \
5988         vaf.fmt = fmt;                                          \
5989         vaf.va = &args;                                         \
5990                                                                 \
5991         r = __netdev_printk(level, dev, &vaf);                  \
5992         va_end(args);                                           \
5993                                                                 \
5994         return r;                                               \
5995 }                                                               \
5996 EXPORT_SYMBOL(func);
5997
5998 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
5999 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6000 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6001 define_netdev_printk_level(netdev_err, KERN_ERR);
6002 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6003 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6004 define_netdev_printk_level(netdev_info, KERN_INFO);
6005
6006 static void __net_exit netdev_exit(struct net *net)
6007 {
6008         kfree(net->dev_name_head);
6009         kfree(net->dev_index_head);
6010 }
6011
6012 static struct pernet_operations __net_initdata netdev_net_ops = {
6013         .init = netdev_init,
6014         .exit = netdev_exit,
6015 };
6016
6017 static void __net_exit default_device_exit(struct net *net)
6018 {
6019         struct net_device *dev, *aux;
6020         /*
6021          * Push all migratable network devices back to the
6022          * initial network namespace
6023          */
6024         rtnl_lock();
6025         for_each_netdev_safe(net, dev, aux) {
6026                 int err;
6027                 char fb_name[IFNAMSIZ];
6028
6029                 /* Ignore unmoveable devices (i.e. loopback) */
6030                 if (dev->features & NETIF_F_NETNS_LOCAL)
6031                         continue;
6032
6033                 /* Leave virtual devices for the generic cleanup */
6034                 if (dev->rtnl_link_ops)
6035                         continue;
6036
6037                 /* Push remaing network devices to init_net */
6038                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6039                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6040                 if (err) {
6041                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6042                                 __func__, dev->name, err);
6043                         BUG();
6044                 }
6045         }
6046         rtnl_unlock();
6047 }
6048
6049 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6050 {
6051         /* At exit all network devices most be removed from a network
6052          * namespace.  Do this in the reverse order of registeration.
6053          * Do this across as many network namespaces as possible to
6054          * improve batching efficiency.
6055          */
6056         struct net_device *dev;
6057         struct net *net;
6058         LIST_HEAD(dev_kill_list);
6059
6060         rtnl_lock();
6061         list_for_each_entry(net, net_list, exit_list) {
6062                 for_each_netdev_reverse(net, dev) {
6063                         if (dev->rtnl_link_ops)
6064                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6065                         else
6066                                 unregister_netdevice_queue(dev, &dev_kill_list);
6067                 }
6068         }
6069         unregister_netdevice_many(&dev_kill_list);
6070         rtnl_unlock();
6071 }
6072
6073 static struct pernet_operations __net_initdata default_device_ops = {
6074         .exit = default_device_exit,
6075         .exit_batch = default_device_exit_batch,
6076 };
6077
6078 /*
6079  *      Initialize the DEV module. At boot time this walks the device list and
6080  *      unhooks any devices that fail to initialise (normally hardware not
6081  *      present) and leaves us with a valid list of present and active devices.
6082  *
6083  */
6084
6085 /*
6086  *       This is called single threaded during boot, so no need
6087  *       to take the rtnl semaphore.
6088  */
6089 static int __init net_dev_init(void)
6090 {
6091         int i, rc = -ENOMEM;
6092
6093         BUG_ON(!dev_boot_phase);
6094
6095         if (dev_proc_init())
6096                 goto out;
6097
6098         if (netdev_kobject_init())
6099                 goto out;
6100
6101         INIT_LIST_HEAD(&ptype_all);
6102         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6103                 INIT_LIST_HEAD(&ptype_base[i]);
6104
6105         if (register_pernet_subsys(&netdev_net_ops))
6106                 goto out;
6107
6108         /*
6109          *      Initialise the packet receive queues.
6110          */
6111
6112         for_each_possible_cpu(i) {
6113                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6114
6115                 memset(sd, 0, sizeof(*sd));
6116                 skb_queue_head_init(&sd->input_pkt_queue);
6117                 skb_queue_head_init(&sd->process_queue);
6118                 sd->completion_queue = NULL;
6119                 INIT_LIST_HEAD(&sd->poll_list);
6120                 sd->output_queue = NULL;
6121                 sd->output_queue_tailp = &sd->output_queue;
6122 #ifdef CONFIG_RPS
6123                 sd->csd.func = rps_trigger_softirq;
6124                 sd->csd.info = sd;
6125                 sd->csd.flags = 0;
6126                 sd->cpu = i;
6127 #endif
6128
6129                 sd->backlog.poll = process_backlog;
6130                 sd->backlog.weight = weight_p;
6131                 sd->backlog.gro_list = NULL;
6132                 sd->backlog.gro_count = 0;
6133         }
6134
6135         dev_boot_phase = 0;
6136
6137         /* The loopback device is special if any other network devices
6138          * is present in a network namespace the loopback device must
6139          * be present. Since we now dynamically allocate and free the
6140          * loopback device ensure this invariant is maintained by
6141          * keeping the loopback device as the first device on the
6142          * list of network devices.  Ensuring the loopback devices
6143          * is the first device that appears and the last network device
6144          * that disappears.
6145          */
6146         if (register_pernet_device(&loopback_net_ops))
6147                 goto out;
6148
6149         if (register_pernet_device(&default_device_ops))
6150                 goto out;
6151
6152         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6153         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6154
6155         hotcpu_notifier(dev_cpu_callback, 0);
6156         dst_init();
6157         dev_mcast_init();
6158         rc = 0;
6159 out:
6160         return rc;
6161 }
6162
6163 subsys_initcall(net_dev_init);
6164
6165 static int __init initialize_hashrnd(void)
6166 {
6167         get_random_bytes(&hashrnd, sizeof(hashrnd));
6168         return 0;
6169 }
6170
6171 late_initcall_sync(initialize_hashrnd);
6172