Merge branch 'for-linus2' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 8 Jan 2012 20:19:57 +0000 (12:19 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 8 Jan 2012 20:19:57 +0000 (12:19 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 8 Jan 2012 20:19:57 +0000 (12:19 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 8 Jan 2012 20:19:57 +0000 (12:19 -0800)
diff --combined Documentation/filesystems/debugfs.txt

index f04066a,9281a95..6872c91
--- 1/Documentation/filesystems/debugfs.txt
--- 2/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@@ -35,7 -35,7 +35,7 @@@ described below will work
   
   The most general way to create a file within a debugfs directory is with:
   
-     struct dentry *debugfs_create_file(const char *name, mode_t mode,
+     struct dentry *debugfs_create_file(const char *name, umode_t mode,
                                        struct dentry *parent, void *data,
                                        const struct file_operations *fops);
   
@@@ -53,13 -53,13 +53,13 @@@ actually necessary; the debugfs code pr
   for simple situations.  Files containing a single integer value can be
   created with any of:
   
-     struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+     struct dentry *debugfs_create_u8(const char *name, umode_t mode,
                                      struct dentry *parent, u8 *value);
-     struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+     struct dentry *debugfs_create_u16(const char *name, umode_t mode,
                                       struct dentry *parent, u16 *value);
-     struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+     struct dentry *debugfs_create_u32(const char *name, umode_t mode,
                                       struct dentry *parent, u32 *value);
-     struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+     struct dentry *debugfs_create_u64(const char *name, umode_t mode,
                                       struct dentry *parent, u64 *value);
   
   These files support both reading and writing the given value; if a specific
@@@ -67,13 -67,13 +67,13 @@@ file should not be written to, simply s
   values in these files are in decimal; if hexadecimal is more appropriate,
   the following functions can be used instead:
   
-     struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+     struct dentry *debugfs_create_x8(const char *name, umode_t mode,
                                      struct dentry *parent, u8 *value);
-     struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+     struct dentry *debugfs_create_x16(const char *name, umode_t mode,
                                       struct dentry *parent, u16 *value);
-     struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+     struct dentry *debugfs_create_x32(const char *name, umode_t mode,
                                       struct dentry *parent, u32 *value);
-     struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+     struct dentry *debugfs_create_x64(const char *name, umode_t mode,
                                       struct dentry *parent, u64 *value);
   
   These functions are useful as long as the developer knows the size of the
@@@ -81,7 -81,7 +81,7 @@@ value to be exported.  Some types can h
   architectures, though, complicating the situation somewhat.  There is a
   function meant to help out in one special case:
   
-     struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+     struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
                                          struct dentry *parent, 
                                          size_t *value);
   
@@@ -90,22 -90,21 +90,22 @@@ a variable of type size_t
   
   Boolean values can be placed in debugfs with:
   
-     struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+     struct dentry *debugfs_create_bool(const char *name, umode_t mode,
                                        struct dentry *parent, u32 *value);
   
   A read on the resulting file will yield either Y (for non-zero values) or
   N, followed by a newline.  If written to, it will accept either upper- or
   lower-case values, or 1 or 0.  Any other input will be silently ignored.
   
- -Finally, a block of arbitrary binary data can be exported with:
+ +Another option is exporting a block of arbitrary binary data, with
+ +this structure and function:
   
       struct debugfs_blob_wrapper {
         void *data;
         unsigned long size;
       };
   
-     struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+     struct dentry *debugfs_create_blob(const char *name, umode_t mode,
                                        struct dentry *parent,
                                        struct debugfs_blob_wrapper *blob);
   
@@@ -116,35 -115,6 +116,35 @@@ can be used to export binary informatio
   any code which does so in the mainline.  Note that all files created with
   debugfs_create_blob() are read-only.
   
+ +If you want to dump a block of registers (something that happens quite
+ +often during development, even if little such code reaches mainline.
+ +Debugfs offers two functions: one to make a registers-only file, and
+ +another to insert a register block in the middle of another sequential
+ +file.
+ +
+ +    struct debugfs_reg32 {
+ +      char *name;
+ +      unsigned long offset;
+ +    };
+ +
+ +    struct debugfs_regset32 {
+ +      struct debugfs_reg32 *regs;
+ +      int nregs;
+ +      void __iomem *base;
+ +    };
+ +
+ +    struct dentry *debugfs_create_regset32(const char *name, mode_t mode,
+ +                                   struct dentry *parent,
+ +                                   struct debugfs_regset32 *regset);
+ +
+ +    int debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs,
+ +                       int nregs, void __iomem *base, char *prefix);
+ +
+ +The "base" argument may be 0, but you may want to build the reg32 array
+ +using __stringify, and a number of register names (macros) are actually
+ +byte offsets over a base for the register block.
+ +
+ +
   There are a couple of other directory-oriented helper functions:
   
       struct dentry *debugfs_rename(struct dentry *old_dir, 
diff --combined arch/powerpc/include/asm/spu.h

index fff9213,c526400..93f280e
--- 1/arch/powerpc/include/asm/spu.h
--- 2/arch/powerpc/include/asm/spu.h
+++ b/arch/powerpc/include/asm/spu.h
@@@ -25,7 -25,7 +25,7 @@@
   #ifdef __KERNEL__
   
   #include <linux/workqueue.h>
- -#include <linux/sysdev.h>
+ +#include <linux/device.h>
   #include <linux/mutex.h>
   
   #define LS_SIZE (256 * 1024)
@@@ -166,7 -166,7 +166,7 @@@ struct spu 
         /* beat only */
         u64 shadow_int_mask_RW[3];
   
- -      struct sys_device sysdev;
+ +      struct device dev;
   
         int has_mem_affinity;
         struct list_head aff_list;
@@@ -237,7 -237,7 +237,7 @@@ extern long spu_sys_callback(struct spu
   struct file;
   struct spufs_calls {
         long (*create_thread)(const char __user *name,
-                                       unsigned int flags, mode_t mode,
+                                       unsigned int flags, umode_t mode,
                                         struct file *neighbor);
         long (*spu_run)(struct file *filp, __u32 __user *unpc,
                                                 __u32 __user *ustatus);
@@@ -270,11 -270,11 +270,11 @@@
   int register_spu_syscalls(struct spufs_calls *calls);
   void unregister_spu_syscalls(struct spufs_calls *calls);
   
- -int spu_add_sysdev_attr(struct sysdev_attribute *attr);
- -void spu_remove_sysdev_attr(struct sysdev_attribute *attr);
+ +int spu_add_dev_attr(struct device_attribute *attr);
+ +void spu_remove_dev_attr(struct device_attribute *attr);
   
- -int spu_add_sysdev_attr_group(struct attribute_group *attrs);
- -void spu_remove_sysdev_attr_group(struct attribute_group *attrs);
+ +int spu_add_dev_attr_group(struct attribute_group *attrs);
+ +void spu_remove_dev_attr_group(struct attribute_group *attrs);
   
   int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
                 unsigned long dsisr, unsigned *flt);
diff --combined arch/powerpc/include/asm/types.h

index d82e94e,b15a52e..0abf7f2
--- 1/arch/powerpc/include/asm/types.h
--- 2/arch/powerpc/include/asm/types.h
+++ b/arch/powerpc/include/asm/types.h
@@@ -5,11 -5,8 +5,11 @@@
    * This is here because we used to use l64 for 64bit powerpc
    * and we don't want to impact user mode with our change to ll64
    * in the kernel.
+ + *
+ + * However, some user programs are fine with this.  They can
+ + * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here.
    */
- -#if defined(__powerpc64__) && !defined(__KERNEL__)
+ +#if !defined(__SANE_USERSPACE_TYPES__) && defined(__powerpc64__) && !defined(__KERNEL__)
   # include <asm-generic/int-l64.h>
   #else
   # include <asm-generic/int-ll64.h>
@@@ -30,12 -27,6 +30,6 @@@
    * 2 of the License, or (at your option) any later version.
    */
   
- #ifdef __powerpc64__
- typedef unsigned int umode_t;
- #else
- typedef unsigned short umode_t;
- #endif
- 
   typedef struct {
         __u32 u[4];
   } __attribute__((aligned(16))) __vector128;
diff --combined block/ioctl.c

index d510c2a,91e7b19..4828fa3
--- 1/block/ioctl.c
--- 2/block/ioctl.c
+++ b/block/ioctl.c
@@@ -5,7 -5,7 +5,7 @@@
   #include <linux/blkpg.h>
   #include <linux/hdreg.h>
   #include <linux/backing-dev.h>
- #include <linux/buffer_head.h>
+ #include <linux/fs.h>
   #include <linux/blktrace_api.h>
   #include <asm/uaccess.h>
   
@@@ -180,26 -180,6 +180,26 @@@ int __blkdev_driver_ioctl(struct block_
   EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
   
   /*
+ + * Is it an unrecognized ioctl? The correct returns are either
+ + * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
+ + * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
+ + * code before returning.
+ + *
+ + * Confused drivers sometimes return EINVAL, which is wrong. It
+ + * means "I understood the ioctl command, but the parameters to
+ + * it were wrong".
+ + *
+ + * We should aim to just fix the broken drivers, the EINVAL case
+ + * should go away.
+ + */
+ +static inline int is_unrecognized_ioctl(int ret)
+ +{
+ +      return  ret == -EINVAL ||
+ +              ret == -ENOTTY ||
+ +              ret == -ENOIOCTLCMD;
+ +}
+ +
+ +/*
    * always keep this in sync with compat_blkdev_ioctl()
    */
   int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
@@@ -216,7 -196,8 +216,7 @@@
                         return -EACCES;
   
                 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
- -              /* -EINVAL to handle old uncorrected drivers */
- -              if (ret != -EINVAL && ret != -ENOTTY)
+ +              if (!is_unrecognized_ioctl(ret))
                         return ret;
   
                 fsync_bdev(bdev);
@@@ -225,7 -206,8 +225,7 @@@
   
         case BLKROSET:
                 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
- -              /* -EINVAL to handle old uncorrected drivers */
- -              if (ret != -EINVAL && ret != -ENOTTY)
+ +              if (!is_unrecognized_ioctl(ret))
                         return ret;
                 if (!capable(CAP_SYS_ADMIN))
                         return -EACCES;
diff --combined drivers/base/core.c

index 4dac58a,1dfa1d6..4a67cc0
--- 1/drivers/base/core.c
--- 2/drivers/base/core.c
+++ b/drivers/base/core.c
@@@ -118,56 -118,6 +118,56 @@@ static const struct sysfs_ops dev_sysfs
         .store  = dev_attr_store,
   };
   
+ +#define to_ext_attr(x) container_of(x, struct dev_ext_attribute, attr)
+ +
+ +ssize_t device_store_ulong(struct device *dev,
+ +                         struct device_attribute *attr,
+ +                         const char *buf, size_t size)
+ +{
+ +      struct dev_ext_attribute *ea = to_ext_attr(attr);
+ +      char *end;
+ +      unsigned long new = simple_strtoul(buf, &end, 0);
+ +      if (end == buf)
+ +              return -EINVAL;
+ +      *(unsigned long *)(ea->var) = new;
+ +      /* Always return full write size even if we didn't consume all */
+ +      return size;
+ +}
+ +EXPORT_SYMBOL_GPL(device_store_ulong);
+ +
+ +ssize_t device_show_ulong(struct device *dev,
+ +                        struct device_attribute *attr,
+ +                        char *buf)
+ +{
+ +      struct dev_ext_attribute *ea = to_ext_attr(attr);
+ +      return snprintf(buf, PAGE_SIZE, "%lx\n", *(unsigned long *)(ea->var));
+ +}
+ +EXPORT_SYMBOL_GPL(device_show_ulong);
+ +
+ +ssize_t device_store_int(struct device *dev,
+ +                       struct device_attribute *attr,
+ +                       const char *buf, size_t size)
+ +{
+ +      struct dev_ext_attribute *ea = to_ext_attr(attr);
+ +      char *end;
+ +      long new = simple_strtol(buf, &end, 0);
+ +      if (end == buf || new > INT_MAX || new < INT_MIN)
+ +              return -EINVAL;
+ +      *(int *)(ea->var) = new;
+ +      /* Always return full write size even if we didn't consume all */
+ +      return size;
+ +}
+ +EXPORT_SYMBOL_GPL(device_store_int);
+ +
+ +ssize_t device_show_int(struct device *dev,
+ +                      struct device_attribute *attr,
+ +                      char *buf)
+ +{
+ +      struct dev_ext_attribute *ea = to_ext_attr(attr);
+ +
+ +      return snprintf(buf, PAGE_SIZE, "%d\n", *(int *)(ea->var));
+ +}
+ +EXPORT_SYMBOL_GPL(device_show_int);
   
   /**
    *    device_release - free device structure.
@@@ -248,7 -198,7 +248,7 @@@ static int dev_uevent(struct kset *kset
         if (MAJOR(dev->devt)) {
                 const char *tmp;
                 const char *name;
-               mode_t mode = 0;
+               umode_t mode = 0;
   
                 add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
                 add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
@@@ -514,7 -464,7 +514,7 @@@ static ssize_t show_dev(struct device *
   static struct device_attribute devt_attr =
         __ATTR(dev, S_IRUGO, show_dev, NULL);
   
- -/* kset to create /sys/devices/  */
+ +/* /sys/devices/ */
   struct kset *devices_kset;
   
   /**
@@@ -761,10 -711,6 +761,10 @@@ static struct kobject *get_device_paren
                 return k;
         }
   
+ +      /* subsystems can specify a default root directory for their devices */
+ +      if (!parent && dev->bus && dev->bus->dev_root)
+ +              return &dev->bus->dev_root->kobj;
+ +
         if (parent)
                 return &parent->kobj;
         return NULL;
@@@ -785,6 -731,14 +785,6 @@@ static void cleanup_device_parent(struc
         cleanup_glue_dir(dev, dev->kobj.parent);
   }
   
- -static void setup_parent(struct device *dev, struct device *parent)
- -{
- -      struct kobject *kobj;
- -      kobj = get_device_parent(dev, parent);
- -      if (kobj)
- -              dev->kobj.parent = kobj;
- -}
- -
   static int device_add_class_symlinks(struct device *dev)
   {
         int error;
@@@ -937,7 -891,6 +937,7 @@@ int device_private_init(struct device *
   int device_add(struct device *dev)
   {
         struct device *parent = NULL;
+ +      struct kobject *kobj;
         struct class_interface *class_intf;
         int error = -EINVAL;
   
@@@ -961,10 -914,6 +961,10 @@@
                 dev->init_name = NULL;
         }
   
+ +      /* subsystems can specify simple device enumeration */
+ +      if (!dev_name(dev) && dev->bus && dev->bus->dev_name)
+ +              dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);
+ +
         if (!dev_name(dev)) {
                 error = -EINVAL;
                 goto name_error;
@@@ -973,9 -922,7 +973,9 @@@
         pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
   
         parent = get_device(dev->parent);
- -      setup_parent(dev, parent);
+ +      kobj = get_device_parent(dev, parent);
+ +      if (kobj)
+ +              dev->kobj.parent = kobj;
   
         /* use parent numa_node */
         if (parent)
@@@ -1035,17 -982,17 +1035,17 @@@
                                &parent->p->klist_children);
   
         if (dev->class) {
- -              mutex_lock(&dev->class->p->class_mutex);
+ +              mutex_lock(&dev->class->p->mutex);
                 /* tie the class to the device */
                 klist_add_tail(&dev->knode_class,
                                &dev->class->p->klist_devices);
   
                 /* notify any interfaces that the device is here */
                 list_for_each_entry(class_intf,
- -                                  &dev->class->p->class_interfaces, node)
+ +                                  &dev->class->p->interfaces, node)
                         if (class_intf->add_dev)
                                 class_intf->add_dev(dev, class_intf);
- -              mutex_unlock(&dev->class->p->class_mutex);
+ +              mutex_unlock(&dev->class->p->mutex);
         }
   done:
         put_device(dev);
@@@ -1160,15 -1107,15 +1160,15 @@@ void device_del(struct device *dev
         if (dev->class) {
                 device_remove_class_symlinks(dev);
   
- -              mutex_lock(&dev->class->p->class_mutex);
+ +              mutex_lock(&dev->class->p->mutex);
                 /* notify any interfaces that the device is now gone */
                 list_for_each_entry(class_intf,
- -                                  &dev->class->p->class_interfaces, node)
+ +                                  &dev->class->p->interfaces, node)
                         if (class_intf->remove_dev)
                                 class_intf->remove_dev(dev, class_intf);
                 /* remove the device from the class list */
                 klist_del(&dev->knode_class);
- -              mutex_unlock(&dev->class->p->class_mutex);
+ +              mutex_unlock(&dev->class->p->mutex);
         }
         device_remove_file(dev, &uevent_attr);
         device_remove_attrs(dev);
@@@ -1235,7 -1182,7 +1235,7 @@@ static struct device *next_device(struc
    * freed by the caller.
    */
   const char *device_get_devnode(struct device *dev,
-                              mode_t *mode, const char **tmp)
+                              umode_t *mode, const char **tmp)
   {
         char *s;
   
diff --combined drivers/base/devtmpfs.c

index 2bb4bff,393f450..8493536
--- 1/drivers/base/devtmpfs.c
--- 2/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@@ -40,7 -40,7 +40,7 @@@ static struct req 
         struct completion done;
         int err;
         const char *name;
-       mode_t mode;    /* 0 => delete */
+       umode_t mode;   /* 0 => delete */
         struct device *dev;
   } *requests;
   
@@@ -142,7 -142,7 +142,7 @@@ int devtmpfs_delete_node(struct device 
         return req.err;
   }
   
- static int dev_mkdir(const char *name, mode_t mode)
+ static int dev_mkdir(const char *name, umode_t mode)
   {
         struct dentry *dentry;
         struct path path;
@@@ -189,7 -189,7 +189,7 @@@ static int create_path(const char *node
         return err;
   }
   
- static int handle_create(const char *nodename, mode_t mode, struct device *dev)
+ static int handle_create(const char *nodename, umode_t mode, struct device *dev)
   {
         struct dentry *dentry;
         struct path path;
@@@ -378,7 -378,7 +378,7 @@@ int devtmpfs_mount(const char *mntdir
   
   static DECLARE_COMPLETION(setup_done);
   
- static int handle(const char *name, mode_t mode, struct device *dev)
+ static int handle(const char *name, umode_t mode, struct device *dev)
   {
         if (mode)
                 return handle_create(name, mode, dev);
@@@ -413,9 -413,10 +413,9 @@@ static int devtmpfsd(void *p
                         }
                         spin_lock(&req_lock);
                 }
- -              set_current_state(TASK_INTERRUPTIBLE);
+ +              __set_current_state(TASK_INTERRUPTIBLE);
                 spin_unlock(&req_lock);
                 schedule();
- -              __set_current_state(TASK_RUNNING);
         }
         return 0;
   out:
diff --combined drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c

index 5ca7367,f5a24d9..e53365a
--- 1/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
--- 2/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@@ -1092,8 -1092,7 +1092,8 @@@ static int cxgb4vf_change_mtu(struct ne
         return ret;
   }
   
- -static u32 cxgb4vf_fix_features(struct net_device *dev, u32 features)
+ +static netdev_features_t cxgb4vf_fix_features(struct net_device *dev,
+ +      netdev_features_t features)
   {
         /*
          * Since there is no support for separate rx/tx vlan accel
@@@ -1107,11 -1106,10 +1107,11 @@@
         return features;
   }
   
- -static int cxgb4vf_set_features(struct net_device *dev, u32 features)
+ +static int cxgb4vf_set_features(struct net_device *dev,
+ +      netdev_features_t features)
   {
         struct port_info *pi = netdev_priv(dev);
- -      u32 changed = dev->features ^ features;
+ +      netdev_features_t changed = dev->features ^ features;
   
         if (changed & NETIF_F_HW_VLAN_RX)
                 t4vf_set_rxmode(pi->adapter, pi->viid, -1, -1, -1, -1,
@@@ -1205,10 -1203,9 +1205,10 @@@ static void cxgb4vf_get_drvinfo(struct 
   {
         struct adapter *adapter = netdev2adap(dev);
   
- -      strcpy(drvinfo->driver, KBUILD_MODNAME);
- -      strcpy(drvinfo->version, DRV_VERSION);
- -      strcpy(drvinfo->bus_info, pci_name(to_pci_dev(dev->dev.parent)));
+ +      strlcpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
+ +      strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
+ +      strlcpy(drvinfo->bus_info, pci_name(to_pci_dev(dev->dev.parent)),
+ +              sizeof(drvinfo->bus_info));
         snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
                  "%u.%u.%u.%u, TP %u.%u.%u.%u",
                  FW_HDR_FW_VER_MAJOR_GET(adapter->params.dev.fwrev),
@@@ -1564,7 -1561,7 +1564,7 @@@ static void cxgb4vf_get_wol(struct net_
    */
   #define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
   
- -static struct ethtool_ops cxgb4vf_ethtool_ops = {
+ +static const struct ethtool_ops cxgb4vf_ethtool_ops = {
         .get_settings           = cxgb4vf_get_settings,
         .get_drvinfo            = cxgb4vf_get_drvinfo,
         .get_msglevel           = cxgb4vf_get_msglevel,
@@@ -2003,7 -2000,7 +2003,7 @@@ static const struct file_operations int
    */
   struct cxgb4vf_debugfs_entry {
         const char *name;               /* name of debugfs node */
-       mode_t mode;                    /* file system mode */
+       umode_t mode;                   /* file system mode */
         const struct file_operations *fops;
   };
   
diff --combined drivers/scsi/cxgbi/libcxgbi.c

index 1d25a87,997fa36..c5360ff
--- 1/drivers/scsi/cxgbi/libcxgbi.c
--- 2/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@@ -472,7 -472,6 +472,7 @@@ static struct cxgbi_sock *cxgbi_check_r
         struct net_device *ndev;
         struct cxgbi_device *cdev;
         struct rtable *rt = NULL;
+ +      struct neighbour *n;
         struct flowi4 fl4;
         struct cxgbi_sock *csk = NULL;
         unsigned int mtu = 0;
@@@ -494,12 -493,7 +494,12 @@@
                 goto err_out;
         }
         dst = &rt->dst;
- -      ndev = dst_get_neighbour(dst)->dev;
+ +      n = dst_get_neighbour_noref(dst);
+ +      if (!n) {
+ +              err = -ENODEV;
+ +              goto rel_rt;
+ +      }
+ +      ndev = n->dev;
   
         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
                 pr_info("multi-cast route %pI4, port %u, dev %s.\n",
@@@ -513,7 -507,7 +513,7 @@@
                 ndev = ip_dev_find(&init_net, daddr->sin_addr.s_addr);
                 mtu = ndev->mtu;
                 pr_info("rt dev %s, loopback -> %s, mtu %u.\n",
- -                      dst_get_neighbour(dst)->dev->name, ndev->name, mtu);
+ +                      n->dev->name, ndev->name, mtu);
         }
   
         cdev = cxgbi_device_find_by_netdev(ndev, &port);
@@@ -2575,7 -2569,7 +2575,7 @@@ void cxgbi_iscsi_cleanup(struct iscsi_t
   }
   EXPORT_SYMBOL_GPL(cxgbi_iscsi_cleanup);
   
- mode_t cxgbi_attr_is_visible(int param_type, int param)
+ umode_t cxgbi_attr_is_visible(int param_type, int param)
   {
         switch (param_type) {
         case ISCSI_HOST_PARAM:
diff --combined drivers/staging/iio/adc/ad7192.c

index a6a4a4e,e7bf324..797e65c
--- 1/drivers/staging/iio/adc/ad7192.c
--- 2/drivers/staging/iio/adc/ad7192.c
+++ b/drivers/staging/iio/adc/ad7192.c
@@@ -838,14 -838,14 +838,14 @@@ static struct attribute *ad7192_attribu
         NULL
   };
   
- static mode_t ad7192_attr_is_visible(struct kobject *kobj,
+ static umode_t ad7192_attr_is_visible(struct kobject *kobj,
                                      struct attribute *attr, int n)
   {
         struct device *dev = container_of(kobj, struct device, kobj);
         struct iio_dev *indio_dev = dev_get_drvdata(dev);
         struct ad7192_state *st = iio_priv(indio_dev);
   
-       mode_t mode = attr->mode;
+       umode_t mode = attr->mode;
   
         if ((st->devid != ID_AD7195) &&
                 (attr == &iio_dev_attr_ac_excitation_en.dev_attr.attr))
@@@ -1161,7 -1161,18 +1161,7 @@@ static struct spi_driver ad7192_driver 
         .remove         = __devexit_p(ad7192_remove),
         .id_table       = ad7192_id,
   };
- -
- -static int __init ad7192_init(void)
- -{
- -      return spi_register_driver(&ad7192_driver);
- -}
- -module_init(ad7192_init);
- -
- -static void __exit ad7192_exit(void)
- -{
- -      spi_unregister_driver(&ad7192_driver);
- -}
- -module_exit(ad7192_exit);
+ +module_spi_driver(ad7192_driver);
   
   MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
   MODULE_DESCRIPTION("Analog Devices AD7190, AD7192, AD7195 ADC");
diff --combined drivers/staging/iio/dac/ad5446.c

index ac3bc5f,dc46b6d..ec701e9
--- 1/drivers/staging/iio/dac/ad5446.c
--- 2/drivers/staging/iio/dac/ad5446.c
+++ b/drivers/staging/iio/dac/ad5446.c
@@@ -197,14 -197,14 +197,14 @@@ static struct attribute *ad5446_attribu
         NULL,
   };
   
- static mode_t ad5446_attr_is_visible(struct kobject *kobj,
+ static umode_t ad5446_attr_is_visible(struct kobject *kobj,
                                      struct attribute *attr, int n)
   {
         struct device *dev = container_of(kobj, struct device, kobj);
         struct iio_dev *indio_dev = dev_get_drvdata(dev);
         struct ad5446_state *st = iio_priv(indio_dev);
   
-       mode_t mode = attr->mode;
+       umode_t mode = attr->mode;
   
         if (!st->chip_info->store_pwr_down &&
                 (attr == &iio_dev_attr_out_voltage0_powerdown.dev_attr.attr ||
@@@ -465,7 -465,18 +465,7 @@@ static struct spi_driver ad5446_driver 
         .remove         = __devexit_p(ad5446_remove),
         .id_table       = ad5446_id,
   };
- -
- -static int __init ad5446_init(void)
- -{
- -      return spi_register_driver(&ad5446_driver);
- -}
- -module_init(ad5446_init);
- -
- -static void __exit ad5446_exit(void)
- -{
- -      spi_unregister_driver(&ad5446_driver);
- -}
- -module_exit(ad5446_exit);
+ +module_spi_driver(ad5446_driver);
   
   MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
   MODULE_DESCRIPTION("Analog Devices AD5444/AD5446 DAC");
diff --combined drivers/staging/iio/dds/ad9834.c

index 2b31e35,cc3293a..51fda6f
--- 1/drivers/staging/iio/dds/ad9834.c
--- 2/drivers/staging/iio/dds/ad9834.c
+++ b/drivers/staging/iio/dds/ad9834.c
@@@ -281,14 -281,14 +281,14 @@@ static struct attribute *ad9834_attribu
         NULL,
   };
   
- static mode_t ad9834_attr_is_visible(struct kobject *kobj,
+ static umode_t ad9834_attr_is_visible(struct kobject *kobj,
                                      struct attribute *attr, int n)
   {
         struct device *dev = container_of(kobj, struct device, kobj);
         struct iio_dev *indio_dev = dev_get_drvdata(dev);
         struct ad9834_state *st = iio_priv(indio_dev);
   
-       mode_t mode = attr->mode;
+       umode_t mode = attr->mode;
   
         if (((st->devid == ID_AD9833) || (st->devid == ID_AD9837)) &&
                 ((attr == &iio_dev_attr_dds0_out1_enable.dev_attr.attr) ||
@@@ -446,7 -446,18 +446,7 @@@ static struct spi_driver ad9834_driver 
         .remove         = __devexit_p(ad9834_remove),
         .id_table       = ad9834_id,
   };
- -
- -static int __init ad9834_init(void)
- -{
- -      return spi_register_driver(&ad9834_driver);
- -}
- -module_init(ad9834_init);
- -
- -static void __exit ad9834_exit(void)
- -{
- -      spi_unregister_driver(&ad9834_driver);
- -}
- -module_exit(ad9834_exit);
+ +module_spi_driver(ad9834_driver);
   
   MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
   MODULE_DESCRIPTION("Analog Devices AD9833/AD9834/AD9837/AD9838 DDS");
diff --combined drivers/usb/class/usblp.c

index 81ef2e2,bc5089f..a68c1a6
--- 1/drivers/usb/class/usblp.c
--- 2/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@@ -1045,7 -1045,7 +1045,7 @@@ static const struct file_operations usb
         .llseek =       noop_llseek,
   };
   
- static char *usblp_devnode(struct device *dev, mode_t *mode)
+ static char *usblp_devnode(struct device *dev, umode_t *mode)
   {
         return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
   }
@@@ -1412,7 -1412,18 +1412,7 @@@ static struct usb_driver usblp_driver 
         .supports_autosuspend = 1,
   };
   
- -static int __init usblp_init(void)
- -{
- -      return usb_register(&usblp_driver);
- -}
- -
- -static void __exit usblp_exit(void)
- -{
- -      usb_deregister(&usblp_driver);
- -}
- -
- -module_init(usblp_init);
- -module_exit(usblp_exit);
+ +module_usb_driver(usblp_driver);
   
   MODULE_AUTHOR(DRIVER_AUTHOR);
   MODULE_DESCRIPTION(DRIVER_DESC);
diff --combined drivers/usb/misc/iowarrior.c

index 7676b5b,5bd4b05..2453a39
--- 1/drivers/usb/misc/iowarrior.c
--- 2/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@@ -734,7 -734,7 +734,7 @@@ static const struct file_operations iow
         .llseek = noop_llseek,
   };
   
- static char *iowarrior_devnode(struct device *dev, mode_t *mode)
+ static char *iowarrior_devnode(struct device *dev, umode_t *mode)
   {
         return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
   }
@@@ -927,4 -927,15 +927,4 @@@ static struct usb_driver iowarrior_driv
         .id_table = iowarrior_ids,
   };
   
- -static int __init iowarrior_init(void)
- -{
- -      return usb_register(&iowarrior_driver);
- -}
- -
- -static void __exit iowarrior_exit(void)
- -{
- -      usb_deregister(&iowarrior_driver);
- -}
- -
- -module_init(iowarrior_init);
- -module_exit(iowarrior_exit);
+ +module_usb_driver(iowarrior_driver);
diff --combined drivers/usb/misc/legousbtower.c

index 16937da,94f6566..5752220
--- 1/drivers/usb/misc/legousbtower.c
--- 2/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@@ -269,7 -269,7 +269,7 @@@ static const struct file_operations tow
         .llseek =       tower_llseek,
   };
   
- static char *legousbtower_devnode(struct device *dev, mode_t *mode)
+ static char *legousbtower_devnode(struct device *dev, umode_t *mode)
   {
         return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
   }
@@@ -1043,7 -1043,51 +1043,7 @@@ static void tower_disconnect (struct us
         dbg(2, "%s: leave", __func__);
   }
   
- -
- -
- -/**
- - *    lego_usb_tower_init
- - */
- -static int __init lego_usb_tower_init(void)
- -{
- -      int result;
- -      int retval = 0;
- -
- -      dbg(2, "%s: enter", __func__);
- -
- -      /* register this driver with the USB subsystem */
- -      result = usb_register(&tower_driver);
- -      if (result < 0) {
- -              err("usb_register failed for the %s driver. Error number %d", __FILE__, result);
- -              retval = -1;
- -              goto exit;
- -      }
- -
- -      printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_VERSION ":"
- -             DRIVER_DESC "\n");
- -
- -exit:
- -      dbg(2, "%s: leave, return value %d", __func__, retval);
- -
- -      return retval;
- -}
- -
- -
- -/**
- - *    lego_usb_tower_exit
- - */
- -static void __exit lego_usb_tower_exit(void)
- -{
- -      dbg(2, "%s: enter", __func__);
- -
- -      /* deregister this driver with the USB subsystem */
- -      usb_deregister (&tower_driver);
- -
- -      dbg(2, "%s: leave", __func__);
- -}
- -
- -module_init (lego_usb_tower_init);
- -module_exit (lego_usb_tower_exit);
+ +module_usb_driver(tower_driver);
   
   MODULE_AUTHOR(DRIVER_AUTHOR);
   MODULE_DESCRIPTION(DRIVER_DESC);
diff --combined fs/cifs/connect.c

index f3670cf,be1e8f9..4666780
--- 1/fs/cifs/connect.c
--- 2/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@@ -282,7 -282,7 +282,7 @@@ static int coalesce_t2(struct smb_hdr *
         byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
         byte_count += total_in_buf2;
         /* don't allow buffer to overflow */
- -      if (byte_count > CIFSMaxBufSize)
+ +      if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
                 return -ENOBUFS;
         pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
   
@@@ -2122,7 -2122,7 +2122,7 @@@ cifs_get_smb_ses(struct TCP_Server_Inf
                 warned_on_ntlm = true;
                 cERROR(1, "default security mechanism requested.  The default "
                         "security mechanism will be upgraded from ntlm to "
- -                      "ntlmv2 in kernel release 3.2");
+ +                      "ntlmv2 in kernel release 3.3");
         }
         ses->overrideSecFlg = volume_info->secFlg;
   
@@@ -2819,7 -2819,7 +2819,7 @@@ void cifs_setup_cifs_sb(struct smb_vol 
                 cifs_sb->mnt_backupgid = pvolume_info->backupgid;
         cifs_sb->mnt_file_mode = pvolume_info->file_mode;
         cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-       cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
+       cFYI(1, "file mode: 0x%hx  dir mode: 0x%hx",
                 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
   
         cifs_sb->actimeo = pvolume_info->actimeo;
diff --combined fs/debugfs/file.c

index ea62afa,d501660..f65d445
--- 1/fs/debugfs/file.c
--- 2/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@@ -15,11 -15,9 +15,11 @@@
   
   #include <linux/module.h>
   #include <linux/fs.h>
+ +#include <linux/seq_file.h>
   #include <linux/pagemap.h>
   #include <linux/namei.h>
   #include <linux/debugfs.h>
+ +#include <linux/io.h>
   
   static ssize_t default_read_file(struct file *file, char __user *buf,
                                  size_t count, loff_t *ppos)
@@@ -97,7 -95,7 +97,7 @@@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NUL
    * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
    * code.
    */
- struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+ struct dentry *debugfs_create_u8(const char *name, umode_t mode,
                                  struct dentry *parent, u8 *value)
   {
         /* if there are no write bits set, make read only */
@@@ -149,7 -147,7 +149,7 @@@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NU
    * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
    * code.
    */
- struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+ struct dentry *debugfs_create_u16(const char *name, umode_t mode,
                                   struct dentry *parent, u16 *value)
   {
         /* if there are no write bits set, make read only */
@@@ -201,7 -199,7 +201,7 @@@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NU
    * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
    * code.
    */
- struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+ struct dentry *debugfs_create_u32(const char *name, umode_t mode,
                                  struct dentry *parent, u32 *value)
   {
         /* if there are no write bits set, make read only */
@@@ -254,7 -252,7 +254,7 @@@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NU
    * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
    * code.
    */
- struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+ struct dentry *debugfs_create_u64(const char *name, umode_t mode,
                                  struct dentry *parent, u64 *value)
   {
         /* if there are no write bits set, make read only */
@@@ -300,7 -298,7 +300,7 @@@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debug
    * @value: a pointer to the variable that the file should read to and write
    *         from.
    */
- struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+ struct dentry *debugfs_create_x8(const char *name, umode_t mode,
                                  struct dentry *parent, u8 *value)
   {
         /* if there are no write bits set, make read only */
@@@ -324,7 -322,7 +324,7 @@@ EXPORT_SYMBOL_GPL(debugfs_create_x8)
    * @value: a pointer to the variable that the file should read to and write
    *         from.
    */
- struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+ struct dentry *debugfs_create_x16(const char *name, umode_t mode,
                                  struct dentry *parent, u16 *value)
   {
         /* if there are no write bits set, make read only */
@@@ -348,7 -346,7 +348,7 @@@ EXPORT_SYMBOL_GPL(debugfs_create_x16)
    * @value: a pointer to the variable that the file should read to and write
    *         from.
    */
- struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+ struct dentry *debugfs_create_x32(const char *name, umode_t mode,
                                  struct dentry *parent, u32 *value)
   {
         /* if there are no write bits set, make read only */
@@@ -372,7 -370,7 +372,7 @@@ EXPORT_SYMBOL_GPL(debugfs_create_x32)
    * @value: a pointer to the variable that the file should read to and write
    *         from.
    */
- struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+ struct dentry *debugfs_create_x64(const char *name, umode_t mode,
                                  struct dentry *parent, u64 *value)
   {
         return debugfs_create_file(name, mode, parent, value, &fops_x64);
@@@ -403,7 -401,7 +403,7 @@@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, de
    * @value: a pointer to the variable that the file should read to and write
    *         from.
    */
- struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
                                      struct dentry *parent, size_t *value)
   {
         return debugfs_create_file(name, mode, parent, value, &fops_size_t);
@@@ -475,7 -473,7 +475,7 @@@ static const struct file_operations fop
    * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
    * code.
    */
- struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+ struct dentry *debugfs_create_bool(const char *name, umode_t mode,
                                    struct dentry *parent, u32 *value)
   {
         return debugfs_create_file(name, mode, parent, value, &fops_bool);
@@@ -520,103 -518,10 +520,103 @@@ static const struct file_operations fop
    * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
    * code.
    */
- struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
                                    struct dentry *parent,
                                    struct debugfs_blob_wrapper *blob)
   {
         return debugfs_create_file(name, mode, parent, blob, &fops_blob);
   }
   EXPORT_SYMBOL_GPL(debugfs_create_blob);
+ +
+ +#ifdef CONFIG_HAS_IOMEM
+ +
+ +/*
+ + * The regset32 stuff is used to print 32-bit registers using the
+ + * seq_file utilities. We offer printing a register set in an already-opened
+ + * sequential file or create a debugfs file that only prints a regset32.
+ + */
+ +
+ +/**
+ + * debugfs_print_regs32 - use seq_print to describe a set of registers
+ + * @s: the seq_file structure being used to generate output
+ + * @regs: an array if struct debugfs_reg32 structures
+ + * @mregs: the length of the above array
+ + * @base: the base address to be used in reading the registers
+ + * @prefix: a string to be prefixed to every output line
+ + *
+ + * This function outputs a text block describing the current values of
+ + * some 32-bit hardware registers. It is meant to be used within debugfs
+ + * files based on seq_file that need to show registers, intermixed with other
+ + * information. The prefix argument may be used to specify a leading string,
+ + * because some peripherals have several blocks of identical registers,
+ + * for example configuration of dma channels
+ + */
+ +int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
+ +                         int nregs, void __iomem *base, char *prefix)
+ +{
+ +      int i, ret = 0;
+ +
+ +      for (i = 0; i < nregs; i++, regs++) {
+ +              if (prefix)
+ +                      ret += seq_printf(s, "%s", prefix);
+ +              ret += seq_printf(s, "%s = 0x%08x\n", regs->name,
+ +                                readl(base + regs->offset));
+ +      }
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL_GPL(debugfs_print_regs32);
+ +
+ +static int debugfs_show_regset32(struct seq_file *s, void *data)
+ +{
+ +      struct debugfs_regset32 *regset = s->private;
+ +
+ +      debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, "");
+ +      return 0;
+ +}
+ +
+ +static int debugfs_open_regset32(struct inode *inode, struct file *file)
+ +{
+ +      return single_open(file, debugfs_show_regset32, inode->i_private);
+ +}
+ +
+ +static const struct file_operations fops_regset32 = {
+ +      .open =         debugfs_open_regset32,
+ +      .read =         seq_read,
+ +      .llseek =       seq_lseek,
+ +      .release =      single_release,
+ +};
+ +
+ +/**
+ + * debugfs_create_regset32 - create a debugfs file that returns register values
+ + * @name: a pointer to a string containing the name of the file to create.
+ + * @mode: the permission that the file should have
+ + * @parent: a pointer to the parent dentry for this file.  This should be a
+ + *          directory dentry if set.  If this parameter is %NULL, then the
+ + *          file will be created in the root of the debugfs filesystem.
+ + * @regset: a pointer to a struct debugfs_regset32, which contains a pointer
+ + *          to an array of register definitions, the array size and the base
+ + *          address where the register bank is to be found.
+ + *
+ + * This function creates a file in debugfs with the given name that reports
+ + * the names and values of a set of 32-bit registers. If the @mode variable
+ + * is so set it can be read from. Writing is not supported.
+ + *
+ + * This function will return a pointer to a dentry if it succeeds.  This
+ + * pointer must be passed to the debugfs_remove() function when the file is
+ + * to be removed (no automatic cleanup happens if your module is unloaded,
+ + * you are responsible here.)  If an error occurs, %NULL will be returned.
+ + *
+ + * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ + * returned.  It is not wise to check for this value, but rather, check for
+ + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ + * code.
+ + */
+ +struct dentry *debugfs_create_regset32(const char *name, mode_t mode,
+ +                                     struct dentry *parent,
+ +                                     struct debugfs_regset32 *regset)
+ +{
+ +      return debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+ +}
+ +EXPORT_SYMBOL_GPL(debugfs_create_regset32);
+ +
+ +#endif /* CONFIG_HAS_IOMEM */
diff --combined fs/minix/inode.c

index 4d46a6a,c811c19..fa8b612
--- 1/fs/minix/inode.c
--- 2/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@@ -71,7 -71,6 +71,6 @@@ static struct inode *minix_alloc_inode(
   static void minix_i_callback(struct rcu_head *head)
   {
         struct inode *inode = container_of(head, struct inode, i_rcu);
-       INIT_LIST_HEAD(&inode->i_dentry);
         kmem_cache_free(minix_inode_cachep, minix_i(inode));
   }
   
@@@ -263,6 -262,23 +262,6 @@@ static int minix_fill_super(struct supe
                 goto out_no_root;
         }
   
- -      ret = -ENOMEM;
- -      s->s_root = d_alloc_root(root_inode);
- -      if (!s->s_root)
- -              goto out_iput;
- -
- -      if (!(s->s_flags & MS_RDONLY)) {
- -              if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
- -                      ms->s_state &= ~MINIX_VALID_FS;
- -              mark_buffer_dirty(bh);
- -      }
- -      if (!(sbi->s_mount_state & MINIX_VALID_FS))
- -              printk("MINIX-fs: mounting unchecked file system, "
- -                      "running fsck is recommended\n");
- -      else if (sbi->s_mount_state & MINIX_ERROR_FS)
- -              printk("MINIX-fs: mounting file system with errors, "
- -                      "running fsck is recommended\n");
- -
         /* Apparently minix can create filesystems that allocate more blocks for
          * the bitmaps than needed.  We simply ignore that, but verify it didn't
          * create one with not enough blocks and bail out if so.
@@@ -283,23 -299,6 +282,23 @@@
                 goto out_iput;
         }
   
+ +      ret = -ENOMEM;
+ +      s->s_root = d_alloc_root(root_inode);
+ +      if (!s->s_root)
+ +              goto out_iput;
+ +
+ +      if (!(s->s_flags & MS_RDONLY)) {
+ +              if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
+ +                      ms->s_state &= ~MINIX_VALID_FS;
+ +              mark_buffer_dirty(bh);
+ +      }
+ +      if (!(sbi->s_mount_state & MINIX_VALID_FS))
+ +              printk("MINIX-fs: mounting unchecked file system, "
+ +                      "running fsck is recommended\n");
+ +      else if (sbi->s_mount_state & MINIX_ERROR_FS)
+ +              printk("MINIX-fs: mounting file system with errors, "
+ +                      "running fsck is recommended\n");
+ +
         return 0;
   
   out_iput:
diff --combined include/linux/debugfs.h

index e8c3abc,d1ac841..6169c26
--- 1/include/linux/debugfs.h
--- 2/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@@ -16,7 -16,6 +16,7 @@@
   #define _DEBUGFS_H_
   
   #include <linux/fs.h>
+ +#include <linux/seq_file.h>
   
   #include <linux/types.h>
   
@@@ -27,17 -26,6 +27,17 @@@ struct debugfs_blob_wrapper 
         unsigned long size;
   };
   
+ +struct debugfs_reg32 {
+ +      char *name;
+ +      unsigned long offset;
+ +};
+ +
+ +struct debugfs_regset32 {
+ +      struct debugfs_reg32 *regs;
+ +      int nregs;
+ +      void __iomem *base;
+ +};
+ +
   extern struct dentry *arch_debugfs_dir;
   
   #if defined(CONFIG_DEBUG_FS)
@@@ -46,7 -34,7 +46,7 @@@
   extern const struct file_operations debugfs_file_operations;
   extern const struct inode_operations debugfs_link_operations;
   
- struct dentry *debugfs_create_file(const char *name, mode_t mode,
+ struct dentry *debugfs_create_file(const char *name, umode_t mode,
                                    struct dentry *parent, void *data,
                                    const struct file_operations *fops);
   
@@@ -61,38 -49,31 +61,38 @@@ void debugfs_remove_recursive(struct de
   struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                   struct dentry *new_dir, const char *new_name);
   
- struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+ struct dentry *debugfs_create_u8(const char *name, umode_t mode,
                                  struct dentry *parent, u8 *value);
- struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+ struct dentry *debugfs_create_u16(const char *name, umode_t mode,
                                   struct dentry *parent, u16 *value);
- struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+ struct dentry *debugfs_create_u32(const char *name, umode_t mode,
                                   struct dentry *parent, u32 *value);
- struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+ struct dentry *debugfs_create_u64(const char *name, umode_t mode,
                                   struct dentry *parent, u64 *value);
- struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+ struct dentry *debugfs_create_x8(const char *name, umode_t mode,
                                  struct dentry *parent, u8 *value);
- struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+ struct dentry *debugfs_create_x16(const char *name, umode_t mode,
                                   struct dentry *parent, u16 *value);
- struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+ struct dentry *debugfs_create_x32(const char *name, umode_t mode,
                                   struct dentry *parent, u32 *value);
- struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+ struct dentry *debugfs_create_x64(const char *name, umode_t mode,
                                   struct dentry *parent, u64 *value);
- struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
                                      struct dentry *parent, size_t *value);
- struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+ struct dentry *debugfs_create_bool(const char *name, umode_t mode,
                                   struct dentry *parent, u32 *value);
   
- struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
                                   struct dentry *parent,
                                   struct debugfs_blob_wrapper *blob);
   
+ +struct dentry *debugfs_create_regset32(const char *name, mode_t mode,
+ +                                   struct dentry *parent,
+ +                                   struct debugfs_regset32 *regset);
+ +
+ +int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
+ +                       int nregs, void __iomem *base, char *prefix);
+ +
   bool debugfs_initialized(void);
   
   #else
@@@ -105,7 -86,7 +105,7 @@@
    * want to duplicate the design decision mistakes of procfs and devfs again.
    */
   
- static inline struct dentry *debugfs_create_file(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
                                         struct dentry *parent, void *data,
                                         const struct file_operations *fops)
   {
@@@ -137,83 -118,76 +137,83 @@@ static inline struct dentry *debugfs_re
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_u8(const char *name, umode_t mode,
                                                struct dentry *parent,
                                                u8 *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_u16(const char *name, umode_t mode,
                                                 struct dentry *parent,
                                                 u16 *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_u32(const char *name, umode_t mode,
                                                 struct dentry *parent,
                                                 u32 *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_u64(const char *name, umode_t mode,
                                                 struct dentry *parent,
                                                 u64 *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_x8(const char *name, umode_t mode,
                                                struct dentry *parent,
                                                u8 *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_x16(const char *name, umode_t mode,
                                                 struct dentry *parent,
                                                 u16 *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_x32(const char *name, umode_t mode,
                                                 struct dentry *parent,
                                                 u32 *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
                                      struct dentry *parent,
                                      size_t *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
                                                  struct dentry *parent,
                                                  u32 *value)
   {
         return ERR_PTR(-ENODEV);
   }
   
- static inline struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+ static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
                                   struct dentry *parent,
                                   struct debugfs_blob_wrapper *blob)
   {
         return ERR_PTR(-ENODEV);
   }
   
+ +static inline struct dentry *debugfs_create_regset32(const char *name,
+ +                                 mode_t mode, struct dentry *parent,
+ +                                 struct debugfs_regset32 *regset)
+ +{
+ +      return ERR_PTR(-ENODEV);
+ +}
+ +
   static inline bool debugfs_initialized(void)
   {
         return false;
diff --combined include/linux/device.h

index 96acef8,2fe0005..5b3adb8
--- 1/include/linux/device.h
--- 2/include/linux/device.h
+++ b/include/linux/device.h
@@@ -53,8 -53,6 +53,8 @@@ extern void bus_remove_file(struct bus_
    * struct bus_type - The bus type of the device
    *
    * @name:     The name of the bus.
+ + * @dev_name: Used for subsystems to enumerate devices like ("foo%u", dev->id).
+ + * @dev_root: Default device to use as the parent.
    * @bus_attrs:        Default attributes of the bus.
    * @dev_attrs:        Default attributes of the devices on the bus.
    * @drv_attrs:        Default attributes of the device drivers on the bus.
@@@ -88,8 -86,6 +88,8 @@@
    */
   struct bus_type {
         const char              *name;
+ +      const char              *dev_name;
+ +      struct device           *dev_root;
         struct bus_attribute    *bus_attrs;
         struct device_attribute *dev_attrs;
         struct driver_attribute *drv_attrs;
@@@ -110,30 -106,12 +110,30 @@@
         struct subsys_private *p;
   };
   
- -extern int __must_check bus_register(struct bus_type *bus);
+ +/* This is a #define to keep the compiler from merging different
+ + * instances of the __key variable */
+ +#define bus_register(subsys)                  \
+ +({                                            \
+ +      static struct lock_class_key __key;     \
+ +      __bus_register(subsys, &__key); \
+ +})
+ +extern int __must_check __bus_register(struct bus_type *bus,
+ +                                     struct lock_class_key *key);
   extern void bus_unregister(struct bus_type *bus);
   
   extern int __must_check bus_rescan_devices(struct bus_type *bus);
   
   /* iterator helpers for buses */
+ +struct subsys_dev_iter {
+ +      struct klist_iter               ki;
+ +      const struct device_type        *type;
+ +};
+ +void subsys_dev_iter_init(struct subsys_dev_iter *iter,
+ +                       struct bus_type *subsys,
+ +                       struct device *start,
+ +                       const struct device_type *type);
+ +struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter);
+ +void subsys_dev_iter_exit(struct subsys_dev_iter *iter);
   
   int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data,
                      int (*fn)(struct device *dev, void *data));
@@@ -143,10 -121,10 +143,10 @@@ struct device *bus_find_device(struct b
   struct device *bus_find_device_by_name(struct bus_type *bus,
                                        struct device *start,
                                        const char *name);
- -
+ +struct device *subsys_find_device_by_id(struct bus_type *bus, unsigned int id,
+ +                                      struct device *hint);
   int bus_for_each_drv(struct bus_type *bus, struct device_driver *start,
                      void *data, int (*fn)(struct device_driver *, void *));
- -
   void bus_sort_breadthfirst(struct bus_type *bus,
                            int (*compare)(const struct device *a,
                                           const struct device *b));
@@@ -278,33 -256,6 +278,33 @@@ struct device *driver_find_device(struc
                                   int (*match)(struct device *dev, void *data));
   
   /**
+ + * struct subsys_interface - interfaces to device functions
+ + * @name        name of the device function
+ + * @subsystem   subsytem of the devices to attach to
+ + * @node        the list of functions registered at the subsystem
+ + * @add         device hookup to device function handler
+ + * @remove      device hookup to device function handler
+ + *
+ + * Simple interfaces attached to a subsystem. Multiple interfaces can
+ + * attach to a subsystem and its devices. Unlike drivers, they do not
+ + * exclusively claim or control devices. Interfaces usually represent
+ + * a specific functionality of a subsystem/class of devices.
+ + */
+ +struct subsys_interface {
+ +      const char *name;
+ +      struct bus_type *subsys;
+ +      struct list_head node;
+ +      int (*add_dev)(struct device *dev, struct subsys_interface *sif);
+ +      int (*remove_dev)(struct device *dev, struct subsys_interface *sif);
+ +};
+ +
+ +int subsys_interface_register(struct subsys_interface *sif);
+ +void subsys_interface_unregister(struct subsys_interface *sif);
+ +
+ +int subsys_system_register(struct bus_type *subsys,
+ +                         const struct attribute_group **groups);
+ +
+ +/**
    * struct class - device classes
    * @name:     Name of the class.
    * @owner:    The module owner.
@@@ -343,7 -294,7 +343,7 @@@ struct class 
         struct kobject                  *dev_kobj;
   
         int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env);
-       char *(*devnode)(struct device *dev, mode_t *mode);
+       char *(*devnode)(struct device *dev, umode_t *mode);
   
         void (*class_release)(struct class *class);
         void (*dev_release)(struct device *dev);
@@@ -472,7 -423,7 +472,7 @@@ struct device_type 
         const char *name;
         const struct attribute_group **groups;
         int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
-       char *(*devnode)(struct device *dev, mode_t *mode);
+       char *(*devnode)(struct device *dev, umode_t *mode);
         void (*release)(struct device *dev);
   
         const struct dev_pm_ops *pm;
@@@ -487,31 -438,11 +487,31 @@@ struct device_attribute 
                          const char *buf, size_t count);
   };
   
- -#define DEVICE_ATTR(_name, _mode, _show, _store) \
- -struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)
+ +struct dev_ext_attribute {
+ +      struct device_attribute attr;
+ +      void *var;
+ +};
+ +
+ +ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr,
+ +                        char *buf);
+ +ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr,
+ +                         const char *buf, size_t count);
+ +ssize_t device_show_int(struct device *dev, struct device_attribute *attr,
+ +                      char *buf);
+ +ssize_t device_store_int(struct device *dev, struct device_attribute *attr,
+ +                       const char *buf, size_t count);
   
- -extern int __must_check device_create_file(struct device *device,
- -                                      const struct device_attribute *entry);
+ +#define DEVICE_ATTR(_name, _mode, _show, _store) \
+ +      struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)
+ +#define DEVICE_ULONG_ATTR(_name, _mode, _var) \
+ +      struct dev_ext_attribute dev_attr_##_name = \
+ +              { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }
+ +#define DEVICE_INT_ATTR(_name, _mode, _var) \
+ +      struct dev_ext_attribute dev_attr_##_name = \
+ +              { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }
+ +
+ +extern int device_create_file(struct device *device,
+ +                            const struct device_attribute *entry);
   extern void device_remove_file(struct device *dev,
                                const struct device_attribute *attr);
   extern int __must_check device_create_bin_file(struct device *dev,
@@@ -559,9 -490,6 +559,9 @@@ extern int devres_release_group(struct 
   extern void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp);
   extern void devm_kfree(struct device *dev, void *p);
   
+ +void __iomem *devm_request_and_ioremap(struct device *dev,
+ +                      struct resource *res);
+ +
   struct device_dma_parameters {
         /*
          * a low level driver may set these to teach IOMMU code about
@@@ -672,7 -600,6 +672,7 @@@ struct device 
         struct device_node      *of_node; /* associated device tree node */
   
         dev_t                   devt;   /* dev_t, creates the sysfs "dev" */
+ +      u32                     id;     /* device instance */
   
         spinlock_t              devres_lock;
         struct list_head        devres_head;
@@@ -793,7 -720,7 +793,7 @@@ extern int device_rename(struct device 
   extern int device_move(struct device *dev, struct device *new_parent,
                        enum dpm_order dpm_order);
   extern const char *device_get_devnode(struct device *dev,
-                                     mode_t *mode, const char **tmp);
+                                     umode_t *mode, const char **tmp);
   extern void *dev_get_drvdata(const struct device *dev);
   extern int dev_set_drvdata(struct device *dev, void *data);
   
@@@ -997,25 -924,4 +997,25 @@@ extern long sysfs_deprecated
   #define sysfs_deprecated 0
   #endif
   
+ +/**
+ + * module_driver() - Helper macro for drivers that don't do anything
+ + * special in module init/exit. This eliminates a lot of boilerplate.
+ + * Each module may only use this macro once, and calling it replaces
+ + * module_init() and module_exit().
+ + *
+ + * Use this macro to construct bus specific macros for registering
+ + * drivers, and do not use it on its own.
+ + */
+ +#define module_driver(__driver, __register, __unregister) \
+ +static int __init __driver##_init(void) \
+ +{ \
+ +      return __register(&(__driver)); \
+ +} \
+ +module_init(__driver##_init); \
+ +static void __exit __driver##_exit(void) \
+ +{ \
+ +      __unregister(&(__driver)); \
+ +} \
+ +module_exit(__driver##_exit);
+ +
   #endif /* _DEVICE_H_ */
diff --combined include/linux/usb.h

index 5d258c3,a593217..7f8d4d6
--- 1/include/linux/usb.h
--- 2/include/linux/usb.h
+++ b/include/linux/usb.h
@@@ -935,7 -935,7 +935,7 @@@ extern struct bus_type usb_bus_type
    */
   struct usb_class_driver {
         char *name;
-       char *(*devnode)(struct device *dev, mode_t *mode);
+       char *(*devnode)(struct device *dev, umode_t *mode);
         const struct file_operations *fops;
         int minor_base;
   };
@@@ -953,18 -953,6 +953,18 @@@ extern int usb_register_driver(struct u
   
   extern void usb_deregister(struct usb_driver *);
   
+ +/**
+ + * module_usb_driver() - Helper macro for registering a USB driver
+ + * @__usb_driver: usb_driver struct
+ + *
+ + * Helper macro for USB drivers which do not do anything special in module
+ + * init/exit. This eliminates a lot of boilerplate. Each module may only
+ + * use this macro once, and calling it replaces module_init() and module_exit()
+ + */
+ +#define module_usb_driver(__usb_driver) \
+ +      module_driver(__usb_driver, usb_register, \
+ +                     usb_deregister)
+ +
   extern int usb_register_device_driver(struct usb_device_driver *,
                         struct module *);
   extern void usb_deregister_device_driver(struct usb_device_driver *);
diff --combined kernel/acct.c

index 203dfea,9663eb8..02e6167
--- 1/kernel/acct.c
--- 2/kernel/acct.c
+++ b/kernel/acct.c
@@@ -84,11 -84,10 +84,10 @@@ static void do_acct_process(struct bsd_
    * the cache line to have the data after getting the lock.
    */
   struct bsd_acct_struct {
-       volatile int            active;
-       volatile int            needcheck;
+       int                     active;
+       unsigned long           needcheck;
         struct file             *file;
         struct pid_namespace    *ns;
-       struct timer_list       timer;
         struct list_head        list;
   };
   
@@@ -96,15 -95,6 +95,6 @@@ static DEFINE_SPINLOCK(acct_lock)
   static LIST_HEAD(acct_list);
   
   /*
-  * Called whenever the timer says to check the free space.
-  */
- static void acct_timeout(unsigned long x)
- {
-       struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
-       acct->needcheck = 1;
- }
- 
- /*
    * Check the amount of free space and suspend/resume accordingly.
    */
   static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@@ -112,12 -102,12 +102,12 @@@
         struct kstatfs sbuf;
         int res;
         int act;
-       sector_t resume;
-       sector_t suspend;
+       u64 resume;
+       u64 suspend;
   
         spin_lock(&acct_lock);
         res = acct->active;
-       if (!file || !acct->needcheck)
+       if (!file || time_is_before_jiffies(acct->needcheck))
                 goto out;
         spin_unlock(&acct_lock);
   
@@@ -127,8 -117,8 +117,8 @@@
         suspend = sbuf.f_blocks * SUSPEND;
         resume = sbuf.f_blocks * RESUME;
   
-       sector_div(suspend, 100);
-       sector_div(resume, 100);
+       do_div(suspend, 100);
+       do_div(resume, 100);
   
         if (sbuf.f_bavail <= suspend)
                 act = -1;
@@@ -160,10 -150,7 +150,7 @@@
                 }
         }
   
-       del_timer(&acct->timer);
-       acct->needcheck = 0;
-       acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-       add_timer(&acct->timer);
+       acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
         res = acct->active;
   out:
         spin_unlock(&acct_lock);
@@@ -185,9 -172,7 +172,7 @@@ static void acct_file_reopen(struct bsd
         if (acct->file) {
                 old_acct = acct->file;
                 old_ns = acct->ns;
-               del_timer(&acct->timer);
                 acct->active = 0;
-               acct->needcheck = 0;
                 acct->file = NULL;
                 acct->ns = NULL;
                 list_del(&acct->list);
@@@ -195,13 -180,9 +180,9 @@@
         if (file) {
                 acct->file = file;
                 acct->ns = ns;
-               acct->needcheck = 0;
+               acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
                 acct->active = 1;
                 list_add(&acct->list, &acct_list);
-               /* It's been deleted if it was used before so this is safe */
-               setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
-               acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-               add_timer(&acct->timer);
         }
         if (old_acct) {
                 mnt_unpin(old_acct->f_path.mnt);
@@@ -334,7 -315,7 +315,7 @@@ void acct_auto_close(struct super_bloc
         spin_lock(&acct_lock);
   restart:
         list_for_each_entry(acct, &acct_list, list)
-               if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+               if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
                         acct_file_reopen(acct, NULL, NULL);
                         goto restart;
                 }
@@@ -348,7 -329,6 +329,6 @@@ void acct_exit_ns(struct pid_namespace 
         if (acct == NULL)
                 return;
   
-       del_timer_sync(&acct->timer);
         spin_lock(&acct_lock);
         if (acct->file != NULL)
                 acct_file_reopen(acct, NULL, NULL);
@@@ -498,7 -478,7 +478,7 @@@ static void do_acct_process(struct bsd_
          * Fill the accounting struct with the needed info as recorded
          * by the different kernel functions.
          */
-       memset((caddr_t)&ac, 0, sizeof(acct_t));
+       memset(&ac, 0, sizeof(acct_t));
   
         ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
         strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@@ -613,8 -593,8 +593,8 @@@ void acct_collect(long exitcode, int gr
                 pacct->ac_flag |= ACORE;
         if (current->flags & PF_SIGNALED)
                 pacct->ac_flag |= AXSIG;
- -      pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
- -      pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+ +      pacct->ac_utime += current->utime;
+ +      pacct->ac_stime += current->stime;
         pacct->ac_minflt += current->min_flt;
         pacct->ac_majflt += current->maj_flt;
         spin_unlock_irq(&current->sighand->siglock);
diff --combined kernel/sched/core.c

index 2a4590f,e64f457..0ac0f81
--- 1/kernel/sched/core.c
--- 2/kernel/sched.c
+++ b/kernel/sched/core.c
@@@ -1,5 -1,5 +1,5 @@@
   /*
- - *  kernel/sched.c
+ + *  kernel/sched/core.c
    *
    *  Kernel scheduler and related syscalls
    *
@@@ -56,6 -56,7 +56,6 @@@
   #include <linux/percpu.h>
   #include <linux/proc_fs.h>
   #include <linux/seq_file.h>
- -#include <linux/stop_machine.h>
   #include <linux/sysctl.h>
   #include <linux/syscalls.h>
   #include <linux/times.h>
@@@ -74,17 -75,129 +74,17 @@@
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
- -#include <asm/mutex.h>
   #ifdef CONFIG_PARAVIRT
   #include <asm/paravirt.h>
   #endif
   
- -#include "sched_cpupri.h"
- -#include "workqueue_sched.h"
- -#include "sched_autogroup.h"
+ +#include "sched.h"
+ +#include "../workqueue_sched.h"
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/sched.h>
   
- -/*
- - * Convert user-nice values [ -20 ... 0 ... 19 ]
- - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- - * and back.
- - */
- -#define NICE_TO_PRIO(nice)    (MAX_RT_PRIO + (nice) + 20)
- -#define PRIO_TO_NICE(prio)    ((prio) - MAX_RT_PRIO - 20)
- -#define TASK_NICE(p)          PRIO_TO_NICE((p)->static_prio)
- -
- -/*
- - * 'User priority' is the nice value converted to something we
- - * can work with better when scaling various scheduler parameters,
- - * it's a [ 0 ... 39 ] range.
- - */
- -#define USER_PRIO(p)          ((p)-MAX_RT_PRIO)
- -#define TASK_USER_PRIO(p)     USER_PRIO((p)->static_prio)
- -#define MAX_USER_PRIO         (USER_PRIO(MAX_PRIO))
- -
- -/*
- - * Helpers for converting nanosecond timing to jiffy resolution
- - */
- -#define NS_TO_JIFFIES(TIME)   ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
- -
- -#define NICE_0_LOAD           SCHED_LOAD_SCALE
- -#define NICE_0_SHIFT          SCHED_LOAD_SHIFT
- -
- -/*
- - * These are the 'tuning knobs' of the scheduler:
- - *
- - * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- - * Timeslices get refilled after they expire.
- - */
- -#define DEF_TIMESLICE         (100 * HZ / 1000)
- -
- -/*
- - * single value that denotes runtime == period, ie unlimited time.
- - */
- -#define RUNTIME_INF   ((u64)~0ULL)
- -
- -static inline int rt_policy(int policy)
- -{
- -      if (policy == SCHED_FIFO || policy == SCHED_RR)
- -              return 1;
- -      return 0;
- -}
- -
- -static inline int task_has_rt_policy(struct task_struct *p)
- -{
- -      return rt_policy(p->policy);
- -}
- -
- -/*
- - * This is the priority-queue data structure of the RT scheduling class:
- - */
- -struct rt_prio_array {
- -      DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
- -      struct list_head queue[MAX_RT_PRIO];
- -};
- -
- -struct rt_bandwidth {
- -      /* nests inside the rq lock: */
- -      raw_spinlock_t          rt_runtime_lock;
- -      ktime_t                 rt_period;
- -      u64                     rt_runtime;
- -      struct hrtimer          rt_period_timer;
- -};
- -
- -static struct rt_bandwidth def_rt_bandwidth;
- -
- -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
- -
- -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
- -{
- -      struct rt_bandwidth *rt_b =
- -              container_of(timer, struct rt_bandwidth, rt_period_timer);
- -      ktime_t now;
- -      int overrun;
- -      int idle = 0;
- -
- -      for (;;) {
- -              now = hrtimer_cb_get_time(timer);
- -              overrun = hrtimer_forward(timer, now, rt_b->rt_period);
- -
- -              if (!overrun)
- -                      break;
- -
- -              idle = do_sched_rt_period_timer(rt_b, overrun);
- -      }
- -
- -      return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
- -}
- -
- -static
- -void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
- -{
- -      rt_b->rt_period = ns_to_ktime(period);
- -      rt_b->rt_runtime = runtime;
- -
- -      raw_spin_lock_init(&rt_b->rt_runtime_lock);
- -
- -      hrtimer_init(&rt_b->rt_period_timer,
- -                      CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- -      rt_b->rt_period_timer.function = sched_rt_period_timer;
- -}
- -
- -static inline int rt_bandwidth_enabled(void)
- -{
- -      return sysctl_sched_rt_runtime >= 0;
- -}
- -
- -static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
+ +void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
   {
         unsigned long delta;
         ktime_t soft, hard, now;
@@@ -104,12 -217,580 +104,12 @@@
         }
   }
   
- -static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
- -{
- -      if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- -              return;
- -
- -      if (hrtimer_active(&rt_b->rt_period_timer))
- -              return;
- -
- -      raw_spin_lock(&rt_b->rt_runtime_lock);
- -      start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
- -      raw_spin_unlock(&rt_b->rt_runtime_lock);
- -}
- -
- -#ifdef CONFIG_RT_GROUP_SCHED
- -static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
- -{
- -      hrtimer_cancel(&rt_b->rt_period_timer);
- -}
- -#endif
- -
- -/*
- - * sched_domains_mutex serializes calls to init_sched_domains,
- - * detach_destroy_domains and partition_sched_domains.
- - */
- -static DEFINE_MUTEX(sched_domains_mutex);
- -
- -#ifdef CONFIG_CGROUP_SCHED
- -
- -#include <linux/cgroup.h>
- -
- -struct cfs_rq;
- -
- -static LIST_HEAD(task_groups);
- -
- -struct cfs_bandwidth {
- -#ifdef CONFIG_CFS_BANDWIDTH
- -      raw_spinlock_t lock;
- -      ktime_t period;
- -      u64 quota, runtime;
- -      s64 hierarchal_quota;
- -      u64 runtime_expires;
- -
- -      int idle, timer_active;
- -      struct hrtimer period_timer, slack_timer;
- -      struct list_head throttled_cfs_rq;
- -
- -      /* statistics */
- -      int nr_periods, nr_throttled;
- -      u64 throttled_time;
- -#endif
- -};
- -
- -/* task group related information */
- -struct task_group {
- -      struct cgroup_subsys_state css;
- -
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -      /* schedulable entities of this group on each cpu */
- -      struct sched_entity **se;
- -      /* runqueue "owned" by this group on each cpu */
- -      struct cfs_rq **cfs_rq;
- -      unsigned long shares;
- -
- -      atomic_t load_weight;
- -#endif
- -
- -#ifdef CONFIG_RT_GROUP_SCHED
- -      struct sched_rt_entity **rt_se;
- -      struct rt_rq **rt_rq;
- -
- -      struct rt_bandwidth rt_bandwidth;
- -#endif
- -
- -      struct rcu_head rcu;
- -      struct list_head list;
- -
- -      struct task_group *parent;
- -      struct list_head siblings;
- -      struct list_head children;
- -
- -#ifdef CONFIG_SCHED_AUTOGROUP
- -      struct autogroup *autogroup;
- -#endif
- -
- -      struct cfs_bandwidth cfs_bandwidth;
- -};
- -
- -/* task_group_lock serializes the addition/removal of task groups */
- -static DEFINE_SPINLOCK(task_group_lock);
- -
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -
- -# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
- -
- -/*
- - * A weight of 0 or 1 can cause arithmetics problems.
- - * A weight of a cfs_rq is the sum of weights of which entities
- - * are queued on this cfs_rq, so a weight of a entity should not be
- - * too large, so as the shares value of a task group.
- - * (The default weight is 1024 - so there's no practical
- - *  limitation from this.)
- - */
- -#define MIN_SHARES    (1UL <<  1)
- -#define MAX_SHARES    (1UL << 18)
- -
- -static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
- -#endif
- -
- -/* Default task group.
- - *    Every task in system belong to this group at bootup.
- - */
- -struct task_group root_task_group;
- -
- -#endif        /* CONFIG_CGROUP_SCHED */
- -
- -/* CFS-related fields in a runqueue */
- -struct cfs_rq {
- -      struct load_weight load;
- -      unsigned long nr_running, h_nr_running;
- -
- -      u64 exec_clock;
- -      u64 min_vruntime;
- -#ifndef CONFIG_64BIT
- -      u64 min_vruntime_copy;
- -#endif
- -
- -      struct rb_root tasks_timeline;
- -      struct rb_node *rb_leftmost;
- -
- -      struct list_head tasks;
- -      struct list_head *balance_iterator;
- -
- -      /*
- -       * 'curr' points to currently running entity on this cfs_rq.
- -       * It is set to NULL otherwise (i.e when none are currently running).
- -       */
- -      struct sched_entity *curr, *next, *last, *skip;
- -
- -#ifdef        CONFIG_SCHED_DEBUG
- -      unsigned int nr_spread_over;
- -#endif
- -
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -      struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
- -
- -      /*
- -       * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- -       * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- -       * (like users, containers etc.)
- -       *
- -       * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- -       * list is used during load balance.
- -       */
- -      int on_list;
- -      struct list_head leaf_cfs_rq_list;
- -      struct task_group *tg;  /* group that "owns" this runqueue */
- -
- -#ifdef CONFIG_SMP
- -      /*
- -       * the part of load.weight contributed by tasks
- -       */
- -      unsigned long task_weight;
- -
- -      /*
- -       *   h_load = weight * f(tg)
- -       *
- -       * Where f(tg) is the recursive weight fraction assigned to
- -       * this group.
- -       */
- -      unsigned long h_load;
- -
- -      /*
- -       * Maintaining per-cpu shares distribution for group scheduling
- -       *
- -       * load_stamp is the last time we updated the load average
- -       * load_last is the last time we updated the load average and saw load
- -       * load_unacc_exec_time is currently unaccounted execution time
- -       */
- -      u64 load_avg;
- -      u64 load_period;
- -      u64 load_stamp, load_last, load_unacc_exec_time;
- -
- -      unsigned long load_contribution;
- -#endif
- -#ifdef CONFIG_CFS_BANDWIDTH
- -      int runtime_enabled;
- -      u64 runtime_expires;
- -      s64 runtime_remaining;
- -
- -      u64 throttled_timestamp;
- -      int throttled, throttle_count;
- -      struct list_head throttled_list;
- -#endif
- -#endif
- -};
- -
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -#ifdef CONFIG_CFS_BANDWIDTH
- -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
- -{
- -      return &tg->cfs_bandwidth;
- -}
- -
- -static inline u64 default_cfs_period(void);
- -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
- -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
- -
- -static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
- -{
- -      struct cfs_bandwidth *cfs_b =
- -              container_of(timer, struct cfs_bandwidth, slack_timer);
- -      do_sched_cfs_slack_timer(cfs_b);
- -
- -      return HRTIMER_NORESTART;
- -}
- -
- -static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
- -{
- -      struct cfs_bandwidth *cfs_b =
- -              container_of(timer, struct cfs_bandwidth, period_timer);
- -      ktime_t now;
- -      int overrun;
- -      int idle = 0;
- -
- -      for (;;) {
- -              now = hrtimer_cb_get_time(timer);
- -              overrun = hrtimer_forward(timer, now, cfs_b->period);
- -
- -              if (!overrun)
- -                      break;
- -
- -              idle = do_sched_cfs_period_timer(cfs_b, overrun);
- -      }
- -
- -      return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
- -}
- -
- -static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
- -{
- -      raw_spin_lock_init(&cfs_b->lock);
- -      cfs_b->runtime = 0;
- -      cfs_b->quota = RUNTIME_INF;
- -      cfs_b->period = ns_to_ktime(default_cfs_period());
- -
- -      INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- -      hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- -      cfs_b->period_timer.function = sched_cfs_period_timer;
- -      hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- -      cfs_b->slack_timer.function = sched_cfs_slack_timer;
- -}
- -
- -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
- -{
- -      cfs_rq->runtime_enabled = 0;
- -      INIT_LIST_HEAD(&cfs_rq->throttled_list);
- -}
- -
- -/* requires cfs_b->lock, may release to reprogram timer */
- -static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
- -{
- -      /*
- -       * The timer may be active because we're trying to set a new bandwidth
- -       * period or because we're racing with the tear-down path
- -       * (timer_active==0 becomes visible before the hrtimer call-back
- -       * terminates).  In either case we ensure that it's re-programmed
- -       */
- -      while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
- -              raw_spin_unlock(&cfs_b->lock);
- -              /* ensure cfs_b->lock is available while we wait */
- -              hrtimer_cancel(&cfs_b->period_timer);
- -
- -              raw_spin_lock(&cfs_b->lock);
- -              /* if someone else restarted the timer then we're done */
- -              if (cfs_b->timer_active)
- -                      return;
- -      }
- -
- -      cfs_b->timer_active = 1;
- -      start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
- -}
- -
- -static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
- -{
- -      hrtimer_cancel(&cfs_b->period_timer);
- -      hrtimer_cancel(&cfs_b->slack_timer);
- -}
- -#else
- -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
- -static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
- -static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
- -
- -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
- -{
- -      return NULL;
- -}
- -#endif /* CONFIG_CFS_BANDWIDTH */
- -#endif /* CONFIG_FAIR_GROUP_SCHED */
- -
- -/* Real-Time classes' related field in a runqueue: */
- -struct rt_rq {
- -      struct rt_prio_array active;
- -      unsigned long rt_nr_running;
- -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -      struct {
- -              int curr; /* highest queued rt task prio */
- -#ifdef CONFIG_SMP
- -              int next; /* next highest */
- -#endif
- -      } highest_prio;
- -#endif
- -#ifdef CONFIG_SMP
- -      unsigned long rt_nr_migratory;
- -      unsigned long rt_nr_total;
- -      int overloaded;
- -      struct plist_head pushable_tasks;
- -#endif
- -      int rt_throttled;
- -      u64 rt_time;
- -      u64 rt_runtime;
- -      /* Nests inside the rq lock: */
- -      raw_spinlock_t rt_runtime_lock;
- -
- -#ifdef CONFIG_RT_GROUP_SCHED
- -      unsigned long rt_nr_boosted;
- -
- -      struct rq *rq;
- -      struct list_head leaf_rt_rq_list;
- -      struct task_group *tg;
- -#endif
- -};
- -
- -#ifdef CONFIG_SMP
- -
- -/*
- - * We add the notion of a root-domain which will be used to define per-domain
- - * variables. Each exclusive cpuset essentially defines an island domain by
- - * fully partitioning the member cpus from any other cpuset. Whenever a new
- - * exclusive cpuset is created, we also create and attach a new root-domain
- - * object.
- - *
- - */
- -struct root_domain {
- -      atomic_t refcount;
- -      atomic_t rto_count;
- -      struct rcu_head rcu;
- -      cpumask_var_t span;
- -      cpumask_var_t online;
- -
- -      /*
- -       * The "RT overload" flag: it gets set if a CPU has more than
- -       * one runnable RT task.
- -       */
- -      cpumask_var_t rto_mask;
- -      struct cpupri cpupri;
- -};
- -
- -/*
- - * By default the system creates a single root-domain with all cpus as
- - * members (mimicking the global state we have today).
- - */
- -static struct root_domain def_root_domain;
- -
- -#endif /* CONFIG_SMP */
- -
- -/*
- - * This is the main, per-CPU runqueue data structure.
- - *
- - * Locking rule: those places that want to lock multiple runqueues
- - * (such as the load balancing or the thread migration code), lock
- - * acquire operations must be ordered by ascending &runqueue.
- - */
- -struct rq {
- -      /* runqueue lock: */
- -      raw_spinlock_t lock;
- -
- -      /*
- -       * nr_running and cpu_load should be in the same cacheline because
- -       * remote CPUs use both these fields when doing load calculation.
- -       */
- -      unsigned long nr_running;
- -      #define CPU_LOAD_IDX_MAX 5
- -      unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- -      unsigned long last_load_update_tick;
- -#ifdef CONFIG_NO_HZ
- -      u64 nohz_stamp;
- -      unsigned char nohz_balance_kick;
- -#endif
- -      int skip_clock_update;
- -
- -      /* capture load from *all* tasks on this cpu: */
- -      struct load_weight load;
- -      unsigned long nr_load_updates;
- -      u64 nr_switches;
- -
- -      struct cfs_rq cfs;
- -      struct rt_rq rt;
- -
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -      /* list of leaf cfs_rq on this cpu: */
- -      struct list_head leaf_cfs_rq_list;
- -#endif
- -#ifdef CONFIG_RT_GROUP_SCHED
- -      struct list_head leaf_rt_rq_list;
- -#endif
- -
- -      /*
- -       * This is part of a global counter where only the total sum
- -       * over all CPUs matters. A task can increase this counter on
- -       * one CPU and if it got migrated afterwards it may decrease
- -       * it on another CPU. Always updated under the runqueue lock:
- -       */
- -      unsigned long nr_uninterruptible;
- -
- -      struct task_struct *curr, *idle, *stop;
- -      unsigned long next_balance;
- -      struct mm_struct *prev_mm;
- -
- -      u64 clock;
- -      u64 clock_task;
- -
- -      atomic_t nr_iowait;
- -
- -#ifdef CONFIG_SMP
- -      struct root_domain *rd;
- -      struct sched_domain *sd;
- -
- -      unsigned long cpu_power;
- -
- -      unsigned char idle_balance;
- -      /* For active balancing */
- -      int post_schedule;
- -      int active_balance;
- -      int push_cpu;
- -      struct cpu_stop_work active_balance_work;
- -      /* cpu of this runqueue: */
- -      int cpu;
- -      int online;
- -
- -      u64 rt_avg;
- -      u64 age_stamp;
- -      u64 idle_stamp;
- -      u64 avg_idle;
- -#endif
- -
- -#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- -      u64 prev_irq_time;
- -#endif
- -#ifdef CONFIG_PARAVIRT
- -      u64 prev_steal_time;
- -#endif
- -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- -      u64 prev_steal_time_rq;
- -#endif
- -
- -      /* calc_load related fields */
- -      unsigned long calc_load_update;
- -      long calc_load_active;
- -
- -#ifdef CONFIG_SCHED_HRTICK
- -#ifdef CONFIG_SMP
- -      int hrtick_csd_pending;
- -      struct call_single_data hrtick_csd;
- -#endif
- -      struct hrtimer hrtick_timer;
- -#endif
- -
- -#ifdef CONFIG_SCHEDSTATS
- -      /* latency stats */
- -      struct sched_info rq_sched_info;
- -      unsigned long long rq_cpu_time;
- -      /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
- -
- -      /* sys_sched_yield() stats */
- -      unsigned int yld_count;
- -
- -      /* schedule() stats */
- -      unsigned int sched_switch;
- -      unsigned int sched_count;
- -      unsigned int sched_goidle;
- -
- -      /* try_to_wake_up() stats */
- -      unsigned int ttwu_count;
- -      unsigned int ttwu_local;
- -#endif
- -
- -#ifdef CONFIG_SMP
- -      struct llist_head wake_list;
- -#endif
- -};
- -
- -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
- -
- -
- -static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
- -
- -static inline int cpu_of(struct rq *rq)
- -{
- -#ifdef CONFIG_SMP
- -      return rq->cpu;
- -#else
- -      return 0;
- -#endif
- -}
- -
- -#define rcu_dereference_check_sched_domain(p) \
- -      rcu_dereference_check((p), \
- -                            lockdep_is_held(&sched_domains_mutex))
- -
- -/*
- - * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- - * See detach_destroy_domains: synchronize_sched for details.
- - *
- - * The domain tree of any CPU may only be accessed from within
- - * preempt-disabled sections.
- - */
- -#define for_each_domain(cpu, __sd) \
- -      for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
- -
- -#define cpu_rq(cpu)           (&per_cpu(runqueues, (cpu)))
- -#define this_rq()             (&__get_cpu_var(runqueues))
- -#define task_rq(p)            cpu_rq(task_cpu(p))
- -#define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
- -#define raw_rq()              (&__raw_get_cpu_var(runqueues))
- -
- -#ifdef CONFIG_CGROUP_SCHED
- -
- -/*
- - * Return the group to which this tasks belongs.
- - *
- - * We use task_subsys_state_check() and extend the RCU verification with
- - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- - * task it moves into the cgroup. Therefore by holding either of those locks,
- - * we pin the task to the current cgroup.
- - */
- -static inline struct task_group *task_group(struct task_struct *p)
- -{
- -      struct task_group *tg;
- -      struct cgroup_subsys_state *css;
- -
- -      css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- -                      lockdep_is_held(&p->pi_lock) ||
- -                      lockdep_is_held(&task_rq(p)->lock));
- -      tg = container_of(css, struct task_group, css);
- -
- -      return autogroup_task_group(p, tg);
- -}
- -
- -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
- -static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
- -{
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -      p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
- -      p->se.parent = task_group(p)->se[cpu];
- -#endif
- -
- -#ifdef CONFIG_RT_GROUP_SCHED
- -      p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
- -      p->rt.parent = task_group(p)->rt_se[cpu];
- -#endif
- -}
- -
- -#else /* CONFIG_CGROUP_SCHED */
- -
- -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
- -static inline struct task_group *task_group(struct task_struct *p)
- -{
- -      return NULL;
- -}
- -
- -#endif /* CONFIG_CGROUP_SCHED */
+ +DEFINE_MUTEX(sched_domains_mutex);
+ +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
   
   static void update_rq_clock_task(struct rq *rq, s64 delta);
   
- -static void update_rq_clock(struct rq *rq)
+ +void update_rq_clock(struct rq *rq)
   {
         s64 delta;
   
@@@ -122,14 -803,44 +122,14 @@@
   }
   
   /*
- - * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
- - */
- -#ifdef CONFIG_SCHED_DEBUG
- -# define const_debug __read_mostly
- -#else
- -# define const_debug static const
- -#endif
- -
- -/**
- - * runqueue_is_locked - Returns true if the current cpu runqueue is locked
- - * @cpu: the processor in question.
- - *
- - * This interface allows printk to be called with the runqueue lock
- - * held and know whether or not it is OK to wake up the klogd.
- - */
- -int runqueue_is_locked(int cpu)
- -{
- -      return raw_spin_is_locked(&cpu_rq(cpu)->lock);
- -}
- -
- -/*
    * Debugging: various feature bits
    */
   
   #define SCHED_FEAT(name, enabled)     \
- -      __SCHED_FEAT_##name ,
- -
- -enum {
- -#include "sched_features.h"
- -};
- -
- -#undef SCHED_FEAT
- -
- -#define SCHED_FEAT(name, enabled)     \
         (1UL << __SCHED_FEAT_##name) * enabled |
   
   const_debug unsigned int sysctl_sched_features =
- -#include "sched_features.h"
+ +#include "features.h"
         0;
   
   #undef SCHED_FEAT
@@@ -139,7 -850,7 +139,7 @@@
         #name ,
   
   static __read_mostly char *sched_feat_names[] = {
- -#include "sched_features.h"
+ +#include "features.h"
         NULL
   };
   
@@@ -149,7 -860,7 +149,7 @@@ static int sched_feat_show(struct seq_f
   {
         int i;
   
- -      for (i = 0; sched_feat_names[i]; i++) {
+ +      for (i = 0; i < __SCHED_FEAT_NR; i++) {
                 if (!(sysctl_sched_features & (1UL << i)))
                         seq_puts(m, "NO_");
                 seq_printf(m, "%s ", sched_feat_names[i]);
@@@ -159,36 -870,6 +159,36 @@@
         return 0;
   }
   
+ +#ifdef HAVE_JUMP_LABEL
+ +
+ +#define jump_label_key__true  jump_label_key_enabled
+ +#define jump_label_key__false jump_label_key_disabled
+ +
+ +#define SCHED_FEAT(name, enabled)     \
+ +      jump_label_key__##enabled ,
+ +
+ +struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+ +#include "features.h"
+ +};
+ +
+ +#undef SCHED_FEAT
+ +
+ +static void sched_feat_disable(int i)
+ +{
+ +      if (jump_label_enabled(&sched_feat_keys[i]))
+ +              jump_label_dec(&sched_feat_keys[i]);
+ +}
+ +
+ +static void sched_feat_enable(int i)
+ +{
+ +      if (!jump_label_enabled(&sched_feat_keys[i]))
+ +              jump_label_inc(&sched_feat_keys[i]);
+ +}
+ +#else
+ +static void sched_feat_disable(int i) { };
+ +static void sched_feat_enable(int i) { };
+ +#endif /* HAVE_JUMP_LABEL */
+ +
   static ssize_t
   sched_feat_write(struct file *filp, const char __user *ubuf,
                 size_t cnt, loff_t *ppos)
@@@ -212,20 -893,17 +212,20 @@@
                 cmp += 3;
         }
   
- -      for (i = 0; sched_feat_names[i]; i++) {
+ +      for (i = 0; i < __SCHED_FEAT_NR; i++) {
                 if (strcmp(cmp, sched_feat_names[i]) == 0) {
- -                      if (neg)
+ +                      if (neg) {
                                 sysctl_sched_features &= ~(1UL << i);
- -                      else
+ +                              sched_feat_disable(i);
+ +                      } else {
                                 sysctl_sched_features |= (1UL << i);
+ +                              sched_feat_enable(i);
+ +                      }
                         break;
                 }
         }
   
- -      if (!sched_feat_names[i])
+ +      if (i == __SCHED_FEAT_NR)
                 return -EINVAL;
   
         *ppos += cnt;
@@@ -254,7 -932,10 +254,7 @@@ static __init int sched_init_debug(void
         return 0;
   }
   late_initcall(sched_init_debug);
- -
- -#endif
- -
- -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+ +#endif /* CONFIG_SCHED_DEBUG */
   
   /*
    * Number of tasks to iterate in a single balance run.
@@@ -276,7 -957,7 +276,7 @@@ const_debug unsigned int sysctl_sched_t
    */
   unsigned int sysctl_sched_rt_period = 1000000;
   
- -static __read_mostly int scheduler_running;
+ +__read_mostly int scheduler_running;
   
   /*
    * part of the period that we allow rt tasks to run in us.
@@@ -284,7 -965,112 +284,7 @@@
    */
   int sysctl_sched_rt_runtime = 950000;
   
- -static inline u64 global_rt_period(void)
- -{
- -      return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
- -}
- -
- -static inline u64 global_rt_runtime(void)
- -{
- -      if (sysctl_sched_rt_runtime < 0)
- -              return RUNTIME_INF;
   
- -      return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
- -}
- -
- -#ifndef prepare_arch_switch
- -# define prepare_arch_switch(next)    do { } while (0)
- -#endif
- -#ifndef finish_arch_switch
- -# define finish_arch_switch(prev)     do { } while (0)
- -#endif
- -
- -static inline int task_current(struct rq *rq, struct task_struct *p)
- -{
- -      return rq->curr == p;
- -}
- -
- -static inline int task_running(struct rq *rq, struct task_struct *p)
- -{
- -#ifdef CONFIG_SMP
- -      return p->on_cpu;
- -#else
- -      return task_current(rq, p);
- -#endif
- -}
- -
- -#ifndef __ARCH_WANT_UNLOCKED_CTXSW
- -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
- -{
- -#ifdef CONFIG_SMP
- -      /*
- -       * We can optimise this out completely for !SMP, because the
- -       * SMP rebalancing from interrupt is the only thing that cares
- -       * here.
- -       */
- -      next->on_cpu = 1;
- -#endif
- -}
- -
- -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
- -{
- -#ifdef CONFIG_SMP
- -      /*
- -       * After ->on_cpu is cleared, the task can be moved to a different CPU.
- -       * We must ensure this doesn't happen until the switch is completely
- -       * finished.
- -       */
- -      smp_wmb();
- -      prev->on_cpu = 0;
- -#endif
- -#ifdef CONFIG_DEBUG_SPINLOCK
- -      /* this is a valid case when another task releases the spinlock */
- -      rq->lock.owner = current;
- -#endif
- -      /*
- -       * If we are tracking spinlock dependencies then we have to
- -       * fix up the runqueue lock - which gets 'carried over' from
- -       * prev into current:
- -       */
- -      spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- -
- -      raw_spin_unlock_irq(&rq->lock);
- -}
- -
- -#else /* __ARCH_WANT_UNLOCKED_CTXSW */
- -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
- -{
- -#ifdef CONFIG_SMP
- -      /*
- -       * We can optimise this out completely for !SMP, because the
- -       * SMP rebalancing from interrupt is the only thing that cares
- -       * here.
- -       */
- -      next->on_cpu = 1;
- -#endif
- -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- -      raw_spin_unlock_irq(&rq->lock);
- -#else
- -      raw_spin_unlock(&rq->lock);
- -#endif
- -}
- -
- -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
- -{
- -#ifdef CONFIG_SMP
- -      /*
- -       * After ->on_cpu is cleared, the task can be moved to a different CPU.
- -       * We must ensure this doesn't happen until the switch is completely
- -       * finished.
- -       */
- -      smp_wmb();
- -      prev->on_cpu = 0;
- -#endif
- -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- -      local_irq_enable();
- -#endif
- -}
- -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
   
   /*
    * __task_rq_lock - lock the rq @p resides on.
@@@ -367,6 -1153,20 +367,6 @@@ static struct rq *this_rq_lock(void
    * rq->lock.
    */
   
- -/*
- - * Use hrtick when:
- - *  - enabled by features
- - *  - hrtimer is actually high res
- - */
- -static inline int hrtick_enabled(struct rq *rq)
- -{
- -      if (!sched_feat(HRTICK))
- -              return 0;
- -      if (!cpu_active(cpu_of(rq)))
- -              return 0;
- -      return hrtimer_is_hres_active(&rq->hrtick_timer);
- -}
- -
   static void hrtick_clear(struct rq *rq)
   {
         if (hrtimer_active(&rq->hrtick_timer))
@@@ -410,7 -1210,7 +410,7 @@@ static void __hrtick_start(void *arg
    *
    * called with rq->lock held and irqs disabled
    */
- -static void hrtick_start(struct rq *rq, u64 delay)
+ +void hrtick_start(struct rq *rq, u64 delay)
   {
         struct hrtimer *timer = &rq->hrtick_timer;
         ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@@ -454,7 -1254,7 +454,7 @@@ static __init void init_hrtick(void
    *
    * called with rq->lock held and irqs disabled
    */
- -static void hrtick_start(struct rq *rq, u64 delay)
+ +void hrtick_start(struct rq *rq, u64 delay)
   {
         __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
                         HRTIMER_MODE_REL_PINNED, 0);
@@@ -505,7 -1305,7 +505,7 @@@ static inline void init_hrtick(void
   #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
   #endif
   
- -static void resched_task(struct task_struct *p)
+ +void resched_task(struct task_struct *p)
   {
         int cpu;
   
@@@ -526,7 -1326,7 +526,7 @@@
                 smp_send_reschedule(cpu);
   }
   
- -static void resched_cpu(int cpu)
+ +void resched_cpu(int cpu)
   {
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
@@@ -605,54 -1405,228 +605,54 @@@ void wake_up_idle_cpu(int cpu
                 smp_send_reschedule(cpu);
   }
   
- -static inline bool got_nohz_idle_kick(void)
- -{
- -      return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
- -}
- -
- -#else /* CONFIG_NO_HZ */
- -
- -static inline bool got_nohz_idle_kick(void)
- -{
- -      return false;
- -}
- -
- -#endif /* CONFIG_NO_HZ */
- -
- -static u64 sched_avg_period(void)
- -{
- -      return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
- -}
- -
- -static void sched_avg_update(struct rq *rq)
- -{
- -      s64 period = sched_avg_period();
- -
- -      while ((s64)(rq->clock - rq->age_stamp) > period) {
- -              /*
- -               * Inline assembly required to prevent the compiler
- -               * optimising this loop into a divmod call.
- -               * See __iter_div_u64_rem() for another example of this.
- -               */
- -              asm("" : "+rm" (rq->age_stamp));
- -              rq->age_stamp += period;
- -              rq->rt_avg /= 2;
- -      }
- -}
- -
- -static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
- -{
- -      rq->rt_avg += rt_delta;
- -      sched_avg_update(rq);
- -}
- -
- -#else /* !CONFIG_SMP */
- -static void resched_task(struct task_struct *p)
- -{
- -      assert_raw_spin_locked(&task_rq(p)->lock);
- -      set_tsk_need_resched(p);
- -}
- -
- -static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
- -{
- -}
- -
- -static void sched_avg_update(struct rq *rq)
- -{
- -}
- -#endif /* CONFIG_SMP */
- -
- -#if BITS_PER_LONG == 32
- -# define WMULT_CONST  (~0UL)
- -#else
- -# define WMULT_CONST  (1UL << 32)
- -#endif
- -
- -#define WMULT_SHIFT   32
- -
- -/*
- - * Shift right and round:
- - */
- -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
- -
- -/*
- - * delta *= weight / lw
- - */
- -static unsigned long
- -calc_delta_mine(unsigned long delta_exec, unsigned long weight,
- -              struct load_weight *lw)
- -{
- -      u64 tmp;
- -
- -      /*
- -       * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
- -       * entities since MIN_SHARES = 2. Treat weight as 1 if less than
- -       * 2^SCHED_LOAD_RESOLUTION.
- -       */
- -      if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
- -              tmp = (u64)delta_exec * scale_load_down(weight);
- -      else
- -              tmp = (u64)delta_exec;
- -
- -      if (!lw->inv_weight) {
- -              unsigned long w = scale_load_down(lw->weight);
- -
- -              if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
- -                      lw->inv_weight = 1;
- -              else if (unlikely(!w))
- -                      lw->inv_weight = WMULT_CONST;
- -              else
- -                      lw->inv_weight = WMULT_CONST / w;
- -      }
- -
- -      /*
- -       * Check whether we'd overflow the 64-bit multiplication:
- -       */
- -      if (unlikely(tmp > WMULT_CONST))
- -              tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
- -                      WMULT_SHIFT/2);
- -      else
- -              tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
- -
- -      return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
- -}
- -
- -static inline void update_load_add(struct load_weight *lw, unsigned long inc)
- -{
- -      lw->weight += inc;
- -      lw->inv_weight = 0;
- -}
- -
- -static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
- -{
- -      lw->weight -= dec;
- -      lw->inv_weight = 0;
- -}
- -
- -static inline void update_load_set(struct load_weight *lw, unsigned long w)
+ +static inline bool got_nohz_idle_kick(void)
   {
- -      lw->weight = w;
- -      lw->inv_weight = 0;
+ +      int cpu = smp_processor_id();
+ +      return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
   }
   
- -/*
- - * To aid in avoiding the subversion of "niceness" due to uneven distribution
- - * of tasks with abnormal "nice" values across CPUs the contribution that
- - * each task makes to its run queue's load is weighted according to its
- - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
- - * scaled version of the new time slice allocation that they receive on time
- - * slice expiry etc.
- - */
- -
- -#define WEIGHT_IDLEPRIO                3
- -#define WMULT_IDLEPRIO         1431655765
- -
- -/*
- - * Nice levels are multiplicative, with a gentle 10% change for every
- - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- - * nice 1, it will get ~10% less CPU time than another CPU-bound task
- - * that remained on nice 0.
- - *
- - * The "10% effect" is relative and cumulative: from _any_ nice level,
- - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- - * If a task goes up by ~10% and another task goes down by ~10% then
- - * the relative distance between them is ~25%.)
- - */
- -static const int prio_to_weight[40] = {
- - /* -20 */     88761,     71755,     56483,     46273,     36291,
- - /* -15 */     29154,     23254,     18705,     14949,     11916,
- - /* -10 */      9548,      7620,      6100,      4904,      3906,
- - /*  -5 */      3121,      2501,      1991,      1586,      1277,
- - /*   0 */      1024,       820,       655,       526,       423,
- - /*   5 */       335,       272,       215,       172,       137,
- - /*  10 */       110,        87,        70,        56,        45,
- - /*  15 */        36,        29,        23,        18,        15,
- -};
- -
- -/*
- - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- - *
- - * In cases where the weight does not change often, we can use the
- - * precalculated inverse to speed up arithmetics by turning divisions
- - * into multiplications:
- - */
- -static const u32 prio_to_wmult[40] = {
- - /* -20 */     48388,     59856,     76040,     92818,    118348,
- - /* -15 */    147320,    184698,    229616,    287308,    360437,
- - /* -10 */    449829,    563644,    704093,    875809,   1099582,
- - /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- - /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- - /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- - /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- - /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
- -};
- -
- -/* Time spent by the tasks of the cpu accounting group executing in ... */
- -enum cpuacct_stat_index {
- -      CPUACCT_STAT_USER,      /* ... user mode */
- -      CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+ +#else /* CONFIG_NO_HZ */
   
- -      CPUACCT_STAT_NSTATS,
- -};
+ +static inline bool got_nohz_idle_kick(void)
+ +{
+ +      return false;
+ +}
   
- -#ifdef CONFIG_CGROUP_CPUACCT
- -static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
- -static void cpuacct_update_stats(struct task_struct *tsk,
- -              enum cpuacct_stat_index idx, cputime_t val);
- -#else
- -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
- -static inline void cpuacct_update_stats(struct task_struct *tsk,
- -              enum cpuacct_stat_index idx, cputime_t val) {}
- -#endif
+ +#endif /* CONFIG_NO_HZ */
   
- -static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+ +void sched_avg_update(struct rq *rq)
   {
- -      update_load_add(&rq->load, load);
+ +      s64 period = sched_avg_period();
+ +
+ +      while ((s64)(rq->clock - rq->age_stamp) > period) {
+ +              /*
+ +               * Inline assembly required to prevent the compiler
+ +               * optimising this loop into a divmod call.
+ +               * See __iter_div_u64_rem() for another example of this.
+ +               */
+ +              asm("" : "+rm" (rq->age_stamp));
+ +              rq->age_stamp += period;
+ +              rq->rt_avg /= 2;
+ +      }
   }
   
- -static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+ +#else /* !CONFIG_SMP */
+ +void resched_task(struct task_struct *p)
   {
- -      update_load_sub(&rq->load, load);
+ +      assert_raw_spin_locked(&task_rq(p)->lock);
+ +      set_tsk_need_resched(p);
   }
+ +#endif /* CONFIG_SMP */
   
   #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
                         (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
- -typedef int (*tg_visitor)(struct task_group *, void *);
- -
   /*
    * Iterate task_group tree rooted at *from, calling @down when first entering a
    * node and @up when leaving it for the final time.
    *
    * Caller must hold rcu_lock or sufficient equivalent.
    */
- -static int walk_tg_tree_from(struct task_group *from,
+ +int walk_tg_tree_from(struct task_group *from,
                              tg_visitor down, tg_visitor up, void *data)
   {
         struct task_group *parent, *child;
@@@ -683,13 -1657,270 +683,13 @@@ out
         return ret;
   }
   
- -/*
- - * Iterate the full tree, calling @down when first entering a node and @up when
- - * leaving it for the final time.
- - *
- - * Caller must hold rcu_lock or sufficient equivalent.
- - */
- -
- -static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
- -{
- -      return walk_tg_tree_from(&root_task_group, down, up, data);
- -}
- -
- -static int tg_nop(struct task_group *tg, void *data)
- -{
- -      return 0;
- -}
- -#endif
- -
- -#ifdef CONFIG_SMP
- -/* Used instead of source_load when we know the type == 0 */
- -static unsigned long weighted_cpuload(const int cpu)
- -{
- -      return cpu_rq(cpu)->load.weight;
- -}
- -
- -/*
- - * Return a low guess at the load of a migration-source cpu weighted
- - * according to the scheduling class and "nice" value.
- - *
- - * We want to under-estimate the load of migration sources, to
- - * balance conservatively.
- - */
- -static unsigned long source_load(int cpu, int type)
- -{
- -      struct rq *rq = cpu_rq(cpu);
- -      unsigned long total = weighted_cpuload(cpu);
- -
- -      if (type == 0 || !sched_feat(LB_BIAS))
- -              return total;
- -
- -      return min(rq->cpu_load[type-1], total);
- -}
- -
- -/*
- - * Return a high guess at the load of a migration-target cpu weighted
- - * according to the scheduling class and "nice" value.
- - */
- -static unsigned long target_load(int cpu, int type)
- -{
- -      struct rq *rq = cpu_rq(cpu);
- -      unsigned long total = weighted_cpuload(cpu);
- -
- -      if (type == 0 || !sched_feat(LB_BIAS))
- -              return total;
- -
- -      return max(rq->cpu_load[type-1], total);
- -}
- -
- -static unsigned long power_of(int cpu)
- -{
- -      return cpu_rq(cpu)->cpu_power;
- -}
- -
- -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
- -
- -static unsigned long cpu_avg_load_per_task(int cpu)
+ +int tg_nop(struct task_group *tg, void *data)
   {
- -      struct rq *rq = cpu_rq(cpu);
- -      unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
- -
- -      if (nr_running)
- -              return rq->load.weight / nr_running;
- -
         return 0;
   }
- -
- -#ifdef CONFIG_PREEMPT
- -
- -static void double_rq_lock(struct rq *rq1, struct rq *rq2);
- -
- -/*
- - * fair double_lock_balance: Safely acquires both rq->locks in a fair
- - * way at the expense of forcing extra atomic operations in all
- - * invocations.  This assures that the double_lock is acquired using the
- - * same underlying policy as the spinlock_t on this architecture, which
- - * reduces latency compared to the unfair variant below.  However, it
- - * also adds more overhead and therefore may reduce throughput.
- - */
- -static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
- -      __releases(this_rq->lock)
- -      __acquires(busiest->lock)
- -      __acquires(this_rq->lock)
- -{
- -      raw_spin_unlock(&this_rq->lock);
- -      double_rq_lock(this_rq, busiest);
- -
- -      return 1;
- -}
- -
- -#else
- -/*
- - * Unfair double_lock_balance: Optimizes throughput at the expense of
- - * latency by eliminating extra atomic operations when the locks are
- - * already in proper order on entry.  This favors lower cpu-ids and will
- - * grant the double lock to lower cpus over higher ids under contention,
- - * regardless of entry order into the function.
- - */
- -static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
- -      __releases(this_rq->lock)
- -      __acquires(busiest->lock)
- -      __acquires(this_rq->lock)
- -{
- -      int ret = 0;
- -
- -      if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- -              if (busiest < this_rq) {
- -                      raw_spin_unlock(&this_rq->lock);
- -                      raw_spin_lock(&busiest->lock);
- -                      raw_spin_lock_nested(&this_rq->lock,
- -                                            SINGLE_DEPTH_NESTING);
- -                      ret = 1;
- -              } else
- -                      raw_spin_lock_nested(&busiest->lock,
- -                                            SINGLE_DEPTH_NESTING);
- -      }
- -      return ret;
- -}
- -
- -#endif /* CONFIG_PREEMPT */
- -
- -/*
- - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- - */
- -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
- -{
- -      if (unlikely(!irqs_disabled())) {
- -              /* printk() doesn't work good under rq->lock */
- -              raw_spin_unlock(&this_rq->lock);
- -              BUG_ON(1);
- -      }
- -
- -      return _double_lock_balance(this_rq, busiest);
- -}
- -
- -static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
- -      __releases(busiest->lock)
- -{
- -      raw_spin_unlock(&busiest->lock);
- -      lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
- -}
- -
- -/*
- - * double_rq_lock - safely lock two runqueues
- - *
- - * Note this does not disable interrupts like task_rq_lock,
- - * you need to do so manually before calling.
- - */
- -static void double_rq_lock(struct rq *rq1, struct rq *rq2)
- -      __acquires(rq1->lock)
- -      __acquires(rq2->lock)
- -{
- -      BUG_ON(!irqs_disabled());
- -      if (rq1 == rq2) {
- -              raw_spin_lock(&rq1->lock);
- -              __acquire(rq2->lock);   /* Fake it out ;) */
- -      } else {
- -              if (rq1 < rq2) {
- -                      raw_spin_lock(&rq1->lock);
- -                      raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- -              } else {
- -                      raw_spin_lock(&rq2->lock);
- -                      raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- -              }
- -      }
- -}
- -
- -/*
- - * double_rq_unlock - safely unlock two runqueues
- - *
- - * Note this does not restore interrupts like task_rq_unlock,
- - * you need to do so manually after calling.
- - */
- -static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- -      __releases(rq1->lock)
- -      __releases(rq2->lock)
- -{
- -      raw_spin_unlock(&rq1->lock);
- -      if (rq1 != rq2)
- -              raw_spin_unlock(&rq2->lock);
- -      else
- -              __release(rq2->lock);
- -}
- -
- -#else /* CONFIG_SMP */
- -
- -/*
- - * double_rq_lock - safely lock two runqueues
- - *
- - * Note this does not disable interrupts like task_rq_lock,
- - * you need to do so manually before calling.
- - */
- -static void double_rq_lock(struct rq *rq1, struct rq *rq2)
- -      __acquires(rq1->lock)
- -      __acquires(rq2->lock)
- -{
- -      BUG_ON(!irqs_disabled());
- -      BUG_ON(rq1 != rq2);
- -      raw_spin_lock(&rq1->lock);
- -      __acquire(rq2->lock);   /* Fake it out ;) */
- -}
- -
- -/*
- - * double_rq_unlock - safely unlock two runqueues
- - *
- - * Note this does not restore interrupts like task_rq_unlock,
- - * you need to do so manually after calling.
- - */
- -static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- -      __releases(rq1->lock)
- -      __releases(rq2->lock)
- -{
- -      BUG_ON(rq1 != rq2);
- -      raw_spin_unlock(&rq1->lock);
- -      __release(rq2->lock);
- -}
- -
- -#endif
- -
- -static void calc_load_account_idle(struct rq *this_rq);
- -static void update_sysctl(void);
- -static int get_update_sysctl_factor(void);
- -static void update_cpu_load(struct rq *this_rq);
- -
- -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
- -{
- -      set_task_rq(p, cpu);
- -#ifdef CONFIG_SMP
- -      /*
- -       * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
- -       * successfully executed on another CPU. We must ensure that updates of
- -       * per-task data have been completed by this moment.
- -       */
- -      smp_wmb();
- -      task_thread_info(p)->cpu = cpu;
   #endif
- -}
- -
- -static const struct sched_class rt_sched_class;
- -
- -#define sched_class_highest (&stop_sched_class)
- -#define for_each_class(class) \
- -   for (class = sched_class_highest; class; class = class->next)
- -
- -#include "sched_stats.h"
- -
- -static void inc_nr_running(struct rq *rq)
- -{
- -      rq->nr_running++;
- -}
   
- -static void dec_nr_running(struct rq *rq)
- -{
- -      rq->nr_running--;
- -}
+ +void update_cpu_load(struct rq *this_rq);
   
   static void set_load_weight(struct task_struct *p)
   {
@@@ -726,7 -1957,7 +726,7 @@@ static void dequeue_task(struct rq *rq
   /*
    * activate_task - move a task to the runqueue.
    */
- -static void activate_task(struct rq *rq, struct task_struct *p, int flags)
+ +void activate_task(struct rq *rq, struct task_struct *p, int flags)
   {
         if (task_contributes_to_load(p))
                 rq->nr_uninterruptible--;
@@@ -737,7 -1968,7 +737,7 @@@
   /*
    * deactivate_task - remove a task from the runqueue.
    */
- -static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+ +void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
   {
         if (task_contributes_to_load(p))
                 rq->nr_uninterruptible++;
@@@ -928,14 -2159,14 +928,14 @@@ static void update_rq_clock_task(struc
   #ifdef CONFIG_IRQ_TIME_ACCOUNTING
   static int irqtime_account_hi_update(void)
   {
- -      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ +      u64 *cpustat = kcpustat_this_cpu->cpustat;
         unsigned long flags;
         u64 latest_ns;
         int ret = 0;
   
         local_irq_save(flags);
         latest_ns = this_cpu_read(cpu_hardirq_time);
- -      if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ +      if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
                 ret = 1;
         local_irq_restore(flags);
         return ret;
@@@ -943,14 -2174,14 +943,14 @@@
   
   static int irqtime_account_si_update(void)
   {
- -      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ +      u64 *cpustat = kcpustat_this_cpu->cpustat;
         unsigned long flags;
         u64 latest_ns;
         int ret = 0;
   
         local_irq_save(flags);
         latest_ns = this_cpu_read(cpu_softirq_time);
- -      if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ +      if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
                 ret = 1;
         local_irq_restore(flags);
         return ret;
@@@ -962,6 -2193,15 +962,6 @@@
   
   #endif
   
- -#include "sched_idletask.c"
- -#include "sched_fair.c"
- -#include "sched_rt.c"
- -#include "sched_autogroup.c"
- -#include "sched_stoptask.c"
- -#ifdef CONFIG_SCHED_DEBUG
- -# include "sched_debug.c"
- -#endif
- -
   void sched_set_stop_task(int cpu, struct task_struct *stop)
   {
         struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@@ -1059,7 -2299,7 +1059,7 @@@ static inline void check_class_changed(
                 p->sched_class->prio_changed(rq, p, oldprio);
   }
   
- -static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+ +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
   {
         const struct sched_class *class;
   
@@@ -1085,6 -2325,38 +1085,6 @@@
   }
   
   #ifdef CONFIG_SMP
- -/*
- - * Is this task likely cache-hot:
- - */
- -static int
- -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
- -{
- -      s64 delta;
- -
- -      if (p->sched_class != &fair_sched_class)
- -              return 0;
- -
- -      if (unlikely(p->policy == SCHED_IDLE))
- -              return 0;
- -
- -      /*
- -       * Buddy candidates are cache hot:
- -       */
- -      if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
- -                      (&p->se == cfs_rq_of(&p->se)->next ||
- -                       &p->se == cfs_rq_of(&p->se)->last))
- -              return 1;
- -
- -      if (sysctl_sched_migration_cost == -1)
- -              return 1;
- -      if (sysctl_sched_migration_cost == 0)
- -              return 0;
- -
- -      delta = now - p->se.exec_start;
- -
- -      return delta < (s64)sysctl_sched_migration_cost;
- -}
- -
   void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
   {
   #ifdef CONFIG_SCHED_DEBUG
@@@ -1511,11 -2783,6 +1511,11 @@@ static int ttwu_activate_remote(struct 
   
   }
   #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+ +
+ +static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+ +{
+ +      return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+ +}
   #endif /* CONFIG_SMP */
   
   static void ttwu_queue(struct task_struct *p, int cpu)
@@@ -1523,7 -2790,7 +1523,7 @@@
         struct rq *rq = cpu_rq(cpu);
   
   #if defined(CONFIG_SMP)
- -      if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ +      if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
                 sched_clock_cpu(cpu); /* sync clocks x-cpu */
                 ttwu_queue_remote(p, cpu);
                 return;
@@@ -1937,7 -3204,6 +1937,7 @@@ static void finish_task_switch(struct r
         local_irq_enable();
   #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
+ +      trace_sched_stat_sleeptime(current, rq->clock);
   
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@@ -2173,7 -3439,7 +2173,7 @@@ calc_load(unsigned long load, unsigned 
    */
   static atomic_long_t calc_load_tasks_idle;
   
- -static void calc_load_account_idle(struct rq *this_rq)
+ +void calc_load_account_idle(struct rq *this_rq)
   {
         long delta;
   
@@@ -2317,7 -3583,7 +2317,7 @@@ static void calc_global_nohz(unsigned l
          */
   }
   #else
- -static void calc_load_account_idle(struct rq *this_rq)
+ +void calc_load_account_idle(struct rq *this_rq)
   {
   }
   
@@@ -2460,7 -3726,7 +2460,7 @@@ decay_load_missed(unsigned long load, u
    * scheduler tick (TICK_NSEC). With tickless idle this will not be called
    * every tick. We fix it up based on jiffies.
    */
- -static void update_cpu_load(struct rq *this_rq)
+ +void update_cpu_load(struct rq *this_rq)
   {
         unsigned long this_load = this_rq->load.weight;
         unsigned long curr_jiffies = jiffies;
@@@ -2538,10 -3804,8 +2538,10 @@@ unlock
   #endif
   
   DEFINE_PER_CPU(struct kernel_stat, kstat);
+ +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
   
   EXPORT_PER_CPU_SYMBOL(kstat);
+ +EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
   
   /*
    * Return any ns on the sched_clock that have not yet been accounted in
@@@ -2594,42 -3858,6 +2594,42 @@@ unsigned long long task_sched_runtime(s
         return ns;
   }
   
+ +#ifdef CONFIG_CGROUP_CPUACCT
+ +struct cgroup_subsys cpuacct_subsys;
+ +struct cpuacct root_cpuacct;
+ +#endif
+ +
+ +static inline void task_group_account_field(struct task_struct *p, int index,
+ +                                          u64 tmp)
+ +{
+ +#ifdef CONFIG_CGROUP_CPUACCT
+ +      struct kernel_cpustat *kcpustat;
+ +      struct cpuacct *ca;
+ +#endif
+ +      /*
+ +       * Since all updates are sure to touch the root cgroup, we
+ +       * get ourselves ahead and touch it first. If the root cgroup
+ +       * is the only cgroup, then nothing else should be necessary.
+ +       *
+ +       */
+ +      __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+ +
+ +#ifdef CONFIG_CGROUP_CPUACCT
+ +      if (unlikely(!cpuacct_subsys.active))
+ +              return;
+ +
+ +      rcu_read_lock();
+ +      ca = task_ca(p);
+ +      while (ca && (ca != &root_cpuacct)) {
+ +              kcpustat = this_cpu_ptr(ca->cpustat);
+ +              kcpustat->cpustat[index] += tmp;
+ +              ca = parent_ca(ca);
+ +      }
+ +      rcu_read_unlock();
+ +#endif
+ +}
+ +
+ +
   /*
    * Account user cpu time to a process.
    * @p: the process that the cpu time gets accounted to
@@@ -2639,18 -3867,22 +2639,18 @@@
   void account_user_time(struct task_struct *p, cputime_t cputime,
                        cputime_t cputime_scaled)
   {
- -      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- -      cputime64_t tmp;
+ +      int index;
   
         /* Add user time to process. */
- -      p->utime = cputime_add(p->utime, cputime);
- -      p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+ +      p->utime += cputime;
+ +      p->utimescaled += cputime_scaled;
         account_group_user_time(p, cputime);
   
+ +      index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+ +
         /* Add user time to cpustat. */
- -      tmp = cputime_to_cputime64(cputime);
- -      if (TASK_NICE(p) > 0)
- -              cpustat->nice = cputime64_add(cpustat->nice, tmp);
- -      else
- -              cpustat->user = cputime64_add(cpustat->user, tmp);
+ +      task_group_account_field(p, index, (__force u64) cputime);
   
- -      cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
         /* Account for user time used */
         acct_update_integrals(p);
   }
@@@ -2664,21 -3896,24 +2664,21 @@@
   static void account_guest_time(struct task_struct *p, cputime_t cputime,
                                cputime_t cputime_scaled)
   {
- -      cputime64_t tmp;
- -      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- -
- -      tmp = cputime_to_cputime64(cputime);
+ +      u64 *cpustat = kcpustat_this_cpu->cpustat;
   
         /* Add guest time to process. */
- -      p->utime = cputime_add(p->utime, cputime);
- -      p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+ +      p->utime += cputime;
+ +      p->utimescaled += cputime_scaled;
         account_group_user_time(p, cputime);
- -      p->gtime = cputime_add(p->gtime, cputime);
+ +      p->gtime += cputime;
   
         /* Add guest time to cpustat. */
         if (TASK_NICE(p) > 0) {
- -              cpustat->nice = cputime64_add(cpustat->nice, tmp);
- -              cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+ +              cpustat[CPUTIME_NICE] += (__force u64) cputime;
+ +              cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
         } else {
- -              cpustat->user = cputime64_add(cpustat->user, tmp);
- -              cpustat->guest = cputime64_add(cpustat->guest, tmp);
+ +              cpustat[CPUTIME_USER] += (__force u64) cputime;
+ +              cpustat[CPUTIME_GUEST] += (__force u64) cputime;
         }
   }
   
@@@ -2691,15 -3926,18 +2691,15 @@@
    */
   static inline
   void __account_system_time(struct task_struct *p, cputime_t cputime,
- -                      cputime_t cputime_scaled, cputime64_t *target_cputime64)
+ +                      cputime_t cputime_scaled, int index)
   {
- -      cputime64_t tmp = cputime_to_cputime64(cputime);
- -
         /* Add system time to process. */
- -      p->stime = cputime_add(p->stime, cputime);
- -      p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ +      p->stime += cputime;
+ +      p->stimescaled += cputime_scaled;
         account_group_system_time(p, cputime);
   
         /* Add system time to cpustat. */
- -      *target_cputime64 = cputime64_add(*target_cputime64, tmp);
- -      cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ +      task_group_account_field(p, index, (__force u64) cputime);
   
         /* Account for system time used */
         acct_update_integrals(p);
@@@ -2715,7 -3953,8 +2715,7 @@@
   void account_system_time(struct task_struct *p, int hardirq_offset,
                          cputime_t cputime, cputime_t cputime_scaled)
   {
- -      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- -      cputime64_t *target_cputime64;
+ +      int index;
   
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                 account_guest_time(p, cputime, cputime_scaled);
@@@ -2723,13 -3962,13 +2723,13 @@@
         }
   
         if (hardirq_count() - hardirq_offset)
- -              target_cputime64 = &cpustat->irq;
+ +              index = CPUTIME_IRQ;
         else if (in_serving_softirq())
- -              target_cputime64 = &cpustat->softirq;
+ +              index = CPUTIME_SOFTIRQ;
         else
- -              target_cputime64 = &cpustat->system;
+ +              index = CPUTIME_SYSTEM;
   
- -      __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+ +      __account_system_time(p, cputime, cputime_scaled, index);
   }
   
   /*
@@@ -2738,9 -3977,10 +2738,9 @@@
    */
   void account_steal_time(cputime_t cputime)
   {
- -      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- -      cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ +      u64 *cpustat = kcpustat_this_cpu->cpustat;
   
- -      cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+ +      cpustat[CPUTIME_STEAL] += (__force u64) cputime;
   }
   
   /*
@@@ -2749,13 -3989,14 +2749,13 @@@
    */
   void account_idle_time(cputime_t cputime)
   {
- -      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- -      cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ +      u64 *cpustat = kcpustat_this_cpu->cpustat;
         struct rq *rq = this_rq();
   
         if (atomic_read(&rq->nr_iowait) > 0)
- -              cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+ +              cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
         else
- -              cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+ +              cpustat[CPUTIME_IDLE] += (__force u64) cputime;
   }
   
   static __always_inline bool steal_account_process_tick(void)
@@@ -2805,15 -4046,16 +2805,15 @@@ static void irqtime_account_process_tic
                                                 struct rq *rq)
   {
         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- -      cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
- -      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ +      u64 *cpustat = kcpustat_this_cpu->cpustat;
   
         if (steal_account_process_tick())
                 return;
   
         if (irqtime_account_hi_update()) {
- -              cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ +              cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
         } else if (irqtime_account_si_update()) {
- -              cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ +              cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
         } else if (this_cpu_ksoftirqd() == p) {
                 /*
                  * ksoftirqd time do not get accounted in cpu_softirq_time.
@@@ -2821,7 -4063,7 +2821,7 @@@
                  * Also, p->stime needs to be updated for ksoftirqd.
                  */
                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- -                                      &cpustat->softirq);
+ +                                      CPUTIME_SOFTIRQ);
         } else if (user_tick) {
                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
         } else if (p == rq->idle) {
@@@ -2830,7 -4072,7 +2830,7 @@@
                 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
         } else {
                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- -                                      &cpustat->system);
+ +                                      CPUTIME_SYSTEM);
         }
   }
   
@@@ -2929,7 -4171,7 +2929,7 @@@ void thread_group_times(struct task_str
   
   void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
   {
- -      cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
+ +      cputime_t rtime, utime = p->utime, total = utime + p->stime;
   
         /*
          * Use CFS's precise accounting:
@@@ -2937,11 -4179,11 +2937,11 @@@
         rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
   
         if (total) {
- -              u64 temp = rtime;
+ +              u64 temp = (__force u64) rtime;
   
- -              temp *= utime;
- -              do_div(temp, total);
- -              utime = (cputime_t)temp;
+ +              temp *= (__force u64) utime;
+ +              do_div(temp, (__force u32) total);
+ +              utime = (__force cputime_t) temp;
         } else
                 utime = rtime;
   
@@@ -2949,7 -4191,7 +2949,7 @@@
          * Compare with previous values, to keep monotonicity:
          */
         p->prev_utime = max(p->prev_utime, utime);
- -      p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
+ +      p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
   
         *ut = p->prev_utime;
         *st = p->prev_stime;
@@@ -2966,20 -4208,21 +2966,20 @@@ void thread_group_times(struct task_str
   
         thread_group_cputime(p, &cputime);
   
- -      total = cputime_add(cputime.utime, cputime.stime);
+ +      total = cputime.utime + cputime.stime;
         rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
   
         if (total) {
- -              u64 temp = rtime;
+ +              u64 temp = (__force u64) rtime;
   
- -              temp *= cputime.utime;
- -              do_div(temp, total);
- -              utime = (cputime_t)temp;
+ +              temp *= (__force u64) cputime.utime;
+ +              do_div(temp, (__force u32) total);
+ +              utime = (__force cputime_t) temp;
         } else
                 utime = rtime;
   
         sig->prev_utime = max(sig->prev_utime, utime);
- -      sig->prev_stime = max(sig->prev_stime,
- -                            cputime_sub(rtime, sig->prev_utime));
+ +      sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
   
         *ut = sig->prev_utime;
         *st = sig->prev_stime;
@@@ -3078,9 -4321,6 +3078,9 @@@ static noinline void __schedule_bug(str
   {
         struct pt_regs *regs = get_irq_regs();
   
+ +      if (oops_in_progress)
+ +              return;
+ +
         printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
                 prev->comm, prev->pid, preempt_count());
   
@@@ -4612,13 -5852,6 +4612,13 @@@ again
                  */
                 if (preempt && rq != p_rq)
                         resched_task(p_rq->curr);
+ +      } else {
+ +              /*
+ +               * We might have set it in task_yield_fair(), but are
+ +               * not going to schedule(), so don't want to skip
+ +               * the next update.
+ +               */
+ +              rq->skip_clock_update = 0;
         }
   
   out:
@@@ -4786,7 -6019,7 +4786,7 @@@ void sched_show_task(struct task_struc
         free = stack_not_used(p);
   #endif
         printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- -              task_pid_nr(p), task_pid_nr(p->real_parent),
+ +              task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
                 (unsigned long)task_thread_info(p)->flags);
   
         show_stack(p, NULL);
@@@ -4885,6 -6118,53 +4885,6 @@@ void __cpuinit init_idle(struct task_st
   #endif
   }
   
- -/*
- - * Increase the granularity value when there are more CPUs,
- - * because with more CPUs the 'effective latency' as visible
- - * to users decreases. But the relationship is not linear,
- - * so pick a second-best guess by going with the log2 of the
- - * number of CPUs.
- - *
- - * This idea comes from the SD scheduler of Con Kolivas:
- - */
- -static int get_update_sysctl_factor(void)
- -{
- -      unsigned int cpus = min_t(int, num_online_cpus(), 8);
- -      unsigned int factor;
- -
- -      switch (sysctl_sched_tunable_scaling) {
- -      case SCHED_TUNABLESCALING_NONE:
- -              factor = 1;
- -              break;
- -      case SCHED_TUNABLESCALING_LINEAR:
- -              factor = cpus;
- -              break;
- -      case SCHED_TUNABLESCALING_LOG:
- -      default:
- -              factor = 1 + ilog2(cpus);
- -              break;
- -      }
- -
- -      return factor;
- -}
- -
- -static void update_sysctl(void)
- -{
- -      unsigned int factor = get_update_sysctl_factor();
- -
- -#define SET_SYSCTL(name) \
- -      (sysctl_##name = (factor) * normalized_sysctl_##name)
- -      SET_SYSCTL(sched_min_granularity);
- -      SET_SYSCTL(sched_latency);
- -      SET_SYSCTL(sched_wakeup_granularity);
- -#undef SET_SYSCTL
- -}
- -
- -static inline void sched_init_granularity(void)
- -{
- -      update_sysctl();
- -}
- -
   #ifdef CONFIG_SMP
   void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
   {
@@@ -5062,14 -6342,38 +5062,14 @@@ static void migrate_nr_uninterruptible(
         rq_src->nr_uninterruptible = 0;
   }
   
- -/*
- - * remove the tasks which were accounted by rq from calc_load_tasks.
- - */
- -static void calc_global_load_remove(struct rq *rq)
- -{
- -      atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- -      rq->calc_load_active = 0;
- -}
- -
- -#ifdef CONFIG_CFS_BANDWIDTH
- -static void unthrottle_offline_cfs_rqs(struct rq *rq)
- -{
- -      struct cfs_rq *cfs_rq;
- -
- -      for_each_leaf_cfs_rq(rq, cfs_rq) {
- -              struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- -
- -              if (!cfs_rq->runtime_enabled)
- -                      continue;
- -
- -              /*
- -               * clock_task is not advancing so we just need to make sure
- -               * there's some valid quota amount
- -               */
- -              cfs_rq->runtime_remaining = cfs_b->quota;
- -              if (cfs_rq_throttled(cfs_rq))
- -                      unthrottle_cfs_rq(cfs_rq);
- -      }
+ +/*
+ + * remove the tasks which were accounted by rq from calc_load_tasks.
+ + */
+ +static void calc_global_load_remove(struct rq *rq)
+ +{
+ +      atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ +      rq->calc_load_active = 0;
   }
- -#else
- -static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
- -#endif
   
   /*
    * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@@ -5176,7 -6480,7 +5176,7 @@@ static void sd_free_ctl_entry(struct ct
   static void
   set_table_entry(struct ctl_table *entry,
                 const char *procname, void *data, int maxlen,
-               mode_t mode, proc_handler *proc_handler)
+               umode_t mode, proc_handler *proc_handler)
   {
         entry->procname = procname;
         entry->data = data;
@@@ -5676,12 -6980,6 +5676,12 @@@ out
         return -ENOMEM;
   }
   
+ +/*
+ + * By default the system creates a single root-domain with all cpus as
+ + * members (mimicking the global state we have today).
+ + */
+ +struct root_domain def_root_domain;
+ +
   static void init_defrootdomain(void)
   {
         init_rootdomain(&def_root_domain);
@@@ -5753,31 -7051,6 +5753,31 @@@ static void destroy_sched_domains(struc
   }
   
   /*
+ + * Keep a special pointer to the highest sched_domain that has
+ + * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ + * allows us to avoid some pointer chasing select_idle_sibling().
+ + *
+ + * Also keep a unique ID per domain (we use the first cpu number in
+ + * the cpumask of the domain), this allows us to quickly tell if
+ + * two cpus are in the same cache domain, see ttwu_share_cache().
+ + */
+ +DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+ +DEFINE_PER_CPU(int, sd_llc_id);
+ +
+ +static void update_top_cache_domain(int cpu)
+ +{
+ +      struct sched_domain *sd;
+ +      int id = cpu;
+ +
+ +      sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ +      if (sd)
+ +              id = cpumask_first(sched_domain_span(sd));
+ +
+ +      rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ +      per_cpu(sd_llc_id, cpu) = id;
+ +}
+ +
+ +/*
    * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
    * hold the hotplug lock.
    */
@@@ -5816,8 -7089,6 +5816,8 @@@ cpu_attach_domain(struct sched_domain *
         tmp = rq->sd;
         rcu_assign_pointer(rq->sd, sd);
         destroy_sched_domains(tmp, cpu);
+ +
+ +      update_top_cache_domain(cpu);
   }
   
   /* cpus with isolated domains */
@@@ -5977,7 -7248,7 +5977,7 @@@ build_overlap_sched_groups(struct sched
                         continue;
   
                 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- -                              GFP_KERNEL, cpu_to_node(i));
+ +                              GFP_KERNEL, cpu_to_node(cpu));
   
                 if (!sg)
                         goto fail;
@@@ -6115,12 -7386,6 +6115,12 @@@ static void init_sched_groups_power(in
                 return;
   
         update_group_power(sd, cpu);
+ +      atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+ +}
+ +
+ +int __weak arch_sd_sibling_asym_packing(void)
+ +{
+ +       return 0*SD_ASYM_PACKING;
   }
   
   /*
@@@ -6675,52 -7940,54 +6675,52 @@@ static ssize_t sched_power_savings_stor
   }
   
   #ifdef CONFIG_SCHED_MC
- -static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
- -                                         struct sysdev_class_attribute *attr,
- -                                         char *page)
+ +static ssize_t sched_mc_power_savings_show(struct device *dev,
+ +                                         struct device_attribute *attr,
+ +                                         char *buf)
   {
- -      return sprintf(page, "%u\n", sched_mc_power_savings);
+ +      return sprintf(buf, "%u\n", sched_mc_power_savings);
   }
- -static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
- -                                          struct sysdev_class_attribute *attr,
+ +static ssize_t sched_mc_power_savings_store(struct device *dev,
+ +                                          struct device_attribute *attr,
                                             const char *buf, size_t count)
   {
         return sched_power_savings_store(buf, count, 0);
   }
- -static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
- -                       sched_mc_power_savings_show,
- -                       sched_mc_power_savings_store);
+ +static DEVICE_ATTR(sched_mc_power_savings, 0644,
+ +                 sched_mc_power_savings_show,
+ +                 sched_mc_power_savings_store);
   #endif
   
   #ifdef CONFIG_SCHED_SMT
- -static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
- -                                          struct sysdev_class_attribute *attr,
- -                                          char *page)
+ +static ssize_t sched_smt_power_savings_show(struct device *dev,
+ +                                          struct device_attribute *attr,
+ +                                          char *buf)
   {
- -      return sprintf(page, "%u\n", sched_smt_power_savings);
+ +      return sprintf(buf, "%u\n", sched_smt_power_savings);
   }
- -static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
- -                                           struct sysdev_class_attribute *attr,
+ +static ssize_t sched_smt_power_savings_store(struct device *dev,
+ +                                          struct device_attribute *attr,
                                              const char *buf, size_t count)
   {
         return sched_power_savings_store(buf, count, 1);
   }
- -static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+ +static DEVICE_ATTR(sched_smt_power_savings, 0644,
                    sched_smt_power_savings_show,
                    sched_smt_power_savings_store);
   #endif
   
- -int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+ +int __init sched_create_sysfs_power_savings_entries(struct device *dev)
   {
         int err = 0;
   
   #ifdef CONFIG_SCHED_SMT
         if (smt_capable())
- -              err = sysfs_create_file(&cls->kset.kobj,
- -                                      &attr_sched_smt_power_savings.attr);
+ +              err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
   #endif
   #ifdef CONFIG_SCHED_MC
         if (!err && mc_capable())
- -              err = sysfs_create_file(&cls->kset.kobj,
- -                                      &attr_sched_mc_power_savings.attr);
+ +              err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
   #endif
         return err;
   }
@@@ -6756,6 -8023,29 +6756,6 @@@ static int cpuset_cpu_inactive(struct n
         }
   }
   
- -static int update_runtime(struct notifier_block *nfb,
- -                              unsigned long action, void *hcpu)
- -{
- -      int cpu = (int)(long)hcpu;
- -
- -      switch (action) {
- -      case CPU_DOWN_PREPARE:
- -      case CPU_DOWN_PREPARE_FROZEN:
- -              disable_runtime(cpu_rq(cpu));
- -              return NOTIFY_OK;
- -
- -      case CPU_DOWN_FAILED:
- -      case CPU_DOWN_FAILED_FROZEN:
- -      case CPU_ONLINE:
- -      case CPU_ONLINE_FROZEN:
- -              enable_runtime(cpu_rq(cpu));
- -              return NOTIFY_OK;
- -
- -      default:
- -              return NOTIFY_DONE;
- -      }
- -}
- -
   void __init sched_init_smp(void)
   {
         cpumask_var_t non_isolated_cpus;
@@@ -6804,11 -8094,104 +6804,11 @@@ int in_sched_functions(unsigned long ad
                 && addr < (unsigned long)__sched_text_end);
   }
   
- -static void init_cfs_rq(struct cfs_rq *cfs_rq)
- -{
- -      cfs_rq->tasks_timeline = RB_ROOT;
- -      INIT_LIST_HEAD(&cfs_rq->tasks);
- -      cfs_rq->min_vruntime = (u64)(-(1LL << 20));
- -#ifndef CONFIG_64BIT
- -      cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
- -#endif
- -}
- -
- -static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
- -{
- -      struct rt_prio_array *array;
- -      int i;
- -
- -      array = &rt_rq->active;
- -      for (i = 0; i < MAX_RT_PRIO; i++) {
- -              INIT_LIST_HEAD(array->queue + i);
- -              __clear_bit(i, array->bitmap);
- -      }
- -      /* delimiter for bitsearch: */
- -      __set_bit(MAX_RT_PRIO, array->bitmap);
- -
- -#if defined CONFIG_SMP
- -      rt_rq->highest_prio.curr = MAX_RT_PRIO;
- -      rt_rq->highest_prio.next = MAX_RT_PRIO;
- -      rt_rq->rt_nr_migratory = 0;
- -      rt_rq->overloaded = 0;
- -      plist_head_init(&rt_rq->pushable_tasks);
- -#endif
- -
- -      rt_rq->rt_time = 0;
- -      rt_rq->rt_throttled = 0;
- -      rt_rq->rt_runtime = 0;
- -      raw_spin_lock_init(&rt_rq->rt_runtime_lock);
- -}
- -
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
- -                              struct sched_entity *se, int cpu,
- -                              struct sched_entity *parent)
- -{
- -      struct rq *rq = cpu_rq(cpu);
- -
- -      cfs_rq->tg = tg;
- -      cfs_rq->rq = rq;
- -#ifdef CONFIG_SMP
- -      /* allow initial update_cfs_load() to truncate */
- -      cfs_rq->load_stamp = 1;
- -#endif
- -      init_cfs_rq_runtime(cfs_rq);
- -
- -      tg->cfs_rq[cpu] = cfs_rq;
- -      tg->se[cpu] = se;
- -
- -      /* se could be NULL for root_task_group */
- -      if (!se)
- -              return;
- -
- -      if (!parent)
- -              se->cfs_rq = &rq->cfs;
- -      else
- -              se->cfs_rq = parent->my_q;
- -
- -      se->my_q = cfs_rq;
- -      update_load_set(&se->load, 0);
- -      se->parent = parent;
- -}
+ +#ifdef CONFIG_CGROUP_SCHED
+ +struct task_group root_task_group;
   #endif
   
- -#ifdef CONFIG_RT_GROUP_SCHED
- -static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- -              struct sched_rt_entity *rt_se, int cpu,
- -              struct sched_rt_entity *parent)
- -{
- -      struct rq *rq = cpu_rq(cpu);
- -
- -      rt_rq->highest_prio.curr = MAX_RT_PRIO;
- -      rt_rq->rt_nr_boosted = 0;
- -      rt_rq->rq = rq;
- -      rt_rq->tg = tg;
- -
- -      tg->rt_rq[cpu] = rt_rq;
- -      tg->rt_se[cpu] = rt_se;
- -
- -      if (!rt_se)
- -              return;
- -
- -      if (!parent)
- -              rt_se->rt_rq = &rq->rt;
- -      else
- -              rt_se->rt_rq = parent->my_q;
- -
- -      rt_se->my_q = rt_rq;
- -      rt_se->parent = parent;
- -      INIT_LIST_HEAD(&rt_se->run_list);
- -}
- -#endif
+ +DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
   
   void __init sched_init(void)
   {
@@@ -6866,17 -8249,9 +6866,17 @@@
   #ifdef CONFIG_CGROUP_SCHED
         list_add(&root_task_group.list, &task_groups);
         INIT_LIST_HEAD(&root_task_group.children);
+ +      INIT_LIST_HEAD(&root_task_group.siblings);
         autogroup_init(&init_task);
+ +
   #endif /* CONFIG_CGROUP_SCHED */
   
+ +#ifdef CONFIG_CGROUP_CPUACCT
+ +      root_cpuacct.cpustat = &kernel_cpustat;
+ +      root_cpuacct.cpuusage = alloc_percpu(u64);
+ +      /* Too early, not expected to fail */
+ +      BUG_ON(!root_cpuacct.cpuusage);
+ +#endif
         for_each_possible_cpu(i) {
                 struct rq *rq;
   
@@@ -6888,7 -8263,7 +6888,7 @@@
                 init_cfs_rq(&rq->cfs);
                 init_rt_rq(&rq->rt, rq);
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -              root_task_group.shares = root_task_group_load;
+ +              root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                 /*
                  * How much cpu bandwidth does root_task_group get?
@@@ -6938,7 -8313,7 +6938,7 @@@
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
                 rq_attach_root(rq, &def_root_domain);
   #ifdef CONFIG_NO_HZ
- -              rq->nohz_balance_kick = 0;
+ +              rq->nohz_flags = 0;
   #endif
   #endif
                 init_rq_hrtick(rq);
@@@ -6951,6 -8326,10 +6951,6 @@@
         INIT_HLIST_HEAD(&init_task.preempt_notifiers);
   #endif
   
- -#ifdef CONFIG_SMP
- -      open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
- -#endif
- -
   #ifdef CONFIG_RT_MUTEXES
         plist_head_init(&init_task.pi_waiters);
   #endif
@@@ -6978,11 -8357,17 +6978,11 @@@
   
   #ifdef CONFIG_SMP
         zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
- -#ifdef CONFIG_NO_HZ
- -      zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- -      alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
- -      atomic_set(&nohz.load_balancer, nr_cpu_ids);
- -      atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
- -      atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
- -#endif
         /* May be allocated at isolcpus cmdline parse time */
         if (cpu_isolated_map == NULL)
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
- -#endif /* SMP */
+ +#endif
+ +      init_sched_fair_class();
   
         scheduler_running = 1;
   }
@@@ -7134,14 -8519,169 +7134,14 @@@ void set_curr_task(int cpu, struct task
   
   #endif
   
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- -static void free_fair_sched_group(struct task_group *tg)
- -{
- -      int i;
- -
- -      destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
- -
- -      for_each_possible_cpu(i) {
- -              if (tg->cfs_rq)
- -                      kfree(tg->cfs_rq[i]);
- -              if (tg->se)
- -                      kfree(tg->se[i]);
- -      }
- -
- -      kfree(tg->cfs_rq);
- -      kfree(tg->se);
- -}
- -
- -static
- -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
- -{
- -      struct cfs_rq *cfs_rq;
- -      struct sched_entity *se;
- -      int i;
- -
- -      tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
- -      if (!tg->cfs_rq)
- -              goto err;
- -      tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
- -      if (!tg->se)
- -              goto err;
- -
- -      tg->shares = NICE_0_LOAD;
- -
- -      init_cfs_bandwidth(tg_cfs_bandwidth(tg));
- -
- -      for_each_possible_cpu(i) {
- -              cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
- -                                    GFP_KERNEL, cpu_to_node(i));
- -              if (!cfs_rq)
- -                      goto err;
- -
- -              se = kzalloc_node(sizeof(struct sched_entity),
- -                                GFP_KERNEL, cpu_to_node(i));
- -              if (!se)
- -                      goto err_free_rq;
- -
- -              init_cfs_rq(cfs_rq);
- -              init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
- -      }
- -
- -      return 1;
- -
- -err_free_rq:
- -      kfree(cfs_rq);
- -err:
- -      return 0;
- -}
- -
- -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
- -{
- -      struct rq *rq = cpu_rq(cpu);
- -      unsigned long flags;
- -
- -      /*
- -      * Only empty task groups can be destroyed; so we can speculatively
- -      * check on_list without danger of it being re-added.
- -      */
- -      if (!tg->cfs_rq[cpu]->on_list)
- -              return;
- -
- -      raw_spin_lock_irqsave(&rq->lock, flags);
- -      list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- -      raw_spin_unlock_irqrestore(&rq->lock, flags);
- -}
- -#else /* !CONFIG_FAIR_GROUP_SCHED */
- -static inline void free_fair_sched_group(struct task_group *tg)
- -{
- -}
- -
- -static inline
- -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
- -{
- -      return 1;
- -}
- -
- -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
- -{
- -}
- -#endif /* CONFIG_FAIR_GROUP_SCHED */
- -
   #ifdef CONFIG_RT_GROUP_SCHED
- -static void free_rt_sched_group(struct task_group *tg)
- -{
- -      int i;
- -
- -      if (tg->rt_se)
- -              destroy_rt_bandwidth(&tg->rt_bandwidth);
- -
- -      for_each_possible_cpu(i) {
- -              if (tg->rt_rq)
- -                      kfree(tg->rt_rq[i]);
- -              if (tg->rt_se)
- -                      kfree(tg->rt_se[i]);
- -      }
- -
- -      kfree(tg->rt_rq);
- -      kfree(tg->rt_se);
- -}
- -
- -static
- -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
- -{
- -      struct rt_rq *rt_rq;
- -      struct sched_rt_entity *rt_se;
- -      int i;
- -
- -      tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
- -      if (!tg->rt_rq)
- -              goto err;
- -      tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
- -      if (!tg->rt_se)
- -              goto err;
- -
- -      init_rt_bandwidth(&tg->rt_bandwidth,
- -                      ktime_to_ns(def_rt_bandwidth.rt_period), 0);
- -
- -      for_each_possible_cpu(i) {
- -              rt_rq = kzalloc_node(sizeof(struct rt_rq),
- -                                   GFP_KERNEL, cpu_to_node(i));
- -              if (!rt_rq)
- -                      goto err;
- -
- -              rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
- -                                   GFP_KERNEL, cpu_to_node(i));
- -              if (!rt_se)
- -                      goto err_free_rq;
- -
- -              init_rt_rq(rt_rq, cpu_rq(i));
- -              rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
- -              init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
- -      }
- -
- -      return 1;
- -
- -err_free_rq:
- -      kfree(rt_rq);
- -err:
- -      return 0;
- -}
   #else /* !CONFIG_RT_GROUP_SCHED */
- -static inline void free_rt_sched_group(struct task_group *tg)
- -{
- -}
- -
- -static inline
- -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
- -{
- -      return 1;
- -}
   #endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_CGROUP_SCHED
+ +/* task_group_lock serializes the addition/removal of task groups */
+ +static DEFINE_SPINLOCK(task_group_lock);
+ +
   static void free_sched_group(struct task_group *tg)
   {
         free_fair_sched_group(tg);
@@@ -7247,6 -8787,47 +7247,6 @@@ void sched_move_task(struct task_struc
   #endif /* CONFIG_CGROUP_SCHED */
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -static DEFINE_MUTEX(shares_mutex);
- -
- -int sched_group_set_shares(struct task_group *tg, unsigned long shares)
- -{
- -      int i;
- -      unsigned long flags;
- -
- -      /*
- -       * We can't change the weight of the root cgroup.
- -       */
- -      if (!tg->se[0])
- -              return -EINVAL;
- -
- -      shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
- -
- -      mutex_lock(&shares_mutex);
- -      if (tg->shares == shares)
- -              goto done;
- -
- -      tg->shares = shares;
- -      for_each_possible_cpu(i) {
- -              struct rq *rq = cpu_rq(i);
- -              struct sched_entity *se;
- -
- -              se = tg->se[i];
- -              /* Propagate contribution to hierarchy */
- -              raw_spin_lock_irqsave(&rq->lock, flags);
- -              for_each_sched_entity(se)
- -                      update_cfs_shares(group_cfs_rq(se));
- -              raw_spin_unlock_irqrestore(&rq->lock, flags);
- -      }
- -
- -done:
- -      mutex_unlock(&shares_mutex);
- -      return 0;
- -}
- -
- -unsigned long sched_group_shares(struct task_group *tg)
- -{
- -      return tg->shares;
- -}
   #endif
   
   #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@@ -7271,7 -8852,7 +7271,7 @@@ static inline int tg_has_rt_tasks(struc
         struct task_struct *g, *p;
   
         do_each_thread(g, p) {
- -              if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+ +              if (rt_task(p) && task_rq(p)->rt.tg == tg)
                         return 1;
         } while_each_thread(g, p);
   
@@@ -7622,8 -9203,8 +7622,8 @@@ static int __cfs_schedulable(struct tas
   
   static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
   {
- -      int i, ret = 0, runtime_enabled;
- -      struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ +      int i, ret = 0, runtime_enabled, runtime_was_enabled;
+ +      struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   
         if (tg == &root_task_group)
                 return -EINVAL;
@@@ -7650,8 -9231,6 +7650,8 @@@
                 goto out_unlock;
   
         runtime_enabled = quota != RUNTIME_INF;
+ +      runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+ +      account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
         raw_spin_lock_irq(&cfs_b->lock);
         cfs_b->period = ns_to_ktime(period);
         cfs_b->quota = quota;
@@@ -7667,13 -9246,13 +7667,13 @@@
   
         for_each_possible_cpu(i) {
                 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
- -              struct rq *rq = rq_of(cfs_rq);
+ +              struct rq *rq = cfs_rq->rq;
   
                 raw_spin_lock_irq(&rq->lock);
                 cfs_rq->runtime_enabled = runtime_enabled;
                 cfs_rq->runtime_remaining = 0;
   
- -              if (cfs_rq_throttled(cfs_rq))
+ +              if (cfs_rq->throttled)
                         unthrottle_cfs_rq(cfs_rq);
                 raw_spin_unlock_irq(&rq->lock);
         }
@@@ -7687,7 -9266,7 +7687,7 @@@ int tg_set_cfs_quota(struct task_group 
   {
         u64 quota, period;
   
- -      period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+ +      period = ktime_to_ns(tg->cfs_bandwidth.period);
         if (cfs_quota_us < 0)
                 quota = RUNTIME_INF;
         else
@@@ -7700,10 -9279,10 +7700,10 @@@ long tg_get_cfs_quota(struct task_grou
   {
         u64 quota_us;
   
- -      if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+ +      if (tg->cfs_bandwidth.quota == RUNTIME_INF)
                 return -1;
   
- -      quota_us = tg_cfs_bandwidth(tg)->quota;
+ +      quota_us = tg->cfs_bandwidth.quota;
         do_div(quota_us, NSEC_PER_USEC);
   
         return quota_us;
@@@ -7714,7 -9293,10 +7714,7 @@@ int tg_set_cfs_period(struct task_grou
         u64 quota, period;
   
         period = (u64)cfs_period_us * NSEC_PER_USEC;
- -      quota = tg_cfs_bandwidth(tg)->quota;
- -
- -      if (period <= 0)
- -              return -EINVAL;
+ +      quota = tg->cfs_bandwidth.quota;
   
         return tg_set_cfs_bandwidth(tg, period, quota);
   }
@@@ -7723,7 -9305,7 +7723,7 @@@ long tg_get_cfs_period(struct task_grou
   {
         u64 cfs_period_us;
   
- -      cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+ +      cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
         do_div(cfs_period_us, NSEC_PER_USEC);
   
         return cfs_period_us;
@@@ -7783,13 -9365,13 +7783,13 @@@ static u64 normalize_cfs_quota(struct t
   static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
   {
         struct cfs_schedulable_data *d = data;
- -      struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ +      struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
         s64 quota = 0, parent_quota = -1;
   
         if (!tg->parent) {
                 quota = RUNTIME_INF;
         } else {
- -              struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+ +              struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
   
                 quota = normalize_cfs_quota(tg, d);
                 parent_quota = parent_b->hierarchal_quota;
@@@ -7833,7 -9415,7 +7833,7 @@@ static int cpu_stats_show(struct cgrou
                 struct cgroup_map_cb *cb)
   {
         struct task_group *tg = cgroup_tg(cgrp);
- -      struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ +      struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   
         cb->fill(cb, "nr_periods", cfs_b->nr_periods);
         cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@@ -7934,16 -9516,38 +7934,16 @@@ struct cgroup_subsys cpu_cgroup_subsys 
    * (balbir@in.ibm.com).
    */
   
- -/* track cpu usage of a group of tasks and its child groups */
- -struct cpuacct {
- -      struct cgroup_subsys_state css;
- -      /* cpuusage holds pointer to a u64-type object on every cpu */
- -      u64 __percpu *cpuusage;
- -      struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
- -      struct cpuacct *parent;
- -};
- -
- -struct cgroup_subsys cpuacct_subsys;
- -
- -/* return cpu accounting group corresponding to this container */
- -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
- -{
- -      return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- -                          struct cpuacct, css);
- -}
- -
- -/* return cpu accounting group to which this task belongs */
- -static inline struct cpuacct *task_ca(struct task_struct *tsk)
- -{
- -      return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- -                          struct cpuacct, css);
- -}
- -
   /* create a new cpu accounting group */
   static struct cgroup_subsys_state *cpuacct_create(
         struct cgroup_subsys *ss, struct cgroup *cgrp)
   {
- -      struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- -      int i;
+ +      struct cpuacct *ca;
   
+ +      if (!cgrp->parent)
+ +              return &root_cpuacct.css;
+ +
+ +      ca = kzalloc(sizeof(*ca), GFP_KERNEL);
         if (!ca)
                 goto out;
   
@@@ -7951,13 -9555,18 +7951,13 @@@
         if (!ca->cpuusage)
                 goto out_free_ca;
   
- -      for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- -              if (percpu_counter_init(&ca->cpustat[i], 0))
- -                      goto out_free_counters;
- -
- -      if (cgrp->parent)
- -              ca->parent = cgroup_ca(cgrp->parent);
+ +      ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ +      if (!ca->cpustat)
+ +              goto out_free_cpuusage;
   
         return &ca->css;
   
- -out_free_counters:
- -      while (--i >= 0)
- -              percpu_counter_destroy(&ca->cpustat[i]);
+ +out_free_cpuusage:
         free_percpu(ca->cpuusage);
   out_free_ca:
         kfree(ca);
@@@ -7970,8 -9579,10 +7970,8 @@@ static voi
   cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
   {
         struct cpuacct *ca = cgroup_ca(cgrp);
- -      int i;
   
- -      for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- -              percpu_counter_destroy(&ca->cpustat[i]);
+ +      free_percpu(ca->cpustat);
         free_percpu(ca->cpuusage);
         kfree(ca);
   }
@@@ -8064,31 -9675,16 +8064,31 @@@ static const char *cpuacct_stat_desc[] 
   };
   
   static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- -              struct cgroup_map_cb *cb)
+ +                            struct cgroup_map_cb *cb)
   {
         struct cpuacct *ca = cgroup_ca(cgrp);
- -      int i;
+ +      int cpu;
+ +      s64 val = 0;
+ +
+ +      for_each_online_cpu(cpu) {
+ +              struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ +              val += kcpustat->cpustat[CPUTIME_USER];
+ +              val += kcpustat->cpustat[CPUTIME_NICE];
+ +      }
+ +      val = cputime64_to_clock_t(val);
+ +      cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
   
- -      for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
- -              s64 val = percpu_counter_read(&ca->cpustat[i]);
- -              val = cputime64_to_clock_t(val);
- -              cb->fill(cb, cpuacct_stat_desc[i], val);
+ +      val = 0;
+ +      for_each_online_cpu(cpu) {
+ +              struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ +              val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ +              val += kcpustat->cpustat[CPUTIME_IRQ];
+ +              val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
         }
+ +
+ +      val = cputime64_to_clock_t(val);
+ +      cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+ +
         return 0;
   }
   
@@@ -8118,7 -9714,7 +8118,7 @@@ static int cpuacct_populate(struct cgro
    *
    * called with rq->lock held.
    */
- -static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+ +void cpuacct_charge(struct task_struct *tsk, u64 cputime)
   {
         struct cpuacct *ca;
         int cpu;
@@@ -8132,7 -9728,7 +8132,7 @@@
   
         ca = task_ca(tsk);
   
- -      for (; ca; ca = ca->parent) {
+ +      for (; ca; ca = parent_ca(ca)) {
                 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
@@@ -8140,6 -9736,45 +8140,6 @@@
         rcu_read_unlock();
   }
   
- -/*
- - * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- - * in cputime_t units. As a result, cpuacct_update_stats calls
- - * percpu_counter_add with values large enough to always overflow the
- - * per cpu batch limit causing bad SMP scalability.
- - *
- - * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- - * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- - * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- - */
- -#ifdef CONFIG_SMP
- -#define CPUACCT_BATCH \
- -      min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
- -#else
- -#define CPUACCT_BATCH 0
- -#endif
- -
- -/*
- - * Charge the system/user time to the task's accounting group.
- - */
- -static void cpuacct_update_stats(struct task_struct *tsk,
- -              enum cpuacct_stat_index idx, cputime_t val)
- -{
- -      struct cpuacct *ca;
- -      int batch = CPUACCT_BATCH;
- -
- -      if (unlikely(!cpuacct_subsys.active))
- -              return;
- -
- -      rcu_read_lock();
- -      ca = task_ca(tsk);
- -
- -      do {
- -              __percpu_counter_add(&ca->cpustat[idx], val, batch);
- -              ca = ca->parent;
- -      } while (ca);
- -      rcu_read_unlock();
- -}
- -
   struct cgroup_subsys cpuacct_subsys = {
         .name = "cpuacct",
         .create = cpuacct_create,
diff --combined kernel/trace/trace.c

index 91dc4bc,660b069..a3f1bc5
--- 1/kernel/trace/trace.c
--- 2/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -338,8 -338,7 +338,8 @@@ static DECLARE_WAIT_QUEUE_HEAD(trace_wa
   /* trace_flags holds trace_options default values */
   unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
         TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
- -      TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
+ +      TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
+ +      TRACE_ITER_IRQ_INFO;
   
   static int trace_stop_count;
   static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@@ -427,7 -426,6 +427,7 @@@ static const char *trace_options[] = 
         "record-cmd",
         "overwrite",
         "disable_on_free",
+ +      "irq-info",
         NULL
   };
   
@@@ -1845,33 -1843,6 +1845,33 @@@ static void s_stop(struct seq_file *m, 
         trace_event_read_unlock();
   }
   
+ +static void
+ +get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+ +{
+ +      unsigned long count;
+ +      int cpu;
+ +
+ +      *total = 0;
+ +      *entries = 0;
+ +
+ +      for_each_tracing_cpu(cpu) {
+ +              count = ring_buffer_entries_cpu(tr->buffer, cpu);
+ +              /*
+ +               * If this buffer has skipped entries, then we hold all
+ +               * entries for the trace and we need to ignore the
+ +               * ones before the time stamp.
+ +               */
+ +              if (tr->data[cpu]->skipped_entries) {
+ +                      count -= tr->data[cpu]->skipped_entries;
+ +                      /* total is the same as the entries */
+ +                      *total += count;
+ +              } else
+ +                      *total += count +
+ +                              ring_buffer_overrun_cpu(tr->buffer, cpu);
+ +              *entries += count;
+ +      }
+ +}
+ +
   static void print_lat_help_header(struct seq_file *m)
   {
         seq_puts(m, "#                  _------=> CPU#            \n");
@@@ -1884,35 -1855,12 +1884,35 @@@
         seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
   }
   
- -static void print_func_help_header(struct seq_file *m)
+ +static void print_event_info(struct trace_array *tr, struct seq_file *m)
+ +{
+ +      unsigned long total;
+ +      unsigned long entries;
+ +
+ +      get_total_entries(tr, &total, &entries);
+ +      seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
+ +                 entries, total, num_online_cpus());
+ +      seq_puts(m, "#\n");
+ +}
+ +
+ +static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
   {
- -      seq_puts(m, "#           TASK-PID    CPU#    TIMESTAMP  FUNCTION\n");
+ +      print_event_info(tr, m);
+ +      seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
         seq_puts(m, "#              | |       |          |         |\n");
   }
   
+ +static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+ +{
+ +      print_event_info(tr, m);
+ +      seq_puts(m, "#                              _-----=> irqs-off\n");
+ +      seq_puts(m, "#                             / _----=> need-resched\n");
+ +      seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
+ +      seq_puts(m, "#                            || / _--=> preempt-depth\n");
+ +      seq_puts(m, "#                            ||| /     delay\n");
+ +      seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
+ +      seq_puts(m, "#              | |       |   ||||       |         |\n");
+ +}
   
   void
   print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@@ -1921,14 -1869,32 +1921,14 @@@
         struct trace_array *tr = iter->tr;
         struct trace_array_cpu *data = tr->data[tr->cpu];
         struct tracer *type = current_trace;
- -      unsigned long entries = 0;
- -      unsigned long total = 0;
- -      unsigned long count;
+ +      unsigned long entries;
+ +      unsigned long total;
         const char *name = "preemption";
- -      int cpu;
   
         if (type)
                 name = type->name;
   
- -
- -      for_each_tracing_cpu(cpu) {
- -              count = ring_buffer_entries_cpu(tr->buffer, cpu);
- -              /*
- -               * If this buffer has skipped entries, then we hold all
- -               * entries for the trace and we need to ignore the
- -               * ones before the time stamp.
- -               */
- -              if (tr->data[cpu]->skipped_entries) {
- -                      count -= tr->data[cpu]->skipped_entries;
- -                      /* total is the same as the entries */
- -                      total += count;
- -              } else
- -                      total += count +
- -                              ring_buffer_overrun_cpu(tr->buffer, cpu);
- -              entries += count;
- -      }
+ +      get_total_entries(tr, &total, &entries);
   
         seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
                    name, UTS_RELEASE);
@@@ -2174,21 -2140,6 +2174,21 @@@ enum print_line_t print_trace_line(stru
         return print_trace_fmt(iter);
   }
   
+ +void trace_latency_header(struct seq_file *m)
+ +{
+ +      struct trace_iterator *iter = m->private;
+ +
+ +      /* print nothing if the buffers are empty */
+ +      if (trace_empty(iter))
+ +              return;
+ +
+ +      if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ +              print_trace_header(m, iter);
+ +
+ +      if (!(trace_flags & TRACE_ITER_VERBOSE))
+ +              print_lat_help_header(m);
+ +}
+ +
   void trace_default_header(struct seq_file *m)
   {
         struct trace_iterator *iter = m->private;
@@@ -2204,12 -2155,8 +2204,12 @@@
                 if (!(trace_flags & TRACE_ITER_VERBOSE))
                         print_lat_help_header(m);
         } else {
- -              if (!(trace_flags & TRACE_ITER_VERBOSE))
- -                      print_func_help_header(m);
+ +              if (!(trace_flags & TRACE_ITER_VERBOSE)) {
+ +                      if (trace_flags & TRACE_ITER_IRQ_INFO)
+ +                              print_func_help_header_irq(iter->tr, m);
+ +                      else
+ +                              print_func_help_header(iter->tr, m);
+ +              }
         }
   }
   
@@@ -4438,7 -4385,7 +4438,7 @@@ static const struct file_operations tra
   };
   
   struct dentry *trace_create_file(const char *name,
-                                mode_t mode,
+                                umode_t mode,
                                  struct dentry *parent,
                                  void *data,
                                  const struct file_operations *fops)
@@@ -4828,7 -4775,6 +4828,7 @@@ void ftrace_dump(enum ftrace_dump_mode 
   {
         __ftrace_dump(true, oops_dump_mode);
   }
+ +EXPORT_SYMBOL_GPL(ftrace_dump);
   
   __init static int tracer_alloc_buffers(void)
   {
diff --combined kernel/trace/trace.h

index 2c26574,0154c0b..b93ecba
--- 1/kernel/trace/trace.h
--- 2/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@@ -312,7 -312,7 +312,7 @@@ void tracing_reset_current(int cpu)
   void tracing_reset_current_online_cpus(void);
   int tracing_open_generic(struct inode *inode, struct file *filp);
   struct dentry *trace_create_file(const char *name,
-                                mode_t mode,
+                                umode_t mode,
                                  struct dentry *parent,
                                  void *data,
                                  const struct file_operations *fops);
@@@ -370,7 -370,6 +370,7 @@@ void trace_graph_function(struct trace_
                     unsigned long ip,
                     unsigned long parent_ip,
                     unsigned long flags, int pc);
+ +void trace_latency_header(struct seq_file *m);
   void trace_default_header(struct seq_file *m);
   void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
   int trace_empty(struct trace_iterator *iter);
@@@ -655,7 -654,6 +655,7 @@@ enum trace_iterator_flags 
         TRACE_ITER_RECORD_CMD           = 0x100000,
         TRACE_ITER_OVERWRITE            = 0x200000,
         TRACE_ITER_STOP_ON_FREE         = 0x400000,
+ +      TRACE_ITER_IRQ_INFO             = 0x800000,
   };
   
   /*
diff --combined mm/page_alloc.c

index bdc804c,99930ec..f24bc1c
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -181,17 -181,39 +181,17 @@@ static unsigned long __meminitdata nr_k
   static unsigned long __meminitdata nr_all_pages;
   static unsigned long __meminitdata dma_reserve;
   
- -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- -  /*
- -   * MAX_ACTIVE_REGIONS determines the maximum number of distinct
- -   * ranges of memory (RAM) that may be registered with add_active_range().
- -   * Ranges passed to add_active_range() will be merged if possible
- -   * so the number of times add_active_range() can be called is
- -   * related to the number of nodes and the number of holes
- -   */
- -  #ifdef CONFIG_MAX_ACTIVE_REGIONS
- -    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
- -    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
- -  #else
- -    #if MAX_NUMNODES >= 32
- -      /* If there can be many nodes, allow up to 50 holes per node */
- -      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
- -    #else
- -      /* By default, allow up to 256 distinct regions */
- -      #define MAX_ACTIVE_REGIONS 256
- -    #endif
- -  #endif
- -
- -  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
- -  static int __meminitdata nr_nodemap_entries;
- -  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
- -  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
- -  static unsigned long __initdata required_kernelcore;
- -  static unsigned long __initdata required_movablecore;
- -  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
- -
- -  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
- -  int movable_zone;
- -  EXPORT_SYMBOL(movable_zone);
- -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ +static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+ +static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+ +static unsigned long __initdata required_kernelcore;
+ +static unsigned long __initdata required_movablecore;
+ +static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+ +
+ +/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+ +int movable_zone;
+ +EXPORT_SYMBOL(movable_zone);
+ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
   
   #if MAX_NUMNODES > 1
   int nr_node_ids __read_mostly = MAX_NUMNODES;
@@@ -684,10 -706,10 +684,10 @@@ void __meminit __free_pages_bootmem(str
                 int loop;
   
                 prefetchw(page);
- -              for (loop = 0; loop < BITS_PER_LONG; loop++) {
+ +              for (loop = 0; loop < (1 << order); loop++) {
                         struct page *p = &page[loop];
   
- -                      if (loop + 1 < BITS_PER_LONG)
+ +                      if (loop + 1 < (1 << order))
                                 prefetchw(p + 1);
                         __ClearPageReserved(p);
                         set_page_count(p, 0);
@@@ -1386,7 -1408,7 +1386,7 @@@ static int should_fail_alloc_page(gfp_
   
   static int __init fail_page_alloc_debugfs(void)
   {
-       mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+       umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
         struct dentry *dir;
   
         dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
@@@ -3715,7 -3737,35 +3715,7 @@@ __meminit int init_currently_empty_zone
         return 0;
   }
   
- -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- -/*
- - * Basic iterator support. Return the first range of PFNs for a node
- - * Note: nid == MAX_NUMNODES returns first region regardless of node
- - */
- -static int __meminit first_active_region_index_in_nid(int nid)
- -{
- -      int i;
- -
- -      for (i = 0; i < nr_nodemap_entries; i++)
- -              if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
- -                      return i;
- -
- -      return -1;
- -}
- -
- -/*
- - * Basic iterator support. Return the next active range of PFNs for a node
- - * Note: nid == MAX_NUMNODES returns next region regardless of node
- - */
- -static int __meminit next_active_region_index_in_nid(int index, int nid)
- -{
- -      for (index = index + 1; index < nr_nodemap_entries; index++)
- -              if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
- -                      return index;
- -
- -      return -1;
- -}
- -
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
   #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
   /*
    * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@@ -3725,12 -3775,15 +3725,12 @@@
    */
   int __meminit __early_pfn_to_nid(unsigned long pfn)
   {
- -      int i;
- -
- -      for (i = 0; i < nr_nodemap_entries; i++) {
- -              unsigned long start_pfn = early_node_map[i].start_pfn;
- -              unsigned long end_pfn = early_node_map[i].end_pfn;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, nid;
   
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
                 if (start_pfn <= pfn && pfn < end_pfn)
- -                      return early_node_map[i].nid;
- -      }
+ +                      return nid;
         /* This is a memory hole */
         return -1;
   }
@@@ -3759,6 -3812,11 +3759,6 @@@ bool __meminit early_pfn_in_nid(unsigne
   }
   #endif
   
- -/* Basic iterator support to walk early_node_map[] */
- -#define for_each_active_range_index_in_nid(i, nid) \
- -      for (i = first_active_region_index_in_nid(nid); i != -1; \
- -                              i = next_active_region_index_in_nid(i, nid))
- -
   /**
    * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
    * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@@ -3768,34 -3826,122 +3768,34 @@@
    * add_active_ranges() contain no holes and may be freed, this
    * this function may be used instead of calling free_bootmem() manually.
    */
- -void __init free_bootmem_with_active_regions(int nid,
- -                                              unsigned long max_low_pfn)
- -{
- -      int i;
- -
- -      for_each_active_range_index_in_nid(i, nid) {
- -              unsigned long size_pages = 0;
- -              unsigned long end_pfn = early_node_map[i].end_pfn;
- -
- -              if (early_node_map[i].start_pfn >= max_low_pfn)
- -                      continue;
- -
- -              if (end_pfn > max_low_pfn)
- -                      end_pfn = max_low_pfn;
- -
- -              size_pages = end_pfn - early_node_map[i].start_pfn;
- -              free_bootmem_node(NODE_DATA(early_node_map[i].nid),
- -                              PFN_PHYS(early_node_map[i].start_pfn),
- -                              size_pages << PAGE_SHIFT);
- -      }
- -}
- -
- -#ifdef CONFIG_HAVE_MEMBLOCK
- -/*
- - * Basic iterator support. Return the last range of PFNs for a node
- - * Note: nid == MAX_NUMNODES returns last region regardless of node
- - */
- -static int __meminit last_active_region_index_in_nid(int nid)
+ +void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
   {
- -      int i;
- -
- -      for (i = nr_nodemap_entries - 1; i >= 0; i--)
- -              if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
- -                      return i;
- -
- -      return -1;
- -}
- -
- -/*
- - * Basic iterator support. Return the previous active range of PFNs for a node
- - * Note: nid == MAX_NUMNODES returns next region regardless of node
- - */
- -static int __meminit previous_active_region_index_in_nid(int index, int nid)
- -{
- -      for (index = index - 1; index >= 0; index--)
- -              if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
- -                      return index;
- -
- -      return -1;
- -}
- -
- -#define for_each_active_range_index_in_nid_reverse(i, nid) \
- -      for (i = last_active_region_index_in_nid(nid); i != -1; \
- -                              i = previous_active_region_index_in_nid(i, nid))
- -
- -u64 __init find_memory_core_early(int nid, u64 size, u64 align,
- -                                      u64 goal, u64 limit)
- -{
- -      int i;
- -
- -      /* Need to go over early_node_map to find out good range for node */
- -      for_each_active_range_index_in_nid_reverse(i, nid) {
- -              u64 addr;
- -              u64 ei_start, ei_last;
- -              u64 final_start, final_end;
- -
- -              ei_last = early_node_map[i].end_pfn;
- -              ei_last <<= PAGE_SHIFT;
- -              ei_start = early_node_map[i].start_pfn;
- -              ei_start <<= PAGE_SHIFT;
- -
- -              final_start = max(ei_start, goal);
- -              final_end = min(ei_last, limit);
- -
- -              if (final_start >= final_end)
- -                      continue;
- -
- -              addr = memblock_find_in_range(final_start, final_end, size, align);
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, this_nid;
   
- -              if (addr == MEMBLOCK_ERROR)
- -                      continue;
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
+ +              start_pfn = min(start_pfn, max_low_pfn);
+ +              end_pfn = min(end_pfn, max_low_pfn);
   
- -              return addr;
+ +              if (start_pfn < end_pfn)
+ +                      free_bootmem_node(NODE_DATA(this_nid),
+ +                                        PFN_PHYS(start_pfn),
+ +                                        (end_pfn - start_pfn) << PAGE_SHIFT);
         }
- -
- -      return MEMBLOCK_ERROR;
   }
- -#endif
   
   int __init add_from_early_node_map(struct range *range, int az,
                                    int nr_range, int nid)
   {
+ +      unsigned long start_pfn, end_pfn;
         int i;
- -      u64 start, end;
   
         /* need to go over early_node_map to find out good range for node */
- -      for_each_active_range_index_in_nid(i, nid) {
- -              start = early_node_map[i].start_pfn;
- -              end = early_node_map[i].end_pfn;
- -              nr_range = add_range(range, az, nr_range, start, end);
- -      }
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
+ +              nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
         return nr_range;
   }
   
- -void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
- -{
- -      int i;
- -      int ret;
- -
- -      for_each_active_range_index_in_nid(i, nid) {
- -              ret = work_fn(early_node_map[i].start_pfn,
- -                            early_node_map[i].end_pfn, data);
- -              if (ret)
- -                      break;
- -      }
- -}
   /**
    * sparse_memory_present_with_active_regions - Call memory_present for each active range
    * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@@ -3806,11 -3952,12 +3806,11 @@@
    */
   void __init sparse_memory_present_with_active_regions(int nid)
   {
- -      int i;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, this_nid;
   
- -      for_each_active_range_index_in_nid(i, nid)
- -              memory_present(early_node_map[i].nid,
- -                              early_node_map[i].start_pfn,
- -                              early_node_map[i].end_pfn);
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
+ +              memory_present(this_nid, start_pfn, end_pfn);
   }
   
   /**
@@@ -3827,15 -3974,13 +3827,15 @@@
   void __meminit get_pfn_range_for_nid(unsigned int nid,
                         unsigned long *start_pfn, unsigned long *end_pfn)
   {
+ +      unsigned long this_start_pfn, this_end_pfn;
         int i;
+ +
         *start_pfn = -1UL;
         *end_pfn = 0;
   
- -      for_each_active_range_index_in_nid(i, nid) {
- -              *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
- -              *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+ +      for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+ +              *start_pfn = min(*start_pfn, this_start_pfn);
+ +              *end_pfn = max(*end_pfn, this_end_pfn);
         }
   
         if (*start_pfn == -1UL)
@@@ -3938,16 -4083,46 +3938,16 @@@ unsigned long __meminit __absent_pages_
                                 unsigned long range_start_pfn,
                                 unsigned long range_end_pfn)
   {
- -      int i = 0;
- -      unsigned long prev_end_pfn = 0, hole_pages = 0;
- -      unsigned long start_pfn;
- -
- -      /* Find the end_pfn of the first active range of pfns in the node */
- -      i = first_active_region_index_in_nid(nid);
- -      if (i == -1)
- -              return 0;
- -
- -      prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
- -
- -      /* Account for ranges before physical memory on this node */
- -      if (early_node_map[i].start_pfn > range_start_pfn)
- -              hole_pages = prev_end_pfn - range_start_pfn;
- -
- -      /* Find all holes for the zone within the node */
- -      for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
- -
- -              /* No need to continue if prev_end_pfn is outside the zone */
- -              if (prev_end_pfn >= range_end_pfn)
- -                      break;
- -
- -              /* Make sure the end of the zone is not within the hole */
- -              start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
- -              prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+ +      unsigned long nr_absent = range_end_pfn - range_start_pfn;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i;
   
- -              /* Update the hole size cound and move on */
- -              if (start_pfn > range_start_pfn) {
- -                      BUG_ON(prev_end_pfn > start_pfn);
- -                      hole_pages += start_pfn - prev_end_pfn;
- -              }
- -              prev_end_pfn = early_node_map[i].end_pfn;
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ +              start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ +              end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ +              nr_absent -= end_pfn - start_pfn;
         }
- -
- -      /* Account for ranges past physical memory on this node */
- -      if (range_end_pfn > prev_end_pfn)
- -              hole_pages += range_end_pfn -
- -                              max(range_start_pfn, prev_end_pfn);
- -
- -      return hole_pages;
+ +      return nr_absent;
   }
   
   /**
@@@ -3968,14 -4143,14 +3968,14 @@@ static unsigned long __meminit zone_abs
                                         unsigned long zone_type,
                                         unsigned long *ignored)
   {
+ +      unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ +      unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
         unsigned long node_start_pfn, node_end_pfn;
         unsigned long zone_start_pfn, zone_end_pfn;
   
         get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
- -      zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
- -                                                      node_start_pfn);
- -      zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
- -                                                      node_end_pfn);
+ +      zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ +      zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
   
         adjust_zone_range_for_zone_movable(nid, zone_type,
                         node_start_pfn, node_end_pfn,
@@@ -3983,7 -4158,7 +3983,7 @@@
         return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
   }
   
- -#else
+ +#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
   static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                         unsigned long zone_type,
                                         unsigned long *zones_size)
@@@ -4001,7 -4176,7 +4001,7 @@@ static inline unsigned long __meminit z
         return zholes_size[zone_type];
   }
   
- -#endif
+ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
   
   static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                 unsigned long *zones_size, unsigned long *zholes_size)
@@@ -4224,10 -4399,10 +4224,10 @@@ static void __init_refok alloc_node_mem
          */
         if (pgdat == NODE_DATA(0)) {
                 mem_map = NODE_DATA(0)->node_mem_map;
- -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
                         mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
- -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
         }
   #endif
   #endif /* CONFIG_FLAT_NODE_MEM_MAP */
@@@ -4252,7 -4427,7 +4252,7 @@@ void __paginginit free_area_init_node(i
         free_area_init_core(pgdat, zones_size, zholes_size);
   }
   
- -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
   
   #if MAX_NUMNODES > 1
   /*
@@@ -4274,6 -4449,170 +4274,6 @@@ static inline void setup_nr_node_ids(vo
   #endif
   
   /**
- - * add_active_range - Register a range of PFNs backed by physical memory
- - * @nid: The node ID the range resides on
- - * @start_pfn: The start PFN of the available physical memory
- - * @end_pfn: The end PFN of the available physical memory
- - *
- - * These ranges are stored in an early_node_map[] and later used by
- - * free_area_init_nodes() to calculate zone sizes and holes. If the
- - * range spans a memory hole, it is up to the architecture to ensure
- - * the memory is not freed by the bootmem allocator. If possible
- - * the range being registered will be merged with existing ranges.
- - */
- -void __init add_active_range(unsigned int nid, unsigned long start_pfn,
- -                                              unsigned long end_pfn)
- -{
- -      int i;
- -
- -      mminit_dprintk(MMINIT_TRACE, "memory_register",
- -                      "Entering add_active_range(%d, %#lx, %#lx) "
- -                      "%d entries of %d used\n",
- -                      nid, start_pfn, end_pfn,
- -                      nr_nodemap_entries, MAX_ACTIVE_REGIONS);
- -
- -      mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
- -
- -      /* Merge with existing active regions if possible */
- -      for (i = 0; i < nr_nodemap_entries; i++) {
- -              if (early_node_map[i].nid != nid)
- -                      continue;
- -
- -              /* Skip if an existing region covers this new one */
- -              if (start_pfn >= early_node_map[i].start_pfn &&
- -                              end_pfn <= early_node_map[i].end_pfn)
- -                      return;
- -
- -              /* Merge forward if suitable */
- -              if (start_pfn <= early_node_map[i].end_pfn &&
- -                              end_pfn > early_node_map[i].end_pfn) {
- -                      early_node_map[i].end_pfn = end_pfn;
- -                      return;
- -              }
- -
- -              /* Merge backward if suitable */
- -              if (start_pfn < early_node_map[i].start_pfn &&
- -                              end_pfn >= early_node_map[i].start_pfn) {
- -                      early_node_map[i].start_pfn = start_pfn;
- -                      return;
- -              }
- -      }
- -
- -      /* Check that early_node_map is large enough */
- -      if (i >= MAX_ACTIVE_REGIONS) {
- -              printk(KERN_CRIT "More than %d memory regions, truncating\n",
- -                                                      MAX_ACTIVE_REGIONS);
- -              return;
- -      }
- -
- -      early_node_map[i].nid = nid;
- -      early_node_map[i].start_pfn = start_pfn;
- -      early_node_map[i].end_pfn = end_pfn;
- -      nr_nodemap_entries = i + 1;
- -}
- -
- -/**
- - * remove_active_range - Shrink an existing registered range of PFNs
- - * @nid: The node id the range is on that should be shrunk
- - * @start_pfn: The new PFN of the range
- - * @end_pfn: The new PFN of the range
- - *
- - * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- - * The map is kept near the end physical page range that has already been
- - * registered. This function allows an arch to shrink an existing registered
- - * range.
- - */
- -void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
- -                              unsigned long end_pfn)
- -{
- -      int i, j;
- -      int removed = 0;
- -
- -      printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
- -                        nid, start_pfn, end_pfn);
- -
- -      /* Find the old active region end and shrink */
- -      for_each_active_range_index_in_nid(i, nid) {
- -              if (early_node_map[i].start_pfn >= start_pfn &&
- -                  early_node_map[i].end_pfn <= end_pfn) {
- -                      /* clear it */
- -                      early_node_map[i].start_pfn = 0;
- -                      early_node_map[i].end_pfn = 0;
- -                      removed = 1;
- -                      continue;
- -              }
- -              if (early_node_map[i].start_pfn < start_pfn &&
- -                  early_node_map[i].end_pfn > start_pfn) {
- -                      unsigned long temp_end_pfn = early_node_map[i].end_pfn;
- -                      early_node_map[i].end_pfn = start_pfn;
- -                      if (temp_end_pfn > end_pfn)
- -                              add_active_range(nid, end_pfn, temp_end_pfn);
- -                      continue;
- -              }
- -              if (early_node_map[i].start_pfn >= start_pfn &&
- -                  early_node_map[i].end_pfn > end_pfn &&
- -                  early_node_map[i].start_pfn < end_pfn) {
- -                      early_node_map[i].start_pfn = end_pfn;
- -                      continue;
- -              }
- -      }
- -
- -      if (!removed)
- -              return;
- -
- -      /* remove the blank ones */
- -      for (i = nr_nodemap_entries - 1; i > 0; i--) {
- -              if (early_node_map[i].nid != nid)
- -                      continue;
- -              if (early_node_map[i].end_pfn)
- -                      continue;
- -              /* we found it, get rid of it */
- -              for (j = i; j < nr_nodemap_entries - 1; j++)
- -                      memcpy(&early_node_map[j], &early_node_map[j+1],
- -                              sizeof(early_node_map[j]));
- -              j = nr_nodemap_entries - 1;
- -              memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
- -              nr_nodemap_entries--;
- -      }
- -}
- -
- -/**
- - * remove_all_active_ranges - Remove all currently registered regions
- - *
- - * During discovery, it may be found that a table like SRAT is invalid
- - * and an alternative discovery method must be used. This function removes
- - * all currently registered regions.
- - */
- -void __init remove_all_active_ranges(void)
- -{
- -      memset(early_node_map, 0, sizeof(early_node_map));
- -      nr_nodemap_entries = 0;
- -}
- -
- -/* Compare two active node_active_regions */
- -static int __init cmp_node_active_region(const void *a, const void *b)
- -{
- -      struct node_active_region *arange = (struct node_active_region *)a;
- -      struct node_active_region *brange = (struct node_active_region *)b;
- -
- -      /* Done this way to avoid overflows */
- -      if (arange->start_pfn > brange->start_pfn)
- -              return 1;
- -      if (arange->start_pfn < brange->start_pfn)
- -              return -1;
- -
- -      return 0;
- -}
- -
- -/* sort the node_map by start_pfn */
- -void __init sort_node_map(void)
- -{
- -      sort(early_node_map, (size_t)nr_nodemap_entries,
- -                      sizeof(struct node_active_region),
- -                      cmp_node_active_region, NULL);
- -}
- -
- -/**
    * node_map_pfn_alignment - determine the maximum internode alignment
    *
    * This function should be called after node map is populated and sorted.
@@@ -4295,11 -4634,15 +4295,11 @@@
   unsigned long __init node_map_pfn_alignment(void)
   {
         unsigned long accl_mask = 0, last_end = 0;
+ +      unsigned long start, end, mask;
         int last_nid = -1;
- -      int i;
- -
- -      for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
- -              int nid = early_node_map[i].nid;
- -              unsigned long start = early_node_map[i].start_pfn;
- -              unsigned long end = early_node_map[i].end_pfn;
- -              unsigned long mask;
+ +      int i, nid;
   
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
                 if (!start || last_nid < 0 || last_nid == nid) {
                         last_nid = nid;
                         last_end = end;
@@@ -4326,12 -4669,12 +4326,12 @@@
   /* Find the lowest pfn for a node */
   static unsigned long __init find_min_pfn_for_node(int nid)
   {
- -      int i;
         unsigned long min_pfn = ULONG_MAX;
+ +      unsigned long start_pfn;
+ +      int i;
   
- -      /* Assuming a sorted map, the first range found has the starting pfn */
- -      for_each_active_range_index_in_nid(i, nid)
- -              min_pfn = min(min_pfn, early_node_map[i].start_pfn);
+ +      for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
+ +              min_pfn = min(min_pfn, start_pfn);
   
         if (min_pfn == ULONG_MAX) {
                 printk(KERN_WARNING
@@@ -4360,16 -4703,15 +4360,16 @@@ unsigned long __init find_min_pfn_with_
    */
   static unsigned long __init early_calculate_totalpages(void)
   {
- -      int i;
         unsigned long totalpages = 0;
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, nid;
+ +
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ +              unsigned long pages = end_pfn - start_pfn;
   
- -      for (i = 0; i < nr_nodemap_entries; i++) {
- -              unsigned long pages = early_node_map[i].end_pfn -
- -                                              early_node_map[i].start_pfn;
                 totalpages += pages;
                 if (pages)
- -                      node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
+ +                      node_set_state(nid, N_HIGH_MEMORY);
         }
         return totalpages;
   }
@@@ -4424,8 -4766,6 +4424,8 @@@ restart
         /* Spread kernelcore memory as evenly as possible throughout nodes */
         kernelcore_node = required_kernelcore / usable_nodes;
         for_each_node_state(nid, N_HIGH_MEMORY) {
+ +              unsigned long start_pfn, end_pfn;
+ +
                 /*
                  * Recalculate kernelcore_node if the division per node
                  * now exceeds what is necessary to satisfy the requested
@@@ -4442,10 -4782,13 +4442,10 @@@
                 kernelcore_remaining = kernelcore_node;
   
                 /* Go through each range of PFNs within this node */
- -              for_each_active_range_index_in_nid(i, nid) {
- -                      unsigned long start_pfn, end_pfn;
+ +              for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
                         unsigned long size_pages;
   
- -                      start_pfn = max(early_node_map[i].start_pfn,
- -                                              zone_movable_pfn[nid]);
- -                      end_pfn = early_node_map[i].end_pfn;
+ +                      start_pfn = max(start_pfn, zone_movable_pfn[nid]);
                         if (start_pfn >= end_pfn)
                                 continue;
   
@@@ -4547,8 -4890,11 +4547,8 @@@ static void check_for_regular_memory(pg
    */
   void __init free_area_init_nodes(unsigned long *max_zone_pfn)
   {
- -      unsigned long nid;
- -      int i;
- -
- -      /* Sort early_node_map as initialisation assumes it is sorted */
- -      sort_node_map();
+ +      unsigned long start_pfn, end_pfn;
+ +      int i, nid;
   
         /* Record where the zone boundaries are */
         memset(arch_zone_lowest_possible_pfn, 0,
@@@ -4595,9 -4941,11 +4595,9 @@@
         }
   
         /* Print out the early_node_map[] */
- -      printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
- -      for (i = 0; i < nr_nodemap_entries; i++)
- -              printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
- -                                              early_node_map[i].start_pfn,
- -                                              early_node_map[i].end_pfn);
+ +      printk("Early memory PFN ranges\n");
+ +      for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ +              printk("  %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
   
         /* Initialise every node */
         mminit_verify_pageflags_layout();
@@@ -4650,7 -4998,7 +4650,7 @@@ static int __init cmdline_parse_movable
   early_param("kernelcore", cmdline_parse_kernelcore);
   early_param("movablecore", cmdline_parse_movablecore);
   
- -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
   
   /**
    * set_dma_reserve - set the specified number of pages reserved in the first zone
diff --combined net/unix/af_unix.c

index 7cc3d7b,412a99f..aad8fb6
--- 1/net/unix/af_unix.c
--- 2/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@@ -115,10 -115,8 +115,10 @@@
   #include <net/checksum.h>
   #include <linux/security.h>
   
- -static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
- -static DEFINE_SPINLOCK(unix_table_lock);
+ +struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
+ +EXPORT_SYMBOL_GPL(unix_socket_table);
+ +DEFINE_SPINLOCK(unix_table_lock);
+ +EXPORT_SYMBOL_GPL(unix_table_lock);
   static atomic_long_t unix_nr_socks;
   
   #define unix_sockets_unbound  (&unix_socket_table[UNIX_HASH_SIZE])
@@@ -174,7 -172,7 +174,7 @@@ static inline int unix_recvq_full(struc
         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
   }
   
- -static struct sock *unix_peer_get(struct sock *s)
+ +struct sock *unix_peer_get(struct sock *s)
   {
         struct sock *peer;
   
@@@ -185,7 -183,6 +185,7 @@@
         unix_state_unlock(s);
         return peer;
   }
+ +EXPORT_SYMBOL_GPL(unix_peer_get);
   
   static inline void unix_release_addr(struct unix_address *addr)
   {
@@@ -850,7 -847,7 +850,7 @@@ static int unix_bind(struct socket *soc
         atomic_set(&addr->refcnt, 1);
   
         if (sun_path[0]) {
-               unsigned int mode;
+               umode_t mode;
                 err = 0;
                 /*
                  * Get the parent directory, calculate the hash for last
@@@ -2065,36 -2062,6 +2065,36 @@@ static int unix_shutdown(struct socket 
         return 0;
   }
   
+ +long unix_inq_len(struct sock *sk)
+ +{
+ +      struct sk_buff *skb;
+ +      long amount = 0;
+ +
+ +      if (sk->sk_state == TCP_LISTEN)
+ +              return -EINVAL;
+ +
+ +      spin_lock(&sk->sk_receive_queue.lock);
+ +      if (sk->sk_type == SOCK_STREAM ||
+ +          sk->sk_type == SOCK_SEQPACKET) {
+ +              skb_queue_walk(&sk->sk_receive_queue, skb)
+ +                      amount += skb->len;
+ +      } else {
+ +              skb = skb_peek(&sk->sk_receive_queue);
+ +              if (skb)
+ +                      amount = skb->len;
+ +      }
+ +      spin_unlock(&sk->sk_receive_queue.lock);
+ +
+ +      return amount;
+ +}
+ +EXPORT_SYMBOL_GPL(unix_inq_len);
+ +
+ +long unix_outq_len(struct sock *sk)
+ +{
+ +      return sk_wmem_alloc_get(sk);
+ +}
+ +EXPORT_SYMBOL_GPL(unix_outq_len);
+ +
   static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
   {
         struct sock *sk = sock->sk;
@@@ -2103,16 -2070,33 +2103,16 @@@
   
         switch (cmd) {
         case SIOCOUTQ:
- -              amount = sk_wmem_alloc_get(sk);
+ +              amount = unix_outq_len(sk);
                 err = put_user(amount, (int __user *)arg);
                 break;
         case SIOCINQ:
- -              {
- -                      struct sk_buff *skb;
- -
- -                      if (sk->sk_state == TCP_LISTEN) {
- -                              err = -EINVAL;
- -                              break;
- -                      }
- -
- -                      spin_lock(&sk->sk_receive_queue.lock);
- -                      if (sk->sk_type == SOCK_STREAM ||
- -                          sk->sk_type == SOCK_SEQPACKET) {
- -                              skb_queue_walk(&sk->sk_receive_queue, skb)
- -                                      amount += skb->len;
- -                      } else {
- -                              skb = skb_peek(&sk->sk_receive_queue);
- -                              if (skb)
- -                                      amount = skb->len;
- -                      }
- -                      spin_unlock(&sk->sk_receive_queue.lock);
+ +              amount = unix_inq_len(sk);
+ +              if (amount < 0)
+ +                      err = amount;
+ +              else
                         err = put_user(amount, (int __user *)arg);
- -                      break;
- -              }
- -
+ +              break;
         default:
                 err = -ENOIOCTLCMD;
                 break;
diff --combined security/selinux/hooks.c

index 86305c2,57546cf..7cd4c3a
--- 1/security/selinux/hooks.c
--- 2/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@@ -1090,7 -1090,7 +1090,7 @@@ static inline u16 socket_type_to_securi
                         return SECCLASS_NETLINK_ROUTE_SOCKET;
                 case NETLINK_FIREWALL:
                         return SECCLASS_NETLINK_FIREWALL_SOCKET;
- -              case NETLINK_INET_DIAG:
+ +              case NETLINK_SOCK_DIAG:
                         return SECCLASS_NETLINK_TCPDIAG_SOCKET;
                 case NETLINK_NFLOG:
                         return SECCLASS_NETLINK_NFLOG_SOCKET;
@@@ -1740,7 -1740,7 +1740,7 @@@ static inline u32 file_mask_to_av(int m
   {
         u32 av = 0;
   
-       if ((mode & S_IFMT) != S_IFDIR) {
+       if (!S_ISDIR(mode)) {
                 if (mask & MAY_EXEC)
                         av |= FILE__EXECUTE;
                 if (mask & MAY_READ)
@@@ -2507,7 -2507,7 +2507,7 @@@ static int selinux_mount(char *dev_name
         const struct cred *cred = current_cred();
   
         if (flags & MS_REMOUNT)
-               return superblock_has_perm(cred, path->mnt->mnt_sb,
+               return superblock_has_perm(cred, path->dentry->d_sb,
                                            FILESYSTEM__REMOUNT, NULL);
         else
                 return path_has_perm(cred, path, FILE__MOUNTON);
@@@ -2598,7 -2598,7 +2598,7 @@@ static int selinux_inode_init_security(
         return 0;
   }
   
- static int selinux_inode_create(struct inode *dir, struct dentry *dentry, int mask)
+ static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
   {
         return may_create(dir, dentry, SECCLASS_FILE);
   }
@@@ -2618,7 -2618,7 +2618,7 @@@ static int selinux_inode_symlink(struc
         return may_create(dir, dentry, SECCLASS_LNK_FILE);
   }
   
- static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, int mask)
+ static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask)
   {
         return may_create(dir, dentry, SECCLASS_DIR);
   }
@@@ -2628,7 -2628,7 +2628,7 @@@ static int selinux_inode_rmdir(struct i
         return may_link(dir, dentry, MAY_RMDIR);
   }
   
- static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+ static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
   {
         return may_create(dir, dentry, inode_mode_to_security_class(mode));
   }
@@@ -3561,20 -3561,19 +3561,20 @@@ static int selinux_parse_skb_ipv6(struc
         u8 nexthdr;
         int ret = -EINVAL, offset;
         struct ipv6hdr _ipv6h, *ip6;
+ +      __be16 frag_off;
   
         offset = skb_network_offset(skb);
         ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
         if (ip6 == NULL)
                 goto out;
   
- -      ipv6_addr_copy(&ad->u.net.v6info.saddr, &ip6->saddr);
- -      ipv6_addr_copy(&ad->u.net.v6info.daddr, &ip6->daddr);
+ +      ad->u.net.v6info.saddr = ip6->saddr;
+ +      ad->u.net.v6info.daddr = ip6->daddr;
         ret = 0;
   
         nexthdr = ip6->nexthdr;
         offset += sizeof(_ipv6h);
- -      offset = ipv6_skip_exthdr(skb, offset, &nexthdr);
+ +      offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
         if (offset < 0)
                 goto out;
   
@@@ -3872,7 -3871,7 +3872,7 @@@ static int selinux_socket_bind(struct s
                 if (family == PF_INET)
                         ad.u.net.v4info.saddr = addr4->sin_addr.s_addr;
                 else
- -                      ipv6_addr_copy(&ad.u.net.v6info.saddr, &addr6->sin6_addr);
+ +                      ad.u.net.v6info.saddr = addr6->sin6_addr;
   
                 err = avc_has_perm(sksec->sid, sid,
                                    sksec->sclass, node_perm, &ad);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 8 Jan 2012 20:19:57 +0000 (12:19 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 8 Jan 2012 20:19:57 +0000 (12:19 -0800)
		1	2
Documentation/filesystems/debugfs.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/spu.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/types.h	patch \|	diff1 \|	diff2 \|	blob \| history
block/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/base/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/base/devtmpfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/cxgbi/libcxgbi.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/iio/adc/ad7192.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/iio/dac/ad5446.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/iio/dds/ad9834.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/class/usblp.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/misc/iowarrior.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/misc/legousbtower.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/cifs/connect.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/debugfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/minix/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/debugfs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/device.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/usb.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/acct.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/unix/af_unix.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/selinux/hooks.c	patch \|	diff1 \|	diff2 \|	blob \| history