Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 6 Mar 2019 18:31:36 +0000 (10:31 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 6 Mar 2019 18:31:36 +0000 (10:31 -0800)
Merge misc updates from Andrew Morton:

 - a few misc things

 - ocfs2 updates

 - most of MM

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (159 commits)
  tools/testing/selftests/proc/proc-self-syscall.c: remove duplicate include
  proc: more robust bulk read test
  proc: test /proc/*/maps, smaps, smaps_rollup, statm
  proc: use seq_puts() everywhere
  proc: read kernel cpu stat pointer once
  proc: remove unused argument in proc_pid_lookup()
  fs/proc/thread_self.c: code cleanup for proc_setup_thread_self()
  fs/proc/self.c: code cleanup for proc_setup_self()
  proc: return exit code 4 for skipped tests
  mm,mremap: bail out earlier in mremap_to under map pressure
  mm/sparse: fix a bad comparison
  mm/memory.c: do_fault: avoid usage of stale vm_area_struct
  writeback: fix inode cgroup switching comment
  mm/huge_memory.c: fix "orig_pud" set but not used
  mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC
  mm/memcontrol.c: fix bad line in comment
  mm/cma.c: cma_declare_contiguous: correct err handling
  mm/page_ext.c: fix an imbalance with kmemleak
  mm/compaction: pass pgdat to too_many_isolated() instead of zone
  mm: remove zone_lru_lock() function, access ->lru_lock directly
  ...

1  2 
MAINTAINERS
fs/proc/stat.c
fs/proc/task_nommu.c
include/linux/sched.h
init/init_task.c
kernel/kthread.c
kernel/sched/core.c
kernel/sched/fair.c
kernel/sysctl.c

diff --combined MAINTAINERS
@@@ -1036,26 -1036,26 +1036,26 @@@ F:   drivers/net/appletalk
  F:    net/appletalk/
  
  APPLIED MICRO (APM) X-GENE DEVICE TREE SUPPORT
 -M:    Duc Dang <dhdang@apm.com>
 +M:    Khuong Dinh <khuong@os.amperecomputing.com>
  S:    Supported
  F:    arch/arm64/boot/dts/apm/
  
  APPLIED MICRO (APM) X-GENE SOC EDAC
 -M:    Loc Ho <lho@apm.com>
 +M:    Khuong Dinh <khuong@os.amperecomputing.com>
  S:    Supported
  F:    drivers/edac/xgene_edac.c
  F:    Documentation/devicetree/bindings/edac/apm-xgene-edac.txt
  
  APPLIED MICRO (APM) X-GENE SOC ETHERNET (V2) DRIVER
 -M:    Iyappan Subramanian <isubramanian@apm.com>
 -M:    Keyur Chudgar <kchudgar@apm.com>
 +M:    Iyappan Subramanian <iyappan@os.amperecomputing.com>
 +M:    Keyur Chudgar <keyur@os.amperecomputing.com>
  S:    Supported
  F:    drivers/net/ethernet/apm/xgene-v2/
  
  APPLIED MICRO (APM) X-GENE SOC ETHERNET DRIVER
 -M:    Iyappan Subramanian <isubramanian@apm.com>
 -M:    Keyur Chudgar <kchudgar@apm.com>
 -M:    Quan Nguyen <qnguyen@apm.com>
 +M:    Iyappan Subramanian <iyappan@os.amperecomputing.com>
 +M:    Keyur Chudgar <keyur@os.amperecomputing.com>
 +M:    Quan Nguyen <quan@os.amperecomputing.com>
  S:    Supported
  F:    drivers/net/ethernet/apm/xgene/
  F:    drivers/net/phy/mdio-xgene.c
@@@ -1063,7 -1063,7 +1063,7 @@@ F:      Documentation/devicetree/bindings/ne
  F:    Documentation/devicetree/bindings/net/apm-xgene-mdio.txt
  
  APPLIED MICRO (APM) X-GENE SOC PMU
 -M:    Tai Nguyen <ttnguyen@apm.com>
 +M:    Khuong Dinh <khuong@os.amperecomputing.com>
  S:    Supported
  F:    drivers/perf/xgene_pmu.c
  F:    Documentation/perf/xgene-pmu.txt
@@@ -1371,13 -1371,6 +1371,13 @@@ F:    arch/arm/mach-aspeed
  F:    arch/arm/boot/dts/aspeed-*
  N:    aspeed
  
 +ARM/BITMAIN ARCHITECTURE
 +M:    Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +S:    Maintained
 +F:    arch/arm64/boot/dts/bitmain/
 +F:    Documentation/devicetree/bindings/arm/bitmain.yaml
 +
  ARM/CALXEDA HIGHBANK ARCHITECTURE
  M:    Rob Herring <robh@kernel.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1537,14 -1530,21 +1537,14 @@@ ARM/FREESCALE IMX / MXC ARM ARCHITECTUR
  M:    Shawn Guo <shawnguo@kernel.org>
  M:    Sascha Hauer <s.hauer@pengutronix.de>
  R:    Pengutronix Kernel Team <kernel@pengutronix.de>
 -R:    Fabio Estevam <fabio.estevam@nxp.com>
 +R:    Fabio Estevam <festevam@gmail.com>
  R:    NXP Linux Team <linux-imx@nxp.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git
 -F:    arch/arm/mach-imx/
 -F:    arch/arm/mach-mxs/
 -F:    arch/arm/boot/dts/imx*
 -F:    arch/arm/configs/imx*_defconfig
 -F:    arch/arm64/boot/dts/freescale/imx*
 -F:    drivers/clk/imx/
 -F:    drivers/firmware/imx/
 -F:    drivers/soc/imx/
 -F:    include/linux/firmware/imx/
 -F:    include/soc/imx/
 +N:    imx
 +N:    mxs
 +X:    drivers/media/i2c/
  
  ARM/FREESCALE VYBRID ARM ARCHITECTURE
  M:    Shawn Guo <shawnguo@kernel.org>
@@@ -1947,37 -1947,19 +1947,37 @@@ M:   David Brown <david.brown@linaro.org
  L:    linux-arm-msm@vger.kernel.org
  S:    Maintained
  F:    Documentation/devicetree/bindings/soc/qcom/
 +F:    Documentation/devicetree/bindings/*/qcom*
  F:    arch/arm/boot/dts/qcom-*.dts
  F:    arch/arm/boot/dts/qcom-*.dtsi
  F:    arch/arm/mach-qcom/
 -F:    arch/arm64/boot/dts/qcom/*
 +F:    arch/arm64/boot/dts/qcom/
 +F:    drivers/*/qcom/
 +F:    drivers/*/qcom*
 +F:    drivers/*/*/qcom/
 +F:    drivers/*/*/qcom*
 +F:    drivers/*/pm8???-*
 +F:    drivers/bluetooth/btqcomsmd.c
 +F:    drivers/clocksource/timer-qcom.c
 +F:    drivers/extcon/extcon-qcom*
 +F:    drivers/iommu/msm*
  F:    drivers/i2c/busses/i2c-qup.c
 -F:    drivers/clk/qcom/
 -F:    drivers/dma/qcom/
 -F:    drivers/soc/qcom/
 +F:    drivers/i2c/busses/i2c-qcom-geni.c
 +F:    drivers/mfd/ssbi.c
 +F:    drivers/mmc/host/mmci_qcom*
 +F:    drivers/mmc/host/sdhci_msm.c
 +F:    drivers/pci/controller/dwc/pcie-qcom.c
 +F:    drivers/phy/qualcomm/
 +F:    drivers/power/*/msm*
 +F:    drivers/reset/reset-qcom-*
 +F:    drivers/scsi/ufs/ufs-qcom.*
  F:    drivers/spi/spi-qup.c
 +F:    drivers/spi/spi-geni-qcom.c
 +F:    drivers/spi/spi-qcom-qspi.c
  F:    drivers/tty/serial/msm_serial.c
 -F:    drivers/*/pm8???-*
 -F:    drivers/mfd/ssbi.c
 -F:    drivers/firmware/qcom_scm*
 +F:    drivers/usb/dwc3/dwc3-qcom.c
 +F:    include/dt-bindings/*/qcom*
 +F:    include/linux/*/qcom*
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/agross/linux.git
  
  ARM/RADISYS ENP2611 MACHINE SUPPORT
@@@ -2014,7 -1996,7 +2014,7 @@@ Q:      http://patchwork.kernel.org/project/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
  S:    Supported
  F:    arch/arm64/boot/dts/renesas/
 -F:    Documentation/devicetree/bindings/arm/shmobile.txt
 +F:    Documentation/devicetree/bindings/arm/renesas.yaml
  F:    drivers/soc/renesas/
  F:    include/linux/soc/renesas/
  
@@@ -2126,8 -2108,6 +2126,8 @@@ Q:      http://patchwork.kernel.org/project/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/horms/renesas.git next
  S:    Supported
  F:    arch/arm/boot/dts/emev2*
 +F:    arch/arm/boot/dts/gr-peach*
 +F:    arch/arm/boot/dts/iwg20d-q7*
  F:    arch/arm/boot/dts/r7s*
  F:    arch/arm/boot/dts/r8a*
  F:    arch/arm/boot/dts/r9a*
@@@ -2135,7 -2115,7 +2135,7 @@@ F:      arch/arm/boot/dts/sh
  F:    arch/arm/configs/shmobile_defconfig
  F:    arch/arm/include/debug/renesas-scif.S
  F:    arch/arm/mach-shmobile/
 -F:    Documentation/devicetree/bindings/arm/shmobile.txt
 +F:    Documentation/devicetree/bindings/arm/renesas.yaml
  F:    drivers/soc/renesas/
  F:    include/linux/soc/renesas/
  
@@@ -2628,7 -2608,6 +2628,7 @@@ L:      linux-kernel@vger.kernel.or
  S:    Maintained
  F:    arch/*/include/asm/atomic*.h
  F:    include/*/atomic*.h
 +F:    scripts/atomic/
  
  ATTO EXPRESSSAS SAS/SATA RAID SCSI DRIVER
  M:    Bradley Grove <linuxdrivers@attotech.com>
@@@ -9835,6 -9814,14 +9835,14 @@@ F:    kernel/sched/membarrier.
  F:    include/uapi/linux/membarrier.h
  F:    arch/powerpc/include/asm/membarrier.h
  
+ MEMBLOCK
+ M:    Mike Rapoport <rppt@linux.ibm.com>
+ L:    linux-mm@kvack.org
+ S:    Maintained
+ F:    include/linux/memblock.h
+ F:    mm/memblock.c
+ F:    Documentation/core-api/boot-time-mm.rst
  MEMORY MANAGEMENT
  L:    linux-mm@kvack.org
  W:    http://www.linux-mm.org
@@@ -9908,11 -9895,6 +9916,11 @@@ S:    Maintaine
  F:    drivers/mtd/nand/raw/meson_*
  F:    Documentation/devicetree/bindings/mtd/amlogic,meson-nand.txt
  
 +METHODE UDPU SUPPORT
 +M:    Vladimir Vid <vladimir.vid@sartura.hr>
 +S:    Maintained
 +F:    arch/arm64/boot/dts/marvell/armada-3720-uDPU.dts
 +
  MICROBLAZE ARCHITECTURE
  M:    Michal Simek <monstr@monstr.eu>
  W:    http://www.monstr.eu/fdt/
@@@ -10853,12 -10835,6 +10861,12 @@@ F: drivers/power/supply/bq27xxx_battery
  F:    drivers/power/supply/isp1704_charger.c
  F:    drivers/power/supply/rx51_battery.c
  
 +NOLIBC HEADER FILE
 +M:    Willy Tarreau <w@1wt.eu>
 +S:    Maintained
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/wtarreau/nolibc.git
 +F:    tools/include/nolibc/
 +
  NTB AMD DRIVER
  M:    Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
  L:    linux-ntb@googlegroups.com
@@@ -11339,11 -11315,6 +11347,11 @@@ M: Jens Wiklander <jens.wiklander@linar
  S:    Maintained
  F:    drivers/tee/optee/
  
 +OP-TEE RANDOM NUMBER GENERATOR (RNG) DRIVER
 +M:    Sumit Garg <sumit.garg@linaro.org>
 +S:    Maintained
 +F:    drivers/char/hw_random/optee-rng.c
 +
  OPA-VNIC DRIVER
  M:    Dennis Dalessandro <dennis.dalessandro@intel.com>
  M:    Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
@@@ -11643,7 -11614,7 +11651,7 @@@ F:   Documentation/devicetree/bindings/pc
  F:    drivers/pci/controller/pcie-altera.c
  
  PCI DRIVER FOR APPLIEDMICRO XGENE
 -M:    Tanmay Inamdar <tinamdar@apm.com>
 +M:    Toan Le <toan@os.amperecomputing.com>
  L:    linux-pci@vger.kernel.org
  L:    linux-arm-kernel@lists.infradead.org
  S:    Maintained
@@@ -11821,7 -11792,7 +11829,7 @@@ F:   Documentation/devicetree/bindings/pc
  F:    drivers/pci/controller/pcie-altera-msi.c
  
  PCI MSI DRIVER FOR APPLIEDMICRO XGENE
 -M:    Duc Dang <dhdang@apm.com>
 +M:    Toan Le <toan@os.amperecomputing.com>
  L:    linux-pci@vger.kernel.org
  L:    linux-arm-kernel@lists.infradead.org
  S:    Maintained
@@@ -12310,6 -12281,14 +12318,6 @@@ S:  Maintaine
  F:    drivers/net/ppp/pptp.c
  W:    http://sourceforge.net/projects/accel-pptp
  
 -PREEMPTIBLE KERNEL
 -M:    Robert Love <rml@tech9.net>
 -L:    kpreempt-tech@lists.sourceforge.net
 -W:    https://www.kernel.org/pub/linux/kernel/people/rml/preempt-kernel
 -S:    Supported
 -F:    Documentation/preempt-locking.txt
 -F:    include/linux/preempt.h
 -
  PRINTK
  M:    Petr Mladek <pmladek@suse.com>
  M:    Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
@@@ -13047,7 -13026,6 +13055,7 @@@ F:   drivers/reset
  F:    Documentation/devicetree/bindings/reset/
  F:    include/dt-bindings/reset/
  F:    include/linux/reset.h
 +F:    include/linux/reset/
  F:    include/linux/reset-controller.h
  
  RESTARTABLE SEQUENCES SUPPORT
@@@ -13548,7 -13526,6 +13556,7 @@@ F:   kernel/sched
  F:    include/linux/sched.h
  F:    include/uapi/linux/sched.h
  F:    include/linux/wait.h
 +F:    include/linux/preempt.h
  
  SCR24X CHIP CARD INTERFACE DRIVER
  M:    Lubomir Rintel <lkundrak@v3.sk>
@@@ -14790,7 -14767,7 +14798,7 @@@ S:   Maintaine
  F:    drivers/tty/serial/8250/8250_dw.c
  
  SYNOPSYS DESIGNWARE APB GPIO DRIVER
 -M:    Hoan Tran <hotran@apm.com>
 +M:    Hoan Tran <hoan@os.amperecomputing.com>
  L:    linux-gpio@vger.kernel.org
  S:    Maintained
  F:    drivers/gpio/gpio-dwapb.c
diff --combined fs/proc/stat.c
  
  #ifdef arch_idle_time
  
- static u64 get_idle_time(int cpu)
+ static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
  {
        u64 idle;
  
-       idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+       idle = kcs->cpustat[CPUTIME_IDLE];
        if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
                idle += arch_idle_time(cpu);
        return idle;
  }
  
- static u64 get_iowait_time(int cpu)
+ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
  {
        u64 iowait;
  
-       iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+       iowait = kcs->cpustat[CPUTIME_IOWAIT];
        if (cpu_online(cpu) && nr_iowait_cpu(cpu))
                iowait += arch_idle_time(cpu);
        return iowait;
@@@ -45,7 -45,7 +45,7 @@@
  
  #else
  
- static u64 get_idle_time(int cpu)
+ static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
  {
        u64 idle, idle_usecs = -1ULL;
  
  
        if (idle_usecs == -1ULL)
                /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
-               idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+               idle = kcs->cpustat[CPUTIME_IDLE];
        else
                idle = idle_usecs * NSEC_PER_USEC;
  
        return idle;
  }
  
- static u64 get_iowait_time(int cpu)
+ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
  {
        u64 iowait, iowait_usecs = -1ULL;
  
@@@ -70,7 -70,7 +70,7 @@@
  
        if (iowait_usecs == -1ULL)
                /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
-               iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+               iowait = kcs->cpustat[CPUTIME_IOWAIT];
        else
                iowait = iowait_usecs * NSEC_PER_USEC;
  
  
  #endif
  
 +static void show_irq_gap(struct seq_file *p, unsigned int gap)
 +{
 +      static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
 +
 +      while (gap > 0) {
 +              unsigned int inc;
 +
 +              inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2);
 +              seq_write(p, zeros, 2 * inc);
 +              gap -= inc;
 +      }
 +}
 +
 +static void show_all_irqs(struct seq_file *p)
 +{
 +      unsigned int i, next = 0;
 +
 +      for_each_active_irq(i) {
 +              show_irq_gap(p, i - next);
 +              seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
 +              next = i + 1;
 +      }
 +      show_irq_gap(p, nr_irqs - next);
 +}
 +
  static int show_stat(struct seq_file *p, void *v)
  {
        int i, j;
        getboottime64(&boottime);
  
        for_each_possible_cpu(i) {
-               user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
-               nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
-               system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
-               idle += get_idle_time(i);
-               iowait += get_iowait_time(i);
-               irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
-               softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
-               steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
-               guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
-               guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+               struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+               user += kcs->cpustat[CPUTIME_USER];
+               nice += kcs->cpustat[CPUTIME_NICE];
+               system += kcs->cpustat[CPUTIME_SYSTEM];
+               idle += get_idle_time(kcs, i);
+               iowait += get_iowait_time(kcs, i);
+               irq += kcs->cpustat[CPUTIME_IRQ];
+               softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
+               steal += kcs->cpustat[CPUTIME_STEAL];
+               guest += kcs->cpustat[CPUTIME_GUEST];
+               guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
                sum += kstat_cpu_irqs_sum(i);
                sum += arch_irq_stat_cpu(i);
  
        seq_putc(p, '\n');
  
        for_each_online_cpu(i) {
+               struct kernel_cpustat *kcs = &kcpustat_cpu(i);
                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-               user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
-               nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
-               system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
-               idle = get_idle_time(i);
-               iowait = get_iowait_time(i);
-               irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
-               softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
-               steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
-               guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
-               guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+               user = kcs->cpustat[CPUTIME_USER];
+               nice = kcs->cpustat[CPUTIME_NICE];
+               system = kcs->cpustat[CPUTIME_SYSTEM];
+               idle = get_idle_time(kcs, i);
+               iowait = get_iowait_time(kcs, i);
+               irq = kcs->cpustat[CPUTIME_IRQ];
+               softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
+               steal = kcs->cpustat[CPUTIME_STEAL];
+               guest = kcs->cpustat[CPUTIME_GUEST];
+               guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
                seq_printf(p, "cpu%d", i);
                seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
                seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
        }
        seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
  
 -      /* sum again ? it could be updated? */
 -      for_each_irq_nr(j)
 -              seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
 +      show_all_irqs(p);
  
        seq_printf(p,
                "\nctxt %llu\n"
diff --combined fs/proc/task_nommu.c
@@@ -64,7 -64,7 +64,7 @@@ void task_mem(struct seq_file *m, struc
        else
                bytes += kobjsize(current->files);
  
 -      if (current->sighand && atomic_read(&current->sighand->count) > 1)
 +      if (current->sighand && refcount_read(&current->sighand->count) > 1)
                sbytes += kobjsize(current->sighand);
        else
                bytes += kobjsize(current->sighand);
@@@ -178,7 -178,7 +178,7 @@@ static int nommu_vma_show(struct seq_fi
                seq_file_path(m, file, "");
        } else if (mm && is_stack(vma)) {
                seq_pad(m, ' ');
-               seq_printf(m, "[stack]");
+               seq_puts(m, "[stack]");
        }
  
        seq_putc(m, '\n');
diff --combined include/linux/sched.h
@@@ -21,7 -21,6 +21,7 @@@
  #include <linux/seccomp.h>
  #include <linux/nodemask.h>
  #include <linux/rcupdate.h>
 +#include <linux/refcount.h>
  #include <linux/resource.h>
  #include <linux/latencytop.h>
  #include <linux/sched/prio.h>
@@@ -48,6 -47,7 +48,7 @@@ struct pid_namespace
  struct pipe_inode_info;
  struct rcu_node;
  struct reclaim_state;
+ struct capture_control;
  struct robust_list_head;
  struct sched_attr;
  struct sched_param;
@@@ -357,6 -357,12 +358,6 @@@ struct util_est 
   * For cfs_rq, it is the aggregated load_avg of all runnable and
   * blocked sched_entities.
   *
 - * load_avg may also take frequency scaling into account:
 - *
 - *   load_avg = runnable% * scale_load_down(load) * freq%
 - *
 - * where freq% is the CPU frequency normalized to the highest frequency.
 - *
   * [util_avg definition]
   *
   *   util_avg = running% * SCHED_CAPACITY_SCALE
   * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
   * and blocked sched_entities.
   *
 - * util_avg may also factor frequency scaling and CPU capacity scaling:
 + * load_avg and util_avg don't direcly factor frequency scaling and CPU
 + * capacity scaling. The scaling is done through the rq_clock_pelt that
 + * is used for computing those signals (see update_rq_clock_pelt())
   *
 - *   util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
 - *
 - * where freq% is the same as above, and capacity% is the CPU capacity
 - * normalized to the greatest capacity (due to uarch differences, etc).
 - *
 - * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
 - * themselves are in the range of [0, 1]. To do fixed point arithmetics,
 - * we therefore scale them to as large a range as necessary. This is for
 - * example reflected by util_avg's SCHED_CAPACITY_SCALE.
 + * N.B., the above ratios (runnable% and running%) themselves are in the
 + * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
 + * to as large a range as necessary. This is for example reflected by
 + * util_avg's SCHED_CAPACITY_SCALE.
   *
   * [Overflow issue]
   *
@@@ -599,7 -608,7 +600,7 @@@ struct task_struct 
        randomized_struct_fields_start
  
        void                            *stack;
 -      atomic_t                        usage;
 +      refcount_t                      usage;
        /* Per task flags (PF_*), defined further below: */
        unsigned int                    flags;
        unsigned int                    ptrace;
  
        struct io_context               *io_context;
  
+ #ifdef CONFIG_COMPACTION
+       struct capture_control          *capture_control;
+ #endif
        /* Ptrace state: */
        unsigned long                   ptrace_message;
        kernel_siginfo_t                *last_siginfo;
  #endif
  #ifdef CONFIG_THREAD_INFO_IN_TASK
        /* A live task holds one reference: */
 -      atomic_t                        stack_refcount;
 +      refcount_t                      stack_refcount;
  #endif
  #ifdef CONFIG_LIVEPATCH
        int patch_state;
@@@ -1395,6 -1407,8 +1399,7 @@@ extern struct pid *cad_pid
  #define PF_UMH                        0x02000000      /* I'm an Usermodehelper process */
  #define PF_NO_SETAFFINITY     0x04000000      /* Userland is not allowed to meddle with cpus_allowed */
  #define PF_MCE_EARLY          0x08000000      /* Early kill for mce process policy */
 -#define PF_MUTEX_TESTER               0x20000000      /* Thread belongs to the rt mutex tester */
+ #define PF_MEMALLOC_NOCMA     0x10000000      /* All allocation request will have _GFP_MOVABLE cleared */
  #define PF_FREEZER_SKIP               0x40000000      /* Freezer should not count it as freezable */
  #define PF_SUSPEND_TASK               0x80000000      /* This thread called freeze_processes() and should not be frozen */
  
@@@ -1444,7 -1458,6 +1449,7 @@@ static inline bool is_percpu_thread(voi
  #define PFA_SPEC_SSB_FORCE_DISABLE    4       /* Speculative Store Bypass force disabled*/
  #define PFA_SPEC_IB_DISABLE           5       /* Indirect branch speculation restricted */
  #define PFA_SPEC_IB_FORCE_DISABLE     6       /* Indirect branch speculation permanently restricted */
 +#define PFA_SPEC_SSB_NOEXEC           7       /* Speculative Store Bypass clear on execve() */
  
  #define TASK_PFA_TEST(name, func)                                     \
        static inline bool task_##func(struct task_struct *p)           \
@@@ -1473,10 -1486,6 +1478,10 @@@ TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ss
  TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
  TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
  
 +TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
 +TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
 +TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)
 +
  TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
  TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
  
@@@ -1744,9 -1753,9 +1749,9 @@@ static __always_inline bool need_resche
  static inline unsigned int task_cpu(const struct task_struct *p)
  {
  #ifdef CONFIG_THREAD_INFO_IN_TASK
 -      return p->cpu;
 +      return READ_ONCE(p->cpu);
  #else
 -      return task_thread_info(p)->cpu;
 +      return READ_ONCE(task_thread_info(p)->cpu);
  #endif
  }
  
diff --combined init/init_task.c
@@@ -10,6 -10,7 +10,7 @@@
  #include <linux/fs.h>
  #include <linux/mm.h>
  #include <linux/audit.h>
+ #include <linux/numa.h>
  
  #include <asm/pgtable.h>
  #include <linux/uaccess.h>
@@@ -44,7 -45,7 +45,7 @@@ static struct signal_struct init_signal
  };
  
  static struct sighand_struct init_sighand = {
 -      .count          = ATOMIC_INIT(1),
 +      .count          = REFCOUNT_INIT(1),
        .action         = { { { .sa_handler = SIG_DFL, } }, },
        .siglock        = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
        .signalfd_wqh   = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
@@@ -61,11 -62,11 +62,11 @@@ struct task_struct init_tas
  = {
  #ifdef CONFIG_THREAD_INFO_IN_TASK
        .thread_info    = INIT_THREAD_INFO(init_task),
 -      .stack_refcount = ATOMIC_INIT(1),
 +      .stack_refcount = REFCOUNT_INIT(1),
  #endif
        .state          = 0,
        .stack          = init_stack,
 -      .usage          = ATOMIC_INIT(2),
 +      .usage          = REFCOUNT_INIT(2),
        .flags          = PF_KTHREAD,
        .prio           = MAX_PRIO - 20,
        .static_prio    = MAX_PRIO - 20,
        .vtime.state    = VTIME_SYS,
  #endif
  #ifdef CONFIG_NUMA_BALANCING
-       .numa_preferred_nid = -1,
+       .numa_preferred_nid = NUMA_NO_NODE,
        .numa_group     = NULL,
        .numa_faults    = NULL,
  #endif
diff --combined kernel/kthread.c
@@@ -20,6 -20,7 +20,7 @@@
  #include <linux/freezer.h>
  #include <linux/ptrace.h>
  #include <linux/uaccess.h>
+ #include <linux/numa.h>
  #include <trace/events/sched.h>
  
  static DEFINE_SPINLOCK(kthread_create_lock);
@@@ -101,12 -102,6 +102,12 @@@ bool kthread_should_stop(void
  }
  EXPORT_SYMBOL(kthread_should_stop);
  
 +bool __kthread_should_park(struct task_struct *k)
 +{
 +      return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
 +}
 +EXPORT_SYMBOL_GPL(__kthread_should_park);
 +
  /**
   * kthread_should_park - should this kthread park now?
   *
   */
  bool kthread_should_park(void)
  {
 -      return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
 +      return __kthread_should_park(current);
  }
  EXPORT_SYMBOL_GPL(kthread_should_park);
  
@@@ -605,7 -600,7 +606,7 @@@ void __kthread_init_worker(struct kthre
                                struct lock_class_key *key)
  {
        memset(worker, 0, sizeof(struct kthread_worker));
 -      spin_lock_init(&worker->lock);
 +      raw_spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
        INIT_LIST_HEAD(&worker->delayed_work_list);
@@@ -647,21 -642,21 +648,21 @@@ repeat
  
        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
 -              spin_lock_irq(&worker->lock);
 +              raw_spin_lock_irq(&worker->lock);
                worker->task = NULL;
 -              spin_unlock_irq(&worker->lock);
 +              raw_spin_unlock_irq(&worker->lock);
                return 0;
        }
  
        work = NULL;
 -      spin_lock_irq(&worker->lock);
 +      raw_spin_lock_irq(&worker->lock);
        if (!list_empty(&worker->work_list)) {
                work = list_first_entry(&worker->work_list,
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
        worker->current_work = work;
 -      spin_unlock_irq(&worker->lock);
 +      raw_spin_unlock_irq(&worker->lock);
  
        if (work) {
                __set_current_state(TASK_RUNNING);
@@@ -681,7 -676,7 +682,7 @@@ __kthread_create_worker(int cpu, unsign
  {
        struct kthread_worker *worker;
        struct task_struct *task;
-       int node = -1;
+       int node = NUMA_NO_NODE;
  
        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
        if (!worker)
@@@ -818,12 -813,12 +819,12 @@@ bool kthread_queue_work(struct kthread_
        bool ret = false;
        unsigned long flags;
  
 -      spin_lock_irqsave(&worker->lock, flags);
 +      raw_spin_lock_irqsave(&worker->lock, flags);
        if (!queuing_blocked(worker, work)) {
                kthread_insert_work(worker, work, &worker->work_list);
                ret = true;
        }
 -      spin_unlock_irqrestore(&worker->lock, flags);
 +      raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
  }
  EXPORT_SYMBOL_GPL(kthread_queue_work);
@@@ -841,7 -836,6 +842,7 @@@ void kthread_delayed_work_timer_fn(stru
        struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
 +      unsigned long flags;
  
        /*
         * This might happen when a pending work is reinitialized.
        if (WARN_ON_ONCE(!worker))
                return;
  
 -      spin_lock(&worker->lock);
 +      raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);
  
        list_del_init(&work->node);
        kthread_insert_work(worker, work, &worker->work_list);
  
 -      spin_unlock(&worker->lock);
 +      raw_spin_unlock_irqrestore(&worker->lock, flags);
  }
  EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
  
@@@ -915,14 -909,14 +916,14 @@@ bool kthread_queue_delayed_work(struct 
        unsigned long flags;
        bool ret = false;
  
 -      spin_lock_irqsave(&worker->lock, flags);
 +      raw_spin_lock_irqsave(&worker->lock, flags);
  
        if (!queuing_blocked(worker, work)) {
                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }
  
 -      spin_unlock_irqrestore(&worker->lock, flags);
 +      raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
  }
  EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
@@@ -958,7 -952,7 +959,7 @@@ void kthread_flush_work(struct kthread_
        if (!worker)
                return;
  
 -      spin_lock_irq(&worker->lock);
 +      raw_spin_lock_irq(&worker->lock);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);
  
        else
                noop = true;
  
 -      spin_unlock_irq(&worker->lock);
 +      raw_spin_unlock_irq(&worker->lock);
  
        if (!noop)
                wait_for_completion(&fwork.done);
@@@ -1003,9 -997,9 +1004,9 @@@ static bool __kthread_cancel_work(struc
                 * any queuing is blocked by setting the canceling counter.
                 */
                work->canceling++;
 -              spin_unlock_irqrestore(&worker->lock, *flags);
 +              raw_spin_unlock_irqrestore(&worker->lock, *flags);
                del_timer_sync(&dwork->timer);
 -              spin_lock_irqsave(&worker->lock, *flags);
 +              raw_spin_lock_irqsave(&worker->lock, *flags);
                work->canceling--;
        }
  
@@@ -1052,7 -1046,7 +1053,7 @@@ bool kthread_mod_delayed_work(struct kt
        unsigned long flags;
        int ret = false;
  
 -      spin_lock_irqsave(&worker->lock, flags);
 +      raw_spin_lock_irqsave(&worker->lock, flags);
  
        /* Do not bother with canceling when never queued. */
        if (!work->worker)
  fast_queue:
        __kthread_queue_delayed_work(worker, dwork, delay);
  out:
 -      spin_unlock_irqrestore(&worker->lock, flags);
 +      raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
  }
  EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
@@@ -1083,7 -1077,7 +1084,7 @@@ static bool __kthread_cancel_work_sync(
        if (!worker)
                goto out;
  
 -      spin_lock_irqsave(&worker->lock, flags);
 +      raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);
  
         * In the meantime, block any queuing by setting the canceling counter.
         */
        work->canceling++;
 -      spin_unlock_irqrestore(&worker->lock, flags);
 +      raw_spin_unlock_irqrestore(&worker->lock, flags);
        kthread_flush_work(work);
 -      spin_lock_irqsave(&worker->lock, flags);
 +      raw_spin_lock_irqsave(&worker->lock, flags);
        work->canceling--;
  
  out_fast:
 -      spin_unlock_irqrestore(&worker->lock, flags);
 +      raw_spin_unlock_irqrestore(&worker->lock, flags);
  out:
        return ret;
  }
diff --combined kernel/sched/core.c
@@@ -107,12 -107,11 +107,12 @@@ struct rq *task_rq_lock(struct task_str
                 *                                      [L] ->on_rq
                 *      RELEASE (rq->lock)
                 *
 -               * If we observe the old CPU in task_rq_lock, the acquire of
 +               * If we observe the old CPU in task_rq_lock(), the acquire of
                 * the old rq->lock will fully serialize against the stores.
                 *
 -               * If we observe the new CPU in task_rq_lock, the acquire will
 -               * pair with the WMB to ensure we must then also see migrating.
 +               * If we observe the new CPU in task_rq_lock(), the address
 +               * dependency headed by '[L] rq = task_rq()' and the acquire
 +               * will pair with the WMB to ensure we then also see migrating.
                 */
                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
                        rq_pin_lock(rq, rf);
@@@ -181,7 -180,6 +181,7 @@@ static void update_rq_clock_task(struc
        if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
                update_irq_load_avg(rq, irq_delta + steal);
  #endif
 +      update_rq_clock_pelt(rq, delta);
  }
  
  void update_rq_clock(struct rq *rq)
@@@ -398,7 -396,19 +398,7 @@@ static bool set_nr_if_polling(struct ta
  #endif
  #endif
  
 -/**
 - * wake_q_add() - queue a wakeup for 'later' waking.
 - * @head: the wake_q_head to add @task to
 - * @task: the task to queue for 'later' wakeup
 - *
 - * Queue a task for later wakeup, most likely by the wake_up_q() call in the
 - * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
 - * instantly.
 - *
 - * This function must be used as-if it were wake_up_process(); IOW the task
 - * must be ready to be woken at this location.
 - */
 -void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
  {
        struct wake_q_node *node = &task->wake_q;
  
         * state, even in the failed case, an explicit smp_mb() must be used.
         */
        smp_mb__before_atomic();
 -      if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
 -              return;
 -
 -      get_task_struct(task);
 +      if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
 +              return false;
  
        /*
         * The head is context local, there can be no concurrency.
         */
        *head->lastp = node;
        head->lastp = &node->next;
 +      return true;
 +}
 +
 +/**
 + * wake_q_add() - queue a wakeup for 'later' waking.
 + * @head: the wake_q_head to add @task to
 + * @task: the task to queue for 'later' wakeup
 + *
 + * Queue a task for later wakeup, most likely by the wake_up_q() call in the
 + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
 + * instantly.
 + *
 + * This function must be used as-if it were wake_up_process(); IOW the task
 + * must be ready to be woken at this location.
 + */
 +void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 +{
 +      if (__wake_q_add(head, task))
 +              get_task_struct(task);
 +}
 +
 +/**
 + * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
 + * @head: the wake_q_head to add @task to
 + * @task: the task to queue for 'later' wakeup
 + *
 + * Queue a task for later wakeup, most likely by the wake_up_q() call in the
 + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
 + * instantly.
 + *
 + * This function must be used as-if it were wake_up_process(); IOW the task
 + * must be ready to be woken at this location.
 + *
 + * This function is essentially a task-safe equivalent to wake_q_add(). Callers
 + * that already hold reference to @task can call the 'safe' version and trust
 + * wake_q to do the right thing depending whether or not the @task is already
 + * queued for wakeup.
 + */
 +void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
 +{
 +      if (!__wake_q_add(head, task))
 +              put_task_struct(task);
  }
  
  void wake_up_q(struct wake_q_head *head)
@@@ -958,7 -928,7 +958,7 @@@ static struct rq *move_queued_task(stru
  {
        lockdep_assert_held(&rq->lock);
  
 -      p->on_rq = TASK_ON_RQ_MIGRATING;
 +      WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
        dequeue_task(rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, new_cpu);
        rq_unlock(rq, rf);
@@@ -2220,6 -2190,9 +2220,9 @@@ static void __sched_fork(unsigned long 
        INIT_HLIST_HEAD(&p->preempt_notifiers);
  #endif
  
+ #ifdef CONFIG_COMPACTION
+       p->capture_control = NULL;
+ #endif
        init_numa_balancing(clone_flags, p);
  }
  
@@@ -2461,7 -2434,7 +2464,7 @@@ void wake_up_new_task(struct task_struc
  #endif
        rq = __task_rq_lock(p, &rf);
        update_rq_clock(rq);
 -      post_init_entity_util_avg(&p->se);
 +      post_init_entity_util_avg(p);
  
        activate_task(rq, p, ENQUEUE_NOCLOCK);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@@ -5295,8 -5268,9 +5298,8 @@@ SYSCALL_DEFINE2(sched_rr_get_interval, 
  }
  
  #ifdef CONFIG_COMPAT_32BIT_TIME
 -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
 -                     compat_pid_t, pid,
 -                     struct old_timespec32 __user *, interval)
 +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
 +              struct old_timespec32 __user *, interval)
  {
        struct timespec64 t;
        int retval = sched_rr_get_interval(pid, &t);
@@@ -5896,11 -5870,14 +5899,11 @@@ void __init sched_init_smp(void
        /*
         * There's no userspace yet to cause hotplug operations; hence all the
         * CPU masks are stable and all blatant races in the below code cannot
 -       * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
 -       * but there won't be any contention on it.
 +       * happen.
         */
 -      cpus_read_lock();
        mutex_lock(&sched_domains_mutex);
        sched_init_domains(cpu_active_mask);
        mutex_unlock(&sched_domains_mutex);
 -      cpus_read_unlock();
  
        /* Move init over to a non-isolated CPU */
        if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
diff --combined kernel/sched/fair.c
@@@ -248,6 -248,13 +248,6 @@@ const struct sched_class fair_sched_cla
   */
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 -
 -/* cpu runqueue to which this cfs_rq is attached */
 -static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 -{
 -      return cfs_rq->rq;
 -}
 -
  static inline struct task_struct *task_of(struct sched_entity *se)
  {
        SCHED_WARN_ON(!entity_is_task(se));
@@@ -275,103 -282,79 +275,103 @@@ static inline struct cfs_rq *group_cfs_
        return grp->my_q;
  }
  
 -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
 -      if (!cfs_rq->on_list) {
 -              struct rq *rq = rq_of(cfs_rq);
 -              int cpu = cpu_of(rq);
 +      struct rq *rq = rq_of(cfs_rq);
 +      int cpu = cpu_of(rq);
 +
 +      if (cfs_rq->on_list)
 +              return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
 +
 +      cfs_rq->on_list = 1;
 +
 +      /*
 +       * Ensure we either appear before our parent (if already
 +       * enqueued) or force our parent to appear after us when it is
 +       * enqueued. The fact that we always enqueue bottom-up
 +       * reduces this to two cases and a special case for the root
 +       * cfs_rq. Furthermore, it also means that we will always reset
 +       * tmp_alone_branch either when the branch is connected
 +       * to a tree or when we reach the top of the tree
 +       */
 +      if (cfs_rq->tg->parent &&
 +          cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
                /*
 -               * Ensure we either appear before our parent (if already
 -               * enqueued) or force our parent to appear after us when it is
 -               * enqueued. The fact that we always enqueue bottom-up
 -               * reduces this to two cases and a special case for the root
 -               * cfs_rq. Furthermore, it also means that we will always reset
 -               * tmp_alone_branch either when the branch is connected
 -               * to a tree or when we reach the beg of the tree
 +               * If parent is already on the list, we add the child
 +               * just before. Thanks to circular linked property of
 +               * the list, this means to put the child at the tail
 +               * of the list that starts by parent.
                 */
 -              if (cfs_rq->tg->parent &&
 -                  cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 -                      /*
 -                       * If parent is already on the list, we add the child
 -                       * just before. Thanks to circular linked property of
 -                       * the list, this means to put the child at the tail
 -                       * of the list that starts by parent.
 -                       */
 -                      list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 -                              &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 -                      /*
 -                       * The branch is now connected to its tree so we can
 -                       * reset tmp_alone_branch to the beginning of the
 -                       * list.
 -                       */
 -                      rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 -              } else if (!cfs_rq->tg->parent) {
 -                      /*
 -                       * cfs rq without parent should be put
 -                       * at the tail of the list.
 -                       */
 -                      list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 -                              &rq->leaf_cfs_rq_list);
 -                      /*
 -                       * We have reach the beg of a tree so we can reset
 -                       * tmp_alone_branch to the beginning of the list.
 -                       */
 -                      rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 -              } else {
 -                      /*
 -                       * The parent has not already been added so we want to
 -                       * make sure that it will be put after us.
 -                       * tmp_alone_branch points to the beg of the branch
 -                       * where we will add parent.
 -                       */
 -                      list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 -                              rq->tmp_alone_branch);
 -                      /*
 -                       * update tmp_alone_branch to points to the new beg
 -                       * of the branch
 -                       */
 -                      rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 -              }
 +              list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 +                      &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 +              /*
 +               * The branch is now connected to its tree so we can
 +               * reset tmp_alone_branch to the beginning of the
 +               * list.
 +               */
 +              rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 +              return true;
 +      }
  
 -              cfs_rq->on_list = 1;
 +      if (!cfs_rq->tg->parent) {
 +              /*
 +               * cfs rq without parent should be put
 +               * at the tail of the list.
 +               */
 +              list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 +                      &rq->leaf_cfs_rq_list);
 +              /*
 +               * We have reach the top of a tree so we can reset
 +               * tmp_alone_branch to the beginning of the list.
 +               */
 +              rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 +              return true;
        }
 +
 +      /*
 +       * The parent has not already been added so we want to
 +       * make sure that it will be put after us.
 +       * tmp_alone_branch points to the begin of the branch
 +       * where we will add parent.
 +       */
 +      list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
 +      /*
 +       * update tmp_alone_branch to points to the new begin
 +       * of the branch
 +       */
 +      rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 +      return false;
  }
  
  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
        if (cfs_rq->on_list) {
 +              struct rq *rq = rq_of(cfs_rq);
 +
 +              /*
 +               * With cfs_rq being unthrottled/throttled during an enqueue,
 +               * it can happen the tmp_alone_branch points the a leaf that
 +               * we finally want to del. In this case, tmp_alone_branch moves
 +               * to the prev element but it will point to rq->leaf_cfs_rq_list
 +               * at the end of the enqueue.
 +               */
 +              if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
 +                      rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
 +
                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
                cfs_rq->on_list = 0;
        }
  }
  
 -/* Iterate through all leaf cfs_rq's on a runqueue: */
 -#define for_each_leaf_cfs_rq(rq, cfs_rq) \
 -      list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 +static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 +{
 +      SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
 +}
 +
 +/* Iterate thr' all leaf cfs_rq's on a runqueue */
 +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                    \
 +      list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
 +                               leaf_cfs_rq_list)
  
  /* Do the two (enqueued) entities belong to the same group ? */
  static inline struct cfs_rq *
@@@ -427,6 -410,12 +427,6 @@@ static inline struct task_struct *task_
        return container_of(se, struct task_struct, se);
  }
  
 -static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 -{
 -      return container_of(cfs_rq, struct rq, cfs);
 -}
 -
 -
  #define for_each_sched_entity(se) \
                for (; se; se = NULL)
  
@@@ -449,21 -438,16 +449,21 @@@ static inline struct cfs_rq *group_cfs_
        return NULL;
  }
  
 -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
 +      return true;
  }
  
  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  }
  
 -#define for_each_leaf_cfs_rq(rq, cfs_rq)      \
 -              for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 +static inline void assert_list_leaf_cfs_rq(struct rq *rq)
 +{
 +}
 +
 +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)    \
 +              for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
  {
@@@ -702,8 -686,9 +702,8 @@@ static u64 sched_vslice(struct cfs_rq *
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
 -#ifdef CONFIG_SMP
  #include "pelt.h"
 -#include "sched-pelt.h"
 +#ifdef CONFIG_SMP
  
  static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  static unsigned long task_h_load(struct task_struct *p);
@@@ -759,9 -744,8 +759,9 @@@ static void attach_entity_cfs_rq(struc
   * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
   * if util_avg > util_avg_cap.
   */
 -void post_init_entity_util_avg(struct sched_entity *se)
 +void post_init_entity_util_avg(struct task_struct *p)
  {
 +      struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
        long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
                }
        }
  
 -      if (entity_is_task(se)) {
 -              struct task_struct *p = task_of(se);
 -              if (p->sched_class != &fair_sched_class) {
 -                      /*
 -                       * For !fair tasks do:
 -                       *
 -                      update_cfs_rq_load_avg(now, cfs_rq);
 -                      attach_entity_load_avg(cfs_rq, se, 0);
 -                      switched_from_fair(rq, p);
 -                       *
 -                       * such that the next switched_to_fair() has the
 -                       * expected state.
 -                       */
 -                      se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
 -                      return;
 -              }
 +      if (p->sched_class != &fair_sched_class) {
 +              /*
 +               * For !fair tasks do:
 +               *
 +              update_cfs_rq_load_avg(now, cfs_rq);
 +              attach_entity_load_avg(cfs_rq, se, 0);
 +              switched_from_fair(rq, p);
 +               *
 +               * such that the next switched_to_fair() has the
 +               * expected state.
 +               */
 +              se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
 +              return;
        }
  
        attach_entity_cfs_rq(se);
  void init_entity_runnable_average(struct sched_entity *se)
  {
  }
 -void post_init_entity_util_avg(struct sched_entity *se)
 +void post_init_entity_util_avg(struct task_struct *p)
  {
  }
  static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
@@@ -1048,7 -1035,7 +1048,7 @@@ unsigned int sysctl_numa_balancing_scan
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
  struct numa_group {
 -      atomic_t refcount;
 +      refcount_t refcount;
  
        spinlock_t lock; /* nr_tasks, tasks */
        int nr_tasks;
@@@ -1117,7 -1104,7 +1117,7 @@@ static unsigned int task_scan_start(str
                unsigned long shared = group_faults_shared(ng);
                unsigned long private = group_faults_priv(ng);
  
 -              period *= atomic_read(&ng->refcount);
 +              period *= refcount_read(&ng->refcount);
                period *= shared + 1;
                period /= private + shared + 1;
        }
@@@ -1140,7 -1127,7 +1140,7 @@@ static unsigned int task_scan_max(struc
                unsigned long private = group_faults_priv(ng);
                unsigned long period = smax;
  
 -              period *= atomic_read(&ng->refcount);
 +              period *= refcount_read(&ng->refcount);
                period *= shared + 1;
                period /= private + shared + 1;
  
@@@ -1173,7 -1160,7 +1173,7 @@@ void init_numa_balancing(unsigned long 
  
        /* New address space, reset the preferred nid */
        if (!(clone_flags & CLONE_VM)) {
-               p->numa_preferred_nid = -1;
+               p->numa_preferred_nid = NUMA_NO_NODE;
                return;
        }
  
  
  static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
  {
-       rq->nr_numa_running += (p->numa_preferred_nid != -1);
+       rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
  }
  
  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
  {
-       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+       rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
  }
  
@@@ -1413,7 -1400,7 +1413,7 @@@ bool should_numa_migrate_memory(struct 
         * two full passes of the "multi-stage node selection" test that is
         * executed below.
         */
-       if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+       if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
            (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
                return true;
  
@@@ -1861,7 -1848,7 +1861,7 @@@ static void numa_migrate_preferred(stru
        unsigned long interval = HZ;
  
        /* This task has no NUMA fault statistics yet */
-       if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+       if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
                return;
  
        /* Periodically retry migrating the task to the preferred node */
@@@ -2108,7 -2095,7 +2108,7 @@@ static int preferred_group_nid(struct t
  
  static void task_numa_placement(struct task_struct *p)
  {
-       int seq, nid, max_nid = -1;
+       int seq, nid, max_nid = NUMA_NO_NODE;
        unsigned long max_faults = 0;
        unsigned long fault_types[2] = { 0, 0 };
        unsigned long total_faults;
  
  static inline int get_numa_group(struct numa_group *grp)
  {
 -      return atomic_inc_not_zero(&grp->refcount);
 +      return refcount_inc_not_zero(&grp->refcount);
  }
  
  static inline void put_numa_group(struct numa_group *grp)
  {
 -      if (atomic_dec_and_test(&grp->refcount))
 +      if (refcount_dec_and_test(&grp->refcount))
                kfree_rcu(grp, rcu);
  }
  
@@@ -2242,7 -2229,7 +2242,7 @@@ static void task_numa_group(struct task
                if (!grp)
                        return;
  
 -              atomic_set(&grp->refcount, 1);
 +              refcount_set(&grp->refcount, 1);
                grp->active_nodes = 1;
                grp->max_faults_cpu = 0;
                spin_lock_init(&grp->lock);
@@@ -2651,7 -2638,8 +2651,8 @@@ static void update_scan_period(struct t
                 * the preferred node.
                 */
                if (dst_nid == p->numa_preferred_nid ||
-                   (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+                   (p->numa_preferred_nid != NUMA_NO_NODE &&
+                       src_nid != p->numa_preferred_nid))
                        return;
        }
  
@@@ -3135,7 -3123,7 +3136,7 @@@ void set_task_rq_fair(struct sched_enti
        p_last_update_time = prev->avg.last_update_time;
        n_last_update_time = next->avg.last_update_time;
  #endif
 -      __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
 +      __update_load_avg_blocked_se(p_last_update_time, se);
        se->avg.last_update_time = n_last_update_time;
  }
  
@@@ -3270,11 -3258,11 +3271,11 @@@ update_tg_cfs_runnable(struct cfs_rq *c
  
        /*
         * runnable_sum can't be lower than running_sum
 -       * As running sum is scale with CPU capacity wehreas the runnable sum
 -       * is not we rescale running_sum 1st
 +       * Rescale running sum to be in the same range as runnable sum
 +       * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
 +       * runnable_sum is in [0 : LOAD_AVG_MAX]
         */
 -      running_sum = se->avg.util_sum /
 -              arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
 +      running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
        runnable_sum = max(runnable_sum, running_sum);
  
        load_sum = (s64)se_weight(se) * runnable_sum;
@@@ -3377,7 -3365,7 +3378,7 @@@ static inline void add_tg_cfs_propagate
  
  /**
   * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
 - * @now: current time, as per cfs_rq_clock_task()
 + * @now: current time, as per cfs_rq_clock_pelt()
   * @cfs_rq: cfs_rq to update
   *
   * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@@ -3422,7 -3410,7 +3423,7 @@@ update_cfs_rq_load_avg(u64 now, struct 
                decayed = 1;
        }
  
 -      decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
 +      decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
  
  #ifndef CONFIG_64BIT
        smp_wmb();
@@@ -3512,7 -3500,9 +3513,7 @@@ static void detach_entity_load_avg(stru
  /* Update task and its cfs_rq load average */
  static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
 -      u64 now = cfs_rq_clock_task(cfs_rq);
 -      struct rq *rq = rq_of(cfs_rq);
 -      int cpu = cpu_of(rq);
 +      u64 now = cfs_rq_clock_pelt(cfs_rq);
        int decayed;
  
        /*
         * track group sched_entity load average for task_h_load calc in migration
         */
        if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
 -              __update_load_avg_se(now, cpu, cfs_rq, se);
 +              __update_load_avg_se(now, cfs_rq, se);
  
        decayed  = update_cfs_rq_load_avg(now, cfs_rq);
        decayed |= propagate_entity_load_avg(se);
@@@ -3572,7 -3562,7 +3573,7 @@@ void sync_entity_load_avg(struct sched_
        u64 last_update_time;
  
        last_update_time = cfs_rq_last_update_time(cfs_rq);
 -      __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
 +      __update_load_avg_blocked_se(last_update_time, se);
  }
  
  /*
@@@ -3588,6 -3578,10 +3589,6 @@@ void remove_entity_load_avg(struct sche
         * tasks cannot exit without having gone through wake_up_new_task() ->
         * post_init_entity_util_avg() which will have added things to the
         * cfs_rq, so we can remove unconditionally.
 -       *
 -       * Similarly for groups, they will have passed through
 -       * post_init_entity_util_avg() before unregister_sched_fair_group()
 -       * calls this.
         */
  
        sync_entity_load_avg(se);
@@@ -3661,7 -3655,6 +3662,7 @@@ util_est_dequeue(struct cfs_rq *cfs_rq
  {
        long last_ewma_diff;
        struct util_est ue;
 +      int cpu;
  
        if (!sched_feat(UTIL_EST))
                return;
                return;
  
        /*
 +       * To avoid overestimation of actual task utilization, skip updates if
 +       * we cannot grant there is idle time in this CPU.
 +       */
 +      cpu = cpu_of(rq_of(cfs_rq));
 +      if (task_util(p) > capacity_orig_of(cpu))
 +              return;
 +
 +      /*
         * Update Task's estimated utilization
         *
         * When *p completes an activation we can consolidate another sample
@@@ -4445,10 -4430,6 +4446,10 @@@ static int tg_unthrottle_up(struct task
                /* adjust cfs_rq_clock_task() */
                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
 +
 +              /* Add cfs_rq with already running entity in the list */
 +              if (cfs_rq->nr_running >= 1)
 +                      list_add_leaf_cfs_rq(cfs_rq);
        }
  
        return 0;
@@@ -4460,10 -4441,8 +4461,10 @@@ static int tg_throttle_down(struct task
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
        /* group is entering throttled state, stop time */
 -      if (!cfs_rq->throttle_count)
 +      if (!cfs_rq->throttle_count) {
                cfs_rq->throttled_clock_task = rq_clock_task(rq);
 +              list_del_leaf_cfs_rq(cfs_rq);
 +      }
        cfs_rq->throttle_count++;
  
        return 0;
@@@ -4566,8 -4545,6 +4567,8 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
                        break;
        }
  
 +      assert_list_leaf_cfs_rq(rq);
 +
        if (!se)
                add_nr_running(rq, task_delta);
  
@@@ -4589,7 -4566,7 +4590,7 @@@ static u64 distribute_cfs_runtime(struc
                struct rq *rq = rq_of(cfs_rq);
                struct rq_flags rf;
  
 -              rq_lock(rq, &rf);
 +              rq_lock_irqsave(rq, &rf);
                if (!cfs_rq_throttled(cfs_rq))
                        goto next;
  
                        unthrottle_cfs_rq(cfs_rq);
  
  next:
 -              rq_unlock(rq, &rf);
 +              rq_unlock_irqrestore(rq, &rf);
  
                if (!remaining)
                        break;
   * period the timer is deactivated until scheduling resumes; cfs_b->idle is
   * used to track this state.
   */
 -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
  {
        u64 runtime, runtime_expires;
        int throttled;
        while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
                runtime = cfs_b->runtime;
                cfs_b->distribute_running = 1;
 -              raw_spin_unlock(&cfs_b->lock);
 +              raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                /* we can't nest cfs_b->lock while distributing bandwidth */
                runtime = distribute_cfs_runtime(cfs_b, runtime,
                                                 runtime_expires);
 -              raw_spin_lock(&cfs_b->lock);
 +              raw_spin_lock_irqsave(&cfs_b->lock, flags);
  
                cfs_b->distribute_running = 0;
                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
@@@ -4777,18 -4754,17 +4778,18 @@@ static __always_inline void return_cfs_
  static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  {
        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
 +      unsigned long flags;
        u64 expires;
  
        /* confirm we're still not at a refresh boundary */
 -      raw_spin_lock(&cfs_b->lock);
 +      raw_spin_lock_irqsave(&cfs_b->lock, flags);
        if (cfs_b->distribute_running) {
 -              raw_spin_unlock(&cfs_b->lock);
 +              raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                return;
        }
  
        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 -              raw_spin_unlock(&cfs_b->lock);
 +              raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                return;
        }
  
        if (runtime)
                cfs_b->distribute_running = 1;
  
 -      raw_spin_unlock(&cfs_b->lock);
 +      raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  
        if (!runtime)
                return;
  
        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
  
 -      raw_spin_lock(&cfs_b->lock);
 +      raw_spin_lock_irqsave(&cfs_b->lock, flags);
        if (expires == cfs_b->runtime_expires)
                lsub_positive(&cfs_b->runtime, runtime);
        cfs_b->distribute_running = 0;
 -      raw_spin_unlock(&cfs_b->lock);
 +      raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  }
  
  /*
@@@ -4888,21 -4864,20 +4889,21 @@@ static enum hrtimer_restart sched_cfs_p
  {
        struct cfs_bandwidth *cfs_b =
                container_of(timer, struct cfs_bandwidth, period_timer);
 +      unsigned long flags;
        int overrun;
        int idle = 0;
  
 -      raw_spin_lock(&cfs_b->lock);
 +      raw_spin_lock_irqsave(&cfs_b->lock, flags);
        for (;;) {
                overrun = hrtimer_forward_now(timer, cfs_b->period);
                if (!overrun)
                        break;
  
 -              idle = do_sched_cfs_period_timer(cfs_b, overrun);
 +              idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
        }
        if (idle)
                cfs_b->period_active = 0;
 -      raw_spin_unlock(&cfs_b->lock);
 +      raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  
        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  }
@@@ -5012,12 -4987,6 +5013,12 @@@ static void __maybe_unused unthrottle_o
  }
  
  #else /* CONFIG_CFS_BANDWIDTH */
 +
 +static inline bool cfs_bandwidth_used(void)
 +{
 +      return false;
 +}
 +
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  {
        return rq_clock_task(rq_of(cfs_rq));
@@@ -5209,23 -5178,6 +5210,23 @@@ enqueue_task_fair(struct rq *rq, struc
  
        }
  
 +      if (cfs_bandwidth_used()) {
 +              /*
 +               * When bandwidth control is enabled; the cfs_rq_throttled()
 +               * breaks in the above iteration can result in incomplete
 +               * leaf list maintenance, resulting in triggering the assertion
 +               * below.
 +               */
 +              for_each_sched_entity(se) {
 +                      cfs_rq = cfs_rq_of(se);
 +
 +                      if (list_add_leaf_cfs_rq(cfs_rq))
 +                              break;
 +              }
 +      }
 +
 +      assert_list_leaf_cfs_rq(rq);
 +
        hrtick_update(rq);
  }
  
@@@ -5605,6 -5557,11 +5606,6 @@@ static unsigned long capacity_of(int cp
        return cpu_rq(cpu)->cpu_capacity;
  }
  
 -static unsigned long capacity_orig_of(int cpu)
 -{
 -      return cpu_rq(cpu)->cpu_capacity_orig;
 -}
 -
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
        struct rq *rq = cpu_rq(cpu);
@@@ -6097,7 -6054,7 +6098,7 @@@ static int select_idle_core(struct task
                bool idle = true;
  
                for_each_cpu(cpu, cpu_smt_mask(core)) {
 -                      cpumask_clear_cpu(cpu, cpus);
 +                      __cpumask_clear_cpu(cpu, cpus);
                        if (!available_idle_cpu(cpu))
                                idle = false;
                }
  /*
   * Scan the local SMT mask for idle CPUs.
   */
 -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 +static int select_idle_smt(struct task_struct *p, int target)
  {
        int cpu;
  
@@@ -6141,7 -6098,7 +6142,7 @@@ static inline int select_idle_core(stru
        return -1;
  }
  
 -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 +static inline int select_idle_smt(struct task_struct *p, int target)
  {
        return -1;
  }
@@@ -6246,7 -6203,7 +6247,7 @@@ static int select_idle_sibling(struct t
        if ((unsigned)i < nr_cpumask_bits)
                return i;
  
 -      i = select_idle_smt(p, sd, target);
 +      i = select_idle_smt(p, target);
        if ((unsigned)i < nr_cpumask_bits)
                return i;
  
@@@ -6652,7 -6609,7 +6653,7 @@@ select_task_rq_fair(struct task_struct 
        if (sd_flag & SD_BALANCE_WAKE) {
                record_wakee(p);
  
 -              if (static_branch_unlikely(&sched_energy_present)) {
 +              if (sched_energy_enabled()) {
                        new_cpu = find_energy_efficient_cpu(p, prev_cpu);
                        if (new_cpu >= 0)
                                return new_cpu;
@@@ -7071,12 -7028,6 +7072,12 @@@ idle
        if (new_tasks > 0)
                goto again;
  
 +      /*
 +       * rq is about to be idle, check if we need to update the
 +       * lost_idle_time of clock_pelt
 +       */
 +      update_idle_rq_clock_pelt(rq);
 +
        return NULL;
  }
  
@@@ -7697,27 -7648,10 +7698,27 @@@ static inline bool others_have_blocked(
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
 +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 +{
 +      if (cfs_rq->load.weight)
 +              return false;
 +
 +      if (cfs_rq->avg.load_sum)
 +              return false;
 +
 +      if (cfs_rq->avg.util_sum)
 +              return false;
 +
 +      if (cfs_rq->avg.runnable_load_sum)
 +              return false;
 +
 +      return true;
 +}
 +
  static void update_blocked_averages(int cpu)
  {
        struct rq *rq = cpu_rq(cpu);
 -      struct cfs_rq *cfs_rq;
 +      struct cfs_rq *cfs_rq, *pos;
        const struct sched_class *curr_class;
        struct rq_flags rf;
        bool done = true;
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
 -      for_each_leaf_cfs_rq(rq, cfs_rq) {
 +      for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
                struct sched_entity *se;
  
 -              /* throttled entities do not contribute to load */
 -              if (throttled_hierarchy(cfs_rq))
 -                      continue;
 -
 -              if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
 +              if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
                        update_tg_load_avg(cfs_rq, 0);
  
                /* Propagate pending load changes to the parent, if any: */
                if (se && !skip_blocked_update(se))
                        update_load_avg(cfs_rq_of(se), se, 0);
  
 +              /*
 +               * There can be a lot of idle CPU cgroups.  Don't let fully
 +               * decayed cfs_rqs linger on the list.
 +               */
 +              if (cfs_rq_is_decayed(cfs_rq))
 +                      list_del_leaf_cfs_rq(cfs_rq);
 +
                /* Don't need periodic decay once load/util_avg are null */
                if (cfs_rq_has_blocked(cfs_rq))
                        done = false;
        }
  
        curr_class = rq->curr->sched_class;
 -      update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
 -      update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
 +      update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
 +      update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
        update_irq_load_avg(rq, 0);
        /* Don't need periodic decay once load/util_avg are null */
        if (others_have_blocked(rq))
@@@ -7824,11 -7755,11 +7825,11 @@@ static inline void update_blocked_avera
  
        rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
 -      update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 +      update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
  
        curr_class = rq->curr->sched_class;
 -      update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
 -      update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
 +      update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
 +      update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
        update_irq_load_avg(rq, 0);
  #ifdef CONFIG_NO_HZ_COMMON
        rq->last_blocked_load_update_tick = jiffies;
@@@ -8522,7 -8453,9 +8523,7 @@@ static int check_asym_packing(struct lb
        if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
                return 0;
  
 -      env->imbalance = DIV_ROUND_CLOSEST(
 -              sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
 -              SCHED_CAPACITY_SCALE);
 +      env->imbalance = sds->busiest_stat.group_load;
  
        return 1;
  }
@@@ -8704,7 -8637,7 +8705,7 @@@ static struct sched_group *find_busiest
         */
        update_sd_lb_stats(env, &sds);
  
 -      if (static_branch_unlikely(&sched_energy_present)) {
 +      if (sched_energy_enabled()) {
                struct root_domain *rd = env->dst_rq->rd;
  
                if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
@@@ -8895,25 -8828,21 +8896,25 @@@ static struct rq *find_busiest_queue(st
   */
  #define MAX_PINNED_INTERVAL   512
  
 -static int need_active_balance(struct lb_env *env)
 +static inline bool
 +asym_active_balance(struct lb_env *env)
  {
 -      struct sched_domain *sd = env->sd;
 +      /*
 +       * ASYM_PACKING needs to force migrate tasks from busy but
 +       * lower priority CPUs in order to pack all tasks in the
 +       * highest priority CPUs.
 +       */
 +      return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
 +             sched_asym_prefer(env->dst_cpu, env->src_cpu);
 +}
  
 -      if (env->idle == CPU_NEWLY_IDLE) {
 +static inline bool
 +voluntary_active_balance(struct lb_env *env)
 +{
 +      struct sched_domain *sd = env->sd;
  
 -              /*
 -               * ASYM_PACKING needs to force migrate tasks from busy but
 -               * lower priority CPUs in order to pack all tasks in the
 -               * highest priority CPUs.
 -               */
 -              if ((sd->flags & SD_ASYM_PACKING) &&
 -                  sched_asym_prefer(env->dst_cpu, env->src_cpu))
 -                      return 1;
 -      }
 +      if (asym_active_balance(env))
 +              return 1;
  
        /*
         * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
        if (env->src_grp_type == group_misfit_task)
                return 1;
  
 +      return 0;
 +}
 +
 +static int need_active_balance(struct lb_env *env)
 +{
 +      struct sched_domain *sd = env->sd;
 +
 +      if (voluntary_active_balance(env))
 +              return 1;
 +
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
  }
  
@@@ -9105,7 -9024,7 +9106,7 @@@ more_balance
                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
  
                        /* Prevent to re-select dst_cpu via env's CPUs */
 -                      cpumask_clear_cpu(env.dst_cpu, env.cpus);
 +                      __cpumask_clear_cpu(env.dst_cpu, env.cpus);
  
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
  
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
 -                      cpumask_clear_cpu(cpu_of(busiest), cpus);
 +                      __cpumask_clear_cpu(cpu_of(busiest), cpus);
                        /*
                         * Attempting to continue load balancing at the current
                         * sched_domain level only makes sense if there are
        } else
                sd->nr_balance_failed = 0;
  
 -      if (likely(!active_balance)) {
 +      if (likely(!active_balance) || voluntary_active_balance(&env)) {
                /* We were unbalanced, so reset the balancing interval */
                sd->balance_interval = sd->min_interval;
        } else {
@@@ -9551,8 -9470,15 +9552,8 @@@ static void kick_ilb(unsigned int flags
  }
  
  /*
 - * Current heuristic for kicking the idle load balancer in the presence
 - * of an idle cpu in the system.
 - *   - This rq has more than one task.
 - *   - This rq has at least one CFS task and the capacity of the CPU is
 - *     significantly reduced because of RT tasks or IRQs.
 - *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
 - *     multiple busy cpu.
 - *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
 - *     domain span are idle.
 + * Current decision point for kicking the idle load balancer in the presence
 + * of idle CPUs in the system.
   */
  static void nohz_balancer_kick(struct rq *rq)
  {
        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
        if (sds) {
                /*
 -               * XXX: write a coherent comment on why we do this.
 -               * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
 +               * If there is an imbalance between LLC domains (IOW we could
 +               * increase the overall cache use), we need some less-loaded LLC
 +               * domain to pull some load. Likewise, we may need to spread
 +               * load within the current LLC domain (e.g. packed SMT cores but
 +               * other CPUs are idle). We can't really know from here how busy
 +               * the others are - so just get a nohz balance going if it looks
 +               * like this LLC domain has tasks we could move.
                 */
                nr_busy = atomic_read(&sds->nr_busy_cpus);
                if (nr_busy > 1) {
        sd = rcu_dereference(rq->sd);
        if (sd) {
                if ((rq->cfs.h_nr_running >= 1) &&
 -                              check_cpu_capacity(rq, sd)) {
 +                  check_cpu_capacity(rq, sd)) {
                        flags = NOHZ_KICK_MASK;
                        goto unlock;
                }
  
        sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
        if (sd) {
 -              for_each_cpu(i, sched_domain_span(sd)) {
 -                      if (i == cpu ||
 -                          !cpumask_test_cpu(i, nohz.idle_cpus_mask))
 -                              continue;
 -
 +              for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
                        if (sched_asym_prefer(i, cpu)) {
                                flags = NOHZ_KICK_MASK;
                                goto unlock;
@@@ -10622,10 -10547,10 +10623,10 @@@ const struct sched_class fair_sched_cla
  #ifdef CONFIG_SCHED_DEBUG
  void print_cfs_stats(struct seq_file *m, int cpu)
  {
 -      struct cfs_rq *cfs_rq;
 +      struct cfs_rq *cfs_rq, *pos;
  
        rcu_read_lock();
 -      for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 +      for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
                print_cfs_rq(m, cpu, cfs_rq);
        rcu_read_unlock();
  }
diff --combined kernel/sysctl.c
@@@ -472,17 -472,6 +472,17 @@@ static struct ctl_table kern_table[] = 
                .extra1         = &one,
        },
  #endif
 +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 +      {
 +              .procname       = "sched_energy_aware",
 +              .data           = &sysctl_sched_energy_aware,
 +              .maxlen         = sizeof(unsigned int),
 +              .mode           = 0644,
 +              .proc_handler   = sched_energy_aware_handler,
 +              .extra1         = &zero,
 +              .extra2         = &one,
 +      },
 +#endif
  #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
@@@ -1471,7 -1460,7 +1471,7 @@@ static struct ctl_table vm_table[] = 
                .data           = &sysctl_extfrag_threshold,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = sysctl_extfrag_handler,
+               .proc_handler   = proc_dointvec_minmax,
                .extra1         = &min_extfrag_threshold,
                .extra2         = &max_extfrag_threshold,
        },