Merge tag 'timers-nohz-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 28 Jun 2021 19:22:06 +0000 (12:22 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 28 Jun 2021 19:22:06 +0000 (12:22 -0700)
Pull timers/nohz updates from Ingo Molnar:

 - Micro-optimize tick_nohz_full_cpu()

 - Optimize idle exit tick restarts to be less eager

 - Optimize tick_nohz_dep_set_task() to only wake up a single CPU.
   This reduces IPIs and interruptions on nohz_full CPUs.

 - Optimize tick_nohz_dep_set_signal() in a similar fashion.

 - Skip IPIs in tick_nohz_kick_task() when trying to kick a
   non-running task.

 - Micro-optimize tick_nohz_task_switch() IRQ flags handling to
   reduce context switching costs.

 - Misc cleanups and fixes

* tag 'timers-nohz-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  MAINTAINERS: Add myself as context tracking maintainer
  tick/nohz: Call tick_nohz_task_switch() with interrupts disabled
  tick/nohz: Kick only _queued_ task whose tick dependency is updated
  tick/nohz: Change signal tick dependency to wake up CPUs of member tasks
  tick/nohz: Only wake up a single target cpu when kicking a task
  tick/nohz: Update nohz_full Kconfig help
  tick/nohz: Update idle_exittime on actual idle exit
  tick/nohz: Remove superflous check for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  tick/nohz: Conditionally restart tick on idle exit
  tick/nohz: Evaluate the CPU expression after the static key

1  2 
MAINTAINERS
include/linux/sched.h
include/linux/tick.h
kernel/sched/core.c
kernel/time/tick-sched.c

diff --combined MAINTAINERS
@@@ -1578,7 -1578,7 +1578,7 @@@ F:      drivers/clk/sunxi
  ARM/Allwinner sunXi SoC support
  M:    Maxime Ripard <mripard@kernel.org>
  M:    Chen-Yu Tsai <wens@csie.org>
 -R:    Jernej Skrabec <jernej.skrabec@siol.net>
 +R:    Jernej Skrabec <jernej.skrabec@gmail.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux.git
@@@ -1618,8 -1618,8 +1618,8 @@@ F:      Documentation/devicetree/bindings/so
  F:    sound/soc/meson/
  
  ARM/Amlogic Meson SoC support
 +M:    Neil Armstrong <narmstrong@baylibre.com>
  M:    Kevin Hilman <khilman@baylibre.com>
 -R:    Neil Armstrong <narmstrong@baylibre.com>
  R:    Jerome Brunet <jbrunet@baylibre.com>
  R:    Martin Blumenstingl <martin.blumenstingl@googlemail.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1816,7 -1816,7 +1816,7 @@@ F:      drivers/pinctrl/pinctrl-gemini.
  F:    drivers/rtc/rtc-ftrtc010.c
  
  ARM/CZ.NIC TURRIS SUPPORT
 -M:    Marek Behun <kabel@kernel.org>
 +M:    Marek Behún <kabel@kernel.org>
  S:    Maintained
  W:    https://www.turris.cz/
  F:    Documentation/ABI/testing/debugfs-moxtet
@@@ -3877,7 -3877,6 +3877,7 @@@ L:      linux-btrfs@vger.kernel.or
  S:    Maintained
  W:    http://btrfs.wiki.kernel.org/
  Q:    http://patchwork.kernel.org/project/linux-btrfs/list/
 +C:    irc://irc.libera.chat/btrfs
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git
  F:    Documentation/filesystems/btrfs.rst
  F:    fs/btrfs/
@@@ -4139,14 -4138,6 +4139,14 @@@ S:    Odd Fixe
  F:    Documentation/devicetree/bindings/arm/cavium-thunder2.txt
  F:    arch/arm64/boot/dts/cavium/thunder2-99xx*
  
 +CBS/ETF/TAPRIO QDISCS
 +M:    Vinicius Costa Gomes <vinicius.gomes@intel.com>
 +S:    Maintained
 +L:    netdev@vger.kernel.org
 +F:    net/sched/sch_cbs.c
 +F:    net/sched/sch_etf.c
 +F:    net/sched/sch_taprio.c
 +
  CC2520 IEEE-802.15.4 RADIO DRIVER
  M:    Varka Bhadram <varkabhadram@gmail.com>
  L:    linux-wpan@vger.kernel.org
@@@ -4610,6 -4601,12 +4610,12 @@@ S:    Supporte
  F:    drivers/video/console/
  F:    include/linux/console*
  
+ CONTEXT TRACKING
+ M:    Frederic Weisbecker <frederic@kernel.org>
+ S:    Maintained
+ F:    kernel/context_tracking.c
+ F:    include/linux/context_tracking*
  CONTROL GROUP (CGROUP)
  M:    Tejun Heo <tj@kernel.org>
  M:    Zefan Li <lizefan.x@bytedance.com>
@@@ -5098,7 -5095,7 +5104,7 @@@ S:      Maintaine
  F:    drivers/net/fddi/defza.*
  
  DEINTERLACE DRIVERS FOR ALLWINNER H3
 -M:    Jernej Skrabec <jernej.skrabec@siol.net>
 +M:    Jernej Skrabec <jernej.skrabec@gmail.com>
  L:    linux-media@vger.kernel.org
  S:    Maintained
  T:    git git://linuxtv.org/media_tree.git
@@@ -5189,13 -5186,6 +5195,13 @@@ W:    https://linuxtv.or
  T:    git git://linuxtv.org/media_tree.git
  F:    drivers/media/platform/sti/delta
  
 +DELTA DPS920AB PSU DRIVER
 +M:    Robert Marko <robert.marko@sartura.hr>
 +L:    linux-hwmon@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/hwmon/dps920ab.rst
 +F:    drivers/hwmon/pmbus/dps920ab.c
 +
  DENALI NAND DRIVER
  L:    linux-mtd@lists.infradead.org
  S:    Orphan
@@@ -5253,7 -5243,7 +5259,7 @@@ DEVICE DIRECT ACCESS (DAX
  M:    Dan Williams <dan.j.williams@intel.com>
  M:    Vishal Verma <vishal.l.verma@intel.com>
  M:    Dave Jiang <dave.jiang@intel.com>
 -L:    linux-nvdimm@lists.01.org
 +L:    nvdimm@lists.linux.dev
  S:    Supported
  F:    drivers/dax/
  
@@@ -5585,6 -5575,7 +5591,6 @@@ F:      drivers/soc/fsl/dpi
  
  DPAA2 ETHERNET DRIVER
  M:    Ioana Ciornei <ioana.ciornei@nxp.com>
 -M:    Ioana Radulescu <ruxandra.radulescu@nxp.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    Documentation/networking/device_drivers/ethernet/freescale/dpaa2/ethernet-driver.rst
@@@ -5647,14 -5638,14 +5653,14 @@@ F:   include/linux/power/smartreflex.
  DRM DRIVER FOR ALLWINNER DE2 AND DE3 ENGINE
  M:    Maxime Ripard <mripard@kernel.org>
  M:    Chen-Yu Tsai <wens@csie.org>
 -R:    Jernej Skrabec <jernej.skrabec@siol.net>
 +R:    Jernej Skrabec <jernej.skrabec@gmail.com>
  L:    dri-devel@lists.freedesktop.org
  S:    Supported
  T:    git git://anongit.freedesktop.org/drm/drm-misc
  F:    drivers/gpu/drm/sun4i/sun8i*
  
  DRM DRIVER FOR ARM PL111 CLCD
 -M:    Eric Anholt <eric@anholt.net>
 +M:    Emma Anholt <emma@anholt.net>
  S:    Supported
  T:    git git://anongit.freedesktop.org/drm/drm-misc
  F:    drivers/gpu/drm/pl111/
@@@ -5734,7 -5725,7 +5740,7 @@@ T:      git git://anongit.freedesktop.org/dr
  F:    drivers/gpu/drm/tiny/gm12u320.c
  
  DRM DRIVER FOR HX8357D PANELS
 -M:    Eric Anholt <eric@anholt.net>
 +M:    Emma Anholt <emma@anholt.net>
  S:    Maintained
  T:    git git://anongit.freedesktop.org/drm/drm-misc
  F:    Documentation/devicetree/bindings/display/himax,hx8357d.txt
@@@ -6038,7 -6029,7 +6044,7 @@@ M:      Neil Armstrong <narmstrong@baylibre.
  M:    Robert Foss <robert.foss@linaro.org>
  R:    Laurent Pinchart <Laurent.pinchart@ideasonboard.com>
  R:    Jonas Karlman <jonas@kwiboo.se>
 -R:    Jernej Skrabec <jernej.skrabec@siol.net>
 +R:    Jernej Skrabec <jernej.skrabec@gmail.com>
  S:    Maintained
  T:    git git://anongit.freedesktop.org/drm/drm-misc
  F:    drivers/gpu/drm/bridge/
@@@ -6192,7 -6183,7 +6198,7 @@@ F:      Documentation/devicetree/bindings/di
  F:    drivers/gpu/drm/omapdrm/
  
  DRM DRIVERS FOR V3D
 -M:    Eric Anholt <eric@anholt.net>
 +M:    Emma Anholt <emma@anholt.net>
  S:    Supported
  T:    git git://anongit.freedesktop.org/drm/drm-misc
  F:    Documentation/devicetree/bindings/gpu/brcm,bcm-v3d.yaml
@@@ -6200,7 -6191,7 +6206,7 @@@ F:      drivers/gpu/drm/v3d
  F:    include/uapi/drm/v3d_drm.h
  
  DRM DRIVERS FOR VC4
 -M:    Eric Anholt <eric@anholt.net>
 +M:    Emma Anholt <emma@anholt.net>
  M:    Maxime Ripard <mripard@kernel.org>
  S:    Supported
  T:    git git://github.com/anholt/linux
@@@ -6953,7 -6944,6 +6959,7 @@@ F:      net/core/failover.
  FANOTIFY
  M:    Jan Kara <jack@suse.cz>
  R:    Amir Goldstein <amir73il@gmail.com>
 +R:    Matthew Bobrowski <repnop@google.com>
  L:    linux-fsdevel@vger.kernel.org
  S:    Maintained
  F:    fs/notify/fanotify/
@@@ -7022,7 -7012,7 +7028,7 @@@ M:      Dan Williams <dan.j.williams@intel.c
  R:    Matthew Wilcox <willy@infradead.org>
  R:    Jan Kara <jack@suse.cz>
  L:    linux-fsdevel@vger.kernel.org
 -L:    linux-nvdimm@lists.01.org
 +L:    nvdimm@lists.linux.dev
  S:    Supported
  F:    fs/dax.c
  F:    include/linux/dax.h
@@@ -7361,6 -7351,7 +7367,6 @@@ F:      drivers/net/ethernet/freescale/fs_en
  F:    include/linux/fs_enet_pd.h
  
  FREESCALE SOC SOUND DRIVERS
 -M:    Timur Tabi <timur@kernel.org>
  M:    Nicolin Chen <nicoleotsuka@gmail.com>
  M:    Xiubo Li <Xiubo.Lee@gmail.com>
  R:    Fabio Estevam <festevam@gmail.com>
@@@ -8778,6 -8769,22 +8784,6 @@@ L:     linux-i2c@vger.kernel.or
  S:    Maintained
  F:    drivers/i2c/busses/i2c-icy.c
  
 -IDE SUBSYSTEM
 -M:    "David S. Miller" <davem@davemloft.net>
 -L:    linux-ide@vger.kernel.org
 -S:    Maintained
 -Q:    http://patchwork.ozlabs.org/project/linux-ide/list/
 -T:    git git://git.kernel.org/pub/scm/linux/kernel/git/davem/ide.git
 -F:    Documentation/ide/
 -F:    drivers/ide/
 -F:    include/linux/ide.h
 -
 -IDE/ATAPI DRIVERS
 -L:    linux-ide@vger.kernel.org
 -S:    Orphan
 -F:    Documentation/cdrom/ide-cd.rst
 -F:    drivers/ide/ide-cd*
 -
  IDEAPAD LAPTOP EXTRAS DRIVER
  M:    Ike Panhc <ike.pan@canonical.com>
  L:    platform-driver-x86@vger.kernel.org
@@@ -10377,7 -10384,7 +10383,7 @@@ LIBNVDIMM BLK: MMIO-APERTURE DRIVE
  M:    Dan Williams <dan.j.williams@intel.com>
  M:    Vishal Verma <vishal.l.verma@intel.com>
  M:    Dave Jiang <dave.jiang@intel.com>
 -L:    linux-nvdimm@lists.01.org
 +L:    nvdimm@lists.linux.dev
  S:    Supported
  Q:    https://patchwork.kernel.org/project/linux-nvdimm/list/
  P:    Documentation/nvdimm/maintainer-entry-profile.rst
@@@ -10388,7 -10395,7 +10394,7 @@@ LIBNVDIMM BTT: BLOCK TRANSLATION TABL
  M:    Vishal Verma <vishal.l.verma@intel.com>
  M:    Dan Williams <dan.j.williams@intel.com>
  M:    Dave Jiang <dave.jiang@intel.com>
 -L:    linux-nvdimm@lists.01.org
 +L:    nvdimm@lists.linux.dev
  S:    Supported
  Q:    https://patchwork.kernel.org/project/linux-nvdimm/list/
  P:    Documentation/nvdimm/maintainer-entry-profile.rst
@@@ -10398,7 -10405,7 +10404,7 @@@ LIBNVDIMM PMEM: PERSISTENT MEMORY DRIVE
  M:    Dan Williams <dan.j.williams@intel.com>
  M:    Vishal Verma <vishal.l.verma@intel.com>
  M:    Dave Jiang <dave.jiang@intel.com>
 -L:    linux-nvdimm@lists.01.org
 +L:    nvdimm@lists.linux.dev
  S:    Supported
  Q:    https://patchwork.kernel.org/project/linux-nvdimm/list/
  P:    Documentation/nvdimm/maintainer-entry-profile.rst
@@@ -10406,7 -10413,7 +10412,7 @@@ F:   drivers/nvdimm/pmem
  
  LIBNVDIMM: DEVICETREE BINDINGS
  M:    Oliver O'Halloran <oohall@gmail.com>
 -L:    linux-nvdimm@lists.01.org
 +L:    nvdimm@lists.linux.dev
  S:    Supported
  Q:    https://patchwork.kernel.org/project/linux-nvdimm/list/
  F:    Documentation/devicetree/bindings/pmem/pmem-region.txt
@@@ -10417,7 -10424,7 +10423,7 @@@ M:   Dan Williams <dan.j.williams@intel.c
  M:    Vishal Verma <vishal.l.verma@intel.com>
  M:    Dave Jiang <dave.jiang@intel.com>
  M:    Ira Weiny <ira.weiny@intel.com>
 -L:    linux-nvdimm@lists.01.org
 +L:    nvdimm@lists.linux.dev
  S:    Supported
  Q:    https://patchwork.kernel.org/project/linux-nvdimm/list/
  P:    Documentation/nvdimm/maintainer-entry-profile.rst
@@@ -10936,7 -10943,7 +10942,7 @@@ F:   include/linux/mv643xx.
  
  MARVELL MV88X3310 PHY DRIVER
  M:    Russell King <linux@armlinux.org.uk>
 -M:    Marek Behun <marek.behun@nic.cz>
 +M:    Marek Behún <kabel@kernel.org>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/phy/marvell10g.c
@@@ -12179,7 -12186,6 +12185,7 @@@ F:   drivers/platform/surface/surfacepro3
  
  MICROSOFT SURFACE SYSTEM AGGREGATOR SUBSYSTEM
  M:    Maximilian Luz <luzmaximilian@gmail.com>
 +L:    platform-driver-x86@vger.kernel.org
  S:    Maintained
  W:    https://github.com/linux-surface/surface-aggregator-module
  C:    irc://chat.freenode.net/##linux-surface
@@@ -12680,9 -12686,9 +12686,9 @@@ F:   drivers/rtc/rtc-ntxec.
  F:    include/linux/mfd/ntxec.h
  
  NETRONOME ETHERNET DRIVERS
 -M:    Simon Horman <simon.horman@netronome.com>
 +M:    Simon Horman <simon.horman@corigine.com>
  R:    Jakub Kicinski <kuba@kernel.org>
 -L:    oss-drivers@netronome.com
 +L:    oss-drivers@corigine.com
  S:    Maintained
  F:    drivers/net/ethernet/netronome/
  
@@@ -12709,6 -12715,7 +12715,6 @@@ M:   "David S. Miller" <davem@davemloft.n
  M:    Jakub Kicinski <kuba@kernel.org>
  L:    netdev@vger.kernel.org
  S:    Maintained
 -W:    http://www.linuxfoundation.org/en/Net
  Q:    https://patchwork.kernel.org/project/netdevbpf/list/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
@@@ -12753,6 -12760,7 +12759,6 @@@ M:   "David S. Miller" <davem@davemloft.n
  M:    Jakub Kicinski <kuba@kernel.org>
  L:    netdev@vger.kernel.org
  S:    Maintained
 -W:    http://www.linuxfoundation.org/en/Net
  Q:    https://patchwork.kernel.org/project/netdevbpf/list/
  B:    mailto:netdev@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git
@@@ -12894,10 -12902,8 +12900,10 @@@ F: include/uapi/linux/nexthop.
  F:    net/ipv4/nexthop.c
  
  NFC SUBSYSTEM
 +M:    Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
 +L:    linux-nfc@lists.01.org (subscribers-only)
  L:    netdev@vger.kernel.org
 -S:    Orphan
 +S:    Maintained
  F:    Documentation/devicetree/bindings/net/nfc/
  F:    drivers/nfc/
  F:    include/linux/platform_data/nfcmrvl.h
@@@ -12908,7 -12914,7 +12914,7 @@@ F:   net/nfc
  NFC VIRTUAL NCI DEVICE DRIVER
  M:    Bongsu Jeon <bongsu.jeon@samsung.com>
  L:    netdev@vger.kernel.org
 -L:    linux-nfc@lists.01.org (moderated for non-subscribers)
 +L:    linux-nfc@lists.01.org (subscribers-only)
  S:    Supported
  F:    drivers/nfc/virtual_ncidev.c
  F:    tools/testing/selftests/nci/
@@@ -13205,8 -13211,9 +13211,8 @@@ F:   Documentation/devicetree/bindings/so
  F:    sound/soc/codecs/tfa9879*
  
  NXP-NCI NFC DRIVER
 -M:    Clément Perrochaud <clement.perrochaud@effinnov.com>
  R:    Charles Gorand <charles.gorand@effinnov.com>
 -L:    linux-nfc@lists.01.org (moderated for non-subscribers)
 +L:    linux-nfc@lists.01.org (subscribers-only)
  S:    Supported
  F:    drivers/nfc/nxp-nci
  
@@@ -14109,7 -14116,6 +14115,7 @@@ F:   drivers/pci/controller/pci-v3-semi.
  PCI ENDPOINT SUBSYSTEM
  M:    Kishon Vijay Abraham I <kishon@ti.com>
  M:    Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
 +R:    Krzysztof Wilczyński <kw@linux.com>
  L:    linux-pci@vger.kernel.org
  S:    Supported
  F:    Documentation/PCI/endpoint/*
@@@ -14158,7 -14164,6 +14164,7 @@@ F:   drivers/pci/controller/pci-xgene-msi
  PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS
  M:    Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
  R:    Rob Herring <robh@kernel.org>
 +R:    Krzysztof Wilczyński <kw@linux.com>
  L:    linux-pci@vger.kernel.org
  S:    Supported
  Q:    http://patchwork.ozlabs.org/project/linux-pci/list/
@@@ -14318,12 -14323,10 +14324,12 @@@ PER-CPU MEMORY ALLOCATO
  M:    Dennis Zhou <dennis@kernel.org>
  M:    Tejun Heo <tj@kernel.org>
  M:    Christoph Lameter <cl@linux.com>
 +L:    linux-mm@kvack.org
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git
  F:    arch/*/include/asm/percpu.h
  F:    include/linux/percpu*.h
 +F:    lib/percpu*.c
  F:    mm/percpu*.c
  
  PER-TASK DELAY ACCOUNTING
@@@ -14737,6 -14740,7 +14743,6 @@@ W:   https://wireless.wiki.kernel.org/en/
  F:    drivers/net/wireless/intersil/prism54/
  
  PROC FILESYSTEM
 -R:    Alexey Dobriyan <adobriyan@gmail.com>
  L:    linux-kernel@vger.kernel.org
  L:    linux-fsdevel@vger.kernel.org
  S:    Maintained
@@@ -15817,7 -15821,7 +15823,7 @@@ F:   include/uapi/linux/rose.
  F:    net/rose/
  
  ROTATION DRIVER FOR ALLWINNER A83T
 -M:    Jernej Skrabec <jernej.skrabec@siol.net>
 +M:    Jernej Skrabec <jernej.skrabec@gmail.com>
  L:    linux-media@vger.kernel.org
  S:    Maintained
  T:    git git://linuxtv.org/media_tree.git
@@@ -15947,7 -15951,6 +15953,7 @@@ S390 IUCV NETWORK LAYE
  M:    Julian Wiedmann <jwi@linux.ibm.com>
  M:    Karsten Graul <kgraul@linux.ibm.com>
  L:    linux-s390@vger.kernel.org
 +L:    netdev@vger.kernel.org
  S:    Supported
  W:    http://www.ibm.com/developerworks/linux/linux390/
  F:    drivers/s390/net/*iucv*
@@@ -15958,7 -15961,6 +15964,7 @@@ S390 NETWORK DRIVER
  M:    Julian Wiedmann <jwi@linux.ibm.com>
  M:    Karsten Graul <kgraul@linux.ibm.com>
  L:    linux-s390@vger.kernel.org
 +L:    netdev@vger.kernel.org
  S:    Supported
  W:    http://www.ibm.com/developerworks/linux/linux390/
  F:    drivers/s390/net/
@@@ -16137,7 -16139,7 +16143,7 @@@ F:   include/media/drv-intf/s3c_camif.
  SAMSUNG S3FWRN5 NFC DRIVER
  M:    Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
  M:    Krzysztof Opasiak <k.opasiak@samsung.com>
 -L:    linux-nfc@lists.01.org (moderated for non-subscribers)
 +L:    linux-nfc@lists.01.org (subscribers-only)
  S:    Maintained
  F:    Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml
  F:    drivers/nfc/s3fwrn5
@@@ -16550,7 -16552,6 +16556,7 @@@ F:   drivers/misc/sgi-xp
  
  SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS
  M:    Karsten Graul <kgraul@linux.ibm.com>
 +M:    Guvenc Gulce <guvenc@linux.ibm.com>
  L:    linux-s390@vger.kernel.org
  S:    Supported
  W:    http://www.ibm.com/developerworks/linux/linux390/
@@@ -17309,12 -17310,6 +17315,12 @@@ L: linux-i2c@vger.kernel.or
  S:    Maintained
  F:    drivers/i2c/busses/i2c-stm32*
  
 +ST STM32 SPI DRIVER
 +M:    Alain Volmat <alain.volmat@foss.st.com>
 +L:    linux-spi@vger.kernel.org
 +S:    Maintained
 +F:    drivers/spi/spi-stm32.c
 +
  ST STPDDC60 DRIVER
  M:    Daniel Nilsson <daniel.nilsson@flex.com>
  L:    linux-hwmon@vger.kernel.org
@@@ -17673,6 -17668,7 +17679,6 @@@ R:   Mika Westerberg <mika.westerberg@lin
  L:    linux-i2c@vger.kernel.org
  S:    Maintained
  F:    drivers/i2c/busses/i2c-designware-*
 -F:    include/linux/platform_data/i2c-designware.h
  
  SYNOPSYS DESIGNWARE MMC/SD/SDIO DRIVER
  M:    Jaehoon Chung <jh80.chung@samsung.com>
@@@ -18328,7 -18324,7 +18334,7 @@@ F:   sound/soc/codecs/tas571x
  TI TRF7970A NFC DRIVER
  M:    Mark Greer <mgreer@animalcreek.com>
  L:    linux-wireless@vger.kernel.org
 -L:    linux-nfc@lists.01.org (moderated for non-subscribers)
 +L:    linux-nfc@lists.01.org (subscribers-only)
  S:    Supported
  F:    Documentation/devicetree/bindings/net/nfc/trf7970a.txt
  F:    drivers/nfc/trf7970a.c
@@@ -18864,13 -18860,6 +18870,13 @@@ S: Maintaine
  F:    drivers/usb/host/isp116x*
  F:    include/linux/usb/isp116x.h
  
 +USB ISP1760 DRIVER
 +M:    Rui Miguel Silva <rui.silva@linaro.org>
 +L:    linux-usb@vger.kernel.org
 +S:    Maintained
 +F:    drivers/usb/isp1760/*
 +F:    Documentation/devicetree/bindings/usb/nxp,isp1760.yaml
 +
  USB LAN78XX ETHERNET DRIVER
  M:    Woojung Huh <woojung.huh@microchip.com>
  M:    UNGLinuxDriver@microchip.com
@@@ -19568,10 -19557,6 +19574,10 @@@ F: include/dt-bindings/regulator
  F:    include/linux/regulator/
  K:    regulator_get_optional
  
 +VOLTAGE AND CURRENT REGULATOR IRQ HELPERS
 +R:    Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
 +F:    drivers/regulator/irq_helpers.c
 +
  VRF
  M:    David Ahern <dsahern@kernel.org>
  L:    netdev@vger.kernel.org
@@@ -20019,7 -20004,6 +20025,7 @@@ F:   arch/x86/xen/*swiotlb
  F:    drivers/xen/*swiotlb*
  
  XFS FILESYSTEM
 +C:    irc://irc.oftc.net/xfs
  M:    Darrick J. Wong <djwong@kernel.org>
  M:    linux-xfs@vger.kernel.org
  L:    linux-xfs@vger.kernel.org
diff --combined include/linux/sched.h
@@@ -113,13 -113,11 +113,13 @@@ struct task_group
                                         __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
                                         TASK_PARKED)
  
 -#define task_is_traced(task)          ((task->state & __TASK_TRACED) != 0)
 +#define task_is_running(task)         (READ_ONCE((task)->__state) == TASK_RUNNING)
  
 -#define task_is_stopped(task)         ((task->state & __TASK_STOPPED) != 0)
 +#define task_is_traced(task)          ((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
  
 -#define task_is_stopped_or_traced(task)       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 +#define task_is_stopped(task)         ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
 +
 +#define task_is_stopped_or_traced(task)       ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  
        do {                                                    \
                WARN_ON_ONCE(is_special_task_state(state_value));\
                current->task_state_change = _THIS_IP_;         \
 -              current->state = (state_value);                 \
 +              WRITE_ONCE(current->__state, (state_value));    \
        } while (0)
  
  #define set_current_state(state_value)                                \
        do {                                                    \
                WARN_ON_ONCE(is_special_task_state(state_value));\
                current->task_state_change = _THIS_IP_;         \
 -              smp_store_mb(current->state, (state_value));    \
 +              smp_store_mb(current->__state, (state_value));  \
        } while (0)
  
  #define set_special_state(state_value)                                        \
                WARN_ON_ONCE(!is_special_task_state(state_value));      \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                current->task_state_change = _THIS_IP_;                 \
 -              current->state = (state_value);                         \
 +              WRITE_ONCE(current->__state, (state_value));            \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);   \
        } while (0)
  #else
   * Also see the comments of try_to_wake_up().
   */
  #define __set_current_state(state_value)                              \
 -      current->state = (state_value)
 +      WRITE_ONCE(current->__state, (state_value))
  
  #define set_current_state(state_value)                                        \
 -      smp_store_mb(current->state, (state_value))
 +      smp_store_mb(current->__state, (state_value))
  
  /*
   * set_special_state() should be used for those states when the blocking task
        do {                                                            \
                unsigned long flags; /* may shadow */                   \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
 -              current->state = (state_value);                         \
 +              WRITE_ONCE(current->__state, (state_value));            \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);   \
        } while (0)
  
  #endif
  
 +#define get_current_state()   READ_ONCE(current->__state)
 +
  /* Task command name length: */
  #define TASK_COMM_LEN                 16
  
@@@ -354,19 -350,11 +354,19 @@@ struct load_weight 
   * Only for tasks we track a moving average of the past instantaneous
   * estimated utilization. This allows to absorb sporadic drops in utilization
   * of an otherwise almost periodic task.
 + *
 + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 + * updates. When a task is dequeued, its util_est should not be updated if its
 + * util_avg has not been updated in the meantime.
 + * This information is mapped into the MSB bit of util_est.enqueued at dequeue
 + * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
 + * for a task) it is safe to use MSB.
   */
  struct util_est {
        unsigned int                    enqueued;
        unsigned int                    ewma;
  #define UTIL_EST_WEIGHT_SHIFT         2
 +#define UTIL_AVG_UNCHANGED            0x80000000
  } __attribute__((__aligned__(sizeof(u64))));
  
  /*
@@@ -666,7 -654,8 +666,7 @@@ struct task_struct 
         */
        struct thread_info              thread_info;
  #endif
 -      /* -1 unrunnable, 0 runnable, >0 stopped: */
 -      volatile long                   state;
 +      unsigned int                    __state;
  
        /*
         * This begins the randomizable portion of task_struct. Only
        const struct sched_class        *sched_class;
        struct sched_entity             se;
        struct sched_rt_entity          rt;
 +      struct sched_dl_entity          dl;
 +
 +#ifdef CONFIG_SCHED_CORE
 +      struct rb_node                  core_node;
 +      unsigned long                   core_cookie;
 +      unsigned int                    core_occupation;
 +#endif
 +
  #ifdef CONFIG_CGROUP_SCHED
        struct task_group               *sched_task_group;
  #endif
 -      struct sched_dl_entity          dl;
  
  #ifdef CONFIG_UCLAMP_TASK
        /*
        /* Signal handlers: */
        struct signal_struct            *signal;
        struct sighand_struct __rcu             *sighand;
 -      struct sigqueue                 *sigqueue_cache;
        sigset_t                        blocked;
        sigset_t                        real_blocked;
        /* Restored if set_restore_sigmask() was used: */
@@@ -1530,7 -1513,7 +1530,7 @@@ static inline pid_t task_pgrp_nr(struc
  
  static inline unsigned int task_state_index(struct task_struct *tsk)
  {
 -      unsigned int tsk_state = READ_ONCE(tsk->state);
 +      unsigned int tsk_state = READ_ONCE(tsk->__state);
        unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
  
        BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
@@@ -1838,10 -1821,10 +1838,10 @@@ static __always_inline void scheduler_i
         */
        preempt_fold_need_resched();
  }
 -extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 +extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
  #else
  static inline void scheduler_ipi(void) { }
 -static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 +static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
  {
        return 1;
  }
@@@ -2028,6 -2011,8 +2028,8 @@@ static inline void set_task_cpu(struct 
  
  #endif /* CONFIG_SMP */
  
+ extern bool sched_task_on_rq(struct task_struct *p);
  /*
   * In order to reduce various lock holder preemption latencies provide an
   * interface to see if a vCPU is currently running or not.
@@@ -2189,14 -2174,4 +2191,14 @@@ int sched_trace_rq_nr_running(struct r
  
  const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
  
 +#ifdef CONFIG_SCHED_CORE
 +extern void sched_core_free(struct task_struct *tsk);
 +extern void sched_core_fork(struct task_struct *p);
 +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
 +                              unsigned long uaddr);
 +#else
 +static inline void sched_core_free(struct task_struct *tsk) { }
 +static inline void sched_core_fork(struct task_struct *p) { }
 +#endif
 +
  #endif
diff --combined include/linux/tick.h
@@@ -11,7 -11,6 +11,7 @@@
  #include <linux/context_tracking_state.h>
  #include <linux/cpumask.h>
  #include <linux/sched.h>
 +#include <linux/rcupdate.h>
  
  #ifdef CONFIG_GENERIC_CLOCKEVENTS
  extern void __init tick_init(void);
@@@ -186,13 -185,17 +186,17 @@@ static inline bool tick_nohz_full_enabl
        return tick_nohz_full_running;
  }
  
- static inline bool tick_nohz_full_cpu(int cpu)
- {
-       if (!tick_nohz_full_enabled())
-               return false;
-       return cpumask_test_cpu(cpu, tick_nohz_full_mask);
- }
+ /*
+  * Check if a CPU is part of the nohz_full subset. Arrange for evaluating
+  * the cpu expression (typically smp_processor_id()) _after_ the static
+  * key.
+  */
+ #define tick_nohz_full_cpu(_cpu) ({                                   \
+       bool __ret = false;                                             \
+       if (tick_nohz_full_enabled())                                   \
+               __ret = cpumask_test_cpu((_cpu), tick_nohz_full_mask);  \
+       __ret;                                                          \
+ })
  
  static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
  {
@@@ -208,7 -211,7 +212,7 @@@ extern void tick_nohz_dep_set_task(stru
                                   enum tick_dep_bits bit);
  extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
                                     enum tick_dep_bits bit);
- extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+ extern void tick_nohz_dep_set_signal(struct task_struct *tsk,
                                     enum tick_dep_bits bit);
  extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
                                       enum tick_dep_bits bit);
@@@ -253,11 -256,11 +257,11 @@@ static inline void tick_dep_clear_task(
        if (tick_nohz_full_enabled())
                tick_nohz_dep_clear_task(tsk, bit);
  }
- static inline void tick_dep_set_signal(struct signal_struct *signal,
+ static inline void tick_dep_set_signal(struct task_struct *tsk,
                                       enum tick_dep_bits bit)
  {
        if (tick_nohz_full_enabled())
-               tick_nohz_dep_set_signal(signal, bit);
+               tick_nohz_dep_set_signal(tsk, bit);
  }
  static inline void tick_dep_clear_signal(struct signal_struct *signal,
                                         enum tick_dep_bits bit)
@@@ -285,7 -288,7 +289,7 @@@ static inline void tick_dep_set_task(st
                                     enum tick_dep_bits bit) { }
  static inline void tick_dep_clear_task(struct task_struct *tsk,
                                       enum tick_dep_bits bit) { }
- static inline void tick_dep_set_signal(struct signal_struct *signal,
+ static inline void tick_dep_set_signal(struct task_struct *tsk,
                                       enum tick_dep_bits bit) { }
  static inline void tick_dep_clear_signal(struct signal_struct *signal,
                                         enum tick_dep_bits bit) { }
@@@ -301,10 -304,4 +305,10 @@@ static inline void tick_nohz_task_switc
                __tick_nohz_task_switch();
  }
  
 +static inline void tick_nohz_user_enter_prepare(void)
 +{
 +      if (tick_nohz_full_cpu(smp_processor_id()))
 +              rcu_nocb_flush_deferred_wakeup();
 +}
 +
  #endif
diff --combined kernel/sched/core.c
@@@ -84,272 -84,6 +84,272 @@@ unsigned int sysctl_sched_rt_period = 1
  
  __read_mostly int scheduler_running;
  
 +#ifdef CONFIG_SCHED_CORE
 +
 +DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
 +
 +/* kernel prio, less is more */
 +static inline int __task_prio(struct task_struct *p)
 +{
 +      if (p->sched_class == &stop_sched_class) /* trumps deadline */
 +              return -2;
 +
 +      if (rt_prio(p->prio)) /* includes deadline */
 +              return p->prio; /* [-1, 99] */
 +
 +      if (p->sched_class == &idle_sched_class)
 +              return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
 +
 +      return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
 +}
 +
 +/*
 + * l(a,b)
 + * le(a,b) := !l(b,a)
 + * g(a,b)  := l(b,a)
 + * ge(a,b) := !l(a,b)
 + */
 +
 +/* real prio, less is less */
 +static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
 +{
 +
 +      int pa = __task_prio(a), pb = __task_prio(b);
 +
 +      if (-pa < -pb)
 +              return true;
 +
 +      if (-pb < -pa)
 +              return false;
 +
 +      if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
 +              return !dl_time_before(a->dl.deadline, b->dl.deadline);
 +
 +      if (pa == MAX_RT_PRIO + MAX_NICE)       /* fair */
 +              return cfs_prio_less(a, b, in_fi);
 +
 +      return false;
 +}
 +
 +static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
 +{
 +      if (a->core_cookie < b->core_cookie)
 +              return true;
 +
 +      if (a->core_cookie > b->core_cookie)
 +              return false;
 +
 +      /* flip prio, so high prio is leftmost */
 +      if (prio_less(b, a, task_rq(a)->core->core_forceidle))
 +              return true;
 +
 +      return false;
 +}
 +
 +#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
 +
 +static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
 +{
 +      return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
 +}
 +
 +static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
 +{
 +      const struct task_struct *p = __node_2_sc(node);
 +      unsigned long cookie = (unsigned long)key;
 +
 +      if (cookie < p->core_cookie)
 +              return -1;
 +
 +      if (cookie > p->core_cookie)
 +              return 1;
 +
 +      return 0;
 +}
 +
 +void sched_core_enqueue(struct rq *rq, struct task_struct *p)
 +{
 +      rq->core->core_task_seq++;
 +
 +      if (!p->core_cookie)
 +              return;
 +
 +      rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
 +}
 +
 +void sched_core_dequeue(struct rq *rq, struct task_struct *p)
 +{
 +      rq->core->core_task_seq++;
 +
 +      if (!sched_core_enqueued(p))
 +              return;
 +
 +      rb_erase(&p->core_node, &rq->core_tree);
 +      RB_CLEAR_NODE(&p->core_node);
 +}
 +
 +/*
 + * Find left-most (aka, highest priority) task matching @cookie.
 + */
 +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
 +{
 +      struct rb_node *node;
 +
 +      node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
 +      /*
 +       * The idle task always matches any cookie!
 +       */
 +      if (!node)
 +              return idle_sched_class.pick_task(rq);
 +
 +      return __node_2_sc(node);
 +}
 +
 +static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
 +{
 +      struct rb_node *node = &p->core_node;
 +
 +      node = rb_next(node);
 +      if (!node)
 +              return NULL;
 +
 +      p = container_of(node, struct task_struct, core_node);
 +      if (p->core_cookie != cookie)
 +              return NULL;
 +
 +      return p;
 +}
 +
 +/*
 + * Magic required such that:
 + *
 + *    raw_spin_rq_lock(rq);
 + *    ...
 + *    raw_spin_rq_unlock(rq);
 + *
 + * ends up locking and unlocking the _same_ lock, and all CPUs
 + * always agree on what rq has what lock.
 + *
 + * XXX entirely possible to selectively enable cores, don't bother for now.
 + */
 +
 +static DEFINE_MUTEX(sched_core_mutex);
 +static atomic_t sched_core_count;
 +static struct cpumask sched_core_mask;
 +
 +static void __sched_core_flip(bool enabled)
 +{
 +      int cpu, t, i;
 +
 +      cpus_read_lock();
 +
 +      /*
 +       * Toggle the online cores, one by one.
 +       */
 +      cpumask_copy(&sched_core_mask, cpu_online_mask);
 +      for_each_cpu(cpu, &sched_core_mask) {
 +              const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 +
 +              i = 0;
 +              local_irq_disable();
 +              for_each_cpu(t, smt_mask) {
 +                      /* supports up to SMT8 */
 +                      raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
 +              }
 +
 +              for_each_cpu(t, smt_mask)
 +                      cpu_rq(t)->core_enabled = enabled;
 +
 +              for_each_cpu(t, smt_mask)
 +                      raw_spin_unlock(&cpu_rq(t)->__lock);
 +              local_irq_enable();
 +
 +              cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
 +      }
 +
 +      /*
 +       * Toggle the offline CPUs.
 +       */
 +      cpumask_copy(&sched_core_mask, cpu_possible_mask);
 +      cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
 +
 +      for_each_cpu(cpu, &sched_core_mask)
 +              cpu_rq(cpu)->core_enabled = enabled;
 +
 +      cpus_read_unlock();
 +}
 +
 +static void sched_core_assert_empty(void)
 +{
 +      int cpu;
 +
 +      for_each_possible_cpu(cpu)
 +              WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
 +}
 +
 +static void __sched_core_enable(void)
 +{
 +      static_branch_enable(&__sched_core_enabled);
 +      /*
 +       * Ensure all previous instances of raw_spin_rq_*lock() have finished
 +       * and future ones will observe !sched_core_disabled().
 +       */
 +      synchronize_rcu();
 +      __sched_core_flip(true);
 +      sched_core_assert_empty();
 +}
 +
 +static void __sched_core_disable(void)
 +{
 +      sched_core_assert_empty();
 +      __sched_core_flip(false);
 +      static_branch_disable(&__sched_core_enabled);
 +}
 +
 +void sched_core_get(void)
 +{
 +      if (atomic_inc_not_zero(&sched_core_count))
 +              return;
 +
 +      mutex_lock(&sched_core_mutex);
 +      if (!atomic_read(&sched_core_count))
 +              __sched_core_enable();
 +
 +      smp_mb__before_atomic();
 +      atomic_inc(&sched_core_count);
 +      mutex_unlock(&sched_core_mutex);
 +}
 +
 +static void __sched_core_put(struct work_struct *work)
 +{
 +      if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
 +              __sched_core_disable();
 +              mutex_unlock(&sched_core_mutex);
 +      }
 +}
 +
 +void sched_core_put(void)
 +{
 +      static DECLARE_WORK(_work, __sched_core_put);
 +
 +      /*
 +       * "There can be only one"
 +       *
 +       * Either this is the last one, or we don't actually need to do any
 +       * 'work'. If it is the last *again*, we rely on
 +       * WORK_STRUCT_PENDING_BIT.
 +       */
 +      if (!atomic_add_unless(&sched_core_count, -1, 1))
 +              schedule_work(&_work);
 +}
 +
 +#else /* !CONFIG_SCHED_CORE */
 +
 +static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
 +static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
 +
 +#endif /* CONFIG_SCHED_CORE */
 +
  /*
   * part of the period that we allow rt tasks to run in us.
   * default: 0.95s
@@@ -450,79 -184,6 +450,79 @@@ int sysctl_sched_rt_runtime = 950000
   *
   */
  
 +void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
 +{
 +      raw_spinlock_t *lock;
 +
 +      /* Matches synchronize_rcu() in __sched_core_enable() */
 +      preempt_disable();
 +      if (sched_core_disabled()) {
 +              raw_spin_lock_nested(&rq->__lock, subclass);
 +              /* preempt_count *MUST* be > 1 */
 +              preempt_enable_no_resched();
 +              return;
 +      }
 +
 +      for (;;) {
 +              lock = __rq_lockp(rq);
 +              raw_spin_lock_nested(lock, subclass);
 +              if (likely(lock == __rq_lockp(rq))) {
 +                      /* preempt_count *MUST* be > 1 */
 +                      preempt_enable_no_resched();
 +                      return;
 +              }
 +              raw_spin_unlock(lock);
 +      }
 +}
 +
 +bool raw_spin_rq_trylock(struct rq *rq)
 +{
 +      raw_spinlock_t *lock;
 +      bool ret;
 +
 +      /* Matches synchronize_rcu() in __sched_core_enable() */
 +      preempt_disable();
 +      if (sched_core_disabled()) {
 +              ret = raw_spin_trylock(&rq->__lock);
 +              preempt_enable();
 +              return ret;
 +      }
 +
 +      for (;;) {
 +              lock = __rq_lockp(rq);
 +              ret = raw_spin_trylock(lock);
 +              if (!ret || (likely(lock == __rq_lockp(rq)))) {
 +                      preempt_enable();
 +                      return ret;
 +              }
 +              raw_spin_unlock(lock);
 +      }
 +}
 +
 +void raw_spin_rq_unlock(struct rq *rq)
 +{
 +      raw_spin_unlock(rq_lockp(rq));
 +}
 +
 +#ifdef CONFIG_SMP
 +/*
 + * double_rq_lock - safely lock two runqueues
 + */
 +void double_rq_lock(struct rq *rq1, struct rq *rq2)
 +{
 +      lockdep_assert_irqs_disabled();
 +
 +      if (rq_order_less(rq2, rq1))
 +              swap(rq1, rq2);
 +
 +      raw_spin_rq_lock(rq1);
 +      if (__rq_lockp(rq1) == __rq_lockp(rq2))
 +              return;
 +
 +      raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
 +}
 +#endif
 +
  /*
   * __task_rq_lock - lock the rq @p resides on.
   */
@@@ -535,12 -196,12 +535,12 @@@ struct rq *__task_rq_lock(struct task_s
  
        for (;;) {
                rq = task_rq(p);
 -              raw_spin_lock(&rq->lock);
 +              raw_spin_rq_lock(rq);
                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
                        rq_pin_lock(rq, rf);
                        return rq;
                }
 -              raw_spin_unlock(&rq->lock);
 +              raw_spin_rq_unlock(rq);
  
                while (unlikely(task_on_rq_migrating(p)))
                        cpu_relax();
@@@ -559,7 -220,7 +559,7 @@@ struct rq *task_rq_lock(struct task_str
        for (;;) {
                raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
                rq = task_rq(p);
 -              raw_spin_lock(&rq->lock);
 +              raw_spin_rq_lock(rq);
                /*
                 *      move_queued_task()              task_rq_lock()
                 *
                        rq_pin_lock(rq, rf);
                        return rq;
                }
 -              raw_spin_unlock(&rq->lock);
 +              raw_spin_rq_unlock(rq);
                raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
  
                while (unlikely(task_on_rq_migrating(p)))
@@@ -651,7 -312,7 +651,7 @@@ void update_rq_clock(struct rq *rq
  {
        s64 delta;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        if (rq->clock_update_flags & RQCF_ACT_SKIP)
                return;
@@@ -924,6 -585,7 +924,6 @@@ void wake_up_q(struct wake_q_head *head
                struct task_struct *task;
  
                task = container_of(node, struct task_struct, wake_q);
 -              BUG_ON(!task);
                /* Task can safely be re-inserted now: */
                node = node->next;
                task->wake_q.next = NULL;
@@@ -949,7 -611,7 +949,7 @@@ void resched_curr(struct rq *rq
        struct task_struct *curr = rq->curr;
        int cpu;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        if (test_tsk_need_resched(curr))
                return;
@@@ -973,10 -635,10 +973,10 @@@ void resched_cpu(int cpu
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
  
 -      raw_spin_lock_irqsave(&rq->lock, flags);
 +      raw_spin_rq_lock_irqsave(rq, flags);
        if (cpu_online(cpu) || cpu == smp_processor_id())
                resched_curr(rq);
 -      raw_spin_unlock_irqrestore(&rq->lock, flags);
 +      raw_spin_rq_unlock_irqrestore(rq, flags);
  }
  
  #ifdef CONFIG_SMP
@@@ -1403,10 -1065,9 +1403,10 @@@ static void uclamp_sync_util_min_rt_def
  static inline struct uclamp_se
  uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
  {
 +      /* Copy by value as we could modify it */
        struct uclamp_se uc_req = p->uclamp_req[clamp_id];
  #ifdef CONFIG_UCLAMP_TASK_GROUP
 -      struct uclamp_se uc_max;
 +      unsigned int tg_min, tg_max, value;
  
        /*
         * Tasks in autogroups or root task group will be
        if (task_group(p) == &root_task_group)
                return uc_req;
  
 -      uc_max = task_group(p)->uclamp[clamp_id];
 -      if (uc_req.value > uc_max.value || !uc_req.user_defined)
 -              return uc_max;
 +      tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
 +      tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
 +      value = uc_req.value;
 +      value = clamp(value, tg_min, tg_max);
 +      uclamp_se_set(&uc_req, value, false);
  #endif
  
        return uc_req;
@@@ -1478,7 -1137,7 +1478,7 @@@ static inline void uclamp_rq_inc_id(str
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
        struct uclamp_bucket *bucket;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        /* Update task effective clamp */
        p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
@@@ -1518,7 -1177,7 +1518,7 @@@ static inline void uclamp_rq_dec_id(str
        unsigned int bkt_clamp;
        unsigned int rq_clamp;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        /*
         * If sched_uclamp_used was enabled after task @p was enqueued,
@@@ -1620,9 -1279,8 +1620,9 @@@ static inline void uclamp_rq_dec(struc
  }
  
  static inline void
 -uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
 +uclamp_update_active(struct task_struct *p)
  {
 +      enum uclamp_id clamp_id;
        struct rq_flags rf;
        struct rq *rq;
  
         * affecting a valid clamp bucket, the next time it's enqueued,
         * it will already see the updated clamp bucket value.
         */
 -      if (p->uclamp[clamp_id].active) {
 -              uclamp_rq_dec_id(rq, p, clamp_id);
 -              uclamp_rq_inc_id(rq, p, clamp_id);
 +      for_each_clamp_id(clamp_id) {
 +              if (p->uclamp[clamp_id].active) {
 +                      uclamp_rq_dec_id(rq, p, clamp_id);
 +                      uclamp_rq_inc_id(rq, p, clamp_id);
 +              }
        }
  
        task_rq_unlock(rq, p, &rf);
  
  #ifdef CONFIG_UCLAMP_TASK_GROUP
  static inline void
 -uclamp_update_active_tasks(struct cgroup_subsys_state *css,
 -                         unsigned int clamps)
 +uclamp_update_active_tasks(struct cgroup_subsys_state *css)
  {
 -      enum uclamp_id clamp_id;
        struct css_task_iter it;
        struct task_struct *p;
  
        css_task_iter_start(css, 0, &it);
 -      while ((p = css_task_iter_next(&it))) {
 -              for_each_clamp_id(clamp_id) {
 -                      if ((0x1 << clamp_id) & clamps)
 -                              uclamp_update_active(p, clamp_id);
 -              }
 -      }
 +      while ((p = css_task_iter_next(&it)))
 +              uclamp_update_active(p);
        css_task_iter_end(&it);
  }
  
@@@ -1928,33 -1590,32 +1928,38 @@@ static inline void uclamp_post_fork(str
  static inline void init_uclamp(void) { }
  #endif /* CONFIG_UCLAMP_TASK */
  
+ bool sched_task_on_rq(struct task_struct *p)
+ {
+       return task_on_rq_queued(p);
+ }
  static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
        if (!(flags & ENQUEUE_NOCLOCK))
                update_rq_clock(rq);
  
        if (!(flags & ENQUEUE_RESTORE)) {
 -              sched_info_queued(rq, p);
 +              sched_info_enqueue(rq, p);
                psi_enqueue(p, flags & ENQUEUE_WAKEUP);
        }
  
        uclamp_rq_inc(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 +
 +      if (sched_core_enabled(rq))
 +              sched_core_enqueue(rq, p);
  }
  
  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
 +      if (sched_core_enabled(rq))
 +              sched_core_dequeue(rq, p);
 +
        if (!(flags & DEQUEUE_NOCLOCK))
                update_rq_clock(rq);
  
        if (!(flags & DEQUEUE_SAVE)) {
 -              sched_info_dequeued(rq, p);
 +              sched_info_dequeue(rq, p);
                psi_dequeue(p, flags & DEQUEUE_SLEEP);
        }
  
@@@ -2194,7 -1855,7 +2199,7 @@@ static inline bool is_cpu_allowed(struc
  static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
                                   struct task_struct *p, int new_cpu)
  {
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        deactivate_task(rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, new_cpu);
@@@ -2260,6 -1921,7 +2265,6 @@@ static int migration_cpu_stop(void *dat
        struct migration_arg *arg = data;
        struct set_affinity_pending *pending = arg->pending;
        struct task_struct *p = arg->task;
 -      int dest_cpu = arg->dest_cpu;
        struct rq *rq = this_rq();
        bool complete = false;
        struct rq_flags rf;
                if (pending) {
                        p->migration_pending = NULL;
                        complete = true;
 -              }
  
 -              if (dest_cpu < 0) {
                        if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
                                goto out;
 -
 -                      dest_cpu = cpumask_any_distribute(&p->cpus_mask);
                }
  
                if (task_on_rq_queued(p))
 -                      rq = __migrate_task(rq, &rf, p, dest_cpu);
 +                      rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
                else
 -                      p->wake_cpu = dest_cpu;
 +                      p->wake_cpu = arg->dest_cpu;
  
                /*
                 * XXX __migrate_task() can fail, at which point we might end
@@@ -2363,7 -2029,7 +2368,7 @@@ int push_cpu_stop(void *arg
        struct task_struct *p = arg;
  
        raw_spin_lock_irq(&p->pi_lock);
 -      raw_spin_lock(&rq->lock);
 +      raw_spin_rq_lock(rq);
  
        if (task_rq(p) != rq)
                goto out_unlock;
  
  out_unlock:
        rq->push_busy = false;
 -      raw_spin_unlock(&rq->lock);
 +      raw_spin_rq_unlock(rq);
        raw_spin_unlock_irq(&p->pi_lock);
  
        put_task_struct(p);
@@@ -2446,7 -2112,7 +2451,7 @@@ __do_set_cpus_allowed(struct task_struc
                 * Because __kthread_bind() calls this on blocked tasks without
                 * holding rq->lock.
                 */
 -              lockdep_assert_held(&rq->lock);
 +              lockdep_assert_rq_held(rq);
                dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
        }
        if (running)
@@@ -2588,7 -2254,7 +2593,7 @@@ static int affine_move_task(struct rq *
                        init_completion(&my_pending.done);
                        my_pending.arg = (struct migration_arg) {
                                .task = p,
 -                              .dest_cpu = -1,         /* any */
 +                              .dest_cpu = dest_cpu,
                                .pending = &my_pending,
                        };
  
                } else {
                        pending = p->migration_pending;
                        refcount_inc(&pending->refs);
 +                      /*
 +                       * Affinity has changed, but we've already installed a
 +                       * pending. migration_cpu_stop() *must* see this, else
 +                       * we risk a completion of the pending despite having a
 +                       * task on a disallowed CPU.
 +                       *
 +                       * Serialized by p->pi_lock, so this is safe.
 +                       */
 +                      pending->arg.dest_cpu = dest_cpu;
                }
        }
        pending = p->migration_pending;
                return -EINVAL;
        }
  
 -      if (task_running(rq, p) || p->state == TASK_WAKING) {
 +      if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
                /*
                 * MIGRATE_ENABLE gets here because 'p == current', but for
                 * anything else we cannot do is_migration_disabled(), punt
@@@ -2768,20 -2425,19 +2773,20 @@@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr)
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
  #ifdef CONFIG_SCHED_DEBUG
 +      unsigned int state = READ_ONCE(p->__state);
 +
        /*
         * We should never call set_task_cpu() on a blocked task,
         * ttwu() will sort out the placement.
         */
 -      WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 -                      !p->on_rq);
 +      WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
  
        /*
         * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
         * because schedstat_wait_{start,end} rebase migrating task's wait_start
         * time relying on p->on_rq.
         */
 -      WARN_ON_ONCE(p->state == TASK_RUNNING &&
 +      WARN_ON_ONCE(state == TASK_RUNNING &&
                     p->sched_class == &fair_sched_class &&
                     (p->on_rq && !task_on_rq_migrating(p)));
  
         * task_rq_lock().
         */
        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
 -                                    lockdep_is_held(&task_rq(p)->lock)));
 +                                    lockdep_is_held(__rq_lockp(task_rq(p)))));
  #endif
        /*
         * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
@@@ -2953,7 -2609,7 +2958,7 @@@ out
   * smp_call_function() if an IPI is sent by the same process we are
   * waiting to become inactive.
   */
 -unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
  {
        int running, queued;
        struct rq_flags rf;
                 * is actually now running somewhere else!
                 */
                while (task_running(rq, p)) {
 -                      if (match_state && unlikely(p->state != match_state))
 +                      if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
                                return 0;
                        cpu_relax();
                }
                running = task_running(rq, p);
                queued = task_on_rq_queued(p);
                ncsw = 0;
 -              if (!match_state || p->state == match_state)
 +              if (!match_state || READ_ONCE(p->__state) == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                task_rq_unlock(rq, p, &rf);
  
@@@ -3305,7 -2961,7 +3310,7 @@@ static void ttwu_do_wakeup(struct rq *r
                           struct rq_flags *rf)
  {
        check_preempt_curr(rq, p, wake_flags);
 -      p->state = TASK_RUNNING;
 +      WRITE_ONCE(p->__state, TASK_RUNNING);
        trace_sched_wakeup(p);
  
  #ifdef CONFIG_SMP
                if (rq->avg_idle > max)
                        rq->avg_idle = max;
  
 +              rq->wake_stamp = jiffies;
 +              rq->wake_avg_idle = rq->avg_idle / 2;
 +
                rq->idle_stamp = 0;
        }
  #endif
@@@ -3342,7 -2995,7 +3347,7 @@@ ttwu_do_activate(struct rq *rq, struct 
  {
        int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        if (p->sched_contributes_to_load)
                rq->nr_uninterruptible--;
@@@ -3697,12 -3350,12 +3702,12 @@@ try_to_wake_up(struct task_struct *p, u
                 *  - we're serialized against set_special_state() by virtue of
                 *    it disabling IRQs (this allows not taking ->pi_lock).
                 */
 -              if (!(p->state & state))
 +              if (!(READ_ONCE(p->__state) & state))
                        goto out;
  
                success = 1;
                trace_sched_waking(p);
 -              p->state = TASK_RUNNING;
 +              WRITE_ONCE(p->__state, TASK_RUNNING);
                trace_sched_wakeup(p);
                goto out;
        }
         */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        smp_mb__after_spinlock();
 -      if (!(p->state & state))
 +      if (!(READ_ONCE(p->__state) & state))
                goto unlock;
  
        trace_sched_waking(p);
         * TASK_WAKING such that we can unlock p->pi_lock before doing the
         * enqueue, such as ttwu_queue_wakelist().
         */
 -      p->state = TASK_WAKING;
 +      WRITE_ONCE(p->__state, TASK_WAKING);
  
        /*
         * If the owning (remote) CPU is still in the middle of schedule() with
@@@ -3874,7 -3527,7 +3879,7 @@@ bool try_invoke_on_locked_down_task(str
                        ret = func(p, arg);
                rq_unlock(rq, &rf);
        } else {
 -              switch (p->state) {
 +              switch (READ_ONCE(p->__state)) {
                case TASK_RUNNING:
                case TASK_WAKING:
                        break;
@@@ -4000,6 -3653,7 +4005,6 @@@ int sysctl_numa_balancing(struct ctl_ta
  #ifdef CONFIG_SCHEDSTATS
  
  DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 -static bool __initdata __sched_schedstats = false;
  
  static void set_schedstats(bool enabled)
  {
@@@ -4023,11 -3677,16 +4028,11 @@@ static int __init setup_schedstats(cha
        if (!str)
                goto out;
  
 -      /*
 -       * This code is called before jump labels have been set up, so we can't
 -       * change the static branch directly just yet.  Instead set a temporary
 -       * variable so init_schedstats() can do it later.
 -       */
        if (!strcmp(str, "enable")) {
 -              __sched_schedstats = true;
 +              set_schedstats(true);
                ret = 1;
        } else if (!strcmp(str, "disable")) {
 -              __sched_schedstats = false;
 +              set_schedstats(false);
                ret = 1;
        }
  out:
  }
  __setup("schedstats=", setup_schedstats);
  
 -static void __init init_schedstats(void)
 -{
 -      set_schedstats(__sched_schedstats);
 -}
 -
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
        return err;
  }
  #endif /* CONFIG_PROC_SYSCTL */
 -#else  /* !CONFIG_SCHEDSTATS */
 -static inline void init_schedstats(void) {}
  #endif /* CONFIG_SCHEDSTATS */
  
  /*
@@@ -4074,7 -3740,7 +4079,7 @@@ int sched_fork(unsigned long clone_flag
         * nobody will actually run it, and a signal or other external
         * event cannot wake it up and insert it on the runqueue either.
         */
 -      p->state = TASK_NEW;
 +      p->__state = TASK_NEW;
  
        /*
         * Make sure we do not leak PI boosting priority to the child.
@@@ -4180,7 -3846,7 +4185,7 @@@ void wake_up_new_task(struct task_struc
        struct rq *rq;
  
        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 -      p->state = TASK_RUNNING;
 +      WRITE_ONCE(p->__state, TASK_RUNNING);
  #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
@@@ -4340,7 -4006,7 +4345,7 @@@ static void do_balance_callbacks(struc
        void (*func)(struct rq *rq);
        struct callback_head *next;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        while (head) {
                func = (void (*)(struct rq *))head->func;
@@@ -4363,7 -4029,7 +4368,7 @@@ static inline struct callback_head *spl
  {
        struct callback_head *head = rq->balance_callback;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
        if (head)
                rq->balance_callback = NULL;
  
@@@ -4380,9 -4046,9 +4385,9 @@@ static inline void balance_callbacks(st
        unsigned long flags;
  
        if (unlikely(head)) {
 -              raw_spin_lock_irqsave(&rq->lock, flags);
 +              raw_spin_rq_lock_irqsave(rq, flags);
                do_balance_callbacks(rq, head);
 -              raw_spin_unlock_irqrestore(&rq->lock, flags);
 +              raw_spin_rq_unlock_irqrestore(rq, flags);
        }
  }
  
@@@ -4413,10 -4079,10 +4418,10 @@@ prepare_lock_switch(struct rq *rq, stru
         * do an early lockdep release here:
         */
        rq_unpin_lock(rq, rf);
 -      spin_release(&rq->lock.dep_map, _THIS_IP_);
 +      spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
  #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
 -      rq->lock.owner = next;
 +      rq_lockp(rq)->owner = next;
  #endif
  }
  
@@@ -4427,9 -4093,9 +4432,9 @@@ static inline void finish_lock_switch(s
         * fix up the runqueue lock - which gets 'carried over' from
         * prev into current:
         */
 -      spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 +      spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
        __balance_callbacks(rq);
 -      raw_spin_unlock_irq(&rq->lock);
 +      raw_spin_rq_unlock_irq(rq);
  }
  
  /*
@@@ -4542,10 -4208,11 +4547,11 @@@ static struct rq *finish_task_switch(st
         * running on another CPU and we could rave with its RUNNING -> DEAD
         * transition, resulting in a double drop.
         */
 -      prev_state = prev->state;
 +      prev_state = READ_ONCE(prev->__state);
        vtime_task_switch(prev);
        perf_event_task_sched_in(prev, current);
        finish_task(prev);
+       tick_nohz_task_switch();
        finish_lock_switch(rq);
        finish_arch_post_lock_switch();
        kcov_finish_switch(current);
                put_task_struct_rcu_user(prev);
        }
  
-       tick_nohz_task_switch();
        return rq;
  }
  
@@@ -4687,9 -4353,9 +4692,9 @@@ context_switch(struct rq *rq, struct ta
   * externally visible scheduler statistics: current number of runnable
   * threads, total number of context switches performed since bootup.
   */
 -unsigned long nr_running(void)
 +unsigned int nr_running(void)
  {
 -      unsigned long i, sum = 0;
 +      unsigned int i, sum = 0;
  
        for_each_online_cpu(i)
                sum += cpu_rq(i)->nr_running;
@@@ -4734,7 -4400,7 +4739,7 @@@ unsigned long long nr_context_switches(
   * it does become runnable.
   */
  
 -unsigned long nr_iowait_cpu(int cpu)
 +unsigned int nr_iowait_cpu(int cpu)
  {
        return atomic_read(&cpu_rq(cpu)->nr_iowait);
  }
   * Task CPU affinities can make all that even more 'interesting'.
   */
  
 -unsigned long nr_iowait(void)
 +unsigned int nr_iowait(void)
  {
 -      unsigned long i, sum = 0;
 +      unsigned int i, sum = 0;
  
        for_each_possible_cpu(i)
                sum += nr_iowait_cpu(i);
@@@ -5236,7 -4902,7 +5241,7 @@@ static inline void schedule_debug(struc
  #endif
  
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 -      if (!preempt && prev->state && prev->non_block_count) {
 +      if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
                printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
                        prev->comm, prev->pid, prev->non_block_count);
                dump_stack();
@@@ -5282,7 -4948,7 +5287,7 @@@ static void put_prev_task_balance(struc
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
 -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
        const struct sched_class *class;
        struct task_struct *p;
                if (unlikely(p == RETRY_TASK))
                        goto restart;
  
 -              /* Assumes fair_sched_class->next == idle_sched_class */
 +              /* Assume the next prioritized class is idle_sched_class */
                if (!p) {
                        put_prev_task(rq, prev);
                        p = pick_next_task_idle(rq);
@@@ -5322,455 -4988,6 +5327,455 @@@ restart
        BUG();
  }
  
 +#ifdef CONFIG_SCHED_CORE
 +static inline bool is_task_rq_idle(struct task_struct *t)
 +{
 +      return (task_rq(t)->idle == t);
 +}
 +
 +static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
 +{
 +      return is_task_rq_idle(a) || (a->core_cookie == cookie);
 +}
 +
 +static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
 +{
 +      if (is_task_rq_idle(a) || is_task_rq_idle(b))
 +              return true;
 +
 +      return a->core_cookie == b->core_cookie;
 +}
 +
 +// XXX fairness/fwd progress conditions
 +/*
 + * Returns
 + * - NULL if there is no runnable task for this class.
 + * - the highest priority task for this runqueue if it matches
 + *   rq->core->core_cookie or its priority is greater than max.
 + * - Else returns idle_task.
 + */
 +static struct task_struct *
 +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
 +{
 +      struct task_struct *class_pick, *cookie_pick;
 +      unsigned long cookie = rq->core->core_cookie;
 +
 +      class_pick = class->pick_task(rq);
 +      if (!class_pick)
 +              return NULL;
 +
 +      if (!cookie) {
 +              /*
 +               * If class_pick is tagged, return it only if it has
 +               * higher priority than max.
 +               */
 +              if (max && class_pick->core_cookie &&
 +                  prio_less(class_pick, max, in_fi))
 +                      return idle_sched_class.pick_task(rq);
 +
 +              return class_pick;
 +      }
 +
 +      /*
 +       * If class_pick is idle or matches cookie, return early.
 +       */
 +      if (cookie_equals(class_pick, cookie))
 +              return class_pick;
 +
 +      cookie_pick = sched_core_find(rq, cookie);
 +
 +      /*
 +       * If class > max && class > cookie, it is the highest priority task on
 +       * the core (so far) and it must be selected, otherwise we must go with
 +       * the cookie pick in order to satisfy the constraint.
 +       */
 +      if (prio_less(cookie_pick, class_pick, in_fi) &&
 +          (!max || prio_less(max, class_pick, in_fi)))
 +              return class_pick;
 +
 +      return cookie_pick;
 +}
 +
 +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
 +
 +static struct task_struct *
 +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 +{
 +      struct task_struct *next, *max = NULL;
 +      const struct sched_class *class;
 +      const struct cpumask *smt_mask;
 +      bool fi_before = false;
 +      int i, j, cpu, occ = 0;
 +      bool need_sync;
 +
 +      if (!sched_core_enabled(rq))
 +              return __pick_next_task(rq, prev, rf);
 +
 +      cpu = cpu_of(rq);
 +
 +      /* Stopper task is switching into idle, no need core-wide selection. */
 +      if (cpu_is_offline(cpu)) {
 +              /*
 +               * Reset core_pick so that we don't enter the fastpath when
 +               * coming online. core_pick would already be migrated to
 +               * another cpu during offline.
 +               */
 +              rq->core_pick = NULL;
 +              return __pick_next_task(rq, prev, rf);
 +      }
 +
 +      /*
 +       * If there were no {en,de}queues since we picked (IOW, the task
 +       * pointers are all still valid), and we haven't scheduled the last
 +       * pick yet, do so now.
 +       *
 +       * rq->core_pick can be NULL if no selection was made for a CPU because
 +       * it was either offline or went offline during a sibling's core-wide
 +       * selection. In this case, do a core-wide selection.
 +       */
 +      if (rq->core->core_pick_seq == rq->core->core_task_seq &&
 +          rq->core->core_pick_seq != rq->core_sched_seq &&
 +          rq->core_pick) {
 +              WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
 +
 +              next = rq->core_pick;
 +              if (next != prev) {
 +                      put_prev_task(rq, prev);
 +                      set_next_task(rq, next);
 +              }
 +
 +              rq->core_pick = NULL;
 +              return next;
 +      }
 +
 +      put_prev_task_balance(rq, prev, rf);
 +
 +      smt_mask = cpu_smt_mask(cpu);
 +      need_sync = !!rq->core->core_cookie;
 +
 +      /* reset state */
 +      rq->core->core_cookie = 0UL;
 +      if (rq->core->core_forceidle) {
 +              need_sync = true;
 +              fi_before = true;
 +              rq->core->core_forceidle = false;
 +      }
 +
 +      /*
 +       * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
 +       *
 +       * @task_seq guards the task state ({en,de}queues)
 +       * @pick_seq is the @task_seq we did a selection on
 +       * @sched_seq is the @pick_seq we scheduled
 +       *
 +       * However, preemptions can cause multiple picks on the same task set.
 +       * 'Fix' this by also increasing @task_seq for every pick.
 +       */
 +      rq->core->core_task_seq++;
 +
 +      /*
 +       * Optimize for common case where this CPU has no cookies
 +       * and there are no cookied tasks running on siblings.
 +       */
 +      if (!need_sync) {
 +              for_each_class(class) {
 +                      next = class->pick_task(rq);
 +                      if (next)
 +                              break;
 +              }
 +
 +              if (!next->core_cookie) {
 +                      rq->core_pick = NULL;
 +                      /*
 +                       * For robustness, update the min_vruntime_fi for
 +                       * unconstrained picks as well.
 +                       */
 +                      WARN_ON_ONCE(fi_before);
 +                      task_vruntime_update(rq, next, false);
 +                      goto done;
 +              }
 +      }
 +
 +      for_each_cpu(i, smt_mask) {
 +              struct rq *rq_i = cpu_rq(i);
 +
 +              rq_i->core_pick = NULL;
 +
 +              if (i != cpu)
 +                      update_rq_clock(rq_i);
 +      }
 +
 +      /*
 +       * Try and select tasks for each sibling in descending sched_class
 +       * order.
 +       */
 +      for_each_class(class) {
 +again:
 +              for_each_cpu_wrap(i, smt_mask, cpu) {
 +                      struct rq *rq_i = cpu_rq(i);
 +                      struct task_struct *p;
 +
 +                      if (rq_i->core_pick)
 +                              continue;
 +
 +                      /*
 +                       * If this sibling doesn't yet have a suitable task to
 +                       * run; ask for the most eligible task, given the
 +                       * highest priority task already selected for this
 +                       * core.
 +                       */
 +                      p = pick_task(rq_i, class, max, fi_before);
 +                      if (!p)
 +                              continue;
 +
 +                      if (!is_task_rq_idle(p))
 +                              occ++;
 +
 +                      rq_i->core_pick = p;
 +                      if (rq_i->idle == p && rq_i->nr_running) {
 +                              rq->core->core_forceidle = true;
 +                              if (!fi_before)
 +                                      rq->core->core_forceidle_seq++;
 +                      }
 +
 +                      /*
 +                       * If this new candidate is of higher priority than the
 +                       * previous; and they're incompatible; we need to wipe
 +                       * the slate and start over. pick_task makes sure that
 +                       * p's priority is more than max if it doesn't match
 +                       * max's cookie.
 +                       *
 +                       * NOTE: this is a linear max-filter and is thus bounded
 +                       * in execution time.
 +                       */
 +                      if (!max || !cookie_match(max, p)) {
 +                              struct task_struct *old_max = max;
 +
 +                              rq->core->core_cookie = p->core_cookie;
 +                              max = p;
 +
 +                              if (old_max) {
 +                                      rq->core->core_forceidle = false;
 +                                      for_each_cpu(j, smt_mask) {
 +                                              if (j == i)
 +                                                      continue;
 +
 +                                              cpu_rq(j)->core_pick = NULL;
 +                                      }
 +                                      occ = 1;
 +                                      goto again;
 +                              }
 +                      }
 +              }
 +      }
 +
 +      rq->core->core_pick_seq = rq->core->core_task_seq;
 +      next = rq->core_pick;
 +      rq->core_sched_seq = rq->core->core_pick_seq;
 +
 +      /* Something should have been selected for current CPU */
 +      WARN_ON_ONCE(!next);
 +
 +      /*
 +       * Reschedule siblings
 +       *
 +       * NOTE: L1TF -- at this point we're no longer running the old task and
 +       * sending an IPI (below) ensures the sibling will no longer be running
 +       * their task. This ensures there is no inter-sibling overlap between
 +       * non-matching user state.
 +       */
 +      for_each_cpu(i, smt_mask) {
 +              struct rq *rq_i = cpu_rq(i);
 +
 +              /*
 +               * An online sibling might have gone offline before a task
 +               * could be picked for it, or it might be offline but later
 +               * happen to come online, but its too late and nothing was
 +               * picked for it.  That's Ok - it will pick tasks for itself,
 +               * so ignore it.
 +               */
 +              if (!rq_i->core_pick)
 +                      continue;
 +
 +              /*
 +               * Update for new !FI->FI transitions, or if continuing to be in !FI:
 +               * fi_before     fi      update?
 +               *  0            0       1
 +               *  0            1       1
 +               *  1            0       1
 +               *  1            1       0
 +               */
 +              if (!(fi_before && rq->core->core_forceidle))
 +                      task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
 +
 +              rq_i->core_pick->core_occupation = occ;
 +
 +              if (i == cpu) {
 +                      rq_i->core_pick = NULL;
 +                      continue;
 +              }
 +
 +              /* Did we break L1TF mitigation requirements? */
 +              WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
 +
 +              if (rq_i->curr == rq_i->core_pick) {
 +                      rq_i->core_pick = NULL;
 +                      continue;
 +              }
 +
 +              resched_curr(rq_i);
 +      }
 +
 +done:
 +      set_next_task(rq, next);
 +      return next;
 +}
 +
 +static bool try_steal_cookie(int this, int that)
 +{
 +      struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
 +      struct task_struct *p;
 +      unsigned long cookie;
 +      bool success = false;
 +
 +      local_irq_disable();
 +      double_rq_lock(dst, src);
 +
 +      cookie = dst->core->core_cookie;
 +      if (!cookie)
 +              goto unlock;
 +
 +      if (dst->curr != dst->idle)
 +              goto unlock;
 +
 +      p = sched_core_find(src, cookie);
 +      if (p == src->idle)
 +              goto unlock;
 +
 +      do {
 +              if (p == src->core_pick || p == src->curr)
 +                      goto next;
 +
 +              if (!cpumask_test_cpu(this, &p->cpus_mask))
 +                      goto next;
 +
 +              if (p->core_occupation > dst->idle->core_occupation)
 +                      goto next;
 +
 +              p->on_rq = TASK_ON_RQ_MIGRATING;
 +              deactivate_task(src, p, 0);
 +              set_task_cpu(p, this);
 +              activate_task(dst, p, 0);
 +              p->on_rq = TASK_ON_RQ_QUEUED;
 +
 +              resched_curr(dst);
 +
 +              success = true;
 +              break;
 +
 +next:
 +              p = sched_core_next(p, cookie);
 +      } while (p);
 +
 +unlock:
 +      double_rq_unlock(dst, src);
 +      local_irq_enable();
 +
 +      return success;
 +}
 +
 +static bool steal_cookie_task(int cpu, struct sched_domain *sd)
 +{
 +      int i;
 +
 +      for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
 +              if (i == cpu)
 +                      continue;
 +
 +              if (need_resched())
 +                      break;
 +
 +              if (try_steal_cookie(cpu, i))
 +                      return true;
 +      }
 +
 +      return false;
 +}
 +
 +static void sched_core_balance(struct rq *rq)
 +{
 +      struct sched_domain *sd;
 +      int cpu = cpu_of(rq);
 +
 +      preempt_disable();
 +      rcu_read_lock();
 +      raw_spin_rq_unlock_irq(rq);
 +      for_each_domain(cpu, sd) {
 +              if (need_resched())
 +                      break;
 +
 +              if (steal_cookie_task(cpu, sd))
 +                      break;
 +      }
 +      raw_spin_rq_lock_irq(rq);
 +      rcu_read_unlock();
 +      preempt_enable();
 +}
 +
 +static DEFINE_PER_CPU(struct callback_head, core_balance_head);
 +
 +void queue_core_balance(struct rq *rq)
 +{
 +      if (!sched_core_enabled(rq))
 +              return;
 +
 +      if (!rq->core->core_cookie)
 +              return;
 +
 +      if (!rq->nr_running) /* not forced idle */
 +              return;
 +
 +      queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
 +}
 +
 +static inline void sched_core_cpu_starting(unsigned int cpu)
 +{
 +      const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 +      struct rq *rq, *core_rq = NULL;
 +      int i;
 +
 +      core_rq = cpu_rq(cpu)->core;
 +
 +      if (!core_rq) {
 +              for_each_cpu(i, smt_mask) {
 +                      rq = cpu_rq(i);
 +                      if (rq->core && rq->core == rq)
 +                              core_rq = rq;
 +              }
 +
 +              if (!core_rq)
 +                      core_rq = cpu_rq(cpu);
 +
 +              for_each_cpu(i, smt_mask) {
 +                      rq = cpu_rq(i);
 +
 +                      WARN_ON_ONCE(rq->core && rq->core != core_rq);
 +                      rq->core = core_rq;
 +              }
 +      }
 +}
 +#else /* !CONFIG_SCHED_CORE */
 +
 +static inline void sched_core_cpu_starting(unsigned int cpu) {}
 +
 +static struct task_struct *
 +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 +{
 +      return __pick_next_task(rq, prev, rf);
 +}
 +
 +#endif /* CONFIG_SCHED_CORE */
 +
  /*
   * __schedule() is the main scheduler function.
   *
@@@ -5862,10 -5079,10 +5867,10 @@@ static void __sched notrace __schedule(
         *  - we form a control dependency vs deactivate_task() below.
         *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
         */
 -      prev_state = prev->state;
 +      prev_state = READ_ONCE(prev->__state);
        if (!preempt && prev_state) {
                if (signal_pending_state(prev_state, prev)) {
 -                      prev->state = TASK_RUNNING;
 +                      WRITE_ONCE(prev->__state, TASK_RUNNING);
                } else {
                        prev->sched_contributes_to_load =
                                (prev_state & TASK_UNINTERRUPTIBLE) &&
  
                rq_unpin_lock(rq, &rf);
                __balance_callbacks(rq);
 -              raw_spin_unlock_irq(&rq->lock);
 +              raw_spin_rq_unlock_irq(rq);
        }
  }
  
@@@ -5962,7 -5179,7 +5967,7 @@@ static inline void sched_submit_work(st
  {
        unsigned int task_flags;
  
 -      if (!tsk->state)
 +      if (task_is_running(tsk))
                return;
  
        task_flags = tsk->flags;
@@@ -6037,7 -5254,7 +6042,7 @@@ void __sched schedule_idle(void
         * current task can be in any other state. Note, idle is always in the
         * TASK_RUNNING state.
         */
 -      WARN_ON_ONCE(current->state);
 +      WARN_ON_ONCE(current->__state);
        do {
                __schedule(false);
        } while (need_resched());
@@@ -6480,7 -5697,7 +6485,7 @@@ out_unlock
  
        rq_unpin_lock(rq, &rf);
        __balance_callbacks(rq);
 -      raw_spin_unlock(&rq->lock);
 +      raw_spin_rq_unlock(rq);
  
        preempt_enable();
  }
@@@ -7177,6 -6394,7 +7182,6 @@@ int sched_setattr_nocheck(struct task_s
  {
        return __sched_setscheduler(p, attr, false, true);
  }
 -EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
  
  /**
   * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
@@@ -7936,7 -7154,7 +7941,7 @@@ again
        if (curr->sched_class != p->sched_class)
                goto out_unlock;
  
 -      if (task_running(p_rq, p) || p->state)
 +      if (task_running(p_rq, p) || !task_is_running(p))
                goto out_unlock;
  
        yielded = curr->sched_class->yield_to_task(rq, p);
@@@ -8139,7 -7357,7 +8144,7 @@@ void sched_show_task(struct task_struc
  
        pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
  
 -      if (p->state == TASK_RUNNING)
 +      if (task_is_running(p))
                pr_cont("  running task    ");
  #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
@@@ -8163,28 -7381,26 +8168,28 @@@ EXPORT_SYMBOL_GPL(sched_show_task)
  static inline bool
  state_filter_match(unsigned long state_filter, struct task_struct *p)
  {
 +      unsigned int state = READ_ONCE(p->__state);
 +
        /* no filter, everything matches */
        if (!state_filter)
                return true;
  
        /* filter, but doesn't match */
 -      if (!(p->state & state_filter))
 +      if (!(state & state_filter))
                return false;
  
        /*
         * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
         * TASK_KILLABLE).
         */
 -      if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
 +      if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
                return false;
  
        return true;
  }
  
  
 -void show_state_filter(unsigned long state_filter)
 +void show_state_filter(unsigned int state_filter)
  {
        struct task_struct *g, *p;
  
   * NOTE: this function does not set the idle thread's NEED_RESCHED
   * flag, to make booting more robust.
   */
 -void init_idle(struct task_struct *idle, int cpu)
 +void __init init_idle(struct task_struct *idle, int cpu)
  {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
  
        __sched_fork(0, idle);
  
 +      /*
 +       * The idle task doesn't need the kthread struct to function, but it
 +       * is dressed up as a per-CPU kthread and thus needs to play the part
 +       * if we want to avoid special-casing it in code that deals with per-CPU
 +       * kthreads.
 +       */
 +      set_kthread_struct(idle);
 +
        raw_spin_lock_irqsave(&idle->pi_lock, flags);
 -      raw_spin_lock(&rq->lock);
 +      raw_spin_rq_lock(rq);
  
 -      idle->state = TASK_RUNNING;
 +      idle->__state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
 -      idle->flags |= PF_IDLE;
 +      /*
 +       * PF_KTHREAD should already be set at this point; regardless, make it
 +       * look like a proper per-CPU kthread.
 +       */
 +      idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
 +      kthread_set_per_cpu(idle, cpu);
  
        scs_task_reset(idle);
        kasan_unpoison_task_stack(idle);
  #ifdef CONFIG_SMP
        idle->on_cpu = 1;
  #endif
 -      raw_spin_unlock(&rq->lock);
 +      raw_spin_rq_unlock(rq);
        raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
  
        /* Set the preempt count _outside_ the spinlocks! */
@@@ -8448,7 -7651,7 +8453,7 @@@ static void balance_push(struct rq *rq
  {
        struct task_struct *push_task = rq->curr;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
        SCHED_WARN_ON(rq->cpu != smp_processor_id());
  
        /*
        /*
         * Both the cpu-hotplug and stop task are in this case and are
         * required to complete the hotplug process.
 -       *
 -       * XXX: the idle task does not match kthread_is_per_cpu() due to
 -       * histerical raisins.
         */
 -      if (rq->idle == push_task ||
 -          kthread_is_per_cpu(push_task) ||
 +      if (kthread_is_per_cpu(push_task) ||
            is_migration_disabled(push_task)) {
  
                /*
                 */
                if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
                    rcuwait_active(&rq->hotplug_wait)) {
 -                      raw_spin_unlock(&rq->lock);
 +                      raw_spin_rq_unlock(rq);
                        rcuwait_wake_up(&rq->hotplug_wait);
 -                      raw_spin_lock(&rq->lock);
 +                      raw_spin_rq_lock(rq);
                }
                return;
        }
         * Temporarily drop rq->lock such that we can wake-up the stop task.
         * Both preemption and IRQs are still disabled.
         */
 -      raw_spin_unlock(&rq->lock);
 +      raw_spin_rq_unlock(rq);
        stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
                            this_cpu_ptr(&push_work));
        /*
         * schedule(). The next pick is obviously going to be the stop task
         * which kthread_is_per_cpu() and will push this task away.
         */
 -      raw_spin_lock(&rq->lock);
 +      raw_spin_rq_lock(rq);
  }
  
  static void balance_push_set(int cpu, bool on)
@@@ -8746,7 -7953,6 +8751,7 @@@ static void sched_rq_cpu_starting(unsig
  
  int sched_cpu_starting(unsigned int cpu)
  {
 +      sched_core_cpu_starting(cpu);
        sched_rq_cpu_starting(cpu);
        sched_tick_start(cpu);
        return 0;
@@@ -8793,7 -7999,7 +8798,7 @@@ static void dump_rq_tasks(struct rq *rq
        struct task_struct *g, *p;
        int cpu = cpu_of(rq);
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
        for_each_process_thread(g, p) {
@@@ -8845,7 -8051,6 +8850,7 @@@ void __init sched_init_smp(void
        /* Move init over to a non-isolated CPU */
        if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
                BUG();
 +      current->flags &= ~PF_NO_SETAFFINITY;
        sched_init_granularity();
  
        init_sched_rt_class();
@@@ -8967,7 -8172,7 +8972,7 @@@ void __init sched_init(void
                struct rq *rq;
  
                rq = cpu_rq(i);
 -              raw_spin_lock_init(&rq->lock);
 +              raw_spin_lock_init(&rq->__lock);
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
 +              rq->wake_stamp = jiffies;
 +              rq->wake_avg_idle = rq->avg_idle;
                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
  
                INIT_LIST_HEAD(&rq->cfs_tasks);
  #endif /* CONFIG_SMP */
                hrtick_rq_init(rq);
                atomic_set(&rq->nr_iowait, 0);
 +
 +#ifdef CONFIG_SCHED_CORE
 +              rq->core = NULL;
 +              rq->core_pick = NULL;
 +              rq->core_enabled = 0;
 +              rq->core_tree = RB_ROOT;
 +              rq->core_forceidle = false;
 +
 +              rq->core_cookie = 0UL;
 +#endif
        }
  
        set_load_weight(&init_task, false);
  #endif
        init_sched_fair_class();
  
 -      init_schedstats();
 -
        psi_init();
  
        init_uclamp();
@@@ -9087,15 -8282,15 +9092,15 @@@ static inline int preempt_count_equals(
  
  void __might_sleep(const char *file, int line, int preempt_offset)
  {
 +      unsigned int state = get_current_state();
        /*
         * Blocking primitives will set (and therefore destroy) current->state,
         * since we will exit with TASK_RUNNING make sure we enter with it,
         * otherwise we will destroy state.
         */
 -      WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
 +      WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
                        "do not call blocking ops when !TASK_RUNNING; "
 -                      "state=%lx set at [<%p>] %pS\n",
 -                      current->state,
 +                      "state=%x set at [<%p>] %pS\n", state,
                        (void *)current->task_state_change,
                        (void *)current->task_state_change);
  
@@@ -9491,11 -8686,7 +9496,11 @@@ static int cpu_cgroup_css_online(struc
  
  #ifdef CONFIG_UCLAMP_TASK_GROUP
        /* Propagate the effective uclamp value for the new group */
 +      mutex_lock(&uclamp_mutex);
 +      rcu_read_lock();
        cpu_util_update_eff(css);
 +      rcu_read_unlock();
 +      mutex_unlock(&uclamp_mutex);
  #endif
  
        return 0;
@@@ -9556,7 -8747,7 +9561,7 @@@ static int cpu_cgroup_can_attach(struc
                 * has happened. This would lead to problems with PELT, due to
                 * move wanting to detach+attach while we're not attached yet.
                 */
 -              if (task->state == TASK_NEW)
 +              if (READ_ONCE(task->__state) == TASK_NEW)
                        ret = -EINVAL;
                raw_spin_unlock_irq(&task->pi_lock);
  
@@@ -9585,9 -8776,6 +9590,9 @@@ static void cpu_util_update_eff(struct 
        enum uclamp_id clamp_id;
        unsigned int clamps;
  
 +      lockdep_assert_held(&uclamp_mutex);
 +      SCHED_WARN_ON(!rcu_read_lock_held());
 +
        css_for_each_descendant_pre(css, top_css) {
                uc_parent = css_tg(css)->parent
                        ? css_tg(css)->parent->uclamp : NULL;
                }
  
                /* Immediately update descendants RUNNABLE tasks */
 -              uclamp_update_active_tasks(css, clamps);
 +              uclamp_update_active_tasks(css);
        }
  }
  
@@@ -9779,8 -8967,7 +9784,8 @@@ static const u64 max_cfs_runtime = MAX_
  
  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
  
 -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 +                              u64 burst)
  {
        int i, ret = 0, runtime_enabled, runtime_was_enabled;
        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
        if (quota != RUNTIME_INF && quota > max_cfs_runtime)
                return -EINVAL;
  
 +      if (quota != RUNTIME_INF && (burst > quota ||
 +                                   burst + quota > max_cfs_runtime))
 +              return -EINVAL;
 +
        /*
         * Prevent race between setting of cfs_rq->runtime_enabled and
         * unthrottle_offline_cfs_rqs().
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
 +      cfs_b->burst = burst;
  
        __refill_cfs_bandwidth_runtime(cfs_b);
  
@@@ -9869,10 -9051,9 +9874,10 @@@ out_unlock
  
  static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
  {
 -      u64 quota, period;
 +      u64 quota, period, burst;
  
        period = ktime_to_ns(tg->cfs_bandwidth.period);
 +      burst = tg->cfs_bandwidth.burst;
        if (cfs_quota_us < 0)
                quota = RUNTIME_INF;
        else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
        else
                return -EINVAL;
  
 -      return tg_set_cfs_bandwidth(tg, period, quota);
 +      return tg_set_cfs_bandwidth(tg, period, quota, burst);
  }
  
  static long tg_get_cfs_quota(struct task_group *tg)
  
  static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
  {
 -      u64 quota, period;
 +      u64 quota, period, burst;
  
        if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
                return -EINVAL;
  
        period = (u64)cfs_period_us * NSEC_PER_USEC;
        quota = tg->cfs_bandwidth.quota;
 +      burst = tg->cfs_bandwidth.burst;
  
 -      return tg_set_cfs_bandwidth(tg, period, quota);
 +      return tg_set_cfs_bandwidth(tg, period, quota, burst);
  }
  
  static long tg_get_cfs_period(struct task_group *tg)
        return cfs_period_us;
  }
  
 +static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
 +{
 +      u64 quota, period, burst;
 +
 +      if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
 +              return -EINVAL;
 +
 +      burst = (u64)cfs_burst_us * NSEC_PER_USEC;
 +      period = ktime_to_ns(tg->cfs_bandwidth.period);
 +      quota = tg->cfs_bandwidth.quota;
 +
 +      return tg_set_cfs_bandwidth(tg, period, quota, burst);
 +}
 +
 +static long tg_get_cfs_burst(struct task_group *tg)
 +{
 +      u64 burst_us;
 +
 +      burst_us = tg->cfs_bandwidth.burst;
 +      do_div(burst_us, NSEC_PER_USEC);
 +
 +      return burst_us;
 +}
 +
  static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
                                  struct cftype *cft)
  {
@@@ -9968,18 -9124,6 +9973,18 @@@ static int cpu_cfs_period_write_u64(str
        return tg_set_cfs_period(css_tg(css), cfs_period_us);
  }
  
 +static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
 +                                struct cftype *cft)
 +{
 +      return tg_get_cfs_burst(css_tg(css));
 +}
 +
 +static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
 +                                 struct cftype *cftype, u64 cfs_burst_us)
 +{
 +      return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
 +}
 +
  struct cfs_schedulable_data {
        struct task_group *tg;
        u64 period, quota;
@@@ -10133,11 -9277,6 +10138,11 @@@ static struct cftype cpu_legacy_files[
                .write_u64 = cpu_cfs_period_write_u64,
        },
        {
 +              .name = "cfs_burst_us",
 +              .read_u64 = cpu_cfs_burst_read_u64,
 +              .write_u64 = cpu_cfs_burst_write_u64,
 +      },
 +      {
                .name = "stat",
                .seq_show = cpu_cfs_stat_show,
        },
@@@ -10302,13 -9441,12 +10307,13 @@@ static ssize_t cpu_max_write(struct ker
  {
        struct task_group *tg = css_tg(of_css(of));
        u64 period = tg_get_cfs_period(tg);
 +      u64 burst = tg_get_cfs_burst(tg);
        u64 quota;
        int ret;
  
        ret = cpu_period_quota_parse(buf, &period, &quota);
        if (!ret)
 -              ret = tg_set_cfs_bandwidth(tg, period, quota);
 +              ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
        return ret ?: nbytes;
  }
  #endif
@@@ -10335,12 -9473,6 +10340,12 @@@ static struct cftype cpu_files[] = 
                .seq_show = cpu_max_show,
                .write = cpu_max_write,
        },
 +      {
 +              .name = "max.burst",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .read_u64 = cpu_cfs_burst_read_u64,
 +              .write_u64 = cpu_cfs_burst_write_u64,
 +      },
  #endif
  #ifdef CONFIG_UCLAMP_TASK_GROUP
        {
diff --combined kernel/time/tick-sched.c
@@@ -230,7 -230,6 +230,7 @@@ static void tick_sched_handle(struct ti
  
  #ifdef CONFIG_NO_HZ_FULL
  cpumask_var_t tick_nohz_full_mask;
 +EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
  bool tick_nohz_full_running;
  EXPORT_SYMBOL_GPL(tick_nohz_full_running);
  static atomic_t tick_dep_mask;
@@@ -323,6 -322,46 +323,46 @@@ void tick_nohz_full_kick_cpu(int cpu
        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
  }
  
+ static void tick_nohz_kick_task(struct task_struct *tsk)
+ {
+       int cpu;
+       /*
+        * If the task is not running, run_posix_cpu_timers()
+        * has nothing to elapse, IPI can then be spared.
+        *
+        * activate_task()                      STORE p->tick_dep_mask
+        *   STORE p->on_rq
+        * __schedule() (switch to task 'p')    smp_mb() (atomic_fetch_or())
+        *   LOCK rq->lock                      LOAD p->on_rq
+        *   smp_mb__after_spin_lock()
+        *   tick_nohz_task_switch()
+        *     LOAD p->tick_dep_mask
+        */
+       if (!sched_task_on_rq(tsk))
+               return;
+       /*
+        * If the task concurrently migrates to another CPU,
+        * we guarantee it sees the new tick dependency upon
+        * schedule.
+        *
+        * set_task_cpu(p, cpu);
+        *   STORE p->cpu = @cpu
+        * __schedule() (switch to task 'p')
+        *   LOCK rq->lock
+        *   smp_mb__after_spin_lock()          STORE p->tick_dep_mask
+        *   tick_nohz_task_switch()            smp_mb() (atomic_fetch_or())
+        *      LOAD p->tick_dep_mask           LOAD p->cpu
+        */
+       cpu = task_cpu(tsk);
+       preempt_disable();
+       if (cpu_online(cpu))
+               tick_nohz_full_kick_cpu(cpu);
+       preempt_enable();
+ }
  /*
   * Kick all full dynticks CPUs in order to force these to re-evaluate
   * their dependency on the tick and restart it if necessary.
@@@ -405,19 -444,8 +445,8 @@@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_c
   */
  void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
  {
-       if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) {
-               if (tsk == current) {
-                       preempt_disable();
-                       tick_nohz_full_kick();
-                       preempt_enable();
-               } else {
-                       /*
-                        * Some future tick_nohz_full_kick_task()
-                        * should optimize this.
-                        */
-                       tick_nohz_full_kick_all();
-               }
-       }
+       if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
+               tick_nohz_kick_task(tsk);
  }
  EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
  
@@@ -431,9 -459,20 +460,20 @@@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_t
   * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
   * per process timers.
   */
- void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+ void tick_nohz_dep_set_signal(struct task_struct *tsk,
+                             enum tick_dep_bits bit)
  {
-       tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+       int prev;
+       struct signal_struct *sig = tsk->signal;
+       prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
+       if (!prev) {
+               struct task_struct *t;
+               lockdep_assert_held(&tsk->sighand->siglock);
+               __for_each_thread(sig, t)
+                       tick_nohz_kick_task(t);
+       }
  }
  
  void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
   */
  void __tick_nohz_task_switch(void)
  {
-       unsigned long flags;
        struct tick_sched *ts;
  
-       local_irq_save(flags);
        if (!tick_nohz_full_cpu(smp_processor_id()))
-               goto out;
+               return;
  
        ts = this_cpu_ptr(&tick_cpu_sched);
  
                    atomic_read(&current->signal->tick_dep_mask))
                        tick_nohz_full_kick();
        }
- out:
-       local_irq_restore(flags);
  }
  
  /* Get the boot-time nohz CPU list from the kernel parameters. */
@@@ -922,27 -956,31 +957,31 @@@ static void tick_nohz_restart_sched_tic
         * Cancel the scheduled timer and restore the tick
         */
        ts->tick_stopped  = 0;
-       ts->idle_exittime = now;
        tick_nohz_restart(ts, now);
  }
  
- static void tick_nohz_full_update_tick(struct tick_sched *ts)
+ static void __tick_nohz_full_update_tick(struct tick_sched *ts,
+                                        ktime_t now)
  {
  #ifdef CONFIG_NO_HZ_FULL
        int cpu = smp_processor_id();
  
-       if (!tick_nohz_full_cpu(cpu))
+       if (can_stop_full_tick(cpu, ts))
+               tick_nohz_stop_sched_tick(ts, cpu);
+       else if (ts->tick_stopped)
+               tick_nohz_restart_sched_tick(ts, now);
+ #endif
+ }
+ static void tick_nohz_full_update_tick(struct tick_sched *ts)
+ {
+       if (!tick_nohz_full_cpu(smp_processor_id()))
                return;
  
        if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
                return;
  
-       if (can_stop_full_tick(cpu, ts))
-               tick_nohz_stop_sched_tick(ts, cpu);
-       else if (ts->tick_stopped)
-               tick_nohz_restart_sched_tick(ts, ktime_get());
- #endif
+       __tick_nohz_full_update_tick(ts, ktime_get());
  }
  
  static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
@@@ -1189,11 -1227,13 +1228,13 @@@ unsigned long tick_nohz_get_idle_calls(
        return ts->idle_calls;
  }
  
- static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+ static void tick_nohz_account_idle_time(struct tick_sched *ts,
+                                       ktime_t now)
  {
- #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        unsigned long ticks;
  
+       ts->idle_exittime = now;
        if (vtime_accounting_enabled_this_cpu())
                return;
        /*
         */
        if (ticks && ticks < LONG_MAX)
                account_idle_ticks(ticks);
- #endif
  }
  
static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
void tick_nohz_idle_restart_tick(void)
  {
-       tick_nohz_restart_sched_tick(ts, now);
-       tick_nohz_account_idle_ticks(ts);
+       struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+       if (ts->tick_stopped) {
+               ktime_t now = ktime_get();
+               tick_nohz_restart_sched_tick(ts, now);
+               tick_nohz_account_idle_time(ts, now);
+       }
  }
  
void tick_nohz_idle_restart_tick(void)
static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
  {
-       struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+       if (tick_nohz_full_cpu(smp_processor_id()))
+               __tick_nohz_full_update_tick(ts, now);
+       else
+               tick_nohz_restart_sched_tick(ts, now);
  
-       if (ts->tick_stopped)
-               __tick_nohz_idle_restart_tick(ts, ktime_get());
+       tick_nohz_account_idle_time(ts, now);
  }
  
  /**
@@@ -1253,7 -1299,7 +1300,7 @@@ void tick_nohz_idle_exit(void
                tick_nohz_stop_idle(ts, now);
  
        if (tick_stopped)
-               __tick_nohz_idle_restart_tick(ts, now);
+               tick_nohz_idle_update_tick(ts, now);
  
        local_irq_enable();
  }