Merge branch 'for-4.14/block' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 18:59:42 +0000 (11:59 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 18:59:42 +0000 (11:59 -0700)
Pull block layer updates from Jens Axboe:
 "This is the first pull request for 4.14, containing most of the code
  changes. It's a quiet series this round, which I think we needed after
  the churn of the last few series. This contains:

   - Fix for a registration race in loop, from Anton Volkov.

   - Overflow complaint fix from Arnd for DAC960.

   - Series of drbd changes from the usual suspects.

   - Conversion of the stec/skd driver to blk-mq. From Bart.

   - A few BFQ improvements/fixes from Paolo.

   - CFQ improvement from Ritesh, allowing idling for group idle.

   - A few fixes found by Dan's smatch, courtesy of Dan.

   - A warning fixup for a race between changing the IO scheduler and
     device remova. From David Jeffery.

   - A few nbd fixes from Josef.

   - Support for cgroup info in blktrace, from Shaohua.

   - Also from Shaohua, new features in the null_blk driver to allow it
     to actually hold data, among other things.

   - Various corner cases and error handling fixes from Weiping Zhang.

   - Improvements to the IO stats tracking for blk-mq from me. Can
     drastically improve performance for fast devices and/or big
     machines.

   - Series from Christoph removing bi_bdev as being needed for IO
     submission, in preparation for nvme multipathing code.

   - Series from Bart, including various cleanups and fixes for switch
     fall through case complaints"

* 'for-4.14/block' of git://git.kernel.dk/linux-block: (162 commits)
  kernfs: checking for IS_ERR() instead of NULL
  drbd: remove BIOSET_NEED_RESCUER flag from drbd_{md_,}io_bio_set
  drbd: Fix allyesconfig build, fix recent commit
  drbd: switch from kmalloc() to kmalloc_array()
  drbd: abort drbd_start_resync if there is no connection
  drbd: move global variables to drbd namespace and make some static
  drbd: rename "usermode_helper" to "drbd_usermode_helper"
  drbd: fix race between handshake and admin disconnect/down
  drbd: fix potential deadlock when trying to detach during handshake
  drbd: A single dot should be put into a sequence.
  drbd: fix rmmod cleanup, remove _all_ debugfs entries
  drbd: Use setup_timer() instead of init_timer() to simplify the code.
  drbd: fix potential get_ldev/put_ldev refcount imbalance during attach
  drbd: new disk-option disable-write-same
  drbd: Fix resource role for newly created resources in events2
  drbd: mark symbols static where possible
  drbd: Send P_NEG_ACK upon write error in protocol != C
  drbd: add explicit plugging when submitting batches
  drbd: change list_for_each_safe to while(list_first_entry_or_null)
  drbd: introduce drbd_recv_header_maybe_unplug
  ...

40 files changed:
1  2 
MAINTAINERS
arch/powerpc/sysdev/axonram.c
block/bfq-iosched.h
block/bio-integrity.c
block/blk-mq-debugfs.c
block/blk-mq.c
block/blk-throttle.c
block/genhd.c
drivers/block/Kconfig
drivers/block/brd.c
drivers/block/loop.c
drivers/block/null_blk.c
drivers/block/virtio_blk.c
drivers/block/xen-blkback/xenbus.c
drivers/block/xen-blkfront.c
drivers/block/zram/zram_drv.c
drivers/md/dm-crypt.c
drivers/md/dm-mpath.c
drivers/md/dm.c
drivers/md/md.c
drivers/md/raid5-cache.c
drivers/nvme/host/core.c
drivers/nvme/host/rdma.c
fs/btrfs/disk-io.c
fs/btrfs/raid56.c
fs/btrfs/volumes.c
fs/buffer.c
fs/gfs2/lops.c
fs/gfs2/meta_io.c
fs/gfs2/ops_fstype.c
fs/iomap.c
fs/kernfs/file.c
fs/ocfs2/cluster/heartbeat.c
fs/xfs/xfs_aops.c
include/linux/bio.h
include/linux/blkdev.h
include/linux/cgroup.h
include/linux/fs.h
kernel/cgroup/cgroup.c
mm/page_io.c

diff --combined MAINTAINERS
@@@ -301,7 -301,6 +301,7 @@@ S: Supporte
  F:    drivers/acpi/
  F:    drivers/pnp/pnpacpi/
  F:    include/linux/acpi.h
 +F:    include/linux/fwnode.h
  F:    include/acpi/
  F:    Documentation/acpi/
  F:    Documentation/ABI/testing/sysfs-bus-acpi
@@@ -311,14 -310,6 +311,14 @@@ F:       drivers/pci/*/*acpi
  F:    drivers/pci/*/*/*acpi*
  F:    tools/power/acpi/
  
 +ACPI APEI
 +M:    "Rafael J. Wysocki" <rjw@rjwysocki.net>
 +M:    Len Brown <lenb@kernel.org>
 +L:    linux-acpi@vger.kernel.org
 +R:    Tony Luck <tony.luck@intel.com>
 +R:    Borislav Petkov <bp@alien8.de>
 +F:    drivers/acpi/apei/
 +
  ACPI COMPONENT ARCHITECTURE (ACPICA)
  M:    Robert Moore <robert.moore@intel.com>
  M:    Lv Zheng <lv.zheng@intel.com>
@@@ -1162,7 -1153,6 +1162,7 @@@ L:      linux-arm-kernel@axis.co
  F:    arch/arm/mach-artpec
  F:    arch/arm/boot/dts/artpec6*
  F:    drivers/clk/axis
 +F:    drivers/crypto/axis
  F:    drivers/pinctrl/pinctrl-artpec*
  F:    Documentation/devicetree/bindings/pinctrl/axis,artpec6-pinctrl.txt
  
@@@ -1171,7 -1161,7 +1171,7 @@@ M:      Brendan Higgins <brendanhiggins@goog
  R:    Benjamin Herrenschmidt <benh@kernel.crashing.org>
  R:    Joel Stanley <joel@jms.id.au>
  L:    linux-i2c@vger.kernel.org
 -L:    openbmc@lists.ozlabs.org
 +L:    openbmc@lists.ozlabs.org (moderated for non-subscribers)
  S:    Maintained
  F:    drivers/irqchip/irq-aspeed-i2c-ic.c
  F:    drivers/i2c/busses/i2c-aspeed.c
@@@ -1292,15 -1282,10 +1292,15 @@@ S:   Maintaine
  
  ARM/CORTINA SYSTEMS GEMINI ARM ARCHITECTURE
  M:    Hans Ulli Kroll <ulli.kroll@googlemail.com>
 +M:    Linus Walleij <linus.walleij@linaro.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  T:    git git://github.com/ulli-kroll/linux.git
  S:    Maintained
 +F:    Documentation/devicetree/bindings/arm/gemini.txt
 +F:    Documentation/devicetree/bindings/pinctrl/cortina,gemini-pinctrl.txt
 +F:    Documentation/devicetree/bindings/rtc/faraday,ftrtc010.txt
  F:    arch/arm/mach-gemini/
 +F:    drivers/pinctrl/pinctrl-gemini.c
  F:    drivers/rtc/rtc-ftrtc010.c
  
  ARM/CSR SIRFPRIMA2 MACHINE SUPPORT
@@@ -1585,7 -1570,7 +1585,7 @@@ M:      Chunfeng Yun <chunfeng.yun@mediatek.
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  L:    linux-mediatek@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
 -F:    drivers/phy/phy-mt65xx-usb3.c
 +F:    drivers/phy/mediatek/phy-mtk-tphy.c
  
  ARM/MICREL KS8695 ARCHITECTURE
  M:    Greg Ungerer <gerg@uclinux.org>
@@@ -2008,7 -1993,6 +2008,7 @@@ F:      arch/arm64/boot/dts/socionext
  F:    drivers/bus/uniphier-system-bus.c
  F:    drivers/clk/uniphier/
  F:    drivers/i2c/busses/i2c-uniphier*
 +F:    drivers/irqchip/irq-uniphier-aidet.c
  F:    drivers/pinctrl/uniphier/
  F:    drivers/reset/reset-uniphier.c
  F:    drivers/tty/serial/8250/8250_uniphier.c
@@@ -2493,7 -2477,7 +2493,7 @@@ Q:      https://patchwork.open-mesh.org/proj
  S:    Maintained
  F:    Documentation/ABI/testing/sysfs-class-net-batman-adv
  F:    Documentation/ABI/testing/sysfs-class-net-mesh
 -F:    Documentation/networking/batman-adv.txt
 +F:    Documentation/networking/batman-adv.rst
  F:    include/uapi/linux/batman_adv.h
  F:    net/batman-adv/
  
@@@ -4375,12 -4359,6 +4375,12 @@@ S:    Maintaine
  F:    drivers/gpu/drm/qxl/
  F:    include/uapi/drm/qxl_drm.h
  
 +DRM DRIVER FOR PERVASIVE DISPLAYS REPAPER PANELS
 +M:    Noralf Trønnes <noralf@tronnes.org>
 +S:    Maintained
 +F:    drivers/gpu/drm/tinydrm/repaper.c
 +F:    Documentation/devicetree/bindings/display/repaper.txt
 +
  DRM DRIVER FOR RAGE 128 VIDEO CARDS
  S:    Orphan / Obsolete
  F:    drivers/gpu/drm/r128/
@@@ -4396,12 -4374,6 +4396,12 @@@ S:    Orphan / Obsolet
  F:    drivers/gpu/drm/sis/
  F:    include/uapi/drm/sis_drm.h
  
 +DRM DRIVER FOR SITRONIX ST7586 PANELS
 +M:    David Lechner <david@lechnology.com>
 +S:    Maintained
 +F:    drivers/gpu/drm/tinydrm/st7586.c
 +F:    Documentation/devicetree/bindings/display/st7586.txt
 +
  DRM DRIVER FOR TDFX VIDEO CARDS
  S:    Orphan / Obsolete
  F:    drivers/gpu/drm/tdfx/
@@@ -4650,14 -4622,6 +4650,14 @@@ F:    drivers/gpu/drm/panel
  F:    include/drm/drm_panel.h
  F:    Documentation/devicetree/bindings/display/panel/
  
 +DRM TINYDRM DRIVERS
 +M:    Noralf Trønnes <noralf@tronnes.org>
 +W:    https://github.com/notro/tinydrm/wiki/Development
 +T:    git git://anongit.freedesktop.org/drm/drm-misc
 +S:    Maintained
 +F:    drivers/gpu/drm/tinydrm/
 +F:    include/drm/tinydrm/
 +
  DSBR100 USB FM RADIO DRIVER
  M:    Alexey Klimov <klimov.linux@gmail.com>
  L:    linux-media@vger.kernel.org
@@@ -5126,21 -5090,12 +5126,21 @@@ M:   Andrew Lunn <andrew@lunn.ch
  M:    Florian Fainelli <f.fainelli@gmail.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
 -F:    include/linux/phy.h
 -F:    include/linux/phy_fixed.h
 -F:    drivers/net/phy/
 +F:    Documentation/ABI/testing/sysfs-bus-mdio
 +F:    Documentation/devicetree/bindings/net/mdio*
  F:    Documentation/networking/phy.txt
 +F:    drivers/net/phy/
  F:    drivers/of/of_mdio.c
  F:    drivers/of/of_net.c
 +F:    include/linux/*mdio*.h
 +F:    include/linux/of_net.h
 +F:    include/linux/phy.h
 +F:    include/linux/phy_fixed.h
 +F:    include/linux/platform_data/mdio-gpio.h
 +F:    include/linux/platform_data/mdio-bcm-unimac.h
 +F:    include/trace/events/mdio.h
 +F:    include/uapi/linux/mdio.h
 +F:    include/uapi/linux/mii.h
  
  EXT2 FILE SYSTEM
  M:    Jan Kara <jack@suse.com>
@@@ -5378,11 -5333,10 +5378,11 @@@ K:   fmc_d.*registe
  
  FPGA MANAGER FRAMEWORK
  M:    Alan Tull <atull@kernel.org>
 -R:    Moritz Fischer <moritz.fischer@ettus.com>
 +R:    Moritz Fischer <mdf@kernel.org>
  L:    linux-fpga@vger.kernel.org
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/atull/linux-fpga.git
 +Q:    http://patchwork.kernel.org/project/linux-fpga/list/
  F:    Documentation/fpga/
  F:    Documentation/devicetree/bindings/fpga/
  F:    drivers/fpga/
@@@ -5872,7 -5826,7 +5872,7 @@@ F:      drivers/staging/greybus/spi.
  F:    drivers/staging/greybus/spilib.c
  F:    drivers/staging/greybus/spilib.h
  
 -GREYBUS LOOBACK/TIME PROTOCOLS DRIVERS
 +GREYBUS LOOPBACK/TIME PROTOCOLS DRIVERS
  M:    Bryan O'Donoghue <pure.logic@nexus-software.ie>
  S:    Maintained
  F:    drivers/staging/greybus/loopback.c
@@@ -6185,14 -6139,6 +6185,14 @@@ S:    Maintaine
  F:    drivers/net/ethernet/hisilicon/
  F:    Documentation/devicetree/bindings/net/hisilicon*.txt
  
 +HISILICON NETWORK SUBSYSTEM 3 DRIVER (HNS3)
 +M:    Yisen Zhuang <yisen.zhuang@huawei.com>
 +M:    Salil Mehta <salil.mehta@huawei.com>
 +L:    netdev@vger.kernel.org
 +W:    http://www.hisilicon.com
 +S:    Maintained
 +F:    drivers/net/ethernet/hisilicon/hns3/
 +
  HISILICON ROCE DRIVER
  M:    Lijun Ou <oulijun@huawei.com>
  M:    Wei Hu(Xavier) <xavier.huwei@huawei.com>
@@@ -6277,13 -6223,6 +6277,13 @@@ L:    linux-input@vger.kernel.or
  S:    Maintained
  F:    drivers/input/touchscreen/htcpen.c
  
 +HUAWEI ETHERNET DRIVER
 +M:    Aviad Krawczyk <aviad.krawczyk@huawei.com>
 +L:    netdev@vger.kernel.org
 +S:    Supported
 +F:    Documentation/networking/hinic.txt
 +F:    drivers/net/ethernet/huawei/hinic/
 +
  HUGETLB FILESYSTEM
  M:    Nadia Yvette Chambers <nyc@holomorphy.com>
  S:    Maintained
@@@ -6310,9 -6249,7 +6310,9 @@@ M:      Haiyang Zhang <haiyangz@microsoft.co
  M:    Stephen Hemminger <sthemmin@microsoft.com>
  L:    devel@linuxdriverproject.org
  S:    Maintained
 +F:    Documentation/networking/netvsc.txt
  F:    arch/x86/include/asm/mshyperv.h
 +F:    arch/x86/include/asm/trace/hyperv.h
  F:    arch/x86/include/uapi/asm/hyperv.h
  F:    arch/x86/kernel/cpu/mshyperv.c
  F:    arch/x86/hyperv
@@@ -6324,9 -6261,7 +6324,9 @@@ F:      drivers/net/hyperv
  F:    drivers/scsi/storvsc_drv.c
  F:    drivers/uio/uio_hv_generic.c
  F:    drivers/video/fbdev/hyperv_fb.c
 +F:    net/vmw_vsock/hyperv_transport.c
  F:    include/linux/hyperv.h
 +F:    include/uapi/linux/hyperv.h
  F:    tools/hv/
  F:    Documentation/ABI/stable/sysfs-bus-vmbus
  
@@@ -6494,15 -6429,6 +6494,15 @@@ L:    netdev@vger.kernel.or
  S:    Supported
  F:    drivers/net/ethernet/ibm/ibmvnic.*
  
 +IBM Power Virtual Accelerator Switchboard
 +M:    Sukadev Bhattiprolu
 +L:    linuxppc-dev@lists.ozlabs.org
 +S:    Supported
 +F:    arch/powerpc/platforms/powernv/vas*
 +F:    arch/powerpc/platforms/powernv/copy-paste.h
 +F:    arch/powerpc/include/asm/vas.h
 +F:    arch/powerpc/include/uapi/asm/vas.h
 +
  IBM Power Virtual Ethernet Device Driver
  M:    Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
  L:    netdev@vger.kernel.org
@@@ -6810,9 -6736,8 +6810,9 @@@ S:      Supporte
  F:    drivers/scsi/isci/
  
  INTEL DRM DRIVERS (excluding Poulsbo, Moorestown and derivative chipsets)
 -M:    Daniel Vetter <daniel.vetter@intel.com>
  M:    Jani Nikula <jani.nikula@linux.intel.com>
 +M:    Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
 +M:    Rodrigo Vivi <rodrigo.vivi@intel.com>
  L:    intel-gfx@lists.freedesktop.org
  W:    https://01.org/linuxgraphics/
  B:    https://01.org/linuxgraphics/documentation/how-report-bugs
@@@ -7150,7 -7075,9 +7150,7 @@@ W:      http://irda.sourceforge.net
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/sameo/irda-2.6.git
  F:    Documentation/networking/irda.txt
 -F:    drivers/net/irda/
 -F:    include/net/irda/
 -F:    net/irda/
 +F:    drivers/staging/irda/
  
  IRQ DOMAINS (IRQ NUMBER MAPPING LIBRARY)
  M:    Marc Zyngier <marc.zyngier@arm.com>
@@@ -7175,6 -7102,7 +7175,6 @@@ M:      Marc Zyngier <marc.zyngier@arm.com
  L:    linux-kernel@vger.kernel.org
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
 -T:    git git://git.infradead.org/users/jcooper/linux.git irqchip/core
  F:    Documentation/devicetree/bindings/interrupt-controller/
  F:    drivers/irqchip/
  
@@@ -7704,6 -7632,17 +7704,6 @@@ T:     git git://linuxtv.org/mkrufky/tuners
  S:    Maintained
  F:    drivers/media/dvb-frontends/lgdt3305.*
  
 -LGUEST
 -M:    Rusty Russell <rusty@rustcorp.com.au>
 -L:    lguest@lists.ozlabs.org
 -W:    http://lguest.ozlabs.org/
 -S:    Odd Fixes
 -F:    arch/x86/include/asm/lguest*.h
 -F:    arch/x86/lguest/
 -F:    drivers/lguest/
 -F:    include/linux/lguest*.h
 -F:    tools/lguest/
 -
  LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
  M:    Viresh Kumar <vireshk@kernel.org>
  L:    linux-ide@vger.kernel.org
@@@ -7839,7 -7778,6 +7839,7 @@@ F:      drivers/pci/hotplug/rpa
  F:    drivers/rtc/rtc-opal.c
  F:    drivers/scsi/ibmvscsi/
  F:    drivers/tty/hvc/hvc_opal.c
 +F:    drivers/watchdog/wdrtas.c
  F:    tools/testing/selftests/powerpc
  N:    /pmac
  N:    powermac
@@@ -8478,9 -8416,7 +8478,9 @@@ F:      include/uapi/linux/uvcvideo.
  
  MEDIATEK ETHERNET DRIVER
  M:    Felix Fietkau <nbd@openwrt.org>
 -M:    John Crispin <blogic@openwrt.org>
 +M:    John Crispin <john@phrozen.org>
 +M:    Sean Wang <sean.wang@mediatek.com>
 +M:    Nelson Chang <nelson.chang@mediatek.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/ethernet/mediatek/
@@@ -8521,14 -8457,6 +8521,14 @@@ M:    Sean Wang <sean.wang@mediatek.com
  S:    Maintained
  F:    drivers/char/hw_random/mtk-rng.c
  
 +MEDIATEK USB3 DRD IP DRIVER
 +M:    Chunfeng Yun <chunfeng.yun@mediatek.com>
 +L:    linux-usb@vger.kernel.org (moderated for non-subscribers)
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +L:    linux-mediatek@lists.infradead.org (moderated for non-subscribers)
 +S:    Maintained
 +F:    drivers/usb/mtu3/
 +
  MEGACHIPS STDPXXXX-GE-B850V3-FW LVDS/DP++ BRIDGES
  M:    Peter Senna Tschudin <peter.senna@collabora.com>
  M:    Martin Donnelly <martin.donnelly@ge.com>
@@@ -8693,7 -8621,7 +8693,7 @@@ M:      Mathieu Desnoyers <mathieu.desnoyers
  M:    "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
  L:    linux-kernel@vger.kernel.org
  S:    Supported
 -F:    kernel/membarrier.c
 +F:    kernel/sched/membarrier.c
  F:    include/uapi/linux/membarrier.h
  
  MEMORY MANAGEMENT
@@@ -8783,12 -8711,6 +8783,12 @@@ F:    drivers/dma/at_hdmac.
  F:    drivers/dma/at_hdmac_regs.h
  F:    include/linux/platform_data/dma-atmel.h
  
 +MICROCHIP / ATMEL ECC DRIVER
 +M:    Tudor Ambarus <tudor.ambarus@microchip.com>
 +L:    linux-crypto@vger.kernel.org
 +S:    Maintained
 +F:    drivers/crypto/atmel-ecc.*
 +
  MICROCHIP / ATMEL ISC DRIVER
  M:    Songjun Wu <songjun.wu@microchip.com>
  L:    linux-media@vger.kernel.org
@@@ -9536,7 -9458,6 +9536,7 @@@ M:      Srinivas Kandagatla <srinivas.kandag
  S:    Maintained
  F:    drivers/nvmem/
  F:    Documentation/devicetree/bindings/nvmem/
 +F:    Documentation/ABI/stable/sysfs-bus-nvmem
  F:    include/linux/nvmem-consumer.h
  F:    include/linux/nvmem-provider.h
  
@@@ -10454,7 -10375,7 +10454,7 @@@ L:   linux-gpio@vger.kernel.or
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git
  S:    Maintained
  F:    Documentation/devicetree/bindings/pinctrl/
 -F:    Documentation/pinctrl.txt
 +F:    Documentation/driver-api/pinctl.rst
  F:    drivers/pinctrl/
  F:    include/linux/pinctrl/
  
@@@ -11182,7 -11103,7 +11182,7 @@@ M:   Fenghua Yu <fenghua.yu@intel.com
  L:    linux-kernel@vger.kernel.org
  S:    Supported
  F:    arch/x86/kernel/cpu/intel_rdt*
 -F:    arch/x86/include/asm/intel_rdt*
 +F:    arch/x86/include/asm/intel_rdt_sched.h
  F:    Documentation/x86/intel_rdt*
  
  READ-COPY UPDATE (RCU)
@@@ -12561,6 -12482,12 +12561,12 @@@ M: Ion Badulescu <ionut@badula.org
  S:    Odd Fixes
  F:    drivers/net/ethernet/adaptec/starfire*
  
+ STEC S1220 SKD DRIVER
+ M:    Bart Van Assche <bart.vanassche@wdc.com>
+ L:    linux-block@vger.kernel.org
+ S:    Maintained
+ F:    drivers/block/skd*[ch]
  STI CEC DRIVER
  M:    Benjamin Gaignard <benjamin.gaignard@linaro.org>
  S:    Maintained
@@@ -13064,11 -12991,6 +13070,11 @@@ M: Yehezkel Bernat <yehezkel.bernat@int
  S:    Maintained
  F:    drivers/thunderbolt/
  
 +THUNDERX GPIO DRIVER
 +M:    David Daney <david.daney@cavium.com>
 +S:    Maintained
 +F:    drivers/gpio/gpio-thunderx.c
 +
  TI AM437X VPFE DRIVER
  M:    "Lad, Prabhakar" <prabhakar.csengg@gmail.com>
  L:    linux-media@vger.kernel.org
@@@ -14080,7 -14002,6 +14086,7 @@@ F:   drivers/block/virtio_blk.
  F:    include/linux/virtio*.h
  F:    include/uapi/linux/virtio_*.h
  F:    drivers/crypto/virtio/
 +F:    mm/balloon_compaction.c
  
  VIRTIO CRYPTO DRIVER
  M:    Gonglei <arei.gonglei@huawei.com>
@@@ -14295,12 -14216,6 +14301,12 @@@ F: drivers/watchdog
  F:    include/linux/watchdog.h
  F:    include/uapi/linux/watchdog.h
  
 +WHISKEYCOVE PMIC GPIO DRIVER
 +M:    Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
 +L:    linux-gpio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/gpio/gpio-wcove.c
 +
  WIIMOTE HID DRIVER
  M:    David Herrmann <dh.herrmann@googlemail.com>
  L:    linux-input@vger.kernel.org
@@@ -110,7 -110,7 +110,7 @@@ axon_ram_irq_handler(int irq, void *dev
  static blk_qc_t
  axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  {
-       struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data;
+       struct axon_ram_bank *bank = bio->bi_disk->private_data;
        unsigned long phys_mem, phys_end;
        void *user_mem;
        struct bio_vec vec;
@@@ -188,12 -188,15 +188,12 @@@ static int axon_ram_probe(struct platfo
  
        axon_ram_bank_id++;
  
 -      dev_info(&device->dev, "Found memory controller on %s\n",
 -                      device->dev.of_node->full_name);
 +      dev_info(&device->dev, "Found memory controller on %pOF\n",
 +                      device->dev.of_node);
  
 -      bank = kzalloc(sizeof(struct axon_ram_bank), GFP_KERNEL);
 -      if (bank == NULL) {
 -              dev_err(&device->dev, "Out of memory\n");
 -              rc = -ENOMEM;
 -              goto failed;
 -      }
 +      bank = kzalloc(sizeof(*bank), GFP_KERNEL);
 +      if (!bank)
 +              return -ENOMEM;
  
        device->dev.platform_data = bank;
  
        return 0;
  
  failed:
 -      if (bank != NULL) {
 -              if (bank->irq_id)
 -                      free_irq(bank->irq_id, device);
 -              if (bank->disk != NULL) {
 -                      if (bank->disk->major > 0)
 -                              unregister_blkdev(bank->disk->major,
 -                                              bank->disk->disk_name);
 -                      if (bank->disk->flags & GENHD_FL_UP)
 -                              del_gendisk(bank->disk);
 -                      put_disk(bank->disk);
 -              }
 -              kill_dax(bank->dax_dev);
 -              put_dax(bank->dax_dev);
 -              device->dev.platform_data = NULL;
 -              if (bank->io_addr != 0)
 -                      iounmap((void __iomem *) bank->io_addr);
 -              kfree(bank);
 +      if (bank->irq_id)
 +              free_irq(bank->irq_id, device);
 +      if (bank->disk != NULL) {
 +              if (bank->disk->major > 0)
 +                      unregister_blkdev(bank->disk->major,
 +                                      bank->disk->disk_name);
 +              if (bank->disk->flags & GENHD_FL_UP)
 +                      del_gendisk(bank->disk);
 +              put_disk(bank->disk);
        }
 -
 +      kill_dax(bank->dax_dev);
 +      put_dax(bank->dax_dev);
 +      device->dev.platform_data = NULL;
 +      if (bank->io_addr != 0)
 +              iounmap((void __iomem *) bank->io_addr);
 +      kfree(bank);
        return rc;
  }
  
diff --combined block/bfq-iosched.h
@@@ -71,29 -71,17 +71,29 @@@ struct bfq_service_tree 
   *
   * bfq_sched_data is the basic scheduler queue.  It supports three
   * ioprio_classes, and can be used either as a toplevel queue or as an
 - * intermediate queue on a hierarchical setup.  @next_in_service
 - * points to the active entity of the sched_data service trees that
 - * will be scheduled next. It is used to reduce the number of steps
 - * needed for each hierarchical-schedule update.
 + * intermediate queue in a hierarchical setup.
   *
   * The supported ioprio_classes are the same as in CFQ, in descending
   * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
   * Requests from higher priority queues are served before all the
   * requests from lower priority queues; among requests of the same
   * queue requests are served according to B-WF2Q+.
 - * All the fields are protected by the queue lock of the containing bfqd.
 + *
 + * The schedule is implemented by the service trees, plus the field
 + * @next_in_service, which points to the entity on the active trees
 + * that will be served next, if 1) no changes in the schedule occurs
 + * before the current in-service entity is expired, 2) the in-service
 + * queue becomes idle when it expires, and 3) if the entity pointed by
 + * in_service_entity is not a queue, then the in-service child entity
 + * of the entity pointed by in_service_entity becomes idle on
 + * expiration. This peculiar definition allows for the following
 + * optimization, not yet exploited: while a given entity is still in
 + * service, we already know which is the best candidate for next
 + * service among the other active entitities in the same parent
 + * entity. We can then quickly compare the timestamps of the
 + * in-service entity with those of such best candidate.
 + *
 + * All fields are protected by the lock of the containing bfqd.
   */
  struct bfq_sched_data {
        /* entity in service */
@@@ -360,11 -348,11 +360,11 @@@ struct bfq_io_cq 
        uint64_t blkcg_serial_nr; /* the current blkcg serial */
  #endif
        /*
-        * Snapshot of the idle window before merging; taken to
-        * remember this value while the queue is merged, so as to be
-        * able to restore it in case of split.
+        * Snapshot of the has_short_time flag before merging; taken
+        * to remember its value while the queue is merged, so as to
+        * be able to restore it in case of split.
         */
-       bool saved_idle_window;
+       bool saved_has_short_ttime;
        /*
         * Same purpose as the previous two fields for the I/O bound
         * classification of a queue.
@@@ -638,7 -626,7 +638,7 @@@ enum bfqq_state_flags 
                                     * without idling the device
                                     */
        BFQQF_fifo_expire,      /* FIFO checked in this slice */
-       BFQQF_idle_window,      /* slice idling enabled */
+       BFQQF_has_short_ttime,  /* queue has a short think time */
        BFQQF_sync,             /* synchronous queue */
        BFQQF_IO_bound,         /*
                                 * bfqq has timed-out at least once
@@@ -667,7 -655,7 +667,7 @@@ BFQ_BFQQ_FNS(busy)
  BFQ_BFQQ_FNS(wait_request);
  BFQ_BFQQ_FNS(non_blocking_wait_rq);
  BFQ_BFQQ_FNS(fifo_expire);
- BFQ_BFQQ_FNS(idle_window);
+ BFQ_BFQQ_FNS(has_short_ttime);
  BFQ_BFQQ_FNS(sync);
  BFQ_BFQQ_FNS(IO_bound);
  BFQ_BFQQ_FNS(in_large_burst);
@@@ -929,13 -917,16 +929,16 @@@ void bfq_add_bfqq_busy(struct bfq_data 
  struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
  
  #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)        do {                    \
-       blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\
-                       bfq_bfqq_sync((bfqq)) ? 'S' : 'A',              \
-                       bfqq_group(bfqq)->blkg_path, ##args);           \
+       blk_add_cgroup_trace_msg((bfqd)->queue,                         \
+                       bfqg_to_blkg(bfqq_group(bfqq))->blkcg,          \
+                       "bfq%d%c " fmt, (bfqq)->pid,                    \
+                       bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args);     \
  } while (0)
  
- #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)        \
-       blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args)
+ #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)        do {                    \
+       blk_add_cgroup_trace_msg((bfqd)->queue,                         \
+               bfqg_to_blkg(bfqg)->blkcg, fmt, ##args);                \
+ } while (0)
  
  #else /* CONFIG_BFQ_GROUP_IOSCHED */
  
diff --combined block/bio-integrity.c
@@@ -146,7 -146,7 +146,7 @@@ int bio_integrity_add_page(struct bio *
        iv = bip->bip_vec + bip->bip_vcnt;
  
        if (bip->bip_vcnt &&
-           bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+           bvec_gap_to_prev(bio->bi_disk->queue,
                             &bip->bip_vec[bip->bip_vcnt - 1], offset))
                return 0;
  
@@@ -190,7 -190,7 +190,7 @@@ static inline unsigned int bio_integrit
  static blk_status_t bio_integrity_process(struct bio *bio,
                struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
  {
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
        struct blk_integrity_iter iter;
        struct bvec_iter bviter;
        struct bio_vec bv;
        void *prot_buf = page_address(bip->bip_vec->bv_page) +
                bip->bip_vec->bv_offset;
  
-       iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
+       iter.disk_name = bio->bi_disk->disk_name;
        iter.interval = 1 << bi->interval_exp;
        iter.seed = proc_iter->bi_sector;
        iter.prot_buf = prot_buf;
  bool bio_integrity_prep(struct bio *bio)
  {
        struct bio_integrity_payload *bip;
-       struct blk_integrity *bi;
-       struct request_queue *q;
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+       struct request_queue *q = bio->bi_disk->queue;
        void *buf;
        unsigned long start, end;
        unsigned int len, nr_pages;
        unsigned int intervals;
        blk_status_t status;
  
-       bi = bdev_get_integrity(bio->bi_bdev);
-       q = bdev_get_queue(bio->bi_bdev);
+       if (!bi)
+               return true;
        if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
                return true;
  
        if (bio_integrity(bio))
                return true;
  
-       if (bi == NULL)
-               return true;
        if (bio_data_dir(bio) == READ) {
                if (!bi->profile->verify_fn ||
                    !(bi->flags & BLK_INTEGRITY_VERIFY))
@@@ -354,7 -352,7 +352,7 @@@ static void bio_integrity_verify_fn(str
        struct bio_integrity_payload *bip =
                container_of(work, struct bio_integrity_payload, bip_work);
        struct bio *bio = bip->bip_bio;
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
        struct bvec_iter iter = bio->bi_iter;
  
        /*
   */
  bool __bio_integrity_endio(struct bio *bio)
  {
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
 +      struct bio_integrity_payload *bip = bio_integrity(bio);
  
        if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
 -          bi->profile->verify_fn) {
 -              struct bio_integrity_payload *bip = bio_integrity(bio);
 -
 +          (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) {
                INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
                queue_work(kintegrityd_wq, &bip->bip_work);
                return false;
  void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
  {
        struct bio_integrity_payload *bip = bio_integrity(bio);
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
        unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
  
        bip->bip_iter.bi_sector += bytes_done >> 9;
@@@ -430,7 -429,7 +428,7 @@@ EXPORT_SYMBOL(bio_integrity_advance)
  void bio_integrity_trim(struct bio *bio)
  {
        struct bio_integrity_payload *bip = bio_integrity(bio);
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
  
        bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
  }
diff --combined block/blk-mq-debugfs.c
@@@ -48,8 -48,6 +48,6 @@@ static int blk_flags_show(struct seq_fi
  static const char *const blk_queue_flag_name[] = {
        QUEUE_FLAG_NAME(QUEUED),
        QUEUE_FLAG_NAME(STOPPED),
-       QUEUE_FLAG_NAME(SYNCFULL),
-       QUEUE_FLAG_NAME(ASYNCFULL),
        QUEUE_FLAG_NAME(DYING),
        QUEUE_FLAG_NAME(BYPASS),
        QUEUE_FLAG_NAME(BIDI),
@@@ -75,8 -73,6 +73,8 @@@
        QUEUE_FLAG_NAME(STATS),
        QUEUE_FLAG_NAME(POLL_STATS),
        QUEUE_FLAG_NAME(REGISTERED),
 +      QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
 +      QUEUE_FLAG_NAME(QUIESCED),
  };
  #undef QUEUE_FLAG_NAME
  
@@@ -267,7 -263,6 +265,7 @@@ static const char *const cmd_flag_name[
        CMD_FLAG_NAME(RAHEAD),
        CMD_FLAG_NAME(BACKGROUND),
        CMD_FLAG_NAME(NOUNMAP),
 +      CMD_FLAG_NAME(NOWAIT),
  };
  #undef CMD_FLAG_NAME
  
@@@ -744,7 -739,7 +742,7 @@@ static int blk_mq_debugfs_release(struc
                return seq_release(inode, file);
  }
  
- const struct file_operations blk_mq_debugfs_fops = {
static const struct file_operations blk_mq_debugfs_fops = {
        .open           = blk_mq_debugfs_open,
        .read           = seq_read,
        .write          = blk_mq_debugfs_write,
diff --combined block/blk-mq.c
@@@ -83,6 -83,41 +83,41 @@@ static void blk_mq_hctx_clear_pending(s
        sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
  }
  
+ struct mq_inflight {
+       struct hd_struct *part;
+       unsigned int *inflight;
+ };
+ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
+                                 struct request *rq, void *priv,
+                                 bool reserved)
+ {
+       struct mq_inflight *mi = priv;
+       if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
+           !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+               /*
+                * index[0] counts the specific partition that was asked
+                * for. index[1] counts the ones that are active on the
+                * whole device, so increment that if mi->part is indeed
+                * a partition, and not a whole device.
+                */
+               if (rq->part == mi->part)
+                       mi->inflight[0]++;
+               if (mi->part->partno)
+                       mi->inflight[1]++;
+       }
+ }
+ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
+                     unsigned int inflight[2])
+ {
+       struct mq_inflight mi = { .part = part, .inflight = inflight, };
+       inflight[0] = inflight[1] = 0;
+       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+ }
  void blk_freeze_queue_start(struct request_queue *q)
  {
        int freeze_depth;
@@@ -301,12 -336,11 +336,12 @@@ static struct request *blk_mq_get_reque
        struct elevator_queue *e = q->elevator;
        struct request *rq;
        unsigned int tag;
 +      struct blk_mq_ctx *local_ctx = NULL;
  
        blk_queue_enter_live(q);
        data->q = q;
        if (likely(!data->ctx))
 -              data->ctx = blk_mq_get_ctx(q);
 +              data->ctx = local_ctx = blk_mq_get_ctx(q);
        if (likely(!data->hctx))
                data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
        if (op & REQ_NOWAIT)
  
        tag = blk_mq_get_tag(data);
        if (tag == BLK_MQ_TAG_FAIL) {
 +              if (local_ctx) {
 +                      blk_mq_put_ctx(local_ctx);
 +                      data->ctx = NULL;
 +              }
                blk_queue_exit(q);
                return NULL;
        }
@@@ -360,13 -390,13 +395,13 @@@ struct request *blk_mq_alloc_request(st
                return ERR_PTR(ret);
  
        rq = blk_mq_get_request(q, NULL, op, &alloc_data);
 -
 -      blk_mq_put_ctx(alloc_data.ctx);
        blk_queue_exit(q);
  
        if (!rq)
                return ERR_PTR(-EWOULDBLOCK);
  
 +      blk_mq_put_ctx(alloc_data.ctx);
 +
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
@@@ -411,6 -441,7 +446,6 @@@ struct request *blk_mq_alloc_request_hc
        alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
  
        rq = blk_mq_get_request(q, NULL, op, &alloc_data);
 -
        blk_queue_exit(q);
  
        if (!rq)
@@@ -624,11 -655,10 +659,10 @@@ static void blk_mq_requeue_work(struct 
                container_of(work, struct request_queue, requeue_work.work);
        LIST_HEAD(rq_list);
        struct request *rq, *next;
-       unsigned long flags;
  
-       spin_lock_irqsave(&q->requeue_lock, flags);
+       spin_lock_irq(&q->requeue_lock);
        list_splice_init(&q->requeue_list, &rq_list);
-       spin_unlock_irqrestore(&q->requeue_lock, flags);
+       spin_unlock_irq(&q->requeue_lock);
  
        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
                if (!(rq->rq_flags & RQF_SOFTBARRIER))
@@@ -683,8 -713,8 +717,8 @@@ EXPORT_SYMBOL(blk_mq_kick_requeue_list)
  void blk_mq_delay_kick_requeue_list(struct request_queue *q,
                                    unsigned long msecs)
  {
 -      kblockd_schedule_delayed_work(&q->requeue_work,
 -                                    msecs_to_jiffies(msecs));
 +      kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
 +                                  msecs_to_jiffies(msecs));
  }
  EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
  
@@@ -1102,9 -1132,19 +1136,19 @@@ static void __blk_mq_run_hw_queue(struc
  {
        int srcu_idx;
  
+       /*
+        * We should be running this queue from one of the CPUs that
+        * are mapped to it.
+        */
        WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
                cpu_online(hctx->next_cpu));
  
+       /*
+        * We can't run the queue inline with ints disabled. Ensure that
+        * we catch bad users of this early.
+        */
+       WARN_ON_ONCE(in_interrupt());
        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
                rcu_read_lock();
                blk_mq_sched_dispatch_requests(hctx);
@@@ -1218,7 -1258,7 +1262,7 @@@ EXPORT_SYMBOL(blk_mq_queue_stopped)
  /*
   * This function is often used for pausing .queue_rq() by driver when
   * there isn't enough resource or some conditions aren't satisfied, and
-  * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+  * BLK_STS_RESOURCE is usually returned.
   *
   * We do not guarantee that dispatch can be drained or blocked
   * after blk_mq_stop_hw_queue() returns. Please use
@@@ -1235,7 -1275,7 +1279,7 @@@ EXPORT_SYMBOL(blk_mq_stop_hw_queue)
  /*
   * This function is often used for pausing .queue_rq() by driver when
   * there isn't enough resource or some conditions aren't satisfied, and
-  * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+  * BLK_STS_RESOURCE is usually returned.
   *
   * We do not guarantee that dispatch can be drained or blocked
   * after blk_mq_stop_hw_queues() returns. Please use
diff --combined block/blk-throttle.c
@@@ -373,23 -373,13 +373,21 @@@ static unsigned int tg_iops_limit(struc
        if (likely(!blk_trace_note_message_enabled(__td->queue)))       \
                break;                                                  \
        if ((__tg)) {                                                   \
-               char __pbuf[128];                                       \
-                                                                       \
-               blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf));    \
-               blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
+               blk_add_cgroup_trace_msg(__td->queue,                   \
+                       tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
        } else {                                                        \
                blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);  \
        }                                                               \
  } while (0)
  
 +static inline unsigned int throtl_bio_data_size(struct bio *bio)
 +{
 +      /* assume it's one sector */
 +      if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
 +              return 512;
 +      return bio->bi_iter.bi_size;
 +}
 +
  static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
  {
        INIT_LIST_HEAD(&qn->node);
@@@ -942,7 -932,6 +940,7 @@@ static bool tg_with_in_bps_limit(struc
        bool rw = bio_data_dir(bio);
        u64 bytes_allowed, extra_bytes, tmp;
        unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 +      unsigned int bio_size = throtl_bio_data_size(bio);
  
        jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  
        do_div(tmp, HZ);
        bytes_allowed = tmp;
  
 -      if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
 +      if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
                if (wait)
                        *wait = 0;
                return true;
        }
  
        /* Calc approx time to dispatch */
 -      extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
 +      extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
        jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
  
        if (!jiffy_wait)
@@@ -1043,12 -1032,11 +1041,12 @@@ static bool tg_may_dispatch(struct thro
  static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
  {
        bool rw = bio_data_dir(bio);
 +      unsigned int bio_size = throtl_bio_data_size(bio);
  
        /* Charge the bio to the group */
 -      tg->bytes_disp[rw] += bio->bi_iter.bi_size;
 +      tg->bytes_disp[rw] += bio_size;
        tg->io_disp[rw]++;
 -      tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
 +      tg->last_bytes_disp[rw] += bio_size;
        tg->last_io_disp[rw]++;
  
        /*
@@@ -2114,14 -2102,9 +2112,9 @@@ static inline void throtl_update_latenc
  static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
  {
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-       int ret;
-       ret = bio_associate_current(bio);
-       if (ret == 0 || ret == -EBUSY)
+       if (bio->bi_css)
                bio->bi_cg_private = tg;
        blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
- #else
-       bio_associate_current(bio);
  #endif
  }
  
diff --combined block/genhd.c
@@@ -45,6 -45,52 +45,52 @@@ static void disk_add_events(struct gend
  static void disk_del_events(struct gendisk *disk);
  static void disk_release_events(struct gendisk *disk);
  
+ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+ {
+       if (q->mq_ops)
+               return;
+       atomic_inc(&part->in_flight[rw]);
+       if (part->partno)
+               atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+ }
+ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+ {
+       if (q->mq_ops)
+               return;
+       atomic_dec(&part->in_flight[rw]);
+       if (part->partno)
+               atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+ }
+ void part_in_flight(struct request_queue *q, struct hd_struct *part,
+                   unsigned int inflight[2])
+ {
+       if (q->mq_ops) {
+               blk_mq_in_flight(q, part, inflight);
+               return;
+       }
+       inflight[0] = atomic_read(&part->in_flight[0]) +
+                       atomic_read(&part->in_flight[1]);
+       if (part->partno) {
+               part = &part_to_disk(part)->part0;
+               inflight[1] = atomic_read(&part->in_flight[0]) +
+                               atomic_read(&part->in_flight[1]);
+       }
+ }
+ struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
+ {
+       struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
+       if (unlikely(partno < 0 || partno >= ptbl->len))
+               return NULL;
+       return rcu_dereference(ptbl->part[partno]);
+ }
  /**
   * disk_get_part - get partition
   * @disk: disk to look partition from
   */
  struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
  {
-       struct hd_struct *part = NULL;
-       struct disk_part_tbl *ptbl;
-       if (unlikely(partno < 0))
-               return NULL;
+       struct hd_struct *part;
  
        rcu_read_lock();
-       ptbl = rcu_dereference(disk->part_tbl);
-       if (likely(partno < ptbl->len)) {
-               part = rcu_dereference(ptbl->part[partno]);
-               if (part)
-                       get_device(part_to_dev(part));
-       }
+       part = __disk_get_part(disk, partno);
+       if (part)
+               get_device(part_to_dev(part));
        rcu_read_unlock();
  
        return part;
@@@ -242,7 -279,6 +279,7 @@@ EXPORT_SYMBOL_GPL(disk_map_sector_rcu)
   * Can be deleted altogether. Later.
   *
   */
 +#define BLKDEV_MAJOR_HASH_SIZE 255
  static struct blk_major_name {
        struct blk_major_name *next;
        int major;
@@@ -260,11 -296,12 +297,11 @@@ void blkdev_show(struct seq_file *seqf
  {
        struct blk_major_name *dp;
  
 -      if (offset < BLKDEV_MAJOR_HASH_SIZE) {
 -              mutex_lock(&block_class_lock);
 -              for (dp = major_names[offset]; dp; dp = dp->next)
 +      mutex_lock(&block_class_lock);
 +      for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
 +              if (dp->major == offset)
                        seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
 -              mutex_unlock(&block_class_lock);
 -      }
 +      mutex_unlock(&block_class_lock);
  }
  #endif /* CONFIG_PROC_FS */
  
@@@ -309,14 -346,6 +346,14 @@@ int register_blkdev(unsigned int major
                ret = major;
        }
  
 +      if (major >= BLKDEV_MAJOR_MAX) {
 +              pr_err("register_blkdev: major requested (%d) is greater than the maximum (%d) for %s\n",
 +                     major, BLKDEV_MAJOR_MAX, name);
 +
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
        p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
        if (p == NULL) {
                ret = -ENOMEM;
@@@ -1098,12 -1127,13 +1135,13 @@@ static const struct attribute_group *di
   * original ptbl is freed using RCU callback.
   *
   * LOCKING:
-  * Matching bd_mutx locked.
+  * Matching bd_mutex locked or the caller is the only user of @disk.
   */
  static void disk_replace_part_tbl(struct gendisk *disk,
                                  struct disk_part_tbl *new_ptbl)
  {
-       struct disk_part_tbl *old_ptbl = disk->part_tbl;
+       struct disk_part_tbl *old_ptbl =
+               rcu_dereference_protected(disk->part_tbl, 1);
  
        rcu_assign_pointer(disk->part_tbl, new_ptbl);
  
   * uses RCU to allow unlocked dereferencing for stats and other stuff.
   *
   * LOCKING:
-  * Matching bd_mutex locked, might sleep.
+  * Matching bd_mutex locked or the caller is the only user of @disk.
+  * Might sleep.
   *
   * RETURNS:
   * 0 on success, -errno on failure.
   */
  int disk_expand_part_tbl(struct gendisk *disk, int partno)
  {
-       struct disk_part_tbl *old_ptbl = disk->part_tbl;
+       struct disk_part_tbl *old_ptbl =
+               rcu_dereference_protected(disk->part_tbl, 1);
        struct disk_part_tbl *new_ptbl;
        int len = old_ptbl ? old_ptbl->len : 0;
        int i, target;
@@@ -1212,6 -1244,7 +1252,7 @@@ static int diskstats_show(struct seq_fi
        struct disk_part_iter piter;
        struct hd_struct *hd;
        char buf[BDEVNAME_SIZE];
+       unsigned int inflight[2];
        int cpu;
  
        /*
        disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
        while ((hd = disk_part_iter_next(&piter))) {
                cpu = part_stat_lock();
-               part_round_stats(cpu, hd);
+               part_round_stats(gp->queue, cpu, hd);
                part_stat_unlock();
+               part_in_flight(gp->queue, hd, inflight);
                seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
                           "%u %lu %lu %lu %u %u %u %u\n",
                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
                           part_stat_read(hd, merges[WRITE]),
                           part_stat_read(hd, sectors[WRITE]),
                           jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
-                          part_in_flight(hd),
+                          inflight[0],
                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
                           jiffies_to_msecs(part_stat_read(hd, time_in_queue))
                        );
@@@ -1321,6 -1355,14 +1363,14 @@@ EXPORT_SYMBOL(alloc_disk)
  struct gendisk *alloc_disk_node(int minors, int node_id)
  {
        struct gendisk *disk;
+       struct disk_part_tbl *ptbl;
+       if (minors > DISK_MAX_PARTS) {
+               printk(KERN_ERR
+                       "block: can't allocated more than %d partitions\n",
+                       DISK_MAX_PARTS);
+               minors = DISK_MAX_PARTS;
+       }
  
        disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
        if (disk) {
                        kfree(disk);
                        return NULL;
                }
-               disk->part_tbl->part[0] = &disk->part0;
+               ptbl = rcu_dereference_protected(disk->part_tbl, 1);
+               rcu_assign_pointer(ptbl->part[0], &disk->part0);
  
                /*
                 * set_capacity() and get_capacity() currently don't use
diff --combined drivers/block/Kconfig
@@@ -17,6 -17,7 +17,7 @@@ if BLK_DE
  
  config BLK_DEV_NULL_BLK
        tristate "Null test block driver"
+       depends on CONFIGFS_FS
  
  config BLK_DEV_FD
        tristate "Normal floppy disk support"
@@@ -470,7 -471,7 +471,7 @@@ config VIRTIO_BL
        depends on VIRTIO
        ---help---
          This is the virtual block driver for virtio.  It can be used with
 -          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
 +          QEMU based VMMs (like KVM or Xen).  Say Y or M.
  
  config VIRTIO_BLK_SCSI
        bool "SCSI passthrough request for the Virtio block driver"
diff --combined drivers/block/brd.c
@@@ -294,14 -294,13 +294,13 @@@ out
  
  static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
  {
-       struct block_device *bdev = bio->bi_bdev;
-       struct brd_device *brd = bdev->bd_disk->private_data;
+       struct brd_device *brd = bio->bi_disk->private_data;
        struct bio_vec bvec;
        sector_t sector;
        struct bvec_iter iter;
  
        sector = bio->bi_iter.bi_sector;
-       if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
+       if (bio_end_sector(bio) > get_capacity(bio->bi_disk))
                goto io_error;
  
        bio_for_each_segment(bvec, bio, iter) {
@@@ -326,11 -325,7 +325,11 @@@ static int brd_rw_page(struct block_dev
                       struct page *page, bool is_write)
  {
        struct brd_device *brd = bdev->bd_disk->private_data;
 -      int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
 +      int err;
 +
 +      if (PageTransHuge(page))
 +              return -ENOTSUPP;
 +      err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
        page_endio(page, is_write, err);
        return err;
  }
diff --combined drivers/block/loop.c
@@@ -221,7 -221,8 +221,7 @@@ static void __loop_update_dio(struct lo
  }
  
  static int
 -figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit,
 -               loff_t logical_blocksize)
 +figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
  {
        loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
        sector_t x = (sector_t)size;
                lo->lo_offset = offset;
        if (lo->lo_sizelimit != sizelimit)
                lo->lo_sizelimit = sizelimit;
 -      if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) {
 -              lo->lo_logical_blocksize = logical_blocksize;
 -              blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize);
 -              blk_queue_logical_block_size(lo->lo_queue,
 -                                           lo->lo_logical_blocksize);
 -      }
        set_capacity(lo->lo_disk, x);
        bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
        /* let user-space know about the new size */
@@@ -813,6 -820,7 +813,6 @@@ static void loop_config_discard(struct 
        struct file *file = lo->lo_backing_file;
        struct inode *inode = file->f_mapping->host;
        struct request_queue *q = lo->lo_queue;
 -      int lo_bits = 9;
  
        /*
         * We use punch hole to reclaim the free space used by the
  
        q->limits.discard_granularity = inode->i_sb->s_blocksize;
        q->limits.discard_alignment = 0;
 -      if (lo->lo_flags & LO_FLAGS_BLOCKSIZE)
 -              lo_bits = blksize_bits(lo->lo_logical_blocksize);
  
 -      blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits);
 -      blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits);
 +      blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
 +      blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
  }
  
@@@ -928,6 -938,7 +928,6 @@@ static int loop_set_fd(struct loop_devi
  
        lo->use_dio = false;
        lo->lo_blocksize = lo_blocksize;
 -      lo->lo_logical_blocksize = 512;
        lo->lo_device = bdev;
        lo->lo_flags = lo_flags;
        lo->lo_backing_file = file;
@@@ -1093,6 -1104,7 +1093,6 @@@ loop_set_status(struct loop_device *lo
        int err;
        struct loop_func_table *xfer;
        kuid_t uid = current_uid();
 -      int lo_flags = lo->lo_flags;
  
        if (lo->lo_encrypt_key_size &&
            !uid_eq(lo->lo_key_owner, uid) &&
        if (err)
                goto exit;
  
 -      if (info->lo_flags & LO_FLAGS_BLOCKSIZE) {
 -              if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE))
 -                      lo->lo_logical_blocksize = 512;
 -              lo->lo_flags |= LO_FLAGS_BLOCKSIZE;
 -              if (LO_INFO_BLOCKSIZE(info) != 512 &&
 -                  LO_INFO_BLOCKSIZE(info) != 1024 &&
 -                  LO_INFO_BLOCKSIZE(info) != 2048 &&
 -                  LO_INFO_BLOCKSIZE(info) != 4096)
 -                      return -EINVAL;
 -              if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize)
 -                      return -EINVAL;
 -      }
 -
        if (lo->lo_offset != info->lo_offset ||
 -          lo->lo_sizelimit != info->lo_sizelimit ||
 -          lo->lo_flags != lo_flags ||
 -          ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) &&
 -           lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) {
 -              if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit,
 -                                   LO_INFO_BLOCKSIZE(info))) {
 +          lo->lo_sizelimit != info->lo_sizelimit) {
 +              if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
                        err = -EFBIG;
                        goto exit;
                }
@@@ -1319,7 -1348,8 +1319,7 @@@ static int loop_set_capacity(struct loo
        if (unlikely(lo->lo_state != Lo_bound))
                return -ENXIO;
  
 -      return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit,
 -                              lo->lo_logical_blocksize);
 +      return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
  }
  
  static int loop_set_dio(struct loop_device *lo, unsigned long arg)
@@@ -1966,10 -1996,6 +1966,6 @@@ static int __init loop_init(void
        struct loop_device *lo;
        int err;
  
-       err = misc_register(&loop_misc);
-       if (err < 0)
-               return err;
        part_shift = 0;
        if (max_part > 0) {
                part_shift = fls(max_part);
  
        if ((1UL << part_shift) > DISK_MAX_PARTS) {
                err = -EINVAL;
-               goto misc_out;
+               goto err_out;
        }
  
        if (max_loop > 1UL << (MINORBITS - part_shift)) {
                err = -EINVAL;
-               goto misc_out;
+               goto err_out;
        }
  
        /*
                range = 1UL << MINORBITS;
        }
  
+       err = misc_register(&loop_misc);
+       if (err < 0)
+               goto err_out;
        if (register_blkdev(LOOP_MAJOR, "loop")) {
                err = -EIO;
                goto misc_out;
  
  misc_out:
        misc_deregister(&loop_misc);
+ err_out:
        return err;
  }
  
diff --combined drivers/block/null_blk.c
@@@ -1,3 -1,7 +1,7 @@@
+ /*
+  * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
+  * Shaohua Li <shli@fb.com>
+  */
  #include <linux/module.h>
  
  #include <linux/moduleparam.h>
  #include <linux/blk-mq.h>
  #include <linux/hrtimer.h>
  #include <linux/lightnvm.h>
+ #include <linux/configfs.h>
+ #include <linux/badblocks.h>
+ #define SECTOR_SHIFT          9
+ #define PAGE_SECTORS_SHIFT    (PAGE_SHIFT - SECTOR_SHIFT)
+ #define PAGE_SECTORS          (1 << PAGE_SECTORS_SHIFT)
+ #define SECTOR_SIZE           (1 << SECTOR_SHIFT)
+ #define SECTOR_MASK           (PAGE_SECTORS - 1)
+ #define FREE_BATCH            16
+ #define TICKS_PER_SEC         50ULL
+ #define TIMER_INTERVAL                (NSEC_PER_SEC / TICKS_PER_SEC)
+ static inline u64 mb_per_tick(int mbps)
+ {
+       return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
+ }
  
  struct nullb_cmd {
        struct list_head list;
        struct llist_node ll_list;
 -      struct call_single_data csd;
 +      call_single_data_t csd;
        struct request *rq;
        struct bio *bio;
        unsigned int tag;
        struct nullb_queue *nq;
        struct hrtimer timer;
+       blk_status_t error;
  };
  
  struct nullb_queue {
        unsigned long *tag_map;
        wait_queue_head_t wait;
        unsigned int queue_depth;
+       struct nullb_device *dev;
  
        struct nullb_cmd *cmds;
  };
  
+ /*
+  * Status flags for nullb_device.
+  *
+  * CONFIGURED:        Device has been configured and turned on. Cannot reconfigure.
+  * UP:                Device is currently on and visible in userspace.
+  * THROTTLED: Device is being throttled.
+  * CACHE:     Device is using a write-back cache.
+  */
+ enum nullb_device_flags {
+       NULLB_DEV_FL_CONFIGURED = 0,
+       NULLB_DEV_FL_UP         = 1,
+       NULLB_DEV_FL_THROTTLED  = 2,
+       NULLB_DEV_FL_CACHE      = 3,
+ };
+ /*
+  * nullb_page is a page in memory for nullb devices.
+  *
+  * @page:     The page holding the data.
+  * @bitmap:   The bitmap represents which sector in the page has data.
+  *            Each bit represents one block size. For example, sector 8
+  *            will use the 7th bit
+  * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
+  * page is being flushing to storage. FREE means the cache page is freed and
+  * should be skipped from flushing to storage. Please see
+  * null_make_cache_space
+  */
+ struct nullb_page {
+       struct page *page;
+       unsigned long bitmap;
+ };
+ #define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1)
+ #define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2)
+ struct nullb_device {
+       struct nullb *nullb;
+       struct config_item item;
+       struct radix_tree_root data; /* data stored in the disk */
+       struct radix_tree_root cache; /* disk cache data */
+       unsigned long flags; /* device flags */
+       unsigned int curr_cache;
+       struct badblocks badblocks;
+       unsigned long size; /* device size in MB */
+       unsigned long completion_nsec; /* time in ns to complete a request */
+       unsigned long cache_size; /* disk cache size in MB */
+       unsigned int submit_queues; /* number of submission queues */
+       unsigned int home_node; /* home node for the device */
+       unsigned int queue_mode; /* block interface */
+       unsigned int blocksize; /* block size */
+       unsigned int irqmode; /* IRQ completion handler */
+       unsigned int hw_queue_depth; /* queue depth */
+       unsigned int index; /* index of the disk, only valid with a disk */
+       unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
+       bool use_lightnvm; /* register as a LightNVM device */
+       bool blocking; /* blocking blk-mq device */
+       bool use_per_node_hctx; /* use per-node allocation for hardware context */
+       bool power; /* power on/off the device */
+       bool memory_backed; /* if data is stored in memory */
+       bool discard; /* if support discard */
+ };
  struct nullb {
+       struct nullb_device *dev;
        struct list_head list;
        unsigned int index;
        struct request_queue *q;
        struct nvm_dev *ndev;
        struct blk_mq_tag_set *tag_set;
        struct blk_mq_tag_set __tag_set;
-       struct hrtimer timer;
        unsigned int queue_depth;
+       atomic_long_t cur_bytes;
+       struct hrtimer bw_timer;
+       unsigned long cache_flush_pos;
        spinlock_t lock;
  
        struct nullb_queue *queues;
  static LIST_HEAD(nullb_list);
  static struct mutex lock;
  static int null_major;
- static int nullb_indexes;
+ static DEFINE_IDA(nullb_indexes);
  static struct kmem_cache *ppa_cache;
  static struct blk_mq_tag_set tag_set;
  
@@@ -65,15 -154,15 +154,15 @@@ enum 
        NULL_Q_MQ               = 2,
  };
  
- static int submit_queues;
- module_param(submit_queues, int, S_IRUGO);
+ static int g_submit_queues = 1;
+ module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
  MODULE_PARM_DESC(submit_queues, "Number of submission queues");
  
- static int home_node = NUMA_NO_NODE;
- module_param(home_node, int, S_IRUGO);
+ static int g_home_node = NUMA_NO_NODE;
+ module_param_named(home_node, g_home_node, int, S_IRUGO);
  MODULE_PARM_DESC(home_node, "Home node for the device");
  
- static int queue_mode = NULL_Q_MQ;
+ static int g_queue_mode = NULL_Q_MQ;
  
  static int null_param_store_val(const char *str, int *val, int min, int max)
  {
  
  static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
  {
-       return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ);
+       return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
  }
  
  static const struct kernel_param_ops null_queue_mode_param_ops = {
        .get    = param_get_int,
  };
  
- device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO);
+ device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO);
  MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
  
- static int gb = 250;
- module_param(gb, int, S_IRUGO);
+ static int g_gb = 250;
+ module_param_named(gb, g_gb, int, S_IRUGO);
  MODULE_PARM_DESC(gb, "Size in GB");
  
- static int bs = 512;
- module_param(bs, int, S_IRUGO);
+ static int g_bs = 512;
+ module_param_named(bs, g_bs, int, S_IRUGO);
  MODULE_PARM_DESC(bs, "Block size (in bytes)");
  
  static int nr_devices = 1;
  module_param(nr_devices, int, S_IRUGO);
  MODULE_PARM_DESC(nr_devices, "Number of devices to register");
  
- static bool use_lightnvm;
- module_param(use_lightnvm, bool, S_IRUGO);
+ static bool g_use_lightnvm;
+ module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
  MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
  
- static bool blocking;
- module_param(blocking, bool, S_IRUGO);
+ static bool g_blocking;
+ module_param_named(blocking, g_blocking, bool, S_IRUGO);
  MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
  
  static bool shared_tags;
  module_param(shared_tags, bool, S_IRUGO);
  MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
  
- static int irqmode = NULL_IRQ_SOFTIRQ;
+ static int g_irqmode = NULL_IRQ_SOFTIRQ;
  
  static int null_set_irqmode(const char *str, const struct kernel_param *kp)
  {
-       return null_param_store_val(str, &irqmode, NULL_IRQ_NONE,
+       return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
                                        NULL_IRQ_TIMER);
  }
  
@@@ -140,21 -229,358 +229,358 @@@ static const struct kernel_param_ops nu
        .get    = param_get_int,
  };
  
- device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
+ device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO);
  MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
  
- static unsigned long completion_nsec = 10000;
- module_param(completion_nsec, ulong, S_IRUGO);
+ static unsigned long g_completion_nsec = 10000;
+ module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO);
  MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
  
- static int hw_queue_depth = 64;
- module_param(hw_queue_depth, int, S_IRUGO);
+ static int g_hw_queue_depth = 64;
+ module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO);
  MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
  
- static bool use_per_node_hctx = false;
- module_param(use_per_node_hctx, bool, S_IRUGO);
+ static bool g_use_per_node_hctx;
+ module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO);
  MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
  
+ static struct nullb_device *null_alloc_dev(void);
+ static void null_free_dev(struct nullb_device *dev);
+ static void null_del_dev(struct nullb *nullb);
+ static int null_add_dev(struct nullb_device *dev);
+ static void null_free_device_storage(struct nullb_device *dev, bool is_cache);
+ static inline struct nullb_device *to_nullb_device(struct config_item *item)
+ {
+       return item ? container_of(item, struct nullb_device, item) : NULL;
+ }
+ static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
+ {
+       return snprintf(page, PAGE_SIZE, "%u\n", val);
+ }
+ static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
+       char *page)
+ {
+       return snprintf(page, PAGE_SIZE, "%lu\n", val);
+ }
+ static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
+ {
+       return snprintf(page, PAGE_SIZE, "%u\n", val);
+ }
+ static ssize_t nullb_device_uint_attr_store(unsigned int *val,
+       const char *page, size_t count)
+ {
+       unsigned int tmp;
+       int result;
+       result = kstrtouint(page, 0, &tmp);
+       if (result)
+               return result;
+       *val = tmp;
+       return count;
+ }
+ static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
+       const char *page, size_t count)
+ {
+       int result;
+       unsigned long tmp;
+       result = kstrtoul(page, 0, &tmp);
+       if (result)
+               return result;
+       *val = tmp;
+       return count;
+ }
+ static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
+       size_t count)
+ {
+       bool tmp;
+       int result;
+       result = kstrtobool(page,  &tmp);
+       if (result)
+               return result;
+       *val = tmp;
+       return count;
+ }
+ /* The following macro should only be used with TYPE = {uint, ulong, bool}. */
+ #define NULLB_DEVICE_ATTR(NAME, TYPE)                                         \
+ static ssize_t                                                                        \
+ nullb_device_##NAME##_show(struct config_item *item, char *page)              \
+ {                                                                             \
+       return nullb_device_##TYPE##_attr_show(                                 \
+                               to_nullb_device(item)->NAME, page);             \
+ }                                                                             \
+ static ssize_t                                                                        \
+ nullb_device_##NAME##_store(struct config_item *item, const char *page,               \
+                           size_t count)                                       \
+ {                                                                             \
+       if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags))   \
+               return -EBUSY;                                                  \
+       return nullb_device_##TYPE##_attr_store(                                \
+                       &to_nullb_device(item)->NAME, page, count);             \
+ }                                                                             \
+ CONFIGFS_ATTR(nullb_device_, NAME);
+ NULLB_DEVICE_ATTR(size, ulong);
+ NULLB_DEVICE_ATTR(completion_nsec, ulong);
+ NULLB_DEVICE_ATTR(submit_queues, uint);
+ NULLB_DEVICE_ATTR(home_node, uint);
+ NULLB_DEVICE_ATTR(queue_mode, uint);
+ NULLB_DEVICE_ATTR(blocksize, uint);
+ NULLB_DEVICE_ATTR(irqmode, uint);
+ NULLB_DEVICE_ATTR(hw_queue_depth, uint);
+ NULLB_DEVICE_ATTR(index, uint);
+ NULLB_DEVICE_ATTR(use_lightnvm, bool);
+ NULLB_DEVICE_ATTR(blocking, bool);
+ NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
+ NULLB_DEVICE_ATTR(memory_backed, bool);
+ NULLB_DEVICE_ATTR(discard, bool);
+ NULLB_DEVICE_ATTR(mbps, uint);
+ NULLB_DEVICE_ATTR(cache_size, ulong);
+ static ssize_t nullb_device_power_show(struct config_item *item, char *page)
+ {
+       return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
+ }
+ static ssize_t nullb_device_power_store(struct config_item *item,
+                                    const char *page, size_t count)
+ {
+       struct nullb_device *dev = to_nullb_device(item);
+       bool newp = false;
+       ssize_t ret;
+       ret = nullb_device_bool_attr_store(&newp, page, count);
+       if (ret < 0)
+               return ret;
+       if (!dev->power && newp) {
+               if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
+                       return count;
+               if (null_add_dev(dev)) {
+                       clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+                       return -ENOMEM;
+               }
+               set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
+               dev->power = newp;
+       } else if (dev->power && !newp) {
+               mutex_lock(&lock);
+               dev->power = newp;
+               null_del_dev(dev->nullb);
+               mutex_unlock(&lock);
+               clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+       }
+       return count;
+ }
+ CONFIGFS_ATTR(nullb_device_, power);
+ static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
+ {
+       struct nullb_device *t_dev = to_nullb_device(item);
+       return badblocks_show(&t_dev->badblocks, page, 0);
+ }
+ static ssize_t nullb_device_badblocks_store(struct config_item *item,
+                                    const char *page, size_t count)
+ {
+       struct nullb_device *t_dev = to_nullb_device(item);
+       char *orig, *buf, *tmp;
+       u64 start, end;
+       int ret;
+       orig = kstrndup(page, count, GFP_KERNEL);
+       if (!orig)
+               return -ENOMEM;
+       buf = strstrip(orig);
+       ret = -EINVAL;
+       if (buf[0] != '+' && buf[0] != '-')
+               goto out;
+       tmp = strchr(&buf[1], '-');
+       if (!tmp)
+               goto out;
+       *tmp = '\0';
+       ret = kstrtoull(buf + 1, 0, &start);
+       if (ret)
+               goto out;
+       ret = kstrtoull(tmp + 1, 0, &end);
+       if (ret)
+               goto out;
+       ret = -EINVAL;
+       if (start > end)
+               goto out;
+       /* enable badblocks */
+       cmpxchg(&t_dev->badblocks.shift, -1, 0);
+       if (buf[0] == '+')
+               ret = badblocks_set(&t_dev->badblocks, start,
+                       end - start + 1, 1);
+       else
+               ret = badblocks_clear(&t_dev->badblocks, start,
+                       end - start + 1);
+       if (ret == 0)
+               ret = count;
+ out:
+       kfree(orig);
+       return ret;
+ }
+ CONFIGFS_ATTR(nullb_device_, badblocks);
+ static struct configfs_attribute *nullb_device_attrs[] = {
+       &nullb_device_attr_size,
+       &nullb_device_attr_completion_nsec,
+       &nullb_device_attr_submit_queues,
+       &nullb_device_attr_home_node,
+       &nullb_device_attr_queue_mode,
+       &nullb_device_attr_blocksize,
+       &nullb_device_attr_irqmode,
+       &nullb_device_attr_hw_queue_depth,
+       &nullb_device_attr_index,
+       &nullb_device_attr_use_lightnvm,
+       &nullb_device_attr_blocking,
+       &nullb_device_attr_use_per_node_hctx,
+       &nullb_device_attr_power,
+       &nullb_device_attr_memory_backed,
+       &nullb_device_attr_discard,
+       &nullb_device_attr_mbps,
+       &nullb_device_attr_cache_size,
+       &nullb_device_attr_badblocks,
+       NULL,
+ };
+ static void nullb_device_release(struct config_item *item)
+ {
+       struct nullb_device *dev = to_nullb_device(item);
+       badblocks_exit(&dev->badblocks);
+       null_free_device_storage(dev, false);
+       null_free_dev(dev);
+ }
+ static struct configfs_item_operations nullb_device_ops = {
+       .release        = nullb_device_release,
+ };
+ static struct config_item_type nullb_device_type = {
+       .ct_item_ops    = &nullb_device_ops,
+       .ct_attrs       = nullb_device_attrs,
+       .ct_owner       = THIS_MODULE,
+ };
+ static struct
+ config_item *nullb_group_make_item(struct config_group *group, const char *name)
+ {
+       struct nullb_device *dev;
+       dev = null_alloc_dev();
+       if (!dev)
+               return ERR_PTR(-ENOMEM);
+       config_item_init_type_name(&dev->item, name, &nullb_device_type);
+       return &dev->item;
+ }
+ static void
+ nullb_group_drop_item(struct config_group *group, struct config_item *item)
+ {
+       struct nullb_device *dev = to_nullb_device(item);
+       if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
+               mutex_lock(&lock);
+               dev->power = false;
+               null_del_dev(dev->nullb);
+               mutex_unlock(&lock);
+       }
+       config_item_put(item);
+ }
+ static ssize_t memb_group_features_show(struct config_item *item, char *page)
+ {
+       return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n");
+ }
+ CONFIGFS_ATTR_RO(memb_group_, features);
+ static struct configfs_attribute *nullb_group_attrs[] = {
+       &memb_group_attr_features,
+       NULL,
+ };
+ static struct configfs_group_operations nullb_group_ops = {
+       .make_item      = nullb_group_make_item,
+       .drop_item      = nullb_group_drop_item,
+ };
+ static struct config_item_type nullb_group_type = {
+       .ct_group_ops   = &nullb_group_ops,
+       .ct_attrs       = nullb_group_attrs,
+       .ct_owner       = THIS_MODULE,
+ };
+ static struct configfs_subsystem nullb_subsys = {
+       .su_group = {
+               .cg_item = {
+                       .ci_namebuf = "nullb",
+                       .ci_type = &nullb_group_type,
+               },
+       },
+ };
+ static inline int null_cache_active(struct nullb *nullb)
+ {
+       return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+ }
+ static struct nullb_device *null_alloc_dev(void)
+ {
+       struct nullb_device *dev;
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return NULL;
+       INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
+       INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
+       if (badblocks_init(&dev->badblocks, 0)) {
+               kfree(dev);
+               return NULL;
+       }
+       dev->size = g_gb * 1024;
+       dev->completion_nsec = g_completion_nsec;
+       dev->submit_queues = g_submit_queues;
+       dev->home_node = g_home_node;
+       dev->queue_mode = g_queue_mode;
+       dev->blocksize = g_bs;
+       dev->irqmode = g_irqmode;
+       dev->hw_queue_depth = g_hw_queue_depth;
+       dev->use_lightnvm = g_use_lightnvm;
+       dev->blocking = g_blocking;
+       dev->use_per_node_hctx = g_use_per_node_hctx;
+       return dev;
+ }
+ static void null_free_dev(struct nullb_device *dev)
+ {
+       kfree(dev);
+ }
  static void put_tag(struct nullb_queue *nq, unsigned int tag)
  {
        clear_bit_unlock(tag, nq->tag_map);
@@@ -193,7 -619,7 +619,7 @@@ static struct nullb_cmd *__alloc_cmd(st
                cmd = &nq->cmds[tag];
                cmd->tag = tag;
                cmd->nq = nq;
-               if (irqmode == NULL_IRQ_TIMER) {
+               if (nq->dev->irqmode == NULL_IRQ_TIMER) {
                        hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
                                     HRTIMER_MODE_REL);
                        cmd->timer.function = null_cmd_timer_expired;
@@@ -229,19 -655,21 +655,21 @@@ static struct nullb_cmd *alloc_cmd(stru
  static void end_cmd(struct nullb_cmd *cmd)
  {
        struct request_queue *q = NULL;
+       int queue_mode = cmd->nq->dev->queue_mode;
  
        if (cmd->rq)
                q = cmd->rq->q;
  
        switch (queue_mode)  {
        case NULL_Q_MQ:
-               blk_mq_end_request(cmd->rq, BLK_STS_OK);
+               blk_mq_end_request(cmd->rq, cmd->error);
                return;
        case NULL_Q_RQ:
                INIT_LIST_HEAD(&cmd->rq->queuelist);
-               blk_end_request_all(cmd->rq, BLK_STS_OK);
+               blk_end_request_all(cmd->rq, cmd->error);
                break;
        case NULL_Q_BIO:
+               cmd->bio->bi_status = cmd->error;
                bio_endio(cmd->bio);
                break;
        }
@@@ -267,25 -695,582 +695,582 @@@ static enum hrtimer_restart null_cmd_ti
  
  static void null_cmd_end_timer(struct nullb_cmd *cmd)
  {
-       ktime_t kt = completion_nsec;
+       ktime_t kt = cmd->nq->dev->completion_nsec;
  
        hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
  }
  
  static void null_softirq_done_fn(struct request *rq)
  {
-       if (queue_mode == NULL_Q_MQ)
+       struct nullb *nullb = rq->q->queuedata;
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
                end_cmd(blk_mq_rq_to_pdu(rq));
        else
                end_cmd(rq->special);
  }
  
- static inline void null_handle_cmd(struct nullb_cmd *cmd)
+ static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
+ {
+       struct nullb_page *t_page;
+       t_page = kmalloc(sizeof(struct nullb_page), gfp_flags);
+       if (!t_page)
+               goto out;
+       t_page->page = alloc_pages(gfp_flags, 0);
+       if (!t_page->page)
+               goto out_freepage;
+       t_page->bitmap = 0;
+       return t_page;
+ out_freepage:
+       kfree(t_page);
+ out:
+       return NULL;
+ }
+ static void null_free_page(struct nullb_page *t_page)
+ {
+       __set_bit(NULLB_PAGE_FREE, &t_page->bitmap);
+       if (test_bit(NULLB_PAGE_LOCK, &t_page->bitmap))
+               return;
+       __free_page(t_page->page);
+       kfree(t_page);
+ }
+ static void null_free_sector(struct nullb *nullb, sector_t sector,
+       bool is_cache)
+ {
+       unsigned int sector_bit;
+       u64 idx;
+       struct nullb_page *t_page, *ret;
+       struct radix_tree_root *root;
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       sector_bit = (sector & SECTOR_MASK);
+       t_page = radix_tree_lookup(root, idx);
+       if (t_page) {
+               __clear_bit(sector_bit, &t_page->bitmap);
+               if (!t_page->bitmap) {
+                       ret = radix_tree_delete_item(root, idx, t_page);
+                       WARN_ON(ret != t_page);
+                       null_free_page(ret);
+                       if (is_cache)
+                               nullb->dev->curr_cache -= PAGE_SIZE;
+               }
+       }
+ }
+ static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
+       struct nullb_page *t_page, bool is_cache)
+ {
+       struct radix_tree_root *root;
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+       if (radix_tree_insert(root, idx, t_page)) {
+               null_free_page(t_page);
+               t_page = radix_tree_lookup(root, idx);
+               WARN_ON(!t_page || t_page->page->index != idx);
+       } else if (is_cache)
+               nullb->dev->curr_cache += PAGE_SIZE;
+       return t_page;
+ }
+ static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
+ {
+       unsigned long pos = 0;
+       int nr_pages;
+       struct nullb_page *ret, *t_pages[FREE_BATCH];
+       struct radix_tree_root *root;
+       root = is_cache ? &dev->cache : &dev->data;
+       do {
+               int i;
+               nr_pages = radix_tree_gang_lookup(root,
+                               (void **)t_pages, pos, FREE_BATCH);
+               for (i = 0; i < nr_pages; i++) {
+                       pos = t_pages[i]->page->index;
+                       ret = radix_tree_delete_item(root, pos, t_pages[i]);
+                       WARN_ON(ret != t_pages[i]);
+                       null_free_page(ret);
+               }
+               pos++;
+       } while (nr_pages == FREE_BATCH);
+       if (is_cache)
+               dev->curr_cache = 0;
+ }
+ static struct nullb_page *__null_lookup_page(struct nullb *nullb,
+       sector_t sector, bool for_write, bool is_cache)
+ {
+       unsigned int sector_bit;
+       u64 idx;
+       struct nullb_page *t_page;
+       struct radix_tree_root *root;
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       sector_bit = (sector & SECTOR_MASK);
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+       t_page = radix_tree_lookup(root, idx);
+       WARN_ON(t_page && t_page->page->index != idx);
+       if (t_page && (for_write || test_bit(sector_bit, &t_page->bitmap)))
+               return t_page;
+       return NULL;
+ }
+ static struct nullb_page *null_lookup_page(struct nullb *nullb,
+       sector_t sector, bool for_write, bool ignore_cache)
+ {
+       struct nullb_page *page = NULL;
+       if (!ignore_cache)
+               page = __null_lookup_page(nullb, sector, for_write, true);
+       if (page)
+               return page;
+       return __null_lookup_page(nullb, sector, for_write, false);
+ }
+ static struct nullb_page *null_insert_page(struct nullb *nullb,
+       sector_t sector, bool ignore_cache)
+ {
+       u64 idx;
+       struct nullb_page *t_page;
+       t_page = null_lookup_page(nullb, sector, true, ignore_cache);
+       if (t_page)
+               return t_page;
+       spin_unlock_irq(&nullb->lock);
+       t_page = null_alloc_page(GFP_NOIO);
+       if (!t_page)
+               goto out_lock;
+       if (radix_tree_preload(GFP_NOIO))
+               goto out_freepage;
+       spin_lock_irq(&nullb->lock);
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       t_page->page->index = idx;
+       t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
+       radix_tree_preload_end();
+       return t_page;
+ out_freepage:
+       null_free_page(t_page);
+ out_lock:
+       spin_lock_irq(&nullb->lock);
+       return null_lookup_page(nullb, sector, true, ignore_cache);
+ }
+ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
+ {
+       int i;
+       unsigned int offset;
+       u64 idx;
+       struct nullb_page *t_page, *ret;
+       void *dst, *src;
+       idx = c_page->page->index;
+       t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
+       __clear_bit(NULLB_PAGE_LOCK, &c_page->bitmap);
+       if (test_bit(NULLB_PAGE_FREE, &c_page->bitmap)) {
+               null_free_page(c_page);
+               if (t_page && t_page->bitmap == 0) {
+                       ret = radix_tree_delete_item(&nullb->dev->data,
+                               idx, t_page);
+                       null_free_page(t_page);
+               }
+               return 0;
+       }
+       if (!t_page)
+               return -ENOMEM;
+       src = kmap_atomic(c_page->page);
+       dst = kmap_atomic(t_page->page);
+       for (i = 0; i < PAGE_SECTORS;
+                       i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
+               if (test_bit(i, &c_page->bitmap)) {
+                       offset = (i << SECTOR_SHIFT);
+                       memcpy(dst + offset, src + offset,
+                               nullb->dev->blocksize);
+                       __set_bit(i, &t_page->bitmap);
+               }
+       }
+       kunmap_atomic(dst);
+       kunmap_atomic(src);
+       ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
+       null_free_page(ret);
+       nullb->dev->curr_cache -= PAGE_SIZE;
+       return 0;
+ }
+ static int null_make_cache_space(struct nullb *nullb, unsigned long n)
  {
+       int i, err, nr_pages;
+       struct nullb_page *c_pages[FREE_BATCH];
+       unsigned long flushed = 0, one_round;
+ again:
+       if ((nullb->dev->cache_size * 1024 * 1024) >
+            nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
+               return 0;
+       nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
+                       (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
+       /*
+        * nullb_flush_cache_page could unlock before using the c_pages. To
+        * avoid race, we don't allow page free
+        */
+       for (i = 0; i < nr_pages; i++) {
+               nullb->cache_flush_pos = c_pages[i]->page->index;
+               /*
+                * We found the page which is being flushed to disk by other
+                * threads
+                */
+               if (test_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap))
+                       c_pages[i] = NULL;
+               else
+                       __set_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap);
+       }
+       one_round = 0;
+       for (i = 0; i < nr_pages; i++) {
+               if (c_pages[i] == NULL)
+                       continue;
+               err = null_flush_cache_page(nullb, c_pages[i]);
+               if (err)
+                       return err;
+               one_round++;
+       }
+       flushed += one_round << PAGE_SHIFT;
+       if (n > flushed) {
+               if (nr_pages == 0)
+                       nullb->cache_flush_pos = 0;
+               if (one_round == 0) {
+                       /* give other threads a chance */
+                       spin_unlock_irq(&nullb->lock);
+                       spin_lock_irq(&nullb->lock);
+               }
+               goto again;
+       }
+       return 0;
+ }
+ static int copy_to_nullb(struct nullb *nullb, struct page *source,
+       unsigned int off, sector_t sector, size_t n, bool is_fua)
+ {
+       size_t temp, count = 0;
+       unsigned int offset;
+       struct nullb_page *t_page;
+       void *dst, *src;
+       while (count < n) {
+               temp = min_t(size_t, nullb->dev->blocksize, n - count);
+               if (null_cache_active(nullb) && !is_fua)
+                       null_make_cache_space(nullb, PAGE_SIZE);
+               offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+               t_page = null_insert_page(nullb, sector,
+                       !null_cache_active(nullb) || is_fua);
+               if (!t_page)
+                       return -ENOSPC;
+               src = kmap_atomic(source);
+               dst = kmap_atomic(t_page->page);
+               memcpy(dst + offset, src + off + count, temp);
+               kunmap_atomic(dst);
+               kunmap_atomic(src);
+               __set_bit(sector & SECTOR_MASK, &t_page->bitmap);
+               if (is_fua)
+                       null_free_sector(nullb, sector, true);
+               count += temp;
+               sector += temp >> SECTOR_SHIFT;
+       }
+       return 0;
+ }
+ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
+       unsigned int off, sector_t sector, size_t n)
+ {
+       size_t temp, count = 0;
+       unsigned int offset;
+       struct nullb_page *t_page;
+       void *dst, *src;
+       while (count < n) {
+               temp = min_t(size_t, nullb->dev->blocksize, n - count);
+               offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+               t_page = null_lookup_page(nullb, sector, false,
+                       !null_cache_active(nullb));
+               dst = kmap_atomic(dest);
+               if (!t_page) {
+                       memset(dst + off + count, 0, temp);
+                       goto next;
+               }
+               src = kmap_atomic(t_page->page);
+               memcpy(dst + off + count, src + offset, temp);
+               kunmap_atomic(src);
+ next:
+               kunmap_atomic(dst);
+               count += temp;
+               sector += temp >> SECTOR_SHIFT;
+       }
+       return 0;
+ }
+ static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n)
+ {
+       size_t temp;
+       spin_lock_irq(&nullb->lock);
+       while (n > 0) {
+               temp = min_t(size_t, n, nullb->dev->blocksize);
+               null_free_sector(nullb, sector, false);
+               if (null_cache_active(nullb))
+                       null_free_sector(nullb, sector, true);
+               sector += temp >> SECTOR_SHIFT;
+               n -= temp;
+       }
+       spin_unlock_irq(&nullb->lock);
+ }
+ static int null_handle_flush(struct nullb *nullb)
+ {
+       int err;
+       if (!null_cache_active(nullb))
+               return 0;
+       spin_lock_irq(&nullb->lock);
+       while (true) {
+               err = null_make_cache_space(nullb,
+                       nullb->dev->cache_size * 1024 * 1024);
+               if (err || nullb->dev->curr_cache == 0)
+                       break;
+       }
+       WARN_ON(!radix_tree_empty(&nullb->dev->cache));
+       spin_unlock_irq(&nullb->lock);
+       return err;
+ }
+ static int null_transfer(struct nullb *nullb, struct page *page,
+       unsigned int len, unsigned int off, bool is_write, sector_t sector,
+       bool is_fua)
+ {
+       int err = 0;
+       if (!is_write) {
+               err = copy_from_nullb(nullb, page, off, sector, len);
+               flush_dcache_page(page);
+       } else {
+               flush_dcache_page(page);
+               err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+       }
+       return err;
+ }
+ static int null_handle_rq(struct nullb_cmd *cmd)
+ {
+       struct request *rq = cmd->rq;
+       struct nullb *nullb = cmd->nq->dev->nullb;
+       int err;
+       unsigned int len;
+       sector_t sector;
+       struct req_iterator iter;
+       struct bio_vec bvec;
+       sector = blk_rq_pos(rq);
+       if (req_op(rq) == REQ_OP_DISCARD) {
+               null_handle_discard(nullb, sector, blk_rq_bytes(rq));
+               return 0;
+       }
+       spin_lock_irq(&nullb->lock);
+       rq_for_each_segment(bvec, rq, iter) {
+               len = bvec.bv_len;
+               err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+                                    op_is_write(req_op(rq)), sector,
+                                    req_op(rq) & REQ_FUA);
+               if (err) {
+                       spin_unlock_irq(&nullb->lock);
+                       return err;
+               }
+               sector += len >> SECTOR_SHIFT;
+       }
+       spin_unlock_irq(&nullb->lock);
+       return 0;
+ }
+ static int null_handle_bio(struct nullb_cmd *cmd)
+ {
+       struct bio *bio = cmd->bio;
+       struct nullb *nullb = cmd->nq->dev->nullb;
+       int err;
+       unsigned int len;
+       sector_t sector;
+       struct bio_vec bvec;
+       struct bvec_iter iter;
+       sector = bio->bi_iter.bi_sector;
+       if (bio_op(bio) == REQ_OP_DISCARD) {
+               null_handle_discard(nullb, sector,
+                       bio_sectors(bio) << SECTOR_SHIFT);
+               return 0;
+       }
+       spin_lock_irq(&nullb->lock);
+       bio_for_each_segment(bvec, bio, iter) {
+               len = bvec.bv_len;
+               err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+                                    op_is_write(bio_op(bio)), sector,
+                                    bio_op(bio) & REQ_FUA);
+               if (err) {
+                       spin_unlock_irq(&nullb->lock);
+                       return err;
+               }
+               sector += len >> SECTOR_SHIFT;
+       }
+       spin_unlock_irq(&nullb->lock);
+       return 0;
+ }
+ static void null_stop_queue(struct nullb *nullb)
+ {
+       struct request_queue *q = nullb->q;
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
+               blk_mq_stop_hw_queues(q);
+       else {
+               spin_lock_irq(q->queue_lock);
+               blk_stop_queue(q);
+               spin_unlock_irq(q->queue_lock);
+       }
+ }
+ static void null_restart_queue_async(struct nullb *nullb)
+ {
+       struct request_queue *q = nullb->q;
+       unsigned long flags;
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
+               blk_mq_start_stopped_hw_queues(q, true);
+       else {
+               spin_lock_irqsave(q->queue_lock, flags);
+               blk_start_queue_async(q);
+               spin_unlock_irqrestore(q->queue_lock, flags);
+       }
+ }
+ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
+ {
+       struct nullb_device *dev = cmd->nq->dev;
+       struct nullb *nullb = dev->nullb;
+       int err = 0;
+       if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
+               struct request *rq = cmd->rq;
+               if (!hrtimer_active(&nullb->bw_timer))
+                       hrtimer_restart(&nullb->bw_timer);
+               if (atomic_long_sub_return(blk_rq_bytes(rq),
+                               &nullb->cur_bytes) < 0) {
+                       null_stop_queue(nullb);
+                       /* race with timer */
+                       if (atomic_long_read(&nullb->cur_bytes) > 0)
+                               null_restart_queue_async(nullb);
+                       if (dev->queue_mode == NULL_Q_RQ) {
+                               struct request_queue *q = nullb->q;
+                               spin_lock_irq(q->queue_lock);
+                               rq->rq_flags |= RQF_DONTPREP;
+                               blk_requeue_request(q, rq);
+                               spin_unlock_irq(q->queue_lock);
+                               return BLK_STS_OK;
+                       } else
+                               /* requeue request */
+                               return BLK_STS_RESOURCE;
+               }
+       }
+       if (nullb->dev->badblocks.shift != -1) {
+               int bad_sectors;
+               sector_t sector, size, first_bad;
+               bool is_flush = true;
+               if (dev->queue_mode == NULL_Q_BIO &&
+                               bio_op(cmd->bio) != REQ_OP_FLUSH) {
+                       is_flush = false;
+                       sector = cmd->bio->bi_iter.bi_sector;
+                       size = bio_sectors(cmd->bio);
+               }
+               if (dev->queue_mode != NULL_Q_BIO &&
+                               req_op(cmd->rq) != REQ_OP_FLUSH) {
+                       is_flush = false;
+                       sector = blk_rq_pos(cmd->rq);
+                       size = blk_rq_sectors(cmd->rq);
+               }
+               if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector,
+                               size, &first_bad, &bad_sectors)) {
+                       cmd->error = BLK_STS_IOERR;
+                       goto out;
+               }
+       }
+       if (dev->memory_backed) {
+               if (dev->queue_mode == NULL_Q_BIO) {
+                       if (bio_op(cmd->bio) == REQ_OP_FLUSH)
+                               err = null_handle_flush(nullb);
+                       else
+                               err = null_handle_bio(cmd);
+               } else {
+                       if (req_op(cmd->rq) == REQ_OP_FLUSH)
+                               err = null_handle_flush(nullb);
+                       else
+                               err = null_handle_rq(cmd);
+               }
+       }
+       cmd->error = errno_to_blk_status(err);
+ out:
        /* Complete IO by inline, softirq or timer */
-       switch (irqmode) {
+       switch (dev->irqmode) {
        case NULL_IRQ_SOFTIRQ:
-               switch (queue_mode)  {
+               switch (dev->queue_mode)  {
                case NULL_Q_MQ:
                        blk_mq_complete_request(cmd->rq);
                        break;
                null_cmd_end_timer(cmd);
                break;
        }
+       return BLK_STS_OK;
+ }
+ static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
+ {
+       struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
+       ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+       unsigned int mbps = nullb->dev->mbps;
+       if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
+               return HRTIMER_NORESTART;
+       atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
+       null_restart_queue_async(nullb);
+       hrtimer_forward_now(&nullb->bw_timer, timer_interval);
+       return HRTIMER_RESTART;
+ }
+ static void nullb_setup_bwtimer(struct nullb *nullb)
+ {
+       ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+       hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       nullb->bw_timer.function = nullb_bwtimer_fn;
+       atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
+       hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
  }
  
  static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
@@@ -366,20 -1379,20 +1379,20 @@@ static blk_status_t null_queue_rq(struc
                         const struct blk_mq_queue_data *bd)
  {
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+       struct nullb_queue *nq = hctx->driver_data;
  
        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
  
-       if (irqmode == NULL_IRQ_TIMER) {
+       if (nq->dev->irqmode == NULL_IRQ_TIMER) {
                hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
                cmd->timer.function = null_cmd_timer_expired;
        }
        cmd->rq = bd->rq;
-       cmd->nq = hctx->driver_data;
+       cmd->nq = nq;
  
        blk_mq_start_request(bd->rq);
  
-       null_handle_cmd(cmd);
-       return BLK_STS_OK;
+       return null_handle_cmd(cmd);
  }
  
  static const struct blk_mq_ops null_mq_ops = {
@@@ -438,7 -1451,8 +1451,8 @@@ static int null_lnvm_submit_io(struct n
  
  static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
  {
-       sector_t size = gb * 1024 * 1024 * 1024ULL;
+       struct nullb *nullb = dev->q->queuedata;
+       sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
        sector_t blksize;
        struct nvm_id_group *grp;
  
        id->ppaf.ch_offset = 56;
        id->ppaf.ch_len = 8;
  
-       sector_div(size, bs); /* convert size to pages */
+       sector_div(size, nullb->dev->blocksize); /* convert size to pages */
        size >>= 8; /* concert size to pgs pr blk */
        grp = &id->grp;
        grp->mtype = 0;
        grp->num_blk = blksize;
        grp->num_pln = 1;
  
-       grp->fpg_sz = bs;
-       grp->csecs = bs;
+       grp->fpg_sz = nullb->dev->blocksize;
+       grp->csecs = nullb->dev->blocksize;
        grp->trdt = 25000;
        grp->trdm = 25000;
        grp->tprt = 500000;
        grp->tbet = 1500000;
        grp->tbem = 1500000;
        grp->mpos = 0x010101; /* single plane rwe */
-       grp->cpar = hw_queue_depth;
+       grp->cpar = nullb->dev->hw_queue_depth;
  
        return 0;
  }
@@@ -568,19 -1582,44 +1582,44 @@@ static void null_nvm_unregister(struct 
  
  static void null_del_dev(struct nullb *nullb)
  {
+       struct nullb_device *dev = nullb->dev;
+       ida_simple_remove(&nullb_indexes, nullb->index);
        list_del_init(&nullb->list);
  
-       if (use_lightnvm)
+       if (dev->use_lightnvm)
                null_nvm_unregister(nullb);
        else
                del_gendisk(nullb->disk);
+       if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
+               hrtimer_cancel(&nullb->bw_timer);
+               atomic_long_set(&nullb->cur_bytes, LONG_MAX);
+               null_restart_queue_async(nullb);
+       }
        blk_cleanup_queue(nullb->q);
-       if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+       if (dev->queue_mode == NULL_Q_MQ &&
+           nullb->tag_set == &nullb->__tag_set)
                blk_mq_free_tag_set(nullb->tag_set);
-       if (!use_lightnvm)
+       if (!dev->use_lightnvm)
                put_disk(nullb->disk);
        cleanup_queues(nullb);
+       if (null_cache_active(nullb))
+               null_free_device_storage(nullb->dev, true);
        kfree(nullb);
+       dev->nullb = NULL;
+ }
+ static void null_config_discard(struct nullb *nullb)
+ {
+       if (nullb->dev->discard == false)
+               return;
+       nullb->q->limits.discard_granularity = nullb->dev->blocksize;
+       nullb->q->limits.discard_alignment = nullb->dev->blocksize;
+       blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nullb->q);
  }
  
  static int null_open(struct block_device *bdev, fmode_t mode)
@@@ -605,6 -1644,7 +1644,7 @@@ static void null_init_queue(struct null
  
        init_waitqueue_head(&nq->wait);
        nq->queue_depth = nullb->queue_depth;
+       nq->dev = nullb->dev;
  }
  
  static void null_init_queues(struct nullb *nullb)
@@@ -652,13 -1692,13 +1692,13 @@@ static int setup_commands(struct nullb_
  
  static int setup_queues(struct nullb *nullb)
  {
-       nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
-                                                               GFP_KERNEL);
+       nullb->queues = kzalloc(nullb->dev->submit_queues *
+               sizeof(struct nullb_queue), GFP_KERNEL);
        if (!nullb->queues)
                return -ENOMEM;
  
        nullb->nr_queues = 0;
-       nullb->queue_depth = hw_queue_depth;
+       nullb->queue_depth = nullb->dev->hw_queue_depth;
  
        return 0;
  }
@@@ -668,7 -1708,7 +1708,7 @@@ static int init_driver_queues(struct nu
        struct nullb_queue *nq;
        int i, ret = 0;
  
-       for (i = 0; i < submit_queues; i++) {
+       for (i = 0; i < nullb->dev->submit_queues; i++) {
                nq = &nullb->queues[i];
  
                null_init_queue(nullb, nq);
@@@ -686,10 -1726,10 +1726,10 @@@ static int null_gendisk_register(struc
        struct gendisk *disk;
        sector_t size;
  
-       disk = nullb->disk = alloc_disk_node(1, home_node);
+       disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
        if (!disk)
                return -ENOMEM;
-       size = gb * 1024 * 1024 * 1024ULL;
+       size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
        set_capacity(disk, size >> 9);
  
        disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
        return 0;
  }
  
- static int null_init_tag_set(struct blk_mq_tag_set *set)
+ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
  {
        set->ops = &null_mq_ops;
-       set->nr_hw_queues = submit_queues;
-       set->queue_depth = hw_queue_depth;
-       set->numa_node = home_node;
+       set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
+                                               g_submit_queues;
+       set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
+                                               g_hw_queue_depth;
+       set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
        set->cmd_size   = sizeof(struct nullb_cmd);
        set->flags = BLK_MQ_F_SHOULD_MERGE;
        set->driver_data = NULL;
  
-       if (blocking)
+       if ((nullb && nullb->dev->blocking) || g_blocking)
                set->flags |= BLK_MQ_F_BLOCKING;
  
        return blk_mq_alloc_tag_set(set);
  }
  
- static int null_add_dev(void)
+ static void null_validate_conf(struct nullb_device *dev)
+ {
+       dev->blocksize = round_down(dev->blocksize, 512);
+       dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
+       if (dev->use_lightnvm && dev->blocksize != 4096)
+               dev->blocksize = 4096;
+       if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
+               dev->queue_mode = NULL_Q_MQ;
+       if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
+               if (dev->submit_queues != nr_online_nodes)
+                       dev->submit_queues = nr_online_nodes;
+       } else if (dev->submit_queues > nr_cpu_ids)
+               dev->submit_queues = nr_cpu_ids;
+       else if (dev->submit_queues == 0)
+               dev->submit_queues = 1;
+       dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
+       dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
+       /* Do memory allocation, so set blocking */
+       if (dev->memory_backed)
+               dev->blocking = true;
+       else /* cache is meaningless */
+               dev->cache_size = 0;
+       dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
+                                               dev->cache_size);
+       dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
+       /* can not stop a queue */
+       if (dev->queue_mode == NULL_Q_BIO)
+               dev->mbps = 0;
+ }
+ static int null_add_dev(struct nullb_device *dev)
  {
        struct nullb *nullb;
        int rv;
  
-       nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
+       null_validate_conf(dev);
+       nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
        if (!nullb) {
                rv = -ENOMEM;
                goto out;
        }
+       nullb->dev = dev;
+       dev->nullb = nullb;
  
        spin_lock_init(&nullb->lock);
  
-       if (queue_mode == NULL_Q_MQ && use_per_node_hctx)
-               submit_queues = nr_online_nodes;
        rv = setup_queues(nullb);
        if (rv)
                goto out_free_nullb;
  
-       if (queue_mode == NULL_Q_MQ) {
+       if (dev->queue_mode == NULL_Q_MQ) {
                if (shared_tags) {
                        nullb->tag_set = &tag_set;
                        rv = 0;
                } else {
                        nullb->tag_set = &nullb->__tag_set;
-                       rv = null_init_tag_set(nullb->tag_set);
+                       rv = null_init_tag_set(nullb, nullb->tag_set);
                }
  
                if (rv)
                        goto out_cleanup_tags;
                }
                null_init_queues(nullb);
-       } else if (queue_mode == NULL_Q_BIO) {
-               nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+       } else if (dev->queue_mode == NULL_Q_BIO) {
+               nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node);
                if (!nullb->q) {
                        rv = -ENOMEM;
                        goto out_cleanup_queues;
                if (rv)
                        goto out_cleanup_blk_queue;
        } else {
-               nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+               nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock,
+                                               dev->home_node);
                if (!nullb->q) {
                        rv = -ENOMEM;
                        goto out_cleanup_queues;
                        goto out_cleanup_blk_queue;
        }
  
+       if (dev->mbps) {
+               set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
+               nullb_setup_bwtimer(nullb);
+       }
+       if (dev->cache_size > 0) {
+               set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+               blk_queue_write_cache(nullb->q, true, true);
+               blk_queue_flush_queueable(nullb->q, true);
+       }
        nullb->q->queuedata = nullb;
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
        queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
  
        mutex_lock(&lock);
-       nullb->index = nullb_indexes++;
+       nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
+       dev->index = nullb->index;
        mutex_unlock(&lock);
  
-       blk_queue_logical_block_size(nullb->q, bs);
-       blk_queue_physical_block_size(nullb->q, bs);
+       blk_queue_logical_block_size(nullb->q, dev->blocksize);
+       blk_queue_physical_block_size(nullb->q, dev->blocksize);
+       null_config_discard(nullb);
  
        sprintf(nullb->disk_name, "nullb%d", nullb->index);
  
-       if (use_lightnvm)
+       if (dev->use_lightnvm)
                rv = null_nvm_register(nullb);
        else
                rv = null_gendisk_register(nullb);
  out_cleanup_blk_queue:
        blk_cleanup_queue(nullb->q);
  out_cleanup_tags:
-       if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+       if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
                blk_mq_free_tag_set(nullb->tag_set);
  out_cleanup_queues:
        cleanup_queues(nullb);
@@@ -825,51 -1917,63 +1917,63 @@@ static int __init null_init(void
        int ret = 0;
        unsigned int i;
        struct nullb *nullb;
+       struct nullb_device *dev;
+       /* check for nullb_page.bitmap */
+       if (sizeof(unsigned long) * 8 - 2 < (PAGE_SIZE >> SECTOR_SHIFT))
+               return -EINVAL;
  
-       if (bs > PAGE_SIZE) {
+       if (g_bs > PAGE_SIZE) {
                pr_warn("null_blk: invalid block size\n");
                pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
-               bs = PAGE_SIZE;
+               g_bs = PAGE_SIZE;
        }
  
-       if (use_lightnvm && bs != 4096) {
+       if (g_use_lightnvm && g_bs != 4096) {
                pr_warn("null_blk: LightNVM only supports 4k block size\n");
                pr_warn("null_blk: defaults block size to 4k\n");
-               bs = 4096;
+               g_bs = 4096;
        }
  
-       if (use_lightnvm && queue_mode != NULL_Q_MQ) {
+       if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
                pr_warn("null_blk: LightNVM only supported for blk-mq\n");
                pr_warn("null_blk: defaults queue mode to blk-mq\n");
-               queue_mode = NULL_Q_MQ;
+               g_queue_mode = NULL_Q_MQ;
        }
  
-       if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
-               if (submit_queues < nr_online_nodes) {
-                       pr_warn("null_blk: submit_queues param is set to %u.",
+       if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
+               if (g_submit_queues != nr_online_nodes) {
+                       pr_warn("null_blk: submit_queues param is set to %u.\n",
                                                        nr_online_nodes);
-                       submit_queues = nr_online_nodes;
+                       g_submit_queues = nr_online_nodes;
                }
-       } else if (submit_queues > nr_cpu_ids)
-               submit_queues = nr_cpu_ids;
-       else if (!submit_queues)
-               submit_queues = 1;
+       } else if (g_submit_queues > nr_cpu_ids)
+               g_submit_queues = nr_cpu_ids;
+       else if (g_submit_queues <= 0)
+               g_submit_queues = 1;
  
-       if (queue_mode == NULL_Q_MQ && shared_tags) {
-               ret = null_init_tag_set(&tag_set);
+       if (g_queue_mode == NULL_Q_MQ && shared_tags) {
+               ret = null_init_tag_set(NULL, &tag_set);
                if (ret)
                        return ret;
        }
  
+       config_group_init(&nullb_subsys.su_group);
+       mutex_init(&nullb_subsys.su_mutex);
+       ret = configfs_register_subsystem(&nullb_subsys);
+       if (ret)
+               goto err_tagset;
        mutex_init(&lock);
  
        null_major = register_blkdev(0, "nullb");
        if (null_major < 0) {
                ret = null_major;
-               goto err_tagset;
+               goto err_conf;
        }
  
-       if (use_lightnvm) {
+       if (g_use_lightnvm) {
                ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
                                                                0, 0, NULL);
                if (!ppa_cache) {
        }
  
        for (i = 0; i < nr_devices; i++) {
-               ret = null_add_dev();
-               if (ret)
+               dev = null_alloc_dev();
+               if (!dev)
+                       goto err_dev;
+               ret = null_add_dev(dev);
+               if (ret) {
+                       null_free_dev(dev);
                        goto err_dev;
+               }
        }
  
        pr_info("null: module loaded\n");
  err_dev:
        while (!list_empty(&nullb_list)) {
                nullb = list_entry(nullb_list.next, struct nullb, list);
+               dev = nullb->dev;
                null_del_dev(nullb);
+               null_free_dev(dev);
        }
        kmem_cache_destroy(ppa_cache);
  err_ppa:
        unregister_blkdev(null_major, "nullb");
+ err_conf:
+       configfs_unregister_subsystem(&nullb_subsys);
  err_tagset:
-       if (queue_mode == NULL_Q_MQ && shared_tags)
+       if (g_queue_mode == NULL_Q_MQ && shared_tags)
                blk_mq_free_tag_set(&tag_set);
        return ret;
  }
@@@ -906,16 -2019,22 +2019,22 @@@ static void __exit null_exit(void
  {
        struct nullb *nullb;
  
+       configfs_unregister_subsystem(&nullb_subsys);
        unregister_blkdev(null_major, "nullb");
  
        mutex_lock(&lock);
        while (!list_empty(&nullb_list)) {
+               struct nullb_device *dev;
                nullb = list_entry(nullb_list.next, struct nullb, list);
+               dev = nullb->dev;
                null_del_dev(nullb);
+               null_free_dev(dev);
        }
        mutex_unlock(&lock);
  
-       if (queue_mode == NULL_Q_MQ && shared_tags)
+       if (g_queue_mode == NULL_Q_MQ && shared_tags)
                blk_mq_free_tag_set(&tag_set);
  
        kmem_cache_destroy(ppa_cache);
  module_init(null_init);
  module_exit(null_exit);
  
- MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");
+ MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
  MODULE_LICENSE("GPL");
@@@ -265,7 -265,7 +265,7 @@@ static blk_status_t virtio_queue_rq(str
        }
  
        spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
-       if (req_op(req) == REQ_OP_SCSI_IN || req_op(req) == REQ_OP_SCSI_OUT)
+       if (blk_rq_is_scsi(req))
                err = virtblk_add_req_scsi(vblk->vqs[qid].vq, vbr, vbr->sg, num);
        else
                err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
@@@ -381,7 -381,6 +381,7 @@@ static void virtblk_config_changed_work
        struct request_queue *q = vblk->disk->queue;
        char cap_str_2[10], cap_str_10[10];
        char *envp[] = { "RESIZE=1", NULL };
 +      unsigned long long nblocks;
        u64 capacity;
  
        /* Host must always specify the capacity. */
                capacity = (sector_t)-1;
        }
  
 -      string_get_size(capacity, queue_logical_block_size(q),
 +      nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
 +
 +      string_get_size(nblocks, queue_logical_block_size(q),
                        STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
 -      string_get_size(capacity, queue_logical_block_size(q),
 +      string_get_size(nblocks, queue_logical_block_size(q),
                        STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
  
        dev_notice(&vdev->dev,
 -                "new size: %llu %d-byte logical blocks (%s/%s)\n",
 -                (unsigned long long)capacity,
 -                queue_logical_block_size(q),
 -                cap_str_10, cap_str_2);
 +                 "new size: %llu %d-byte logical blocks (%s/%s)\n",
 +                 nblocks,
 +                 queue_logical_block_size(q),
 +                 cap_str_10,
 +                 cap_str_2);
  
        set_capacity(vblk->disk, capacity);
        revalidate_disk(vblk->disk);
@@@ -244,7 -244,6 +244,7 @@@ static int xen_blkif_disconnect(struct 
  {
        struct pending_req *req, *n;
        unsigned int j, r;
 +      bool busy = false;
  
        for (r = 0; r < blkif->nr_rings; r++) {
                struct xen_blkif_ring *ring = &blkif->rings[r];
                 * don't have any discard_io or other_io requests. So, checking
                 * for inflight IO is enough.
                 */
 -              if (atomic_read(&ring->inflight) > 0)
 -                      return -EBUSY;
 +              if (atomic_read(&ring->inflight) > 0) {
 +                      busy = true;
 +                      continue;
 +              }
  
                if (ring->irq) {
                        unbind_from_irqhandler(ring->irq, ring);
                WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
                ring->active = false;
        }
 +      if (busy)
 +              return -EBUSY;
 +
        blkif->nr_ring_pages = 0;
        /*
         * blkif->rings was allocated in connect_ring, so we should free it in
@@@ -816,7 -810,8 +816,8 @@@ static void frontend_changed(struct xen
                xenbus_switch_state(dev, XenbusStateClosed);
                if (xenbus_dev_is_online(dev))
                        break;
-               /* fall through if not online */
+               /* fall through */
+               /* if not online */
        case XenbusStateUnknown:
                /* implies xen_blkif_disconnect() via xen_blkbk_remove() */
                device_unregister(&dev->dev);
@@@ -2075,9 -2075,9 +2075,9 @@@ static int blkfront_resume(struct xenbu
                        /*
                         * Get the bios in the request so we can re-queue them.
                         */
 -                      if (req_op(shadow[i].request) == REQ_OP_FLUSH ||
 -                          req_op(shadow[i].request) == REQ_OP_DISCARD ||
 -                          req_op(shadow[i].request) == REQ_OP_SECURE_ERASE ||
 +                      if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
 +                          req_op(shadow[j].request) == REQ_OP_DISCARD ||
 +                          req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
                            shadow[j].request->cmd_flags & REQ_FUA) {
                                /*
                                 * Flush operations don't contain bios, so
@@@ -2456,7 -2456,7 +2456,7 @@@ static void blkback_changed(struct xenb
        case XenbusStateClosed:
                if (dev->state == XenbusStateClosed)
                        break;
-               /* Missed the backend's Closing state -- fallthrough */
+               /* fall through */
        case XenbusStateClosing:
                if (info)
                        blkfront_closing(info);
@@@ -270,349 -270,6 +270,349 @@@ static ssize_t mem_used_max_store(struc
        return len;
  }
  
-       bio->bi_bdev = zram->bdev;
 +#ifdef CONFIG_ZRAM_WRITEBACK
 +static bool zram_wb_enabled(struct zram *zram)
 +{
 +      return zram->backing_dev;
 +}
 +
 +static void reset_bdev(struct zram *zram)
 +{
 +      struct block_device *bdev;
 +
 +      if (!zram_wb_enabled(zram))
 +              return;
 +
 +      bdev = zram->bdev;
 +      if (zram->old_block_size)
 +              set_blocksize(bdev, zram->old_block_size);
 +      blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 +      /* hope filp_close flush all of IO */
 +      filp_close(zram->backing_dev, NULL);
 +      zram->backing_dev = NULL;
 +      zram->old_block_size = 0;
 +      zram->bdev = NULL;
 +
 +      kvfree(zram->bitmap);
 +      zram->bitmap = NULL;
 +}
 +
 +static ssize_t backing_dev_show(struct device *dev,
 +              struct device_attribute *attr, char *buf)
 +{
 +      struct zram *zram = dev_to_zram(dev);
 +      struct file *file = zram->backing_dev;
 +      char *p;
 +      ssize_t ret;
 +
 +      down_read(&zram->init_lock);
 +      if (!zram_wb_enabled(zram)) {
 +              memcpy(buf, "none\n", 5);
 +              up_read(&zram->init_lock);
 +              return 5;
 +      }
 +
 +      p = file_path(file, buf, PAGE_SIZE - 1);
 +      if (IS_ERR(p)) {
 +              ret = PTR_ERR(p);
 +              goto out;
 +      }
 +
 +      ret = strlen(p);
 +      memmove(buf, p, ret);
 +      buf[ret++] = '\n';
 +out:
 +      up_read(&zram->init_lock);
 +      return ret;
 +}
 +
 +static ssize_t backing_dev_store(struct device *dev,
 +              struct device_attribute *attr, const char *buf, size_t len)
 +{
 +      char *file_name;
 +      struct file *backing_dev = NULL;
 +      struct inode *inode;
 +      struct address_space *mapping;
 +      unsigned int bitmap_sz, old_block_size = 0;
 +      unsigned long nr_pages, *bitmap = NULL;
 +      struct block_device *bdev = NULL;
 +      int err;
 +      struct zram *zram = dev_to_zram(dev);
 +
 +      file_name = kmalloc(PATH_MAX, GFP_KERNEL);
 +      if (!file_name)
 +              return -ENOMEM;
 +
 +      down_write(&zram->init_lock);
 +      if (init_done(zram)) {
 +              pr_info("Can't setup backing device for initialized device\n");
 +              err = -EBUSY;
 +              goto out;
 +      }
 +
 +      strlcpy(file_name, buf, len);
 +
 +      backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
 +      if (IS_ERR(backing_dev)) {
 +              err = PTR_ERR(backing_dev);
 +              backing_dev = NULL;
 +              goto out;
 +      }
 +
 +      mapping = backing_dev->f_mapping;
 +      inode = mapping->host;
 +
 +      /* Support only block device in this moment */
 +      if (!S_ISBLK(inode->i_mode)) {
 +              err = -ENOTBLK;
 +              goto out;
 +      }
 +
 +      bdev = bdgrab(I_BDEV(inode));
 +      err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
 +      if (err < 0)
 +              goto out;
 +
 +      nr_pages = i_size_read(inode) >> PAGE_SHIFT;
 +      bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
 +      bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
 +      if (!bitmap) {
 +              err = -ENOMEM;
 +              goto out;
 +      }
 +
 +      old_block_size = block_size(bdev);
 +      err = set_blocksize(bdev, PAGE_SIZE);
 +      if (err)
 +              goto out;
 +
 +      reset_bdev(zram);
 +      spin_lock_init(&zram->bitmap_lock);
 +
 +      zram->old_block_size = old_block_size;
 +      zram->bdev = bdev;
 +      zram->backing_dev = backing_dev;
 +      zram->bitmap = bitmap;
 +      zram->nr_pages = nr_pages;
 +      up_write(&zram->init_lock);
 +
 +      pr_info("setup backing device %s\n", file_name);
 +      kfree(file_name);
 +
 +      return len;
 +out:
 +      if (bitmap)
 +              kvfree(bitmap);
 +
 +      if (bdev)
 +              blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 +
 +      if (backing_dev)
 +              filp_close(backing_dev, NULL);
 +
 +      up_write(&zram->init_lock);
 +
 +      kfree(file_name);
 +
 +      return err;
 +}
 +
 +static unsigned long get_entry_bdev(struct zram *zram)
 +{
 +      unsigned long entry;
 +
 +      spin_lock(&zram->bitmap_lock);
 +      /* skip 0 bit to confuse zram.handle = 0 */
 +      entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
 +      if (entry == zram->nr_pages) {
 +              spin_unlock(&zram->bitmap_lock);
 +              return 0;
 +      }
 +
 +      set_bit(entry, zram->bitmap);
 +      spin_unlock(&zram->bitmap_lock);
 +
 +      return entry;
 +}
 +
 +static void put_entry_bdev(struct zram *zram, unsigned long entry)
 +{
 +      int was_set;
 +
 +      spin_lock(&zram->bitmap_lock);
 +      was_set = test_and_clear_bit(entry, zram->bitmap);
 +      spin_unlock(&zram->bitmap_lock);
 +      WARN_ON_ONCE(!was_set);
 +}
 +
 +void zram_page_end_io(struct bio *bio)
 +{
 +      struct page *page = bio->bi_io_vec[0].bv_page;
 +
 +      page_endio(page, op_is_write(bio_op(bio)),
 +                      blk_status_to_errno(bio->bi_status));
 +      bio_put(bio);
 +}
 +
 +/*
 + * Returns 1 if the submission is successful.
 + */
 +static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
 +                      unsigned long entry, struct bio *parent)
 +{
 +      struct bio *bio;
 +
 +      bio = bio_alloc(GFP_ATOMIC, 1);
 +      if (!bio)
 +              return -ENOMEM;
 +
 +      bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
-       bio->bi_bdev = zram->bdev;
++      bio_set_dev(bio, zram->bdev);
 +      if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
 +              bio_put(bio);
 +              return -EIO;
 +      }
 +
 +      if (!parent) {
 +              bio->bi_opf = REQ_OP_READ;
 +              bio->bi_end_io = zram_page_end_io;
 +      } else {
 +              bio->bi_opf = parent->bi_opf;
 +              bio_chain(bio, parent);
 +      }
 +
 +      submit_bio(bio);
 +      return 1;
 +}
 +
 +struct zram_work {
 +      struct work_struct work;
 +      struct zram *zram;
 +      unsigned long entry;
 +      struct bio *bio;
 +};
 +
 +#if PAGE_SIZE != 4096
 +static void zram_sync_read(struct work_struct *work)
 +{
 +      struct bio_vec bvec;
 +      struct zram_work *zw = container_of(work, struct zram_work, work);
 +      struct zram *zram = zw->zram;
 +      unsigned long entry = zw->entry;
 +      struct bio *bio = zw->bio;
 +
 +      read_from_bdev_async(zram, &bvec, entry, bio);
 +}
 +
 +/*
 + * Block layer want one ->make_request_fn to be active at a time
 + * so if we use chained IO with parent IO in same context,
 + * it's a deadlock. To avoid, it, it uses worker thread context.
 + */
 +static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
 +                              unsigned long entry, struct bio *bio)
 +{
 +      struct zram_work work;
 +
 +      work.zram = zram;
 +      work.entry = entry;
 +      work.bio = bio;
 +
 +      INIT_WORK_ONSTACK(&work.work, zram_sync_read);
 +      queue_work(system_unbound_wq, &work.work);
 +      flush_work(&work.work);
 +      destroy_work_on_stack(&work.work);
 +
 +      return 1;
 +}
 +#else
 +static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
 +                              unsigned long entry, struct bio *bio)
 +{
 +      WARN_ON(1);
 +      return -EIO;
 +}
 +#endif
 +
 +static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
 +                      unsigned long entry, struct bio *parent, bool sync)
 +{
 +      if (sync)
 +              return read_from_bdev_sync(zram, bvec, entry, parent);
 +      else
 +              return read_from_bdev_async(zram, bvec, entry, parent);
 +}
 +
 +static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
 +                                      u32 index, struct bio *parent,
 +                                      unsigned long *pentry)
 +{
 +      struct bio *bio;
 +      unsigned long entry;
 +
 +      bio = bio_alloc(GFP_ATOMIC, 1);
 +      if (!bio)
 +              return -ENOMEM;
 +
 +      entry = get_entry_bdev(zram);
 +      if (!entry) {
 +              bio_put(bio);
 +              return -ENOSPC;
 +      }
 +
 +      bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
++      bio_set_dev(bio, zram->bdev);
 +      if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
 +                                      bvec->bv_offset)) {
 +              bio_put(bio);
 +              put_entry_bdev(zram, entry);
 +              return -EIO;
 +      }
 +
 +      if (!parent) {
 +              bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 +              bio->bi_end_io = zram_page_end_io;
 +      } else {
 +              bio->bi_opf = parent->bi_opf;
 +              bio_chain(bio, parent);
 +      }
 +
 +      submit_bio(bio);
 +      *pentry = entry;
 +
 +      return 0;
 +}
 +
 +static void zram_wb_clear(struct zram *zram, u32 index)
 +{
 +      unsigned long entry;
 +
 +      zram_clear_flag(zram, index, ZRAM_WB);
 +      entry = zram_get_element(zram, index);
 +      zram_set_element(zram, index, 0);
 +      put_entry_bdev(zram, entry);
 +}
 +
 +#else
 +static bool zram_wb_enabled(struct zram *zram) { return false; }
 +static inline void reset_bdev(struct zram *zram) {};
 +static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
 +                                      u32 index, struct bio *parent,
 +                                      unsigned long *pentry)
 +
 +{
 +      return -EIO;
 +}
 +
 +static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
 +                      unsigned long entry, struct bio *parent, bool sync)
 +{
 +      return -EIO;
 +}
 +static void zram_wb_clear(struct zram *zram, u32 index) {}
 +#endif
 +
 +
  /*
   * We switched to per-cpu streams and this attr is not needed anymore.
   * However, we will keep it around for some time, because:
@@@ -651,7 -308,7 +651,7 @@@ static ssize_t comp_algorithm_store(str
                struct device_attribute *attr, const char *buf, size_t len)
  {
        struct zram *zram = dev_to_zram(dev);
 -      char compressor[CRYPTO_MAX_ALG_NAME];
 +      char compressor[ARRAY_SIZE(zram->compressor)];
        size_t sz;
  
        strlcpy(compressor, buf, sizeof(compressor));
                return -EBUSY;
        }
  
 -      strlcpy(zram->compressor, compressor, sizeof(compressor));
 +      strcpy(zram->compressor, compressor);
        up_write(&zram->init_lock);
        return len;
  }
@@@ -796,6 -453,30 +796,6 @@@ static bool zram_same_page_read(struct 
        return false;
  }
  
 -static bool zram_same_page_write(struct zram *zram, u32 index,
 -                                      struct page *page)
 -{
 -      unsigned long element;
 -      void *mem = kmap_atomic(page);
 -
 -      if (page_same_filled(mem, &element)) {
 -              kunmap_atomic(mem);
 -              /* Free memory associated with this sector now. */
 -              zram_slot_lock(zram, index);
 -              zram_free_page(zram, index);
 -              zram_set_flag(zram, index, ZRAM_SAME);
 -              zram_set_element(zram, index, element);
 -              zram_slot_unlock(zram, index);
 -
 -              atomic64_inc(&zram->stats.same_pages);
 -              atomic64_inc(&zram->stats.pages_stored);
 -              return true;
 -      }
 -      kunmap_atomic(mem);
 -
 -      return false;
 -}
 -
  static void zram_meta_free(struct zram *zram, u64 disksize)
  {
        size_t num_pages = disksize >> PAGE_SHIFT;
@@@ -834,13 -515,7 +834,13 @@@ static bool zram_meta_alloc(struct zra
   */
  static void zram_free_page(struct zram *zram, size_t index)
  {
 -      unsigned long handle = zram_get_handle(zram, index);
 +      unsigned long handle;
 +
 +      if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
 +              zram_wb_clear(zram, index);
 +              atomic64_dec(&zram->stats.pages_stored);
 +              return;
 +      }
  
        /*
         * No memory is allocated for same element filled pages.
                return;
        }
  
 +      handle = zram_get_handle(zram, index);
        if (!handle)
                return;
  
        zram_set_obj_size(zram, index, 0);
  }
  
 -static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 +static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
 +                              struct bio *bio, bool partial_io)
  {
        int ret;
        unsigned long handle;
        unsigned int size;
        void *src, *dst;
  
 +      if (zram_wb_enabled(zram)) {
 +              zram_slot_lock(zram, index);
 +              if (zram_test_flag(zram, index, ZRAM_WB)) {
 +                      struct bio_vec bvec;
 +
 +                      zram_slot_unlock(zram, index);
 +
 +                      bvec.bv_page = page;
 +                      bvec.bv_len = PAGE_SIZE;
 +                      bvec.bv_offset = 0;
 +                      return read_from_bdev(zram, &bvec,
 +                                      zram_get_element(zram, index),
 +                                      bio, partial_io);
 +              }
 +              zram_slot_unlock(zram, index);
 +      }
 +
        if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
                return 0;
  
  }
  
  static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 -                              u32 index, int offset)
 +                              u32 index, int offset, struct bio *bio)
  {
        int ret;
        struct page *page;
                        return -ENOMEM;
        }
  
 -      ret = zram_decompress_page(zram, page, index);
 +      ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
        if (unlikely(ret))
                goto out;
  
@@@ -957,57 -613,30 +957,57 @@@ out
        return ret;
  }
  
 -static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
 -                      struct page *page,
 -                      unsigned long *out_handle, unsigned int *out_comp_len)
 +static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 +                              u32 index, struct bio *bio)
  {
 -      int ret;
 -      unsigned int comp_len;
 -      void *src;
 +      int ret = 0;
        unsigned long alloced_pages;
        unsigned long handle = 0;
 +      unsigned int comp_len = 0;
 +      void *src, *dst, *mem;
 +      struct zcomp_strm *zstrm;
 +      struct page *page = bvec->bv_page;
 +      unsigned long element = 0;
 +      enum zram_pageflags flags = 0;
 +      bool allow_wb = true;
 +
 +      mem = kmap_atomic(page);
 +      if (page_same_filled(mem, &element)) {
 +              kunmap_atomic(mem);
 +              /* Free memory associated with this sector now. */
 +              flags = ZRAM_SAME;
 +              atomic64_inc(&zram->stats.same_pages);
 +              goto out;
 +      }
 +      kunmap_atomic(mem);
  
  compress_again:
 +      zstrm = zcomp_stream_get(zram->comp);
        src = kmap_atomic(page);
 -      ret = zcomp_compress(*zstrm, src, &comp_len);
 +      ret = zcomp_compress(zstrm, src, &comp_len);
        kunmap_atomic(src);
  
        if (unlikely(ret)) {
 +              zcomp_stream_put(zram->comp);
                pr_err("Compression failed! err=%d\n", ret);
 -              if (handle)
 -                      zs_free(zram->mem_pool, handle);
 +              zs_free(zram->mem_pool, handle);
                return ret;
        }
  
 -      if (unlikely(comp_len > max_zpage_size))
 +      if (unlikely(comp_len > max_zpage_size)) {
 +              if (zram_wb_enabled(zram) && allow_wb) {
 +                      zcomp_stream_put(zram->comp);
 +                      ret = write_to_bdev(zram, bvec, index, bio, &element);
 +                      if (!ret) {
 +                              flags = ZRAM_WB;
 +                              ret = 1;
 +                              goto out;
 +                      }
 +                      allow_wb = false;
 +                      goto compress_again;
 +              }
                comp_len = PAGE_SIZE;
 +      }
  
        /*
         * handle allocation has 2 paths:
                handle = zs_malloc(zram->mem_pool, comp_len,
                                GFP_NOIO | __GFP_HIGHMEM |
                                __GFP_MOVABLE);
 -              *zstrm = zcomp_stream_get(zram->comp);
                if (handle)
                        goto compress_again;
                return -ENOMEM;
        update_used_max(zram, alloced_pages);
  
        if (zram->limit_pages && alloced_pages > zram->limit_pages) {
 +              zcomp_stream_put(zram->comp);
                zs_free(zram->mem_pool, handle);
                return -ENOMEM;
        }
  
 -      *out_handle = handle;
 -      *out_comp_len = comp_len;
 -      return 0;
 -}
 -
 -static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
 -{
 -      int ret;
 -      unsigned long handle;
 -      unsigned int comp_len;
 -      void *src, *dst;
 -      struct zcomp_strm *zstrm;
 -      struct page *page = bvec->bv_page;
 -
 -      if (zram_same_page_write(zram, index, page))
 -              return 0;
 -
 -      zstrm = zcomp_stream_get(zram->comp);
 -      ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
 -      if (ret) {
 -              zcomp_stream_put(zram->comp);
 -              return ret;
 -      }
 -
        dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
  
        src = zstrm->buffer;
  
        zcomp_stream_put(zram->comp);
        zs_unmap_object(zram->mem_pool, handle);
 -
 +      atomic64_add(comp_len, &zram->stats.compr_data_size);
 +out:
        /*
         * Free memory associated with this sector
         * before overwriting unused sectors.
         */
        zram_slot_lock(zram, index);
        zram_free_page(zram, index);
 -      zram_set_handle(zram, index, handle);
 -      zram_set_obj_size(zram, index, comp_len);
 +
 +      if (flags) {
 +              zram_set_flag(zram, index, flags);
 +              zram_set_element(zram, index, element);
 +      }  else {
 +              zram_set_handle(zram, index, handle);
 +              zram_set_obj_size(zram, index, comp_len);
 +      }
        zram_slot_unlock(zram, index);
  
        /* Update stats */
 -      atomic64_add(comp_len, &zram->stats.compr_data_size);
        atomic64_inc(&zram->stats.pages_stored);
 -      return 0;
 +      return ret;
  }
  
  static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 -                              u32 index, int offset)
 +                              u32 index, int offset, struct bio *bio)
  {
        int ret;
        struct page *page = NULL;
                if (!page)
                        return -ENOMEM;
  
 -              ret = zram_decompress_page(zram, page, index);
 +              ret = __zram_bvec_read(zram, page, index, bio, true);
                if (ret)
                        goto out;
  
                vec.bv_offset = 0;
        }
  
 -      ret = __zram_bvec_write(zram, &vec, index);
 +      ret = __zram_bvec_write(zram, &vec, index, bio);
  out:
        if (is_partial_io(bvec))
                __free_page(page);
@@@ -1161,33 -808,29 +1161,34 @@@ static void zram_bio_discard(struct zra
        }
  }
  
 +/*
 + * Returns errno if it has some problem. Otherwise return 0 or 1.
 + * Returns 0 if IO request was done synchronously
 + * Returns 1 if IO request was successfully submitted.
 + */
  static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 -                      int offset, bool is_write)
 +                      int offset, bool is_write, struct bio *bio)
  {
        unsigned long start_time = jiffies;
        int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
+       struct request_queue *q = zram->disk->queue;
        int ret;
  
-       generic_start_io_acct(rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+       generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
                        &zram->disk->part0);
  
        if (!is_write) {
                atomic64_inc(&zram->stats.num_reads);
 -              ret = zram_bvec_read(zram, bvec, index, offset);
 +              ret = zram_bvec_read(zram, bvec, index, offset, bio);
                flush_dcache_page(bvec->bv_page);
        } else {
                atomic64_inc(&zram->stats.num_writes);
 -              ret = zram_bvec_write(zram, bvec, index, offset);
 +              ret = zram_bvec_write(zram, bvec, index, offset, bio);
        }
  
-       generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
+       generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
  
 -      if (unlikely(ret)) {
 +      if (unlikely(ret < 0)) {
                if (!is_write)
                        atomic64_inc(&zram->stats.failed_reads);
                else
@@@ -1226,7 -869,7 +1227,7 @@@ static void __zram_make_request(struct 
                        bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
                                                        unwritten);
                        if (zram_bvec_rw(zram, &bv, index, offset,
 -                                      op_is_write(bio_op(bio))) < 0)
 +                                      op_is_write(bio_op(bio)), bio) < 0)
                                goto out;
  
                        bv.bv_offset += bv.bv_len;
@@@ -1280,18 -923,16 +1281,18 @@@ static void zram_slot_free_notify(struc
  static int zram_rw_page(struct block_device *bdev, sector_t sector,
                       struct page *page, bool is_write)
  {
 -      int offset, err = -EIO;
 +      int offset, ret;
        u32 index;
        struct zram *zram;
        struct bio_vec bv;
  
 +      if (PageTransHuge(page))
 +              return -ENOTSUPP;
        zram = bdev->bd_disk->private_data;
  
        if (!valid_io_request(zram, sector, PAGE_SIZE)) {
                atomic64_inc(&zram->stats.invalid_io);
 -              err = -EINVAL;
 +              ret = -EINVAL;
                goto out;
        }
  
        bv.bv_len = PAGE_SIZE;
        bv.bv_offset = 0;
  
 -      err = zram_bvec_rw(zram, &bv, index, offset, is_write);
 +      ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
  out:
        /*
         * If I/O fails, just return error(ie, non-zero) without
         * bio->bi_end_io does things to handle the error
         * (e.g., SetPageError, set_page_dirty and extra works).
         */
 -      if (err == 0)
 +      if (unlikely(ret < 0))
 +              return ret;
 +
 +      switch (ret) {
 +      case 0:
                page_endio(page, is_write, 0);
 -      return err;
 +              break;
 +      case 1:
 +              ret = 0;
 +              break;
 +      default:
 +              WARN_ON(1);
 +      }
 +      return ret;
  }
  
  static void zram_reset_device(struct zram *zram)
        zram_meta_free(zram, disksize);
        memset(&zram->stats, 0, sizeof(zram->stats));
        zcomp_destroy(comp);
 +      reset_bdev(zram);
  }
  
  static ssize_t disksize_store(struct device *dev,
@@@ -1480,9 -1109,6 +1481,9 @@@ static DEVICE_ATTR_WO(mem_limit)
  static DEVICE_ATTR_WO(mem_used_max);
  static DEVICE_ATTR_RW(max_comp_streams);
  static DEVICE_ATTR_RW(comp_algorithm);
 +#ifdef CONFIG_ZRAM_WRITEBACK
 +static DEVICE_ATTR_RW(backing_dev);
 +#endif
  
  static struct attribute *zram_disk_attrs[] = {
        &dev_attr_disksize.attr,
        &dev_attr_mem_used_max.attr,
        &dev_attr_max_comp_streams.attr,
        &dev_attr_comp_algorithm.attr,
 +#ifdef CONFIG_ZRAM_WRITEBACK
 +      &dev_attr_backing_dev.attr,
 +#endif
        &dev_attr_io_stat.attr,
        &dev_attr_mm_stat.attr,
        &dev_attr_debug_stat.attr,
diff --combined drivers/md/dm-crypt.c
@@@ -758,8 -758,9 +758,8 @@@ static int crypt_iv_tcw_whitening(struc
        int i, r;
  
        /* xor whitening with sector number */
 -      memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
 -      crypto_xor(buf, (u8 *)&sector, 8);
 -      crypto_xor(&buf[8], (u8 *)&sector, 8);
 +      crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
 +      crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);
  
        /* calculate crc32 for every 32bit part and xor it */
        desc->tfm = tcw->crc32_tfm;
@@@ -804,10 -805,10 +804,10 @@@ static int crypt_iv_tcw_gen(struct cryp
        }
  
        /* Calculate IV */
 -      memcpy(iv, tcw->iv_seed, cc->iv_size);
 -      crypto_xor(iv, (u8 *)&sector, 8);
 +      crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)&sector, 8);
        if (cc->iv_size > 8)
 -              crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
 +              crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
 +                             cc->iv_size - 8);
  
        return r;
  }
@@@ -932,9 -933,6 +932,6 @@@ static int dm_crypt_integrity_io_alloc(
        bip->bip_iter.bi_size = tag_len;
        bip->bip_iter.bi_sector = io->cc->start + io->sector;
  
-       /* We own the metadata, do not let bio_free to release it */
-       bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
        ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
                                     tag_len, offset_in_page(io->integrity_metadata));
        if (unlikely(ret != tag_len))
@@@ -1546,7 -1544,7 +1543,7 @@@ static void clone_init(struct dm_crypt_
  
        clone->bi_private = io;
        clone->bi_end_io  = crypt_endio;
-       clone->bi_bdev    = cc->dev->bdev;
+       bio_set_dev(clone, cc->dev->bdev);
        clone->bi_opf     = io->base_bio->bi_opf;
  }
  
@@@ -2795,7 -2793,7 +2792,7 @@@ static int crypt_map(struct dm_target *
         */
        if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
            bio_op(bio) == REQ_OP_DISCARD)) {
-               bio->bi_bdev = cc->dev->bdev;
+               bio_set_dev(bio, cc->dev->bdev);
                if (bio_sectors(bio))
                        bio->bi_iter.bi_sector = cc->start +
                                dm_target_offset(ti, bio->bi_iter.bi_sector);
diff --combined drivers/md/dm-mpath.c
@@@ -504,6 -504,7 +504,6 @@@ static int multipath_clone_and_map(stru
                if (queue_dying) {
                        atomic_inc(&m->pg_init_in_progress);
                        activate_or_offline_path(pgpath);
 -                      return DM_MAPIO_REQUEUE;
                }
                return DM_MAPIO_DELAY_REQUEUE;
        }
@@@ -565,7 -566,7 +565,7 @@@ static int __multipath_map_bio(struct m
        mpio->nr_bytes = nr_bytes;
  
        bio->bi_status = 0;
-       bio->bi_bdev = pgpath->path.dev->bdev;
+       bio_set_dev(bio, pgpath->path.dev->bdev);
        bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
  
        if (pgpath->pg->ps.type->start_io)
@@@ -1457,6 -1458,7 +1457,6 @@@ static int noretry_error(blk_status_t e
        case BLK_STS_TARGET:
        case BLK_STS_NEXUS:
        case BLK_STS_MEDIUM:
 -      case BLK_STS_RESOURCE:
                return 1;
        }
  
diff --combined drivers/md/dm.c
  
  #define DM_MSG_PREFIX "core"
  
 -#ifdef CONFIG_PRINTK
 -/*
 - * ratelimit state to be used in DMXXX_LIMIT().
 - */
 -DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
 -                     DEFAULT_RATELIMIT_INTERVAL,
 -                     DEFAULT_RATELIMIT_BURST);
 -EXPORT_SYMBOL(dm_ratelimit_state);
 -#endif
 -
  /*
   * Cookies are numeric values sent with CHANGE and REMOVE
   * uevents while resuming, removing or renaming the device.
@@@ -510,7 -520,7 +510,7 @@@ static void start_io_acct(struct dm_io 
        io->start_time = jiffies;
  
        cpu = part_stat_lock();
-       part_round_stats(cpu, &dm_disk(md)->part0);
+       part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
        part_stat_unlock();
        atomic_set(&dm_disk(md)->part0.in_flight[rw],
                atomic_inc_return(&md->pending[rw]));
@@@ -529,7 -539,7 +529,7 @@@ static void end_io_acct(struct dm_io *i
        int pending;
        int rw = bio_data_dir(bio);
  
-       generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
+       generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
  
        if (unlikely(dm_stats_used(&md->stats)))
                dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@@ -841,10 -851,10 +841,10 @@@ static void clone_endio(struct bio *bio
  
        if (unlikely(error == BLK_STS_TARGET)) {
                if (bio_op(bio) == REQ_OP_WRITE_SAME &&
-                   !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+                   !bio->bi_disk->queue->limits.max_write_same_sectors)
                        disable_write_same(md);
                if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
-                   !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+                   !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
                        disable_write_zeroes(md);
        }
  
@@@ -1205,8 -1215,8 +1205,8 @@@ static void __map_bio(struct dm_target_
                break;
        case DM_MAPIO_REMAPPED:
                /* the bio has been remapped so dispatch it */
-               trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
-                                     tio->io->bio->bi_bdev->bd_dev, sector);
+               trace_block_bio_remap(clone->bi_disk->queue, clone,
+                                     bio_dev(tio->io->bio), sector);
                generic_make_request(clone);
                break;
        case DM_MAPIO_KILL:
@@@ -1513,7 -1523,7 +1513,7 @@@ static void __split_and_process_bio(str
        }
  
        /* drop the extra reference count */
 -      dec_pending(ci.io, error);
 +      dec_pending(ci.io, errno_to_blk_status(error));
  }
  /*-----------------------------------------------------------------
   * CRUD END
@@@ -1532,7 -1542,7 +1532,7 @@@ static blk_qc_t dm_make_request(struct 
  
        map = dm_get_live_table(md, &srcu_idx);
  
-       generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
+       generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
  
        /* if we're suspended, we have to queue this io for later */
        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
@@@ -1786,7 -1796,7 +1786,7 @@@ static struct mapped_device *alloc_dev(
                goto bad;
  
        bio_init(&md->flush_bio, NULL, 0);
-       md->flush_bio.bi_bdev = md->bdev;
+       bio_set_dev(&md->flush_bio, md->bdev);
        md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
  
        dm_stats_init(&md->stats);
diff --combined drivers/md/md.c
@@@ -422,7 -422,7 +422,7 @@@ static void submit_flushes(struct work_
                        bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
                        bi->bi_end_io = md_end_flush;
                        bi->bi_private = rdev;
-                       bi->bi_bdev = rdev->bdev;
+                       bio_set_dev(bi, rdev->bdev);
                        bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
                        atomic_inc(&mddev->flush_pending);
                        submit_bio(bi);
@@@ -772,7 -772,7 +772,7 @@@ void md_super_write(struct mddev *mddev
  
        atomic_inc(&rdev->nr_pending);
  
-       bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
+       bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
        bio->bi_iter.bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
@@@ -803,8 -803,10 +803,10 @@@ int sync_page_io(struct md_rdev *rdev, 
        struct bio *bio = md_bio_alloc_sync(rdev->mddev);
        int ret;
  
-       bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
-               rdev->meta_bdev : rdev->bdev;
+       if (metadata_op && rdev->meta_bdev)
+               bio_set_dev(bio, rdev->meta_bdev);
+       else
+               bio_set_dev(bio, rdev->bdev);
        bio_set_op_attrs(bio, op, op_flags);
        if (metadata_op)
                bio->bi_iter.bi_sector = sector + rdev->sb_start;
@@@ -7996,7 -7998,7 +7998,7 @@@ bool md_write_start(struct mddev *mddev
        if (mddev->safemode == 1)
                mddev->safemode = 0;
        /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
 -      if (mddev->in_sync || !mddev->sync_checkers) {
 +      if (mddev->in_sync || mddev->sync_checkers) {
                spin_lock(&mddev->lock);
                if (mddev->in_sync) {
                        mddev->in_sync = 0;
@@@ -8656,9 -8658,6 +8658,9 @@@ void md_check_recovery(struct mddev *md
        if (mddev_trylock(mddev)) {
                int spares = 0;
  
 +              if (!mddev->external && mddev->safemode == 1)
 +                      mddev->safemode = 0;
 +
                if (mddev->ro) {
                        struct md_rdev *rdev;
                        if (!mddev->external && mddev->in_sync)
diff --combined drivers/md/raid5-cache.c
@@@ -236,10 -236,9 +236,10 @@@ struct r5l_io_unit 
        bool need_split_bio;
        struct bio *split_bio;
  
 -      unsigned int has_flush:1;      /* include flush request */
 -      unsigned int has_fua:1;        /* include fua request */
 -      unsigned int has_null_flush:1; /* include empty flush request */
 +      unsigned int has_flush:1;               /* include flush request */
 +      unsigned int has_fua:1;                 /* include fua request */
 +      unsigned int has_null_flush:1;          /* include null flush request */
 +      unsigned int has_flush_payload:1;       /* include flush payload  */
        /*
         * io isn't sent yet, flush/fua request can only be submitted till it's
         * the first IO in running_ios list
@@@ -572,8 -571,6 +572,8 @@@ static void r5l_log_endio(struct bio *b
        struct r5l_io_unit *io_deferred;
        struct r5l_log *log = io->log;
        unsigned long flags;
 +      bool has_null_flush;
 +      bool has_flush_payload;
  
        if (bio->bi_status)
                md_error(log->rdev->mddev, log->rdev);
  
        spin_lock_irqsave(&log->io_list_lock, flags);
        __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 +
 +      /*
 +       * if the io doesn't not have null_flush or flush payload,
 +       * it is not safe to access it after releasing io_list_lock.
 +       * Therefore, it is necessary to check the condition with
 +       * the lock held.
 +       */
 +      has_null_flush = io->has_null_flush;
 +      has_flush_payload = io->has_flush_payload;
 +
        if (log->need_cache_flush && !list_empty(&io->stripe_list))
                r5l_move_to_end_ios(log);
        else
        if (log->need_cache_flush)
                md_wakeup_thread(log->rdev->mddev->thread);
  
 -      if (io->has_null_flush) {
 +      /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
 +      if (has_null_flush) {
                struct bio *bi;
  
                WARN_ON(bio_list_empty(&io->flush_barriers));
                while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
                        bio_endio(bi);
 -                      atomic_dec(&io->pending_stripe);
 +                      if (atomic_dec_and_test(&io->pending_stripe)) {
 +                              __r5l_stripe_write_finished(io);
 +                              return;
 +                      }
                }
        }
 -
 -      /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
 -      if (atomic_read(&io->pending_stripe) == 0)
 -              __r5l_stripe_write_finished(io);
 +      /* decrease pending_stripe for flush payload */
 +      if (has_flush_payload)
 +              if (atomic_dec_and_test(&io->pending_stripe))
 +                      __r5l_stripe_write_finished(io);
  }
  
  static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
@@@ -745,7 -728,7 +745,7 @@@ static struct bio *r5l_bio_alloc(struc
        struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
  
        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-       bio->bi_bdev = log->rdev->bdev;
+       bio_set_dev(bio, log->rdev->bdev);
        bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
  
        return bio;
@@@ -898,11 -881,6 +898,11 @@@ static void r5l_append_flush_payload(st
        payload->size = cpu_to_le32(sizeof(__le64));
        payload->flush_stripes[0] = cpu_to_le64(sect);
        io->meta_offset += meta_size;
 +      /* multiple flush payloads count as one pending_stripe */
 +      if (!io->has_flush_payload) {
 +              io->has_flush_payload = 1;
 +              atomic_inc(&io->pending_stripe);
 +      }
        mutex_unlock(&log->io_mutex);
  }
  
@@@ -1313,7 -1291,7 +1313,7 @@@ void r5l_flush_stripe_to_raid(struct r5
        if (!do_flush)
                return;
        bio_reset(&log->flush_bio);
-       log->flush_bio.bi_bdev = log->rdev->bdev;
+       bio_set_dev(&log->flush_bio, log->rdev->bdev);
        log->flush_bio.bi_end_io = r5l_log_flush_endio;
        log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
        submit_bio(&log->flush_bio);
@@@ -1691,7 -1669,7 +1691,7 @@@ static int r5l_recovery_fetch_ra_pool(s
                                      sector_t offset)
  {
        bio_reset(ctx->ra_bio);
-       ctx->ra_bio->bi_bdev = log->rdev->bdev;
+       bio_set_dev(ctx->ra_bio, log->rdev->bdev);
        bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
        ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
  
@@@ -2562,32 -2540,23 +2562,32 @@@ static ssize_t r5c_journal_mode_show(st
   */
  int r5c_journal_mode_set(struct mddev *mddev, int mode)
  {
 -      struct r5conf *conf = mddev->private;
 -      struct r5l_log *log = conf->log;
 -
 -      if (!log)
 -              return -ENODEV;
 +      struct r5conf *conf;
 +      int err;
  
        if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
            mode > R5C_JOURNAL_MODE_WRITE_BACK)
                return -EINVAL;
  
 +      err = mddev_lock(mddev);
 +      if (err)
 +              return err;
 +      conf = mddev->private;
 +      if (!conf || !conf->log) {
 +              mddev_unlock(mddev);
 +              return -ENODEV;
 +      }
 +
        if (raid5_calc_degraded(conf) > 0 &&
 -          mode == R5C_JOURNAL_MODE_WRITE_BACK)
 +          mode == R5C_JOURNAL_MODE_WRITE_BACK) {
 +              mddev_unlock(mddev);
                return -EINVAL;
 +      }
  
        mddev_suspend(mddev);
        conf->log->r5c_journal_mode = mode;
        mddev_resume(mddev);
 +      mddev_unlock(mddev);
  
        pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
                 mdname(mddev), mode, r5c_journal_mode_str[mode]);
diff --combined drivers/nvme/host/core.c
@@@ -336,7 -336,7 +336,7 @@@ static int nvme_get_stream_params(struc
  
        c.directive.opcode = nvme_admin_directive_recv;
        c.directive.nsid = cpu_to_le32(nsid);
 -      c.directive.numd = cpu_to_le32(sizeof(*s));
 +      c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
        c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
        c.directive.dtype = NVME_DIR_STREAMS;
  
@@@ -613,11 -613,7 +613,7 @@@ int __nvme_submit_user_cmd(struct reque
  
                if (!disk)
                        goto submit;
-               bio->bi_bdev = bdget_disk(disk, 0);
-               if (!bio->bi_bdev) {
-                       ret = -ENODEV;
-                       goto out_unmap;
-               }
+               bio->bi_disk = disk;
  
                if (meta_buffer && meta_len) {
                        struct bio_integrity_payload *bip;
   out_free_meta:
        kfree(meta);
   out_unmap:
-       if (bio) {
-               if (disk && bio->bi_bdev)
-                       bdput(bio->bi_bdev);
+       if (bio)
                blk_rq_unmap_user(bio);
-       }
   out:
        blk_mq_free_request(req);
        return ret;
@@@ -1509,7 -1502,7 +1502,7 @@@ static void nvme_set_queue_limits(struc
        blk_queue_write_cache(q, vwc, vwc);
  }
  
 -static void nvme_configure_apst(struct nvme_ctrl *ctrl)
 +static int nvme_configure_apst(struct nvme_ctrl *ctrl)
  {
        /*
         * APST (Autonomous Power State Transition) lets us program a
         * then don't do anything.
         */
        if (!ctrl->apsta)
 -              return;
 +              return 0;
  
        if (ctrl->npss > 31) {
                dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
 -              return;
 +              return 0;
        }
  
        table = kzalloc(sizeof(*table), GFP_KERNEL);
        if (!table)
 -              return;
 +              return 0;
  
        if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
                /* Turn off APST. */
                dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
  
        kfree(table);
 +      return ret;
  }
  
  static void nvme_set_latency_tolerance(struct device *dev, s32 val)
@@@ -1836,16 -1828,13 +1829,16 @@@ int nvme_init_identify(struct nvme_ctr
                 * In fabrics we need to verify the cntlid matches the
                 * admin connect
                 */
 -              if (ctrl->cntlid != le16_to_cpu(id->cntlid))
 +              if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
                        ret = -EINVAL;
 +                      goto out_free;
 +              }
  
                if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
                        dev_err(ctrl->device,
                                "keep-alive support is mandatory for fabrics\n");
                        ret = -EINVAL;
 +                      goto out_free;
                }
        } else {
                ctrl->cntlid = le16_to_cpu(id->cntlid);
        else if (!ctrl->apst_enabled && prev_apst_enabled)
                dev_pm_qos_hide_latency_tolerance(ctrl->device);
  
 -      nvme_configure_apst(ctrl);
 -      nvme_configure_directives(ctrl);
 +      ret = nvme_configure_apst(ctrl);
 +      if (ret < 0)
 +              return ret;
 +
 +      ret = nvme_configure_directives(ctrl);
 +      if (ret < 0)
 +              return ret;
  
        ctrl->identified = true;
  
 +      return 0;
 +
 +out_free:
 +      kfree(id);
        return ret;
  }
  EXPORT_SYMBOL_GPL(nvme_init_identify);
@@@ -2017,11 -1997,9 +2010,11 @@@ static ssize_t wwid_show(struct device 
        if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
                return sprintf(buf, "eui.%8phN\n", ns->eui);
  
 -      while (ctrl->serial[serial_len - 1] == ' ')
 +      while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
 +                                ctrl->serial[serial_len - 1] == '\0'))
                serial_len--;
 -      while (ctrl->model[model_len - 1] == ' ')
 +      while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
 +                               ctrl->model[model_len - 1] == '\0'))
                model_len--;
  
        return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
diff --combined drivers/nvme/host/rdma.c
@@@ -19,7 -19,6 +19,7 @@@
  #include <linux/string.h>
  #include <linux/atomic.h>
  #include <linux/blk-mq.h>
 +#include <linux/blk-mq-rdma.h>
  #include <linux/types.h>
  #include <linux/list.h>
  #include <linux/mutex.h>
@@@ -464,10 -463,14 +464,10 @@@ static int nvme_rdma_create_queue_ib(st
        ibdev = queue->device->dev;
  
        /*
 -       * The admin queue is barely used once the controller is live, so don't
 -       * bother to spread it out.
 +       * Spread I/O queues completion vectors according their queue index.
 +       * Admin queues can always go on completion vector 0.
         */
 -      if (idx == 0)
 -              comp_vector = 0;
 -      else
 -              comp_vector = idx % ibdev->num_comp_vectors;
 -
 +      comp_vector = idx == 0 ? idx : idx - 1;
  
        /* +1 for ib_stop_cq */
        queue->ib_cq = ib_alloc_cq(ibdev, queue,
@@@ -608,20 -611,10 +608,20 @@@ out_free_queues
  static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
  {
        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
 +      struct ib_device *ibdev = ctrl->device->dev;
        unsigned int nr_io_queues;
        int i, ret;
  
        nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
 +
 +      /*
 +       * we map queues according to the device irq vectors for
 +       * optimal locality so we don't need more queues than
 +       * completion vectors.
 +       */
 +      nr_io_queues = min_t(unsigned int, nr_io_queues,
 +                              ibdev->num_comp_vectors);
 +
        ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
        if (ret)
                return ret;
@@@ -711,14 -704,16 +711,16 @@@ static void nvme_rdma_reconnect_ctrl_wo
        if (ctrl->ctrl.queue_count > 1) {
                nvme_rdma_free_io_queues(ctrl);
  
-               ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+               ret = blk_mq_reinit_tagset(&ctrl->tag_set,
+                                          nvme_rdma_reinit_request);
                if (ret)
                        goto requeue;
        }
  
        nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
  
-       ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
+       ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set,
+                                  nvme_rdma_reinit_request);
        if (ret)
                goto requeue;
  
@@@ -927,11 -922,7 +929,11 @@@ static int nvme_rdma_map_sg_fr(struct n
        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
        int nr;
  
 -      nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
 +      /*
 +       * Align the MR to a 4K page size to match the ctrl page size and
 +       * the block virtual boundary.
 +       */
 +      nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
        if (nr < count) {
                if (nr < 0)
                        return nr;
@@@ -1509,23 -1500,14 +1511,22 @@@ static void nvme_rdma_complete_rq(struc
        nvme_complete_rq(rq);
  }
  
 +static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 +{
 +      struct nvme_rdma_ctrl *ctrl = set->driver_data;
 +
 +      return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
 +}
 +
  static const struct blk_mq_ops nvme_rdma_mq_ops = {
        .queue_rq       = nvme_rdma_queue_rq,
        .complete       = nvme_rdma_complete_rq,
        .init_request   = nvme_rdma_init_request,
        .exit_request   = nvme_rdma_exit_request,
-       .reinit_request = nvme_rdma_reinit_request,
        .init_hctx      = nvme_rdma_init_hctx,
        .poll           = nvme_rdma_poll,
        .timeout        = nvme_rdma_timeout,
 +      .map_queues     = nvme_rdma_map_queues,
  };
  
  static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
        .complete       = nvme_rdma_complete_rq,
        .init_request   = nvme_rdma_init_request,
        .exit_request   = nvme_rdma_exit_request,
-       .reinit_request = nvme_rdma_reinit_request,
        .init_hctx      = nvme_rdma_init_admin_hctx,
        .timeout        = nvme_rdma_timeout,
  };
@@@ -1602,7 -1583,7 +1602,7 @@@ static int nvme_rdma_configure_admin_qu
                goto out_cleanup_queue;
  
        ctrl->ctrl.max_hw_sectors =
 -              (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
 +              (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
  
        error = nvme_init_identify(&ctrl->ctrl);
        if (error)
@@@ -1731,7 -1712,8 +1731,8 @@@ static void nvme_rdma_reset_ctrl_work(s
        }
  
        if (ctrl->ctrl.queue_count > 1) {
-               ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+               ret = blk_mq_reinit_tagset(&ctrl->tag_set,
+                                          nvme_rdma_reinit_request);
                if (ret)
                        goto del_dead_ctrl;
  
@@@ -1965,6 -1947,10 +1966,6 @@@ static struct nvmf_transport_ops nvme_r
        .create_ctrl    = nvme_rdma_create_ctrl,
  };
  
 -static void nvme_rdma_add_one(struct ib_device *ib_device)
 -{
 -}
 -
  static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
  {
        struct nvme_rdma_ctrl *ctrl;
  
  static struct ib_client nvme_rdma_ib_client = {
        .name   = "nvme_rdma",
 -      .add = nvme_rdma_add_one,
        .remove = nvme_rdma_remove_one
  };
  
diff --combined fs/btrfs/disk-io.c
@@@ -3499,7 -3499,7 +3499,7 @@@ static void write_dev_flush(struct btrf
  
        bio_reset(bio);
        bio->bi_end_io = btrfs_end_empty_barrier;
-       bio->bi_bdev = device->bdev;
+       bio_set_dev(bio, device->bdev);
        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
        init_completion(&device->flush_wait);
        bio->bi_private = &device->flush_wait;
@@@ -3516,7 -3516,7 +3516,7 @@@ static blk_status_t wait_dev_flush(stru
        struct bio *bio = device->flush_bio;
  
        if (!device->flush_bio_sent)
 -              return 0;
 +              return BLK_STS_OK;
  
        device->flush_bio_sent = 0;
        wait_for_completion_io(&device->flush_wait);
@@@ -3563,7 -3563,7 +3563,7 @@@ static int barrier_all_devices(struct b
                        continue;
  
                write_dev_flush(dev);
 -              dev->last_flush_error = 0;
 +              dev->last_flush_error = BLK_STS_OK;
        }
  
        /* wait for all the barriers */
diff --combined fs/btrfs/raid56.c
@@@ -905,7 -905,7 +905,7 @@@ static void raid_write_end_io(struct bi
        if (!atomic_dec_and_test(&rbio->stripes_pending))
                return;
  
 -      err = 0;
 +      err = BLK_STS_OK;
  
        /* OK, we have read all the stripes we need to. */
        max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
@@@ -1090,7 -1090,8 +1090,8 @@@ static int rbio_add_io_page(struct btrf
                 */
                if (last_end == disk_start && stripe->dev->bdev &&
                    !last->bi_status &&
-                   last->bi_bdev == stripe->dev->bdev) {
+                   last->bi_disk == stripe->dev->bdev->bd_disk &&
+                   last->bi_partno == stripe->dev->bdev->bd_partno) {
                        ret = bio_add_page(last, page, PAGE_SIZE, 0);
                        if (ret == PAGE_SIZE)
                                return 0;
        /* put a new bio on the list */
        bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
        bio->bi_iter.bi_size = 0;
-       bio->bi_bdev = stripe->dev->bdev;
+       bio_set_dev(bio, stripe->dev->bdev);
        bio->bi_iter.bi_sector = disk_start >> 9;
  
        bio_add_page(bio, page, PAGE_SIZE, 0);
@@@ -1324,7 -1325,7 +1325,7 @@@ write_data
        return;
  
  cleanup:
 -      rbio_orig_end_io(rbio, -EIO);
 +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
  }
  
  /*
@@@ -1347,7 -1348,8 +1348,8 @@@ static int find_bio_stripe(struct btrfs
                stripe_start = stripe->physical;
                if (physical >= stripe_start &&
                    physical < stripe_start + rbio->stripe_len &&
-                   bio->bi_bdev == stripe->dev->bdev) {
+                   bio->bi_disk == stripe->dev->bdev->bd_disk &&
+                   bio->bi_partno == stripe->dev->bdev->bd_partno) {
                        return i;
                }
        }
@@@ -1475,7 -1477,7 +1477,7 @@@ static void raid_rmw_end_io(struct bio 
  
  cleanup:
  
 -      rbio_orig_end_io(rbio, -EIO);
 +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
  }
  
  static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
@@@ -1579,7 -1581,7 +1581,7 @@@ static int raid56_rmw_stripe(struct btr
        return 0;
  
  cleanup:
 -      rbio_orig_end_io(rbio, -EIO);
 +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
        return -EIO;
  
  finish:
@@@ -1795,12 -1797,12 +1797,12 @@@ static void __raid_recover_end_io(struc
        void **pointers;
        int faila = -1, failb = -1;
        struct page *page;
 -      int err;
 +      blk_status_t err;
        int i;
  
        pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
        if (!pointers) {
 -              err = -ENOMEM;
 +              err = BLK_STS_RESOURCE;
                goto cleanup_io;
        }
  
                                         * a bad data or Q stripe.
                                         * TODO, we should redo the xor here.
                                         */
 -                                      err = -EIO;
 +                                      err = BLK_STS_IOERR;
                                        goto cleanup;
                                }
                                /*
                        if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
                                if (rbio->bbio->raid_map[faila] ==
                                    RAID5_P_STRIPE) {
 -                                      err = -EIO;
 +                                      err = BLK_STS_IOERR;
                                        goto cleanup;
                                }
                                /*
@@@ -1954,13 -1956,13 +1956,13 @@@ pstripe
                }
        }
  
 -      err = 0;
 +      err = BLK_STS_OK;
  cleanup:
        kfree(pointers);
  
  cleanup_io:
        if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
 -              if (err == 0)
 +              if (err == BLK_STS_OK)
                        cache_rbio_pages(rbio);
                else
                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
                rbio_orig_end_io(rbio, err);
        } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
                rbio_orig_end_io(rbio, err);
 -      } else if (err == 0) {
 +      } else if (err == BLK_STS_OK) {
                rbio->faila = -1;
                rbio->failb = -1;
  
@@@ -2005,7 -2007,7 +2007,7 @@@ static void raid_recover_end_io(struct 
                return;
  
        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
 -              rbio_orig_end_io(rbio, -EIO);
 +              rbio_orig_end_io(rbio, BLK_STS_IOERR);
        else
                __raid_recover_end_io(rbio);
  }
@@@ -2104,7 -2106,7 +2106,7 @@@ out
  cleanup:
        if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
            rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
 -              rbio_orig_end_io(rbio, -EIO);
 +              rbio_orig_end_io(rbio, BLK_STS_IOERR);
        return -EIO;
  }
  
@@@ -2431,7 -2433,7 +2433,7 @@@ submit_write
        nr_data = bio_list_size(&bio_list);
        if (!nr_data) {
                /* Every parity is right */
 -              rbio_orig_end_io(rbio, 0);
 +              rbio_orig_end_io(rbio, BLK_STS_OK);
                return;
        }
  
        return;
  
  cleanup:
 -      rbio_orig_end_io(rbio, -EIO);
 +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
  }
  
  static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@@ -2519,7 -2521,7 +2521,7 @@@ static void validate_rbio_for_parity_sc
        return;
  
  cleanup:
 -      rbio_orig_end_io(rbio, -EIO);
 +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
  }
  
  /*
@@@ -2633,7 -2635,7 +2635,7 @@@ static void raid56_parity_scrub_stripe(
        return;
  
  cleanup:
 -      rbio_orig_end_io(rbio, -EIO);
 +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
        return;
  
  finish:
diff --combined fs/btrfs/volumes.c
@@@ -6188,7 -6188,7 +6188,7 @@@ static void submit_stripe_bio(struct bt
                rcu_read_unlock();
        }
  #endif
-       bio->bi_bdev = dev->bdev;
+       bio_set_dev(bio, dev->bdev);
  
        btrfs_bio_counter_inc_noblocked(fs_info);
  
@@@ -6212,8 -6212,8 +6212,8 @@@ static void bbio_error(struct btrfs_bi
        }
  }
  
 -int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 -                int mirror_num, int async_submit)
 +blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 +                         int mirror_num, int async_submit)
  {
        struct btrfs_device *dev;
        struct bio *first_bio = bio;
                                &map_length, &bbio, mirror_num, 1);
        if (ret) {
                btrfs_bio_counter_dec(fs_info);
 -              return ret;
 +              return errno_to_blk_status(ret);
        }
  
        total_devs = bbio->num_stripes;
                }
  
                btrfs_bio_counter_dec(fs_info);
 -              return ret;
 +              return errno_to_blk_status(ret);
        }
  
        if (map_length < length) {
                                  dev_nr, async_submit);
        }
        btrfs_bio_counter_dec(fs_info);
 -      return 0;
 +      return BLK_STS_OK;
  }
  
  struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
diff --combined fs/buffer.c
@@@ -1627,17 -1627,20 +1627,17 @@@ void clean_bdev_aliases(struct block_de
        struct pagevec pvec;
        pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
        pgoff_t end;
 -      int i;
 +      int i, count;
        struct buffer_head *bh;
        struct buffer_head *head;
  
        end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
        pagevec_init(&pvec, 0);
 -      while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
 -                      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
 -              for (i = 0; i < pagevec_count(&pvec); i++) {
 +      while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
 +              count = pagevec_count(&pvec);
 +              for (i = 0; i < count; i++) {
                        struct page *page = pvec.pages[i];
  
 -                      index = page->index;
 -                      if (index > end)
 -                              break;
                        if (!page_has_buffers(page))
                                continue;
                        /*
@@@ -1667,9 -1670,7 +1667,9 @@@ unlock_page
                }
                pagevec_release(&pvec);
                cond_resched();
 -              index++;
 +              /* End of range already reached? */
 +              if (index > end || !index)
 +                      break;
        }
  }
  EXPORT_SYMBOL(clean_bdev_aliases);
@@@ -3056,7 -3057,7 +3056,7 @@@ void guard_bio_eod(int op, struct bio *
        struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
        unsigned truncated_bytes;
  
-       maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+       maxsector = get_capacity(bio->bi_disk);
        if (!maxsector)
                return;
  
@@@ -3115,7 -3116,7 +3115,7 @@@ static int submit_bh_wbc(int op, int op
        }
  
        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-       bio->bi_bdev = bh->b_bdev;
+       bio_set_dev(bio, bh->b_bdev);
        bio->bi_write_hint = write_hint;
  
        bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@@@ -3548,10 -3549,10 +3548,10 @@@ page_cache_seek_hole_data(struct inode 
        pagevec_init(&pvec, 0);
  
        do {
 -              unsigned want, nr_pages, i;
 +              unsigned nr_pages, i;
  
 -              want = min_t(unsigned, end - index, PAGEVEC_SIZE);
 -              nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
 +              nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
 +                                              end - 1);
                if (nr_pages == 0)
                        break;
  
                            lastoff < page_offset(page))
                                goto check_range;
  
 -                      /* Searching done if the page index is out of range. */
 -                      if (page->index >= end)
 -                              goto not_found;
 -
                        lock_page(page);
                        if (likely(page->mapping == inode->i_mapping) &&
                            page_has_buffers(page)) {
                        unlock_page(page);
                        lastoff = page_offset(page) + PAGE_SIZE;
                }
 -
 -              /* Searching done if fewer pages returned than wanted. */
 -              if (nr_pages < want)
 -                      break;
 -
 -              index = pvec.pages[i - 1]->index + 1;
                pagevec_release(&pvec);
        } while (index < end);
  
diff --combined fs/gfs2/lops.c
@@@ -207,11 -207,8 +207,11 @@@ static void gfs2_end_log_write(struct b
        struct page *page;
        int i;
  
 -      if (bio->bi_status)
 -              fs_err(sdp, "Error %d writing to log\n", bio->bi_status);
 +      if (bio->bi_status) {
 +              fs_err(sdp, "Error %d writing to journal, jid=%u\n",
 +                     bio->bi_status, sdp->sd_jdesc->jd_jid);
 +              wake_up(&sdp->sd_logd_waitq);
 +      }
  
        bio_for_each_segment_all(bvec, bio, i) {
                page = bvec->bv_page;
@@@ -268,7 -265,7 +268,7 @@@ static struct bio *gfs2_log_alloc_bio(s
  
        bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
        bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
-       bio->bi_bdev = sb->s_bdev;
+       bio_set_dev(bio, sb->s_bdev);
        bio->bi_end_io = gfs2_end_log_write;
        bio->bi_private = sdp;
  
diff --combined fs/gfs2/meta_io.c
@@@ -221,7 -221,7 +221,7 @@@ static void gfs2_submit_bhs(int op, in
  
                bio = bio_alloc(GFP_NOIO, num);
                bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-               bio->bi_bdev = bh->b_bdev;
+               bio_set_dev(bio, bh->b_bdev);
                while (num > 0) {
                        bh = *bhs;
                        if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
@@@ -419,9 -419,8 +419,9 @@@ int gfs2_meta_indirect_buffer(struct gf
        if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
                brelse(bh);
                ret = -EIO;
 +      } else {
 +              *bhp = bh;
        }
 -      *bhp = bh;
        return ret;
  }
  
@@@ -453,7 -452,7 +453,7 @@@ struct buffer_head *gfs2_meta_ra(struc
        if (buffer_uptodate(first_bh))
                goto out;
        if (!buffer_locked(first_bh))
 -              ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
 +              ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh);
  
        dblock++;
        extlen--;
                bh = gfs2_getbuf(gl, dblock, CREATE);
  
                if (!buffer_uptodate(bh) && !buffer_locked(bh))
 -                      ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh);
 +                      ll_rw_block(REQ_OP_READ,
 +                                  REQ_RAHEAD | REQ_META | REQ_PRIO,
 +                                  1, &bh);
                brelse(bh);
                dblock++;
                extlen--;
diff --combined fs/gfs2/ops_fstype.c
@@@ -242,7 -242,7 +242,7 @@@ static int gfs2_read_super(struct gfs2_
  
        bio = bio_alloc(GFP_NOFS, 1);
        bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
-       bio->bi_bdev = sb->s_bdev;
+       bio_set_dev(bio, sb->s_bdev);
        bio_add_page(bio, page, PAGE_SIZE, 0);
  
        bio->bi_end_io = end_bio_io_page;
@@@ -1113,7 -1113,7 +1113,7 @@@ static int fill_super(struct super_bloc
                return error;
        }
  
 -      snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
 +      snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name);
  
        error = gfs2_sys_fs_add(sdp);
        /*
        }
  
        if (sdp->sd_args.ar_spectator)
 -              snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
 +              snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s",
                         sdp->sd_table_name);
        else
 -              snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
 +              snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u",
                         sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
  
        error = init_inodes(sdp, DO);
@@@ -1388,6 -1388,7 +1388,6 @@@ static void gfs2_kill_sb(struct super_b
        sdp->sd_root_dir = NULL;
        sdp->sd_master_dir = NULL;
        shrink_dcache_sb(sb);
 -      gfs2_delete_debugfs_file(sdp);
        free_percpu(sdp->sd_lkstats);
        kill_block_super(sb);
  }
diff --combined fs/iomap.c
@@@ -278,7 -278,7 +278,7 @@@ iomap_dirty_actor(struct inode *inode, 
                unsigned long bytes;    /* Bytes to write to page */
  
                offset = (pos & (PAGE_SIZE - 1));
 -              bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
 +              bytes = min_t(loff_t, PAGE_SIZE - offset, length);
  
                rpage = __iomap_read_page(inode, pos);
                if (IS_ERR(rpage))
@@@ -373,7 -373,7 +373,7 @@@ iomap_zero_range_actor(struct inode *in
                unsigned offset, bytes;
  
                offset = pos & (PAGE_SIZE - 1); /* Within page */
 -              bytes = min_t(unsigned, PAGE_SIZE - offset, count);
 +              bytes = min_t(loff_t, PAGE_SIZE - offset, count);
  
                if (IS_DAX(inode))
                        status = iomap_dax_zero(pos, offset, bytes, iomap);
@@@ -477,10 -477,10 +477,10 @@@ int iomap_page_mkwrite(struct vm_fault 
  
        set_page_dirty(page);
        wait_for_stable_page(page);
 -      return 0;
 +      return VM_FAULT_LOCKED;
  out_unlock:
        unlock_page(page);
 -      return ret;
 +      return block_page_mkwrite_return(ret);
  }
  EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
  
@@@ -805,7 -805,7 +805,7 @@@ iomap_dio_zero(struct iomap_dio *dio, s
        struct bio *bio;
  
        bio = bio_alloc(GFP_KERNEL, 1);
-       bio->bi_bdev = iomap->bdev;
+       bio_set_dev(bio, iomap->bdev);
        bio->bi_iter.bi_sector =
                iomap->blkno + ((pos - iomap->offset) >> 9);
        bio->bi_private = dio;
@@@ -884,7 -884,7 +884,7 @@@ iomap_dio_actor(struct inode *inode, lo
                        return 0;
  
                bio = bio_alloc(GFP_KERNEL, nr_pages);
-               bio->bi_bdev = iomap->bdev;
+               bio_set_dev(bio, iomap->bdev);
                bio->bi_iter.bi_sector =
                        iomap->blkno + ((pos - iomap->offset) >> 9);
                bio->bi_write_hint = dio->iocb->ki_hint;
diff --combined fs/kernfs/file.c
@@@ -616,7 -616,7 +616,7 @@@ static void kernfs_put_open_node(struc
  
  static int kernfs_fop_open(struct inode *inode, struct file *file)
  {
-       struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+       struct kernfs_node *kn = inode->i_private;
        struct kernfs_root *root = kernfs_root(kn);
        const struct kernfs_ops *ops;
        struct kernfs_open_file *of;
@@@ -768,7 -768,7 +768,7 @@@ static void kernfs_release_file(struct 
  
  static int kernfs_fop_release(struct inode *inode, struct file *filp)
  {
-       struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+       struct kernfs_node *kn = inode->i_private;
        struct kernfs_open_file *of = kernfs_of(filp);
  
        if (kn->flags & KERNFS_HAS_RELEASE) {
@@@ -835,7 -835,7 +835,7 @@@ void kernfs_drain_open_files(struct ker
  static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
  {
        struct kernfs_open_file *of = kernfs_of(filp);
-       struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+       struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
        struct kernfs_open_node *on = kn->attr.open;
  
        if (!kernfs_get_active(kn))
@@@ -895,7 -895,7 +895,7 @@@ repeat
                 * have the matching @file available.  Look up the inodes
                 * and generate the events manually.
                 */
-               inode = ilookup(info->sb, kn->ino);
+               inode = ilookup(info->sb, kn->id.ino);
                if (!inode)
                        continue;
  
                if (parent) {
                        struct inode *p_inode;
  
-                       p_inode = ilookup(info->sb, parent->ino);
+                       p_inode = ilookup(info->sb, parent->id.ino);
                        if (p_inode) {
                                fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
                                         inode, FSNOTIFY_EVENT_INODE, kn->name, 0);
@@@ -997,7 -997,7 +997,7 @@@ struct kernfs_node *__kernfs_create_fil
  
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (key) {
 -              lockdep_init_map(&kn->dep_map, "s_active", key, 0);
 +              lockdep_init_map(&kn->dep_map, "kn->count", key, 0);
                kn->flags |= KERNFS_LOCKDEP;
        }
  #endif
@@@ -505,7 -505,8 +505,7 @@@ static inline void o2hb_bio_wait_dec(st
        }
  }
  
 -static void o2hb_wait_on_io(struct o2hb_region *reg,
 -                          struct o2hb_bio_wait_ctxt *wc)
 +static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
  {
        o2hb_bio_wait_dec(wc, 1);
        wait_for_completion(&wc->wc_io_complete);
@@@ -553,7 -554,7 +553,7 @@@ static struct bio *o2hb_setup_one_bio(s
  
        /* Must put everything in 512 byte sectors for the bio... */
        bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
-       bio->bi_bdev = reg->hr_bdev;
+       bio_set_dev(bio, reg->hr_bdev);
        bio->bi_private = wc;
        bio->bi_end_io = o2hb_bio_end_io;
        bio_set_op_attrs(bio, op, op_flags);
@@@ -607,7 -608,7 +607,7 @@@ static int o2hb_read_slots(struct o2hb_
        status = 0;
  
  bail_and_wait:
 -      o2hb_wait_on_io(reg, &wc);
 +      o2hb_wait_on_io(&wc);
        if (wc.wc_error && !status)
                status = wc.wc_error;
  
@@@ -1161,7 -1162,7 +1161,7 @@@ static int o2hb_do_disk_heartbeat(struc
         * before we can go to steady state.  This ensures that
         * people we find in our steady state have seen us.
         */
 -      o2hb_wait_on_io(reg, &write_wc);
 +      o2hb_wait_on_io(&write_wc);
        if (write_wc.wc_error) {
                /* Do not re-arm the write timeout on I/O error - we
                 * can't be sure that the new block ever made it to
@@@ -1274,7 -1275,7 +1274,7 @@@ static int o2hb_thread(void *data
                o2hb_prepare_block(reg, 0);
                ret = o2hb_issue_node_write(reg, &write_wc);
                if (ret == 0)
 -                      o2hb_wait_on_io(reg, &write_wc);
 +                      o2hb_wait_on_io(&write_wc);
                else
                        mlog_errno(ret);
        }
@@@ -2575,6 -2576,22 +2575,6 @@@ void o2hb_unregister_callback(const cha
  }
  EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
  
 -int o2hb_check_node_heartbeating(u8 node_num)
 -{
 -      unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 -
 -      o2hb_fill_node_map(testing_map, sizeof(testing_map));
 -      if (!test_bit(node_num, testing_map)) {
 -              mlog(ML_HEARTBEAT,
 -                   "node (%u) does not have heartbeating enabled.\n",
 -                   node_num);
 -              return 0;
 -      }
 -
 -      return 1;
 -}
 -EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
 -
  int o2hb_check_node_heartbeating_no_sem(u8 node_num)
  {
        unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@@ -2609,6 -2626,23 +2609,6 @@@ int o2hb_check_node_heartbeating_from_c
  }
  EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
  
 -/* Makes sure our local node is configured with a node number, and is
 - * heartbeating. */
 -int o2hb_check_local_node_heartbeating(void)
 -{
 -      u8 node_num;
 -
 -      /* if this node was set then we have networking */
 -      node_num = o2nm_this_node();
 -      if (node_num == O2NM_MAX_NODES) {
 -              mlog(ML_HEARTBEAT, "this node has not been configured.\n");
 -              return 0;
 -      }
 -
 -      return o2hb_check_node_heartbeating(node_num);
 -}
 -EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
 -
  /*
   * this is just a hack until we get the plumbing which flips file systems
   * read only and drops the hb ref instead of killing the node dead.
diff --combined fs/xfs/xfs_aops.c
@@@ -85,11 -85,11 +85,11 @@@ xfs_find_bdev_for_inode
   * associated buffer_heads, paying attention to the start and end offsets that
   * we need to process on the page.
   *
 - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
 - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
 - * the page at all, as we may be racing with memory reclaim and it can free both
 - * the bufferhead chain and the page as it will see the page as clean and
 - * unused.
 + * Note that we open code the action in end_buffer_async_write here so that we
 + * only have to iterate over the buffers attached to the page once.  This is not
 + * only more efficient, but also ensures that we only calls end_page_writeback
 + * at the end of the iteration, and thus avoids the pitfall of having the page
 + * and buffers potentially freed after every call to end_buffer_async_write.
   */
  static void
  xfs_finish_page_writeback(
        struct bio_vec          *bvec,
        int                     error)
  {
 -      unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
 -      struct buffer_head      *head, *bh, *next;
 +      struct buffer_head      *head = page_buffers(bvec->bv_page), *bh = head;
 +      bool                    busy = false;
        unsigned int            off = 0;
 -      unsigned int            bsize;
 +      unsigned long           flags;
  
        ASSERT(bvec->bv_offset < PAGE_SIZE);
        ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
 -      ASSERT(end < PAGE_SIZE);
 +      ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
        ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
  
 -      bh = head = page_buffers(bvec->bv_page);
 -
 -      bsize = bh->b_size;
 +      local_irq_save(flags);
 +      bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
        do {
 -              if (off > end)
 -                      break;
 -              next = bh->b_this_page;
 -              if (off < bvec->bv_offset)
 -                      goto next_bh;
 -              bh->b_end_io(bh, !error);
 -next_bh:
 -              off += bsize;
 -      } while ((bh = next) != head);
 +              if (off >= bvec->bv_offset &&
 +                  off < bvec->bv_offset + bvec->bv_len) {
 +                      ASSERT(buffer_async_write(bh));
 +                      ASSERT(bh->b_end_io == NULL);
 +
 +                      if (error) {
 +                              mark_buffer_write_io_error(bh);
 +                              clear_buffer_uptodate(bh);
 +                              SetPageError(bvec->bv_page);
 +                      } else {
 +                              set_buffer_uptodate(bh);
 +                      }
 +                      clear_buffer_async_write(bh);
 +                      unlock_buffer(bh);
 +              } else if (buffer_async_write(bh)) {
 +                      ASSERT(buffer_locked(bh));
 +                      busy = true;
 +              }
 +              off += bh->b_size;
 +      } while ((bh = bh->b_this_page) != head);
 +      bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
 +      local_irq_restore(flags);
 +
 +      if (!busy)
 +              end_page_writeback(bvec->bv_page);
  }
  
  /*
@@@ -148,10 -133,8 +148,10 @@@ xfs_destroy_ioend
        int                     error)
  {
        struct inode            *inode = ioend->io_inode;
 -      struct bio              *last = ioend->io_bio;
 -      struct bio              *bio, *next;
 +      struct bio              *bio = &ioend->io_inline_bio;
 +      struct bio              *last = ioend->io_bio, *next;
 +      u64                     start = bio->bi_iter.bi_sector;
 +      bool                    quiet = bio_flagged(bio, BIO_QUIET);
  
        for (bio = &ioend->io_inline_bio; bio; bio = next) {
                struct bio_vec  *bvec;
  
                bio_put(bio);
        }
 +
 +      if (unlikely(error && !quiet)) {
 +              xfs_err_ratelimited(XFS_I(inode)->i_mount,
 +                      "writeback error on sector %llu", start);
 +      }
  }
  
  /*
@@@ -445,8 -423,7 +445,8 @@@ xfs_start_buffer_writeback
        ASSERT(!buffer_delay(bh));
        ASSERT(!buffer_unwritten(bh));
  
 -      mark_buffer_async_write(bh);
 +      bh->b_end_io = NULL;
 +      set_buffer_async_write(bh);
        set_buffer_uptodate(bh);
        clear_buffer_dirty(bh);
  }
@@@ -540,7 -517,7 +540,7 @@@ xfs_init_bio_from_bh
        struct buffer_head      *bh)
  {
        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-       bio->bi_bdev = bh->b_bdev;
+       bio_set_dev(bio, bh->b_bdev);
  }
  
  static struct xfs_ioend *
diff --combined include/linux/bio.h
  #define BIO_BUG_ON
  #endif
  
 +#ifdef CONFIG_THP_SWAP
 +#if HPAGE_PMD_NR > 256
 +#define BIO_MAX_PAGES         HPAGE_PMD_NR
 +#else
  #define BIO_MAX_PAGES         256
 +#endif
 +#else
 +#define BIO_MAX_PAGES         256
 +#endif
  
  #define bio_prio(bio)                 (bio)->bi_ioprio
  #define bio_set_prio(bio, prio)               ((bio)->bi_ioprio = prio)
@@@ -471,10 -463,11 +471,11 @@@ extern struct bio *bio_copy_kern(struc
  extern void bio_set_pages_dirty(struct bio *bio);
  extern void bio_check_pages_dirty(struct bio *bio);
  
- void generic_start_io_acct(int rw, unsigned long sectors,
-                          struct hd_struct *part);
- void generic_end_io_acct(int rw, struct hd_struct *part,
-                        unsigned long start_time);
+ void generic_start_io_acct(struct request_queue *q, int rw,
+                               unsigned long sectors, struct hd_struct *part);
+ void generic_end_io_acct(struct request_queue *q, int rw,
+                               struct hd_struct *part,
+                               unsigned long start_time);
  
  #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
  # error       "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
@@@ -501,6 -494,24 +502,24 @@@ extern struct bio_vec *bvec_alloc(gfp_t
  extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
  extern unsigned int bvec_nr_vecs(unsigned short idx);
  
+ #define bio_set_dev(bio, bdev)                        \
+ do {                                          \
+       (bio)->bi_disk = (bdev)->bd_disk;       \
+       (bio)->bi_partno = (bdev)->bd_partno;   \
+ } while (0)
+ #define bio_copy_dev(dst, src)                        \
+ do {                                          \
+       (dst)->bi_disk = (src)->bi_disk;        \
+       (dst)->bi_partno = (src)->bi_partno;    \
+ } while (0)
+ #define bio_dev(bio) \
+       disk_devt((bio)->bi_disk)
+ #define bio_devname(bio, buf) \
+       __bdevname(bio_dev(bio), (buf))
  #ifdef CONFIG_BLK_CGROUP
  int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
  int bio_associate_current(struct bio *bio);
diff --combined include/linux/blkdev.h
@@@ -134,7 -134,7 +134,7 @@@ typedef __u32 __bitwise req_flags_t
  struct request {
        struct list_head queuelist;
        union {
 -              struct call_single_data csd;
 +              call_single_data_t csd;
                u64 fifo_time;
        };
  
@@@ -568,6 -568,7 +568,6 @@@ struct request_queue 
  
  #if defined(CONFIG_BLK_DEV_BSG)
        bsg_job_fn              *bsg_job_fn;
 -      int                     bsg_job_size;
        struct bsg_class_device bsg_dev;
  #endif
  
        u64                     write_hints[BLK_MAX_WRITE_HINTS];
  };
  
- #define QUEUE_FLAG_QUEUED     1       /* uses generic tag queueing */
- #define QUEUE_FLAG_STOPPED    2       /* queue is stopped */
- #define       QUEUE_FLAG_SYNCFULL     3       /* read queue has been filled */
- #define QUEUE_FLAG_ASYNCFULL  4       /* write queue has been filled */
- #define QUEUE_FLAG_DYING      5       /* queue being torn down */
- #define QUEUE_FLAG_BYPASS     6       /* act as dumb FIFO queue */
- #define QUEUE_FLAG_BIDI               7       /* queue supports bidi requests */
- #define QUEUE_FLAG_NOMERGES     8     /* disable merge attempts */
- #define QUEUE_FLAG_SAME_COMP  9       /* complete on same CPU-group */
- #define QUEUE_FLAG_FAIL_IO     10     /* fake timeout */
- #define QUEUE_FLAG_STACKABLE   11     /* supports request stacking */
- #define QUEUE_FLAG_NONROT      12     /* non-rotational device (SSD) */
+ #define QUEUE_FLAG_QUEUED     0       /* uses generic tag queueing */
+ #define QUEUE_FLAG_STOPPED    1       /* queue is stopped */
+ #define QUEUE_FLAG_DYING      2       /* queue being torn down */
+ #define QUEUE_FLAG_BYPASS     3       /* act as dumb FIFO queue */
+ #define QUEUE_FLAG_BIDI               4       /* queue supports bidi requests */
+ #define QUEUE_FLAG_NOMERGES     5     /* disable merge attempts */
+ #define QUEUE_FLAG_SAME_COMP  6       /* complete on same CPU-group */
+ #define QUEUE_FLAG_FAIL_IO    7       /* fake timeout */
+ #define QUEUE_FLAG_STACKABLE  8       /* supports request stacking */
+ #define QUEUE_FLAG_NONROT     9       /* non-rotational device (SSD) */
  #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
- #define QUEUE_FLAG_IO_STAT     13     /* do IO stats */
- #define QUEUE_FLAG_DISCARD     14     /* supports DISCARD */
- #define QUEUE_FLAG_NOXMERGES   15     /* No extended merges */
- #define QUEUE_FLAG_ADD_RANDOM  16     /* Contributes to random pool */
- #define QUEUE_FLAG_SECERASE    17     /* supports secure erase */
- #define QUEUE_FLAG_SAME_FORCE  18     /* force complete on same CPU */
- #define QUEUE_FLAG_DEAD        19     /* queue tear-down finished */
- #define QUEUE_FLAG_INIT_DONE   20     /* queue is initialized */
- #define QUEUE_FLAG_NO_SG_MERGE 21     /* don't attempt to merge SG segments*/
- #define QUEUE_FLAG_POLL              22       /* IO polling enabled if set */
- #define QUEUE_FLAG_WC        23       /* Write back caching */
- #define QUEUE_FLAG_FUA               24       /* device supports FUA writes */
- #define QUEUE_FLAG_FLUSH_NQ    25     /* flush not queueuable */
- #define QUEUE_FLAG_DAX         26     /* device supports DAX */
- #define QUEUE_FLAG_STATS       27     /* track rq completion times */
- #define QUEUE_FLAG_POLL_STATS  28     /* collecting stats for hybrid polling */
- #define QUEUE_FLAG_REGISTERED  29     /* queue has been registered to a disk */
- #define QUEUE_FLAG_SCSI_PASSTHROUGH 30        /* queue supports SCSI commands */
- #define QUEUE_FLAG_QUIESCED    31     /* queue has been quiesced */
+ #define QUEUE_FLAG_IO_STAT     10     /* do IO stats */
+ #define QUEUE_FLAG_DISCARD     11     /* supports DISCARD */
+ #define QUEUE_FLAG_NOXMERGES   12     /* No extended merges */
+ #define QUEUE_FLAG_ADD_RANDOM  13     /* Contributes to random pool */
+ #define QUEUE_FLAG_SECERASE    14     /* supports secure erase */
+ #define QUEUE_FLAG_SAME_FORCE  15     /* force complete on same CPU */
+ #define QUEUE_FLAG_DEAD        16     /* queue tear-down finished */
+ #define QUEUE_FLAG_INIT_DONE   17     /* queue is initialized */
+ #define QUEUE_FLAG_NO_SG_MERGE 18     /* don't attempt to merge SG segments*/
+ #define QUEUE_FLAG_POLL              19       /* IO polling enabled if set */
+ #define QUEUE_FLAG_WC        20       /* Write back caching */
+ #define QUEUE_FLAG_FUA               21       /* device supports FUA writes */
+ #define QUEUE_FLAG_FLUSH_NQ    22     /* flush not queueuable */
+ #define QUEUE_FLAG_DAX         23     /* device supports DAX */
+ #define QUEUE_FLAG_STATS       24     /* track rq completion times */
+ #define QUEUE_FLAG_POLL_STATS  25     /* collecting stats for hybrid polling */
+ #define QUEUE_FLAG_REGISTERED  26     /* queue has been registered to a disk */
+ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27        /* queue supports SCSI commands */
+ #define QUEUE_FLAG_QUIESCED    28     /* queue has been quiesced */
  
  #define QUEUE_FLAG_DEFAULT    ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \
diff --combined include/linux/cgroup.h
  #define CGROUP_WEIGHT_DFL             100
  #define CGROUP_WEIGHT_MAX             10000
  
 +/* walk only threadgroup leaders */
 +#define CSS_TASK_ITER_PROCS           (1U << 0)
 +/* walk all threaded css_sets in the domain */
 +#define CSS_TASK_ITER_THREADED                (1U << 1)
 +
  /* a css_task_iter should be treated as an opaque object */
  struct css_task_iter {
        struct cgroup_subsys            *ss;
 +      unsigned int                    flags;
  
        struct list_head                *cset_pos;
        struct list_head                *cset_head;
  
 +      struct list_head                *tcset_pos;
 +      struct list_head                *tcset_head;
 +
        struct list_head                *task_pos;
        struct list_head                *tasks_head;
        struct list_head                *mg_tasks_head;
  
        struct css_set                  *cur_cset;
 +      struct css_set                  *cur_dcset;
        struct task_struct              *cur_task;
        struct list_head                iters_node;     /* css_set->task_iters */
  };
@@@ -139,7 -129,7 +139,7 @@@ struct task_struct *cgroup_taskset_firs
  struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp);
  
 -void css_task_iter_start(struct cgroup_subsys_state *css,
 +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it);
  struct task_struct *css_task_iter_next(struct css_task_iter *it);
  void css_task_iter_end(struct css_task_iter *it);
@@@ -398,16 -388,6 +398,16 @@@ static inline void css_put_many(struct 
                percpu_ref_put_many(&css->refcnt, n);
  }
  
 +static inline void cgroup_get(struct cgroup *cgrp)
 +{
 +      css_get(&cgrp->self);
 +}
 +
 +static inline bool cgroup_tryget(struct cgroup *cgrp)
 +{
 +      return css_tryget(&cgrp->self);
 +}
 +
  static inline void cgroup_put(struct cgroup *cgrp)
  {
        css_put(&cgrp->self);
@@@ -520,20 -500,6 +520,20 @@@ static inline struct cgroup *task_cgrou
        return task_css(task, subsys_id)->cgroup;
  }
  
 +static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
 +{
 +      return task_css_set(task)->dfl_cgrp;
 +}
 +
 +static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
 +{
 +      struct cgroup_subsys_state *parent_css = cgrp->self.parent;
 +
 +      if (parent_css)
 +              return container_of(parent_css, struct cgroup, self);
 +      return NULL;
 +}
 +
  /**
   * cgroup_is_descendant - test ancestry
   * @cgrp: the cgroup to be tested
@@@ -571,14 -537,13 +571,14 @@@ static inline bool task_under_cgroup_hi
  /* no synchronization, the result can only be used as a hint */
  static inline bool cgroup_is_populated(struct cgroup *cgrp)
  {
 -      return cgrp->populated_cnt;
 +      return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
 +              cgrp->nr_populated_threaded_children;
  }
  
  /* returns ino associated with a cgroup */
  static inline ino_t cgroup_ino(struct cgroup *cgrp)
  {
-       return cgrp->kn->ino;
+       return cgrp->kn->id.ino;
  }
  
  /* cft/css accessors for cftype->write() operation */
@@@ -644,6 -609,13 +644,13 @@@ static inline void cgroup_kthread_ready
        current->no_cgroup_migration = 0;
  }
  
+ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
+ {
+       return &cgrp->kn->id;
+ }
+ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+                                       char *buf, size_t buflen);
  #else /* !CONFIG_CGROUPS */
  
  struct cgroup_subsys_state;
@@@ -666,12 -638,19 +673,19 @@@ static inline int cgroup_init_early(voi
  static inline int cgroup_init(void) { return 0; }
  static inline void cgroup_init_kthreadd(void) {}
  static inline void cgroup_kthread_ready(void) {}
+ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
+ {
+       return NULL;
+ }
  
  static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
  {
        return true;
  }
+ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+       char *buf, size_t buflen) {}
  #endif /* !CONFIG_CGROUPS */
  
  /*
diff --combined include/linux/fs.h
@@@ -72,8 -72,6 +72,8 @@@ extern int leases_enable, lease_break_t
  extern int sysctl_protected_symlinks;
  extern int sysctl_protected_hardlinks;
  
 +typedef __kernel_rwf_t rwf_t;
 +
  struct buffer_head;
  typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
@@@ -429,6 -427,7 +429,7 @@@ struct block_device 
  #endif
        struct block_device *   bd_contains;
        unsigned                bd_block_size;
+       u8                      bd_partno;
        struct hd_struct *      bd_part;
        /* number of times partitions within this device have been opened. */
        unsigned                bd_part_count;
@@@ -909,9 -908,9 +910,9 @@@ static inline struct file *get_file(str
  /* Page cache limit. The filesystems should put that into their s_maxbytes 
     limits, otherwise bad things can happen in VM. */ 
  #if BITS_PER_LONG==32
 -#define MAX_LFS_FILESIZE      (((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1)
 +#define MAX_LFS_FILESIZE      ((loff_t)ULONG_MAX << PAGE_SHIFT)
  #elif BITS_PER_LONG==64
 -#define MAX_LFS_FILESIZE      ((loff_t)0x7fffffffffffffffLL)
 +#define MAX_LFS_FILESIZE      ((loff_t)LLONG_MAX)
  #endif
  
  #define FL_POSIX      1
@@@ -1002,6 -1001,7 +1003,6 @@@ struct file_lock 
        unsigned char fl_type;
        unsigned int fl_pid;
        int fl_link_cpu;                /* what cpu's list is this on? */
 -      struct pid *fl_nspid;
        wait_queue_head_t fl_wait;
        struct file *fl_file;
        loff_t fl_start;
@@@ -1269,6 -1269,8 +1270,6 @@@ extern void f_delown(struct file *filp)
  extern pid_t f_getown(struct file *filp);
  extern int send_sigurg(struct fown_struct *fown);
  
 -struct mm_struct;
 -
  /*
   *    Umount options
   */
@@@ -1757,9 -1759,9 +1758,9 @@@ extern ssize_t __vfs_write(struct file 
  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
  extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
  extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 -              unsigned long, loff_t *, int);
 +              unsigned long, loff_t *, rwf_t);
  extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
 -              unsigned long, loff_t *, int);
 +              unsigned long, loff_t *, rwf_t);
  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
  extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
@@@ -2470,13 -2472,9 +2471,13 @@@ static inline void bd_unlink_disk_holde
  #endif
  
  /* fs/char_dev.c */
 -#define CHRDEV_MAJOR_HASH_SIZE        255
 +#define CHRDEV_MAJOR_MAX 512
  /* Marks the bottom of the first segment of free char majors */
  #define CHRDEV_MAJOR_DYN_END 234
 +/* Marks the top and bottom of the second segment of free char majors */
 +#define CHRDEV_MAJOR_DYN_EXT_START 511
 +#define CHRDEV_MAJOR_DYN_EXT_END 384
 +
  extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
  extern int register_chrdev_region(dev_t, unsigned, const char *);
  extern int __register_chrdev(unsigned int major, unsigned int baseminor,
@@@ -2503,14 -2501,14 +2504,14 @@@ static inline void unregister_chrdev(un
  #define BDEVT_SIZE    10      /* Largest string for MAJ:MIN for blkdev */
  
  #ifdef CONFIG_BLOCK
 -#define BLKDEV_MAJOR_HASH_SIZE        255
 +#define BLKDEV_MAJOR_MAX      512
  extern const char *__bdevname(dev_t, char *buffer);
  extern const char *bdevname(struct block_device *bdev, char *buffer);
  extern struct block_device *lookup_bdev(const char *);
  extern void blkdev_show(struct seq_file *,off_t);
  
  #else
 -#define BLKDEV_MAJOR_HASH_SIZE        0
 +#define BLKDEV_MAJOR_MAX      0
  #endif
  
  extern void init_special_inode(struct inode *, umode_t, dev_t);
@@@ -2542,19 -2540,12 +2543,19 @@@ extern int invalidate_inode_pages2_rang
  extern int write_inode_now(struct inode *, int);
  extern int filemap_fdatawrite(struct address_space *);
  extern int filemap_flush(struct address_space *);
 -extern int filemap_fdatawait(struct address_space *);
  extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
  extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                   loff_t lend);
 +
 +static inline int filemap_fdatawait(struct address_space *mapping)
 +{
 +      return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
 +}
 +
  extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
                                  loff_t lend);
 +extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
 +                                              loff_t lend);
  extern int filemap_write_and_wait(struct address_space *mapping);
  extern int filemap_write_and_wait_range(struct address_space *mapping,
                                        loff_t lstart, loff_t lend);
@@@ -2563,19 -2554,12 +2564,19 @@@ extern int __filemap_fdatawrite_range(s
  extern int filemap_fdatawrite_range(struct address_space *mapping,
                                loff_t start, loff_t end);
  extern int filemap_check_errors(struct address_space *mapping);
 -
  extern void __filemap_set_wb_err(struct address_space *mapping, int err);
 +
 +extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
 +                                              loff_t lend);
  extern int __must_check file_check_and_advance_wb_err(struct file *file);
  extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);
  
 +static inline int file_write_and_wait(struct file *file)
 +{
 +      return file_write_and_wait_range(file, 0, LLONG_MAX);
 +}
 +
  /**
   * filemap_set_wb_err - set a writeback error on an address_space
   * @mapping: mapping in which to set writeback error
   * When a writeback error occurs, most filesystems will want to call
   * filemap_set_wb_err to record the error in the mapping so that it will be
   * automatically reported whenever fsync is called on the file.
 - *
 - * FIXME: mention FS_* flag here?
   */
  static inline void filemap_set_wb_err(struct address_space *mapping, int err)
  {
@@@ -2846,7 -2832,6 +2847,7 @@@ static inline void lockdep_annotate_ino
  #endif
  extern void unlock_new_inode(struct inode *);
  extern unsigned int get_next_ino(void);
 +extern void evict_inodes(struct super_block *sb);
  
  extern void __iget(struct inode * inode);
  extern void iget_failed(struct inode *);
@@@ -2890,9 -2875,9 +2891,9 @@@ extern ssize_t generic_file_direct_writ
  extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
  
  ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 -              int flags);
 +              rwf_t flags);
  ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 -              int flags);
 +              rwf_t flags);
  
  /* fs/block_dev.c */
  extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
@@@ -3159,7 -3144,7 +3160,7 @@@ static inline int iocb_flags(struct fil
        return res;
  }
  
 -static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
 +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
  {
        if (unlikely(flags & ~RWF_SUPPORTED))
                return -EOPNOTSUPP;
diff --combined kernel/cgroup/cgroup.c
@@@ -162,9 -162,6 +162,9 @@@ static u16 cgrp_dfl_inhibit_ss_mask
  /* some controllers are implicitly enabled on the default hierarchy */
  static u16 cgrp_dfl_implicit_ss_mask;
  
 +/* some controllers can be threaded on the default hierarchy */
 +static u16 cgrp_dfl_threaded_ss_mask;
 +
  /* The list of hierarchy roots */
  LIST_HEAD(cgroup_roots);
  static int cgroup_root_count;
@@@ -319,87 -316,13 +319,87 @@@ static void cgroup_idr_remove(struct id
        spin_unlock_bh(&cgroup_idr_lock);
  }
  
 -static struct cgroup *cgroup_parent(struct cgroup *cgrp)
 +static bool cgroup_has_tasks(struct cgroup *cgrp)
  {
 -      struct cgroup_subsys_state *parent_css = cgrp->self.parent;
 +      return cgrp->nr_populated_csets;
 +}
  
 -      if (parent_css)
 -              return container_of(parent_css, struct cgroup, self);
 -      return NULL;
 +bool cgroup_is_threaded(struct cgroup *cgrp)
 +{
 +      return cgrp->dom_cgrp != cgrp;
 +}
 +
 +/* can @cgrp host both domain and threaded children? */
 +static bool cgroup_is_mixable(struct cgroup *cgrp)
 +{
 +      /*
 +       * Root isn't under domain level resource control exempting it from
 +       * the no-internal-process constraint, so it can serve as a thread
 +       * root and a parent of resource domains at the same time.
 +       */
 +      return !cgroup_parent(cgrp);
 +}
 +
 +/* can @cgrp become a thread root? should always be true for a thread root */
 +static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
 +{
 +      /* mixables don't care */
 +      if (cgroup_is_mixable(cgrp))
 +              return true;
 +
 +      /* domain roots can't be nested under threaded */
 +      if (cgroup_is_threaded(cgrp))
 +              return false;
 +
 +      /* can only have either domain or threaded children */
 +      if (cgrp->nr_populated_domain_children)
 +              return false;
 +
 +      /* and no domain controllers can be enabled */
 +      if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
 +              return false;
 +
 +      return true;
 +}
 +
 +/* is @cgrp root of a threaded subtree? */
 +bool cgroup_is_thread_root(struct cgroup *cgrp)
 +{
 +      /* thread root should be a domain */
 +      if (cgroup_is_threaded(cgrp))
 +              return false;
 +
 +      /* a domain w/ threaded children is a thread root */
 +      if (cgrp->nr_threaded_children)
 +              return true;
 +
 +      /*
 +       * A domain which has tasks and explicit threaded controllers
 +       * enabled is a thread root.
 +       */
 +      if (cgroup_has_tasks(cgrp) &&
 +          (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
 +              return true;
 +
 +      return false;
 +}
 +
 +/* a domain which isn't connected to the root w/o brekage can't be used */
 +static bool cgroup_is_valid_domain(struct cgroup *cgrp)
 +{
 +      /* the cgroup itself can be a thread root */
 +      if (cgroup_is_threaded(cgrp))
 +              return false;
 +
 +      /* but the ancestors can't be unless mixable */
 +      while ((cgrp = cgroup_parent(cgrp))) {
 +              if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
 +                      return false;
 +              if (cgroup_is_threaded(cgrp))
 +                      return false;
 +      }
 +
 +      return true;
  }
  
  /* subsystems visibly enabled on a cgroup */
@@@ -408,14 -331,8 +408,14 @@@ static u16 cgroup_control(struct cgrou
        struct cgroup *parent = cgroup_parent(cgrp);
        u16 root_ss_mask = cgrp->root->subsys_mask;
  
 -      if (parent)
 -              return parent->subtree_control;
 +      if (parent) {
 +              u16 ss_mask = parent->subtree_control;
 +
 +              /* threaded cgroups can only have threaded controllers */
 +              if (cgroup_is_threaded(cgrp))
 +                      ss_mask &= cgrp_dfl_threaded_ss_mask;
 +              return ss_mask;
 +      }
  
        if (cgroup_on_dfl(cgrp))
                root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@@ -428,14 -345,8 +428,14 @@@ static u16 cgroup_ss_mask(struct cgrou
  {
        struct cgroup *parent = cgroup_parent(cgrp);
  
 -      if (parent)
 -              return parent->subtree_ss_mask;
 +      if (parent) {
 +              u16 ss_mask = parent->subtree_ss_mask;
 +
 +              /* threaded cgroups can only have threaded controllers */
 +              if (cgroup_is_threaded(cgrp))
 +                      ss_mask &= cgrp_dfl_threaded_ss_mask;
 +              return ss_mask;
 +      }
  
        return cgrp->root->subsys_mask;
  }
@@@ -525,12 -436,22 +525,12 @@@ out_unlock
        return css;
  }
  
 -static void __maybe_unused cgroup_get(struct cgroup *cgrp)
 -{
 -      css_get(&cgrp->self);
 -}
 -
  static void cgroup_get_live(struct cgroup *cgrp)
  {
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
        css_get(&cgrp->self);
  }
  
 -static bool cgroup_tryget(struct cgroup *cgrp)
 -{
 -      return css_tryget(&cgrp->self);
 -}
 -
  struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
  {
        struct cgroup *cgrp = of->kn->parent->priv;
@@@ -639,11 -560,9 +639,11 @@@ EXPORT_SYMBOL_GPL(of_css)
   */
  struct css_set init_css_set = {
        .refcount               = REFCOUNT_INIT(1),
 +      .dom_cset               = &init_css_set,
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 +      .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
        .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
  
  static int css_set_count      = 1;    /* 1 for init_css_set */
  
 +static bool css_set_threaded(struct css_set *cset)
 +{
 +      return cset->dom_cset != cset;
 +}
 +
  /**
   * css_set_populated - does a css_set contain any tasks?
   * @cset: target css_set
@@@ -673,48 -587,39 +673,48 @@@ static bool css_set_populated(struct cs
  }
  
  /**
 - * cgroup_update_populated - updated populated count of a cgroup
 + * cgroup_update_populated - update the populated count of a cgroup
   * @cgrp: the target cgroup
   * @populated: inc or dec populated count
   *
   * One of the css_sets associated with @cgrp is either getting its first
 - * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
 - * count is propagated towards root so that a given cgroup's populated_cnt
 - * is zero iff the cgroup and all its descendants don't contain any tasks.
 + * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 + * count is propagated towards root so that a given cgroup's
 + * nr_populated_children is zero iff none of its descendants contain any
 + * tasks.
   *
 - * @cgrp's interface file "cgroup.populated" is zero if
 - * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
 - * changes from or to zero, userland is notified that the content of the
 - * interface file has changed.  This can be used to detect when @cgrp and
 - * its descendants become populated or empty.
 + * @cgrp's interface file "cgroup.populated" is zero if both
 + * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 + * 1 otherwise.  When the sum changes from or to zero, userland is notified
 + * that the content of the interface file has changed.  This can be used to
 + * detect when @cgrp and its descendants become populated or empty.
   */
  static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
  {
 +      struct cgroup *child = NULL;
 +      int adj = populated ? 1 : -1;
 +
        lockdep_assert_held(&css_set_lock);
  
        do {
 -              bool trigger;
 +              bool was_populated = cgroup_is_populated(cgrp);
  
 -              if (populated)
 -                      trigger = !cgrp->populated_cnt++;
 -              else
 -                      trigger = !--cgrp->populated_cnt;
 +              if (!child) {
 +                      cgrp->nr_populated_csets += adj;
 +              } else {
 +                      if (cgroup_is_threaded(child))
 +                              cgrp->nr_populated_threaded_children += adj;
 +                      else
 +                              cgrp->nr_populated_domain_children += adj;
 +              }
  
 -              if (!trigger)
 +              if (was_populated == cgroup_is_populated(cgrp))
                        break;
  
                cgroup1_check_for_release(cgrp);
                cgroup_file_notify(&cgrp->events_file);
  
 +              child = cgrp;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
  }
   * @populated: whether @cset is populated or depopulated
   *
   * @cset is either getting the first task or losing the last.  Update the
 - * ->populated_cnt of all associated cgroups accordingly.
 + * populated counters of all associated cgroups accordingly.
   */
  static void css_set_update_populated(struct css_set *cset, bool populated)
  {
   * css_set, @from_cset can be NULL.  If @task is being disassociated
   * instead of moved, @to_cset can be NULL.
   *
 - * This function automatically handles populated_cnt updates and
 + * This function automatically handles populated counter updates and
   * css_task_iter adjustments but the caller is responsible for managing
   * @from_cset and @to_cset's reference counts.
   */
@@@ -832,8 -737,6 +832,8 @@@ void put_css_set_locked(struct css_set 
        if (!refcount_dec_and_test(&cset->refcount))
                return;
  
 +      WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
 +
        /* This css_set is dead. unlink it and release cgroup and css refs */
        for_each_subsys(ss, ssid) {
                list_del(&cset->e_cset_node[ssid]);
                kfree(link);
        }
  
 +      if (css_set_threaded(cset)) {
 +              list_del(&cset->threaded_csets_node);
 +              put_css_set_locked(cset->dom_cset);
 +      }
 +
        kfree_rcu(cset, rcu_head);
  }
  
@@@ -873,7 -771,6 +873,7 @@@ static bool compare_css_sets(struct css
                             struct cgroup *new_cgrp,
                             struct cgroup_subsys_state *template[])
  {
 +      struct cgroup *new_dfl_cgrp;
        struct list_head *l1, *l2;
  
        /*
        if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                return false;
  
 +
 +      /* @cset's domain should match the default cgroup's */
 +      if (cgroup_on_dfl(new_cgrp))
 +              new_dfl_cgrp = new_cgrp;
 +      else
 +              new_dfl_cgrp = old_cset->dfl_cgrp;
 +
 +      if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
 +              return false;
 +
        /*
         * Compare cgroup pointers in order to distinguish between
         * different cgroups in hierarchies.  As different cgroups may
@@@ -1101,11 -988,9 +1101,11 @@@ static struct css_set *find_css_set(str
        }
  
        refcount_set(&cset->refcount, 1);
 +      cset->dom_cset = cset;
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
 +      INIT_LIST_HEAD(&cset->threaded_csets);
        INIT_HLIST_NODE(&cset->hlist);
        INIT_LIST_HEAD(&cset->cgrp_links);
        INIT_LIST_HEAD(&cset->mg_preload_node);
  
        spin_unlock_irq(&css_set_lock);
  
 +      /*
 +       * If @cset should be threaded, look up the matching dom_cset and
 +       * link them up.  We first fully initialize @cset then look for the
 +       * dom_cset.  It's simpler this way and safe as @cset is guaranteed
 +       * to stay empty until we return.
 +       */
 +      if (cgroup_is_threaded(cset->dfl_cgrp)) {
 +              struct css_set *dcset;
 +
 +              dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
 +              if (!dcset) {
 +                      put_css_set(cset);
 +                      return NULL;
 +              }
 +
 +              spin_lock_irq(&css_set_lock);
 +              cset->dom_cset = dcset;
 +              list_add_tail(&cset->threaded_csets_node,
 +                            &dcset->threaded_csets);
 +              spin_unlock_irq(&css_set_lock);
 +      }
 +
        return cset;
  }
  
@@@ -1292,8 -1155,6 +1292,8 @@@ static struct cgroup *cset_cgroup_from_
  
        if (cset == &init_css_set) {
                res = &root->cgrp;
 +      } else if (root == &cgrp_dfl_root) {
 +              res = cset->dfl_cgrp;
        } else {
                struct cgrp_cset_link *link;
  
@@@ -1809,9 -1670,6 +1809,9 @@@ static void init_cgroup_housekeeping(st
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->self.cgroup = cgrp;
        cgrp->self.flags |= CSS_ONLINE;
 +      cgrp->dom_cgrp = cgrp;
 +      cgrp->max_descendants = INT_MAX;
 +      cgrp->max_depth = INT_MAX;
  
        for_each_subsys(ss, ssid)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@@ -1879,7 -1737,8 +1879,8 @@@ int cgroup_setup_root(struct cgroup_roo
                &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
  
        root->kf_root = kernfs_create_root(kf_sops,
-                                          KERNFS_ROOT_CREATE_DEACTIVATED,
+                                          KERNFS_ROOT_CREATE_DEACTIVATED |
+                                          KERNFS_ROOT_SUPPORT_EXPORTOP,
                                           root_cgrp);
        if (IS_ERR(root->kf_root)) {
                ret = PTR_ERR(root->kf_root);
@@@ -2148,8 -2007,6 +2149,8 @@@ static void cgroup_migrate_add_task(str
        if (!cset->mg_src_cgrp)
                return;
  
 +      mgctx->tset.nr_tasks++;
 +
        list_move_tail(&task->cg_list, &cset->mg_tasks);
        if (list_empty(&cset->mg_node))
                list_add_tail(&cset->mg_node,
@@@ -2238,19 -2095,21 +2239,19 @@@ static int cgroup_migrate_execute(struc
        struct css_set *cset, *tmp_cset;
        int ssid, failed_ssid, ret;
  
 -      /* methods shouldn't be called if no task is actually migrating */
 -      if (list_empty(&tset->src_csets))
 -              return 0;
 -
        /* check that we can legitimately attach to the cgroup */
 -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 -              if (ss->can_attach) {
 -                      tset->ssid = ssid;
 -                      ret = ss->can_attach(tset);
 -                      if (ret) {
 -                              failed_ssid = ssid;
 -                              goto out_cancel_attach;
 +      if (tset->nr_tasks) {
 +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 +                      if (ss->can_attach) {
 +                              tset->ssid = ssid;
 +                              ret = ss->can_attach(tset);
 +                              if (ret) {
 +                                      failed_ssid = ssid;
 +                                      goto out_cancel_attach;
 +                              }
                        }
 -              }
 -      } while_each_subsys_mask();
 +              } while_each_subsys_mask();
 +      }
  
        /*
         * Now that we're guaranteed success, proceed to move all tasks to
         */
        tset->csets = &tset->dst_csets;
  
 -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 -              if (ss->attach) {
 -                      tset->ssid = ssid;
 -                      ss->attach(tset);
 -              }
 -      } while_each_subsys_mask();
 +      if (tset->nr_tasks) {
 +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 +                      if (ss->attach) {
 +                              tset->ssid = ssid;
 +                              ss->attach(tset);
 +                      }
 +              } while_each_subsys_mask();
 +      }
  
        ret = 0;
        goto out_release_tset;
  
  out_cancel_attach:
 -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 -              if (ssid == failed_ssid)
 -                      break;
 -              if (ss->cancel_attach) {
 -                      tset->ssid = ssid;
 -                      ss->cancel_attach(tset);
 -              }
 -      } while_each_subsys_mask();
 +      if (tset->nr_tasks) {
 +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 +                      if (ssid == failed_ssid)
 +                              break;
 +                      if (ss->cancel_attach) {
 +                              tset->ssid = ssid;
 +                              ss->cancel_attach(tset);
 +                      }
 +              } while_each_subsys_mask();
 +      }
  out_release_tset:
        spin_lock_irq(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
  }
  
  /**
 - * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
 + * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
   * @dst_cgrp: destination cgroup to test
   *
 - * On the default hierarchy, except for the root, subtree_control must be
 - * zero for migration destination cgroups with tasks so that child cgroups
 - * don't compete against tasks.
 + * On the default hierarchy, except for the mixable, (possible) thread root
 + * and threaded cgroups, subtree_control must be zero for migration
 + * destination cgroups with tasks so that child cgroups don't compete
 + * against tasks.
   */
 -bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 +int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
  {
 -      return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
 -              !dst_cgrp->subtree_control;
 +      /* v1 doesn't have any restriction */
 +      if (!cgroup_on_dfl(dst_cgrp))
 +              return 0;
 +
 +      /* verify @dst_cgrp can host resources */
 +      if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
 +              return -EOPNOTSUPP;
 +
 +      /* mixables don't care */
 +      if (cgroup_is_mixable(dst_cgrp))
 +              return 0;
 +
 +      /*
 +       * If @dst_cgrp is already or can become a thread root or is
 +       * threaded, it doesn't matter.
 +       */
 +      if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
 +              return 0;
 +
 +      /* apply no-internal-process constraint */
 +      if (dst_cgrp->subtree_control)
 +              return -EBUSY;
 +
 +      return 0;
  }
  
  /**
@@@ -2552,9 -2384,8 +2553,9 @@@ int cgroup_attach_task(struct cgroup *d
        struct task_struct *task;
        int ret;
  
 -      if (!cgroup_may_migrate_to(dst_cgrp))
 -              return -EBUSY;
 +      ret = cgroup_migrate_vet_dst(dst_cgrp);
 +      if (ret)
 +              return ret;
  
        /* look up all src csets */
        spin_lock_irq(&css_set_lock);
        return ret;
  }
  
 -static int cgroup_procs_write_permission(struct task_struct *task,
 -                                       struct cgroup *dst_cgrp,
 -                                       struct kernfs_open_file *of)
 -{
 -      struct super_block *sb = of->file->f_path.dentry->d_sb;
 -      struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 -      struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
 -      struct cgroup *src_cgrp, *com_cgrp;
 -      struct inode *inode;
 -      int ret;
 -
 -      if (!cgroup_on_dfl(dst_cgrp)) {
 -              const struct cred *cred = current_cred();
 -              const struct cred *tcred = get_task_cred(task);
 -
 -              /*
 -               * even if we're attaching all tasks in the thread group,
 -               * we only need to check permissions on one of them.
 -               */
 -              if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
 -                  uid_eq(cred->euid, tcred->uid) ||
 -                  uid_eq(cred->euid, tcred->suid))
 -                      ret = 0;
 -              else
 -                      ret = -EACCES;
 -
 -              put_cred(tcred);
 -              return ret;
 -      }
 -
 -      /* find the source cgroup */
 -      spin_lock_irq(&css_set_lock);
 -      src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
 -      spin_unlock_irq(&css_set_lock);
 -
 -      /* and the common ancestor */
 -      com_cgrp = src_cgrp;
 -      while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
 -              com_cgrp = cgroup_parent(com_cgrp);
 -
 -      /* %current should be authorized to migrate to the common ancestor */
 -      inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
 -      if (!inode)
 -              return -ENOMEM;
 -
 -      ret = inode_permission(inode, MAY_WRITE);
 -      iput(inode);
 -      if (ret)
 -              return ret;
 -
 -      /*
 -       * If namespaces are delegation boundaries, %current must be able
 -       * to see both source and destination cgroups from its namespace.
 -       */
 -      if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
 -          (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
 -           !cgroup_is_descendant(dst_cgrp, root_cgrp)))
 -              return -ENOENT;
 -
 -      return 0;
 -}
 -
 -/*
 - * Find the task_struct of the task to attach by vpid and pass it along to the
 - * function to attach either it or all tasks in its threadgroup. Will lock
 - * cgroup_mutex and threadgroup.
 - */
 -ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 -                           size_t nbytes, loff_t off, bool threadgroup)
 +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
 +      __acquires(&cgroup_threadgroup_rwsem)
  {
        struct task_struct *tsk;
 -      struct cgroup_subsys *ss;
 -      struct cgroup *cgrp;
        pid_t pid;
 -      int ssid, ret;
  
        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 -              return -EINVAL;
 -
 -      cgrp = cgroup_kn_lock_live(of->kn, false);
 -      if (!cgrp)
 -              return -ENODEV;
 +              return ERR_PTR(-EINVAL);
  
        percpu_down_write(&cgroup_threadgroup_rwsem);
 +
        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
 -                      ret = -ESRCH;
 -                      goto out_unlock_rcu;
 +                      tsk = ERR_PTR(-ESRCH);
 +                      goto out_unlock_threadgroup;
                }
        } else {
                tsk = current;
         * cgroup with no rt_runtime allocated.  Just say no.
         */
        if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
 -              ret = -EINVAL;
 -              goto out_unlock_rcu;
 +              tsk = ERR_PTR(-EINVAL);
 +              goto out_unlock_threadgroup;
        }
  
        get_task_struct(tsk);
 +      goto out_unlock_rcu;
 +
 +out_unlock_threadgroup:
 +      percpu_up_write(&cgroup_threadgroup_rwsem);
 +out_unlock_rcu:
        rcu_read_unlock();
 +      return tsk;
 +}
  
 -      ret = cgroup_procs_write_permission(tsk, cgrp, of);
 -      if (!ret)
 -              ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 +void cgroup_procs_write_finish(struct task_struct *task)
 +      __releases(&cgroup_threadgroup_rwsem)
 +{
 +      struct cgroup_subsys *ss;
 +      int ssid;
  
 -      put_task_struct(tsk);
 -      goto out_unlock_threadgroup;
 +      /* release reference from cgroup_procs_write_start() */
 +      put_task_struct(task);
  
 -out_unlock_rcu:
 -      rcu_read_unlock();
 -out_unlock_threadgroup:
        percpu_up_write(&cgroup_threadgroup_rwsem);
        for_each_subsys(ss, ssid)
                if (ss->post_attach)
                        ss->post_attach();
 -      cgroup_kn_unlock(of->kn);
 -      return ret ?: nbytes;
 -}
 -
 -ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
 -                         loff_t off)
 -{
 -      return __cgroup_procs_write(of, buf, nbytes, off, true);
  }
  
  static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@@ -2982,46 -2888,6 +2983,46 @@@ static void cgroup_finalize_control(str
        cgroup_apply_control_disable(cgrp);
  }
  
 +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
 +{
 +      u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
 +
 +      /* if nothing is getting enabled, nothing to worry about */
 +      if (!enable)
 +              return 0;
 +
 +      /* can @cgrp host any resources? */
 +      if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
 +              return -EOPNOTSUPP;
 +
 +      /* mixables don't care */
 +      if (cgroup_is_mixable(cgrp))
 +              return 0;
 +
 +      if (domain_enable) {
 +              /* can't enable domain controllers inside a thread subtree */
 +              if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
 +                      return -EOPNOTSUPP;
 +      } else {
 +              /*
 +               * Threaded controllers can handle internal competitions
 +               * and are always allowed inside a (prospective) thread
 +               * subtree.
 +               */
 +              if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
 +                      return 0;
 +      }
 +
 +      /*
 +       * Controllers can't be enabled for a cgroup with tasks to avoid
 +       * child cgroups competing against tasks.
 +       */
 +      if (cgroup_has_tasks(cgrp))
 +              return -EBUSY;
 +
 +      return 0;
 +}
 +
  /* change the enabled child controllers for a cgroup in the default hierarchy */
  static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            char *buf, size_t nbytes,
                goto out_unlock;
        }
  
 -      /*
 -       * Except for the root, subtree_control must be zero for a cgroup
 -       * with tasks so that child cgroups don't compete against tasks.
 -       */
 -      if (enable && cgroup_parent(cgrp)) {
 -              struct cgrp_cset_link *link;
 -
 -              /*
 -               * Because namespaces pin csets too, @cgrp->cset_links
 -               * might not be empty even when @cgrp is empty.  Walk and
 -               * verify each cset.
 -               */
 -              spin_lock_irq(&css_set_lock);
 -
 -              ret = 0;
 -              list_for_each_entry(link, &cgrp->cset_links, cset_link) {
 -                      if (css_set_populated(link->cset)) {
 -                              ret = -EBUSY;
 -                              break;
 -                      }
 -              }
 -
 -              spin_unlock_irq(&css_set_lock);
 -
 -              if (ret)
 -                      goto out_unlock;
 -      }
 +      ret = cgroup_vet_subtree_control_enable(cgrp, enable);
 +      if (ret)
 +              goto out_unlock;
  
        /* save and update control masks and prepare csses */
        cgroup_save_control(cgrp);
        cgrp->subtree_control &= ~disable;
  
        ret = cgroup_apply_control(cgrp);
 -
        cgroup_finalize_control(cgrp, ret);
 +      if (ret)
 +              goto out_unlock;
  
        kernfs_activate(cgrp->kn);
 -      ret = 0;
  out_unlock:
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
  }
  
 +/**
 + * cgroup_enable_threaded - make @cgrp threaded
 + * @cgrp: the target cgroup
 + *
 + * Called when "threaded" is written to the cgroup.type interface file and
 + * tries to make @cgrp threaded and join the parent's resource domain.
 + * This function is never called on the root cgroup as cgroup.type doesn't
 + * exist on it.
 + */
 +static int cgroup_enable_threaded(struct cgroup *cgrp)
 +{
 +      struct cgroup *parent = cgroup_parent(cgrp);
 +      struct cgroup *dom_cgrp = parent->dom_cgrp;
 +      int ret;
 +
 +      lockdep_assert_held(&cgroup_mutex);
 +
 +      /* noop if already threaded */
 +      if (cgroup_is_threaded(cgrp))
 +              return 0;
 +
 +      /* we're joining the parent's domain, ensure its validity */
 +      if (!cgroup_is_valid_domain(dom_cgrp) ||
 +          !cgroup_can_be_thread_root(dom_cgrp))
 +              return -EOPNOTSUPP;
 +
 +      /*
 +       * The following shouldn't cause actual migrations and should
 +       * always succeed.
 +       */
 +      cgroup_save_control(cgrp);
 +
 +      cgrp->dom_cgrp = dom_cgrp;
 +      ret = cgroup_apply_control(cgrp);
 +      if (!ret)
 +              parent->nr_threaded_children++;
 +      else
 +              cgrp->dom_cgrp = cgrp;
 +
 +      cgroup_finalize_control(cgrp, ret);
 +      return ret;
 +}
 +
 +static int cgroup_type_show(struct seq_file *seq, void *v)
 +{
 +      struct cgroup *cgrp = seq_css(seq)->cgroup;
 +
 +      if (cgroup_is_threaded(cgrp))
 +              seq_puts(seq, "threaded\n");
 +      else if (!cgroup_is_valid_domain(cgrp))
 +              seq_puts(seq, "domain invalid\n");
 +      else if (cgroup_is_thread_root(cgrp))
 +              seq_puts(seq, "domain threaded\n");
 +      else
 +              seq_puts(seq, "domain\n");
 +
 +      return 0;
 +}
 +
 +static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
 +                               size_t nbytes, loff_t off)
 +{
 +      struct cgroup *cgrp;
 +      int ret;
 +
 +      /* only switching to threaded mode is supported */
 +      if (strcmp(strstrip(buf), "threaded"))
 +              return -EINVAL;
 +
 +      cgrp = cgroup_kn_lock_live(of->kn, false);
 +      if (!cgrp)
 +              return -ENOENT;
 +
 +      /* threaded can only be enabled */
 +      ret = cgroup_enable_threaded(cgrp);
 +
 +      cgroup_kn_unlock(of->kn);
 +      return ret ?: nbytes;
 +}
 +
 +static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
 +{
 +      struct cgroup *cgrp = seq_css(seq)->cgroup;
 +      int descendants = READ_ONCE(cgrp->max_descendants);
 +
 +      if (descendants == INT_MAX)
 +              seq_puts(seq, "max\n");
 +      else
 +              seq_printf(seq, "%d\n", descendants);
 +
 +      return 0;
 +}
 +
 +static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
 +                                         char *buf, size_t nbytes, loff_t off)
 +{
 +      struct cgroup *cgrp;
 +      int descendants;
 +      ssize_t ret;
 +
 +      buf = strstrip(buf);
 +      if (!strcmp(buf, "max")) {
 +              descendants = INT_MAX;
 +      } else {
 +              ret = kstrtoint(buf, 0, &descendants);
 +              if (ret)
 +                      return ret;
 +      }
 +
 +      if (descendants < 0)
 +              return -ERANGE;
 +
 +      cgrp = cgroup_kn_lock_live(of->kn, false);
 +      if (!cgrp)
 +              return -ENOENT;
 +
 +      cgrp->max_descendants = descendants;
 +
 +      cgroup_kn_unlock(of->kn);
 +
 +      return nbytes;
 +}
 +
 +static int cgroup_max_depth_show(struct seq_file *seq, void *v)
 +{
 +      struct cgroup *cgrp = seq_css(seq)->cgroup;
 +      int depth = READ_ONCE(cgrp->max_depth);
 +
 +      if (depth == INT_MAX)
 +              seq_puts(seq, "max\n");
 +      else
 +              seq_printf(seq, "%d\n", depth);
 +
 +      return 0;
 +}
 +
 +static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
 +                                    char *buf, size_t nbytes, loff_t off)
 +{
 +      struct cgroup *cgrp;
 +      ssize_t ret;
 +      int depth;
 +
 +      buf = strstrip(buf);
 +      if (!strcmp(buf, "max")) {
 +              depth = INT_MAX;
 +      } else {
 +              ret = kstrtoint(buf, 0, &depth);
 +              if (ret)
 +                      return ret;
 +      }
 +
 +      if (depth < 0)
 +              return -ERANGE;
 +
 +      cgrp = cgroup_kn_lock_live(of->kn, false);
 +      if (!cgrp)
 +              return -ENOENT;
 +
 +      cgrp->max_depth = depth;
 +
 +      cgroup_kn_unlock(of->kn);
 +
 +      return nbytes;
 +}
 +
  static int cgroup_events_show(struct seq_file *seq, void *v)
  {
        seq_printf(seq, "populated %d\n",
        return 0;
  }
  
 +static int cgroup_stat_show(struct seq_file *seq, void *v)
 +{
 +      struct cgroup *cgroup = seq_css(seq)->cgroup;
 +
 +      seq_printf(seq, "nr_descendants %d\n",
 +                 cgroup->nr_descendants);
 +      seq_printf(seq, "nr_dying_descendants %d\n",
 +                 cgroup->nr_dying_descendants);
 +
 +      return 0;
 +}
 +
  static int cgroup_file_open(struct kernfs_open_file *of)
  {
        struct cftype *cft = of->kn->priv;
@@@ -3519,6 -3231,7 +3520,6 @@@ restart
  
  static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
  {
 -      LIST_HEAD(pending);
        struct cgroup_subsys *ss = cfts[0].ss;
        struct cgroup *root = &ss->root->cgrp;
        struct cgroup_subsys_state *css;
@@@ -3943,58 -3656,6 +3944,58 @@@ bool css_has_online_children(struct cgr
        return ret;
  }
  
 +static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
 +{
 +      struct list_head *l;
 +      struct cgrp_cset_link *link;
 +      struct css_set *cset;
 +
 +      lockdep_assert_held(&css_set_lock);
 +
 +      /* find the next threaded cset */
 +      if (it->tcset_pos) {
 +              l = it->tcset_pos->next;
 +
 +              if (l != it->tcset_head) {
 +                      it->tcset_pos = l;
 +                      return container_of(l, struct css_set,
 +                                          threaded_csets_node);
 +              }
 +
 +              it->tcset_pos = NULL;
 +      }
 +
 +      /* find the next cset */
 +      l = it->cset_pos;
 +      l = l->next;
 +      if (l == it->cset_head) {
 +              it->cset_pos = NULL;
 +              return NULL;
 +      }
 +
 +      if (it->ss) {
 +              cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
 +      } else {
 +              link = list_entry(l, struct cgrp_cset_link, cset_link);
 +              cset = link->cset;
 +      }
 +
 +      it->cset_pos = l;
 +
 +      /* initialize threaded css_set walking */
 +      if (it->flags & CSS_TASK_ITER_THREADED) {
 +              if (it->cur_dcset)
 +                      put_css_set_locked(it->cur_dcset);
 +              it->cur_dcset = cset;
 +              get_css_set(cset);
 +
 +              it->tcset_head = &cset->threaded_csets;
 +              it->tcset_pos = &cset->threaded_csets;
 +      }
 +
 +      return cset;
 +}
 +
  /**
   * css_task_iter_advance_css_set - advance a task itererator to the next css_set
   * @it: the iterator to advance
   */
  static void css_task_iter_advance_css_set(struct css_task_iter *it)
  {
 -      struct list_head *l = it->cset_pos;
 -      struct cgrp_cset_link *link;
        struct css_set *cset;
  
        lockdep_assert_held(&css_set_lock);
  
        /* Advance to the next non-empty css_set */
        do {
 -              l = l->next;
 -              if (l == it->cset_head) {
 -                      it->cset_pos = NULL;
 +              cset = css_task_iter_next_css_set(it);
 +              if (!cset) {
                        it->task_pos = NULL;
                        return;
                }
 -
 -              if (it->ss) {
 -                      cset = container_of(l, struct css_set,
 -                                          e_cset_node[it->ss->id]);
 -              } else {
 -                      link = list_entry(l, struct cgrp_cset_link, cset_link);
 -                      cset = link->cset;
 -              }
        } while (!css_set_populated(cset));
  
 -      it->cset_pos = l;
 -
        if (!list_empty(&cset->tasks))
                it->task_pos = cset->tasks.next;
        else
@@@ -4055,7 -3729,6 +4056,7 @@@ static void css_task_iter_advance(struc
        lockdep_assert_held(&css_set_lock);
        WARN_ON_ONCE(!l);
  
 +repeat:
        /*
         * Advance iterator to find next entry.  cset->tasks is consumed
         * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
                css_task_iter_advance_css_set(it);
        else
                it->task_pos = l;
 +
 +      /* if PROCS, skip over tasks which aren't group leaders */
 +      if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
 +          !thread_group_leader(list_entry(it->task_pos, struct task_struct,
 +                                          cg_list)))
 +              goto repeat;
  }
  
  /**
   * css_task_iter_start - initiate task iteration
   * @css: the css to walk tasks of
 + * @flags: CSS_TASK_ITER_* flags
   * @it: the task iterator to use
   *
   * Initiate iteration through the tasks of @css.  The caller can call
   * returns NULL.  On completion of iteration, css_task_iter_end() must be
   * called.
   */
 -void css_task_iter_start(struct cgroup_subsys_state *css,
 +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it)
  {
        /* no one should try to iterate before mounting cgroups */
        spin_lock_irq(&css_set_lock);
  
        it->ss = css->ss;
 +      it->flags = flags;
  
        if (it->ss)
                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@@ -4158,9 -3823,6 +4159,9 @@@ void css_task_iter_end(struct css_task_
                spin_unlock_irq(&css_set_lock);
        }
  
 +      if (it->cur_dcset)
 +              put_css_set(it->cur_dcset);
 +
        if (it->cur_task)
                put_task_struct(it->cur_task);
  }
@@@ -4177,12 -3839,16 +4178,12 @@@ static void *cgroup_procs_next(struct s
  {
        struct kernfs_open_file *of = s->private;
        struct css_task_iter *it = of->priv;
 -      struct task_struct *task;
  
 -      do {
 -              task = css_task_iter_next(it);
 -      } while (task && !thread_group_leader(task));
 -
 -      return task;
 +      return css_task_iter_next(it);
  }
  
 -static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 +static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
 +                                unsigned int iter_flags)
  {
        struct kernfs_open_file *of = s->private;
        struct cgroup *cgrp = seq_css(s)->cgroup;
                if (!it)
                        return ERR_PTR(-ENOMEM);
                of->priv = it;
 -              css_task_iter_start(&cgrp->self, it);
 +              css_task_iter_start(&cgrp->self, iter_flags, it);
        } else if (!(*pos)++) {
                css_task_iter_end(it);
 -              css_task_iter_start(&cgrp->self, it);
 +              css_task_iter_start(&cgrp->self, iter_flags, it);
        }
  
        return cgroup_procs_next(s, NULL, NULL);
  }
  
 +static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 +{
 +      struct cgroup *cgrp = seq_css(s)->cgroup;
 +
 +      /*
 +       * All processes of a threaded subtree belong to the domain cgroup
 +       * of the subtree.  Only threads can be distributed across the
 +       * subtree.  Reject reads on cgroup.procs in the subtree proper.
 +       * They're always empty anyway.
 +       */
 +      if (cgroup_is_threaded(cgrp))
 +              return ERR_PTR(-EOPNOTSUPP);
 +
 +      return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
 +                                          CSS_TASK_ITER_THREADED);
 +}
 +
  static int cgroup_procs_show(struct seq_file *s, void *v)
  {
 -      seq_printf(s, "%d\n", task_tgid_vnr(v));
 +      seq_printf(s, "%d\n", task_pid_vnr(v));
        return 0;
  }
  
 +static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
 +                                       struct cgroup *dst_cgrp,
 +                                       struct super_block *sb)
 +{
 +      struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 +      struct cgroup *com_cgrp = src_cgrp;
 +      struct inode *inode;
 +      int ret;
 +
 +      lockdep_assert_held(&cgroup_mutex);
 +
 +      /* find the common ancestor */
 +      while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
 +              com_cgrp = cgroup_parent(com_cgrp);
 +
 +      /* %current should be authorized to migrate to the common ancestor */
 +      inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
 +      if (!inode)
 +              return -ENOMEM;
 +
 +      ret = inode_permission(inode, MAY_WRITE);
 +      iput(inode);
 +      if (ret)
 +              return ret;
 +
 +      /*
 +       * If namespaces are delegation boundaries, %current must be able
 +       * to see both source and destination cgroups from its namespace.
 +       */
 +      if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
 +          (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
 +           !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
 +              return -ENOENT;
 +
 +      return 0;
 +}
 +
 +static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
 +                                char *buf, size_t nbytes, loff_t off)
 +{
 +      struct cgroup *src_cgrp, *dst_cgrp;
 +      struct task_struct *task;
 +      ssize_t ret;
 +
 +      dst_cgrp = cgroup_kn_lock_live(of->kn, false);
 +      if (!dst_cgrp)
 +              return -ENODEV;
 +
 +      task = cgroup_procs_write_start(buf, true);
 +      ret = PTR_ERR_OR_ZERO(task);
 +      if (ret)
 +              goto out_unlock;
 +
 +      /* find the source cgroup */
 +      spin_lock_irq(&css_set_lock);
 +      src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
 +      spin_unlock_irq(&css_set_lock);
 +
 +      ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
 +                                          of->file->f_path.dentry->d_sb);
 +      if (ret)
 +              goto out_finish;
 +
 +      ret = cgroup_attach_task(dst_cgrp, task, true);
 +
 +out_finish:
 +      cgroup_procs_write_finish(task);
 +out_unlock:
 +      cgroup_kn_unlock(of->kn);
 +
 +      return ret ?: nbytes;
 +}
 +
 +static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
 +{
 +      return __cgroup_procs_start(s, pos, 0);
 +}
 +
 +static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
 +                                  char *buf, size_t nbytes, loff_t off)
 +{
 +      struct cgroup *src_cgrp, *dst_cgrp;
 +      struct task_struct *task;
 +      ssize_t ret;
 +
 +      buf = strstrip(buf);
 +
 +      dst_cgrp = cgroup_kn_lock_live(of->kn, false);
 +      if (!dst_cgrp)
 +              return -ENODEV;
 +
 +      task = cgroup_procs_write_start(buf, false);
 +      ret = PTR_ERR_OR_ZERO(task);
 +      if (ret)
 +              goto out_unlock;
 +
 +      /* find the source cgroup */
 +      spin_lock_irq(&css_set_lock);
 +      src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
 +      spin_unlock_irq(&css_set_lock);
 +
 +      /* thread migrations follow the cgroup.procs delegation rule */
 +      ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
 +                                          of->file->f_path.dentry->d_sb);
 +      if (ret)
 +              goto out_finish;
 +
 +      /* and must be contained in the same domain */
 +      ret = -EOPNOTSUPP;
 +      if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
 +              goto out_finish;
 +
 +      ret = cgroup_attach_task(dst_cgrp, task, false);
 +
 +out_finish:
 +      cgroup_procs_write_finish(task);
 +out_unlock:
 +      cgroup_kn_unlock(of->kn);
 +
 +      return ret ?: nbytes;
 +}
 +
  /* cgroup core interface files for the default hierarchy */
  static struct cftype cgroup_base_files[] = {
        {
 +              .name = "cgroup.type",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cgroup_type_show,
 +              .write = cgroup_type_write,
 +      },
 +      {
                .name = "cgroup.procs",
                .flags = CFTYPE_NS_DELEGATABLE,
                .file_offset = offsetof(struct cgroup, procs_file),
                .write = cgroup_procs_write,
        },
        {
 +              .name = "cgroup.threads",
 +              .release = cgroup_procs_release,
 +              .seq_start = cgroup_threads_start,
 +              .seq_next = cgroup_procs_next,
 +              .seq_show = cgroup_procs_show,
 +              .write = cgroup_threads_write,
 +      },
 +      {
                .name = "cgroup.controllers",
                .seq_show = cgroup_controllers_show,
        },
                .file_offset = offsetof(struct cgroup, events_file),
                .seq_show = cgroup_events_show,
        },
 +      {
 +              .name = "cgroup.max.descendants",
 +              .seq_show = cgroup_max_descendants_show,
 +              .write = cgroup_max_descendants_write,
 +      },
 +      {
 +              .name = "cgroup.max.depth",
 +              .seq_show = cgroup_max_depth_show,
 +              .write = cgroup_max_depth_write,
 +      },
 +      {
 +              .name = "cgroup.stat",
 +              .seq_show = cgroup_stat_show,
 +      },
        { }     /* terminate */
  };
  
@@@ -4509,15 -4008,9 +4510,15 @@@ static void css_release_work_fn(struct 
                if (ss->css_released)
                        ss->css_released(css);
        } else {
 +              struct cgroup *tcgrp;
 +
                /* cgroup release path */
                trace_cgroup_release(cgrp);
  
 +              for (tcgrp = cgroup_parent(cgrp); tcgrp;
 +                   tcgrp = cgroup_parent(tcgrp))
 +                      tcgrp->nr_dying_descendants--;
 +
                cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
                cgrp->id = -1;
  
@@@ -4604,6 -4097,9 +4605,6 @@@ static void offline_css(struct cgroup_s
        if (!(css->flags & CSS_ONLINE))
                return;
  
 -      if (ss->css_reset)
 -              ss->css_reset(css);
 -
        if (ss->css_offline)
                ss->css_offline(css);
  
@@@ -4713,13 -4209,9 +4714,13 @@@ static struct cgroup *cgroup_create(str
        cgrp->root = root;
        cgrp->level = level;
  
 -      for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
 +      for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
  
 +              if (tcgrp != cgrp)
 +                      tcgrp->nr_descendants++;
 +      }
 +
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  
@@@ -4760,29 -4252,6 +4761,29 @@@ out_free_cgrp
        return ERR_PTR(ret);
  }
  
 +static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
 +{
 +      struct cgroup *cgroup;
 +      int ret = false;
 +      int level = 1;
 +
 +      lockdep_assert_held(&cgroup_mutex);
 +
 +      for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
 +              if (cgroup->nr_descendants >= cgroup->max_descendants)
 +                      goto fail;
 +
 +              if (level > cgroup->max_depth)
 +                      goto fail;
 +
 +              level++;
 +      }
 +
 +      ret = true;
 +fail:
 +      return ret;
 +}
 +
  int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
  {
        struct cgroup *parent, *cgrp;
        if (!parent)
                return -ENODEV;
  
 +      if (!cgroup_check_hierarchy_limits(parent)) {
 +              ret = -EAGAIN;
 +              goto out_unlock;
 +      }
 +
        cgrp = cgroup_create(parent);
        if (IS_ERR(cgrp)) {
                ret = PTR_ERR(cgrp);
@@@ -4953,7 -4417,6 +4954,7 @@@ static void kill_css(struct cgroup_subs
  static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
 +      struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *css;
        struct cgrp_cset_link *link;
        int ssid;
         */
        kernfs_remove(cgrp->kn);
  
 -      cgroup1_check_for_release(cgroup_parent(cgrp));
 +      if (parent && cgroup_is_threaded(cgrp))
 +              parent->nr_threaded_children--;
 +
 +      for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 +              tcgrp->nr_descendants--;
 +              tcgrp->nr_dying_descendants++;
 +      }
 +
 +      cgroup1_check_for_release(parent);
  
        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);
@@@ -5201,17 -4656,11 +5202,17 @@@ int __init cgroup_init(void
  
                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
  
 +              /* implicit controllers must be threaded too */
 +              WARN_ON(ss->implicit_on_dfl && !ss->threaded);
 +
                if (ss->implicit_on_dfl)
                        cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                else if (!ss->dfl_cftypes)
                        cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
  
 +              if (ss->threaded)
 +                      cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
 +
                if (ss->dfl_cftypes == ss->legacy_cftypes) {
                        WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                } else {
  
                if (ss->bind)
                        ss->bind(init_css_set.subsys[ssid]);
 +
 +              mutex_lock(&cgroup_mutex);
 +              css_populate_dir(init_css_set.subsys[ssid]);
 +              mutex_unlock(&cgroup_mutex);
        }
  
        /* init_css_set.subsys[] has been updated, re-hash */
@@@ -5256,6 -4701,18 +5257,18 @@@ static int __init cgroup_wq_init(void
  }
  core_initcall(cgroup_wq_init);
  
+ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+                                       char *buf, size_t buflen)
+ {
+       struct kernfs_node *kn;
+       kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+       if (!kn)
+               return;
+       kernfs_path(kn, buf, buflen);
+       kernfs_put(kn);
+ }
  /*
   * proc_cgroup_show()
   *  - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --combined mm/page_io.c
  #include <linux/frontswap.h>
  #include <linux/blkdev.h>
  #include <linux/uio.h>
 +#include <linux/sched/task.h>
  #include <asm/pgtable.h>
  
  static struct bio *get_swap_bio(gfp_t gfp_flags,
                                struct page *page, bio_end_io_t end_io)
  {
 +      int i, nr = hpage_nr_pages(page);
        struct bio *bio;
  
 -      bio = bio_alloc(gfp_flags, 1);
 +      bio = bio_alloc(gfp_flags, nr);
        if (bio) {
-               bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+               struct block_device *bdev;
+               bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
+               bio_set_dev(bio, bdev);
                bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
                bio->bi_end_io = end_io;
  
 -              bio_add_page(bio, page, PAGE_SIZE, 0);
 -              BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
 +              for (i = 0; i < nr; i++)
 +                      bio_add_page(bio, page + i, PAGE_SIZE, 0);
 +              VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
        }
        return bio;
  }
@@@ -60,8 -60,7 +63,7 @@@ void end_swap_bio_write(struct bio *bio
                 */
                set_page_dirty(page);
                pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
-                        imajor(bio->bi_bdev->bd_inode),
-                        iminor(bio->bi_bdev->bd_inode),
+                        MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
                         (unsigned long long)bio->bi_iter.bi_sector);
                ClearPageReclaim(page);
        }
@@@ -126,8 -125,7 +128,7 @@@ static void end_swap_bio_read(struct bi
                SetPageError(page);
                ClearPageUptodate(page);
                pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
-                        imajor(bio->bi_bdev->bd_inode),
-                        iminor(bio->bi_bdev->bd_inode),
+                        MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
                         (unsigned long long)bio->bi_iter.bi_sector);
                goto out;
        }
@@@ -139,7 -137,6 +140,7 @@@ out
        WRITE_ONCE(bio->bi_private, NULL);
        bio_put(bio);
        wake_up_process(waiter);
 +      put_task_struct(waiter);
  }
  
  int generic_swapfile_activate(struct swap_info_struct *sis,
@@@ -264,15 -261,6 +265,15 @@@ static sector_t swap_page_sector(struc
        return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
  }
  
 +static inline void count_swpout_vm_event(struct page *page)
 +{
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +      if (unlikely(PageTransHuge(page)))
 +              count_vm_event(THP_SWPOUT);
 +#endif
 +      count_vm_events(PSWPOUT, hpage_nr_pages(page));
 +}
 +
  int __swap_writepage(struct page *page, struct writeback_control *wbc,
                bio_end_io_t end_write_func)
  {
  
        ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
        if (!ret) {
 -              count_vm_event(PSWPOUT);
 +              count_swpout_vm_event(page);
                return 0;
        }
  
                goto out;
        }
        bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 -      count_vm_event(PSWPOUT);
 +      count_swpout_vm_event(page);
        set_page_writeback(page);
        unlock_page(page);
        submit_bio(bio);
@@@ -351,7 -339,7 +352,7 @@@ int swap_readpage(struct page *page, bo
        int ret = 0;
        struct swap_info_struct *sis = page_swap_info(page);
        blk_qc_t qc;
-       struct block_device *bdev;
+       struct gendisk *disk;
  
        VM_BUG_ON_PAGE(!PageSwapCache(page), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
                ret = -ENOMEM;
                goto out;
        }
-       bdev = bio->bi_bdev;
+       disk = bio->bi_disk;
 +      /*
 +       * Keep this task valid during swap readpage because the oom killer may
 +       * attempt to access it in the page fault retry time check.
 +       */
 +      get_task_struct(current);
        bio->bi_private = current;
        bio_set_op_attrs(bio, REQ_OP_READ, 0);
        count_vm_event(PSWPIN);
                if (!READ_ONCE(bio->bi_private))
                        break;
  
-               if (!blk_mq_poll(bdev_get_queue(bdev), qc))
+               if (!blk_mq_poll(disk->queue, qc))
                        break;
        }
        __set_current_state(TASK_RUNNING);