Merge branch 'for-4.14/block' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 7 Sep 2017 18:59:42 +0000 (11:59 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 7 Sep 2017 18:59:42 +0000 (11:59 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 18:59:42 +0000 (11:59 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 7 Sep 2017 18:59:42 +0000 (11:59 -0700)
diff --combined MAINTAINERS

index 29c0c2b,1164f93..c620db0
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -301,7 -301,6 +301,7 @@@ S: Supporte
   F:    drivers/acpi/
   F:    drivers/pnp/pnpacpi/
   F:    include/linux/acpi.h
+ +F:    include/linux/fwnode.h
   F:    include/acpi/
   F:    Documentation/acpi/
   F:    Documentation/ABI/testing/sysfs-bus-acpi
@@@ -311,14 -310,6 +311,14 @@@ F:       drivers/pci/*/*acpi
   F:    drivers/pci/*/*/*acpi*
   F:    tools/power/acpi/
   
+ +ACPI APEI
+ +M:    "Rafael J. Wysocki" <rjw@rjwysocki.net>
+ +M:    Len Brown <lenb@kernel.org>
+ +L:    linux-acpi@vger.kernel.org
+ +R:    Tony Luck <tony.luck@intel.com>
+ +R:    Borislav Petkov <bp@alien8.de>
+ +F:    drivers/acpi/apei/
+ +
   ACPI COMPONENT ARCHITECTURE (ACPICA)
   M:    Robert Moore <robert.moore@intel.com>
   M:    Lv Zheng <lv.zheng@intel.com>
@@@ -1162,7 -1153,6 +1162,7 @@@ L:      linux-arm-kernel@axis.co
   F:    arch/arm/mach-artpec
   F:    arch/arm/boot/dts/artpec6*
   F:    drivers/clk/axis
+ +F:    drivers/crypto/axis
   F:    drivers/pinctrl/pinctrl-artpec*
   F:    Documentation/devicetree/bindings/pinctrl/axis,artpec6-pinctrl.txt
   
@@@ -1171,7 -1161,7 +1171,7 @@@ M:      Brendan Higgins <brendanhiggins@goog
   R:    Benjamin Herrenschmidt <benh@kernel.crashing.org>
   R:    Joel Stanley <joel@jms.id.au>
   L:    linux-i2c@vger.kernel.org
- -L:    openbmc@lists.ozlabs.org
+ +L:    openbmc@lists.ozlabs.org (moderated for non-subscribers)
   S:    Maintained
   F:    drivers/irqchip/irq-aspeed-i2c-ic.c
   F:    drivers/i2c/busses/i2c-aspeed.c
@@@ -1292,15 -1282,10 +1292,15 @@@ S:   Maintaine
   
   ARM/CORTINA SYSTEMS GEMINI ARM ARCHITECTURE
   M:    Hans Ulli Kroll <ulli.kroll@googlemail.com>
+ +M:    Linus Walleij <linus.walleij@linaro.org>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   T:    git git://github.com/ulli-kroll/linux.git
   S:    Maintained
+ +F:    Documentation/devicetree/bindings/arm/gemini.txt
+ +F:    Documentation/devicetree/bindings/pinctrl/cortina,gemini-pinctrl.txt
+ +F:    Documentation/devicetree/bindings/rtc/faraday,ftrtc010.txt
   F:    arch/arm/mach-gemini/
+ +F:    drivers/pinctrl/pinctrl-gemini.c
   F:    drivers/rtc/rtc-ftrtc010.c
   
   ARM/CSR SIRFPRIMA2 MACHINE SUPPORT
@@@ -1585,7 -1570,7 +1585,7 @@@ M:      Chunfeng Yun <chunfeng.yun@mediatek.
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   L:    linux-mediatek@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
- -F:    drivers/phy/phy-mt65xx-usb3.c
+ +F:    drivers/phy/mediatek/phy-mtk-tphy.c
   
   ARM/MICREL KS8695 ARCHITECTURE
   M:    Greg Ungerer <gerg@uclinux.org>
@@@ -2008,7 -1993,6 +2008,7 @@@ F:      arch/arm64/boot/dts/socionext
   F:    drivers/bus/uniphier-system-bus.c
   F:    drivers/clk/uniphier/
   F:    drivers/i2c/busses/i2c-uniphier*
+ +F:    drivers/irqchip/irq-uniphier-aidet.c
   F:    drivers/pinctrl/uniphier/
   F:    drivers/reset/reset-uniphier.c
   F:    drivers/tty/serial/8250/8250_uniphier.c
@@@ -2493,7 -2477,7 +2493,7 @@@ Q:      https://patchwork.open-mesh.org/proj
   S:    Maintained
   F:    Documentation/ABI/testing/sysfs-class-net-batman-adv
   F:    Documentation/ABI/testing/sysfs-class-net-mesh
- -F:    Documentation/networking/batman-adv.txt
+ +F:    Documentation/networking/batman-adv.rst
   F:    include/uapi/linux/batman_adv.h
   F:    net/batman-adv/
   
@@@ -4375,12 -4359,6 +4375,12 @@@ S:    Maintaine
   F:    drivers/gpu/drm/qxl/
   F:    include/uapi/drm/qxl_drm.h
   
+ +DRM DRIVER FOR PERVASIVE DISPLAYS REPAPER PANELS
+ +M:    Noralf Trønnes <noralf@tronnes.org>
+ +S:    Maintained
+ +F:    drivers/gpu/drm/tinydrm/repaper.c
+ +F:    Documentation/devicetree/bindings/display/repaper.txt
+ +
   DRM DRIVER FOR RAGE 128 VIDEO CARDS
   S:    Orphan / Obsolete
   F:    drivers/gpu/drm/r128/
@@@ -4396,12 -4374,6 +4396,12 @@@ S:    Orphan / Obsolet
   F:    drivers/gpu/drm/sis/
   F:    include/uapi/drm/sis_drm.h
   
+ +DRM DRIVER FOR SITRONIX ST7586 PANELS
+ +M:    David Lechner <david@lechnology.com>
+ +S:    Maintained
+ +F:    drivers/gpu/drm/tinydrm/st7586.c
+ +F:    Documentation/devicetree/bindings/display/st7586.txt
+ +
   DRM DRIVER FOR TDFX VIDEO CARDS
   S:    Orphan / Obsolete
   F:    drivers/gpu/drm/tdfx/
@@@ -4650,14 -4622,6 +4650,14 @@@ F:    drivers/gpu/drm/panel
   F:    include/drm/drm_panel.h
   F:    Documentation/devicetree/bindings/display/panel/
   
+ +DRM TINYDRM DRIVERS
+ +M:    Noralf Trønnes <noralf@tronnes.org>
+ +W:    https://github.com/notro/tinydrm/wiki/Development
+ +T:    git git://anongit.freedesktop.org/drm/drm-misc
+ +S:    Maintained
+ +F:    drivers/gpu/drm/tinydrm/
+ +F:    include/drm/tinydrm/
+ +
   DSBR100 USB FM RADIO DRIVER
   M:    Alexey Klimov <klimov.linux@gmail.com>
   L:    linux-media@vger.kernel.org
@@@ -5126,21 -5090,12 +5126,21 @@@ M:   Andrew Lunn <andrew@lunn.ch
   M:    Florian Fainelli <f.fainelli@gmail.com>
   L:    netdev@vger.kernel.org
   S:    Maintained
- -F:    include/linux/phy.h
- -F:    include/linux/phy_fixed.h
- -F:    drivers/net/phy/
+ +F:    Documentation/ABI/testing/sysfs-bus-mdio
+ +F:    Documentation/devicetree/bindings/net/mdio*
   F:    Documentation/networking/phy.txt
+ +F:    drivers/net/phy/
   F:    drivers/of/of_mdio.c
   F:    drivers/of/of_net.c
+ +F:    include/linux/*mdio*.h
+ +F:    include/linux/of_net.h
+ +F:    include/linux/phy.h
+ +F:    include/linux/phy_fixed.h
+ +F:    include/linux/platform_data/mdio-gpio.h
+ +F:    include/linux/platform_data/mdio-bcm-unimac.h
+ +F:    include/trace/events/mdio.h
+ +F:    include/uapi/linux/mdio.h
+ +F:    include/uapi/linux/mii.h
   
   EXT2 FILE SYSTEM
   M:    Jan Kara <jack@suse.com>
@@@ -5378,11 -5333,10 +5378,11 @@@ K:   fmc_d.*registe
   
   FPGA MANAGER FRAMEWORK
   M:    Alan Tull <atull@kernel.org>
- -R:    Moritz Fischer <moritz.fischer@ettus.com>
+ +R:    Moritz Fischer <mdf@kernel.org>
   L:    linux-fpga@vger.kernel.org
   S:    Maintained
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/atull/linux-fpga.git
+ +Q:    http://patchwork.kernel.org/project/linux-fpga/list/
   F:    Documentation/fpga/
   F:    Documentation/devicetree/bindings/fpga/
   F:    drivers/fpga/
@@@ -5872,7 -5826,7 +5872,7 @@@ F:      drivers/staging/greybus/spi.
   F:    drivers/staging/greybus/spilib.c
   F:    drivers/staging/greybus/spilib.h
   
- -GREYBUS LOOBACK/TIME PROTOCOLS DRIVERS
+ +GREYBUS LOOPBACK/TIME PROTOCOLS DRIVERS
   M:    Bryan O'Donoghue <pure.logic@nexus-software.ie>
   S:    Maintained
   F:    drivers/staging/greybus/loopback.c
@@@ -6185,14 -6139,6 +6185,14 @@@ S:    Maintaine
   F:    drivers/net/ethernet/hisilicon/
   F:    Documentation/devicetree/bindings/net/hisilicon*.txt
   
+ +HISILICON NETWORK SUBSYSTEM 3 DRIVER (HNS3)
+ +M:    Yisen Zhuang <yisen.zhuang@huawei.com>
+ +M:    Salil Mehta <salil.mehta@huawei.com>
+ +L:    netdev@vger.kernel.org
+ +W:    http://www.hisilicon.com
+ +S:    Maintained
+ +F:    drivers/net/ethernet/hisilicon/hns3/
+ +
   HISILICON ROCE DRIVER
   M:    Lijun Ou <oulijun@huawei.com>
   M:    Wei Hu(Xavier) <xavier.huwei@huawei.com>
@@@ -6277,13 -6223,6 +6277,13 @@@ L:    linux-input@vger.kernel.or
   S:    Maintained
   F:    drivers/input/touchscreen/htcpen.c
   
+ +HUAWEI ETHERNET DRIVER
+ +M:    Aviad Krawczyk <aviad.krawczyk@huawei.com>
+ +L:    netdev@vger.kernel.org
+ +S:    Supported
+ +F:    Documentation/networking/hinic.txt
+ +F:    drivers/net/ethernet/huawei/hinic/
+ +
   HUGETLB FILESYSTEM
   M:    Nadia Yvette Chambers <nyc@holomorphy.com>
   S:    Maintained
@@@ -6310,9 -6249,7 +6310,9 @@@ M:      Haiyang Zhang <haiyangz@microsoft.co
   M:    Stephen Hemminger <sthemmin@microsoft.com>
   L:    devel@linuxdriverproject.org
   S:    Maintained
+ +F:    Documentation/networking/netvsc.txt
   F:    arch/x86/include/asm/mshyperv.h
+ +F:    arch/x86/include/asm/trace/hyperv.h
   F:    arch/x86/include/uapi/asm/hyperv.h
   F:    arch/x86/kernel/cpu/mshyperv.c
   F:    arch/x86/hyperv
@@@ -6324,9 -6261,7 +6324,9 @@@ F:      drivers/net/hyperv
   F:    drivers/scsi/storvsc_drv.c
   F:    drivers/uio/uio_hv_generic.c
   F:    drivers/video/fbdev/hyperv_fb.c
+ +F:    net/vmw_vsock/hyperv_transport.c
   F:    include/linux/hyperv.h
+ +F:    include/uapi/linux/hyperv.h
   F:    tools/hv/
   F:    Documentation/ABI/stable/sysfs-bus-vmbus
   
@@@ -6494,15 -6429,6 +6494,15 @@@ L:    netdev@vger.kernel.or
   S:    Supported
   F:    drivers/net/ethernet/ibm/ibmvnic.*
   
+ +IBM Power Virtual Accelerator Switchboard
+ +M:    Sukadev Bhattiprolu
+ +L:    linuxppc-dev@lists.ozlabs.org
+ +S:    Supported
+ +F:    arch/powerpc/platforms/powernv/vas*
+ +F:    arch/powerpc/platforms/powernv/copy-paste.h
+ +F:    arch/powerpc/include/asm/vas.h
+ +F:    arch/powerpc/include/uapi/asm/vas.h
+ +
   IBM Power Virtual Ethernet Device Driver
   M:    Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
   L:    netdev@vger.kernel.org
@@@ -6810,9 -6736,8 +6810,9 @@@ S:      Supporte
   F:    drivers/scsi/isci/
   
   INTEL DRM DRIVERS (excluding Poulsbo, Moorestown and derivative chipsets)
- -M:    Daniel Vetter <daniel.vetter@intel.com>
   M:    Jani Nikula <jani.nikula@linux.intel.com>
+ +M:    Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
+ +M:    Rodrigo Vivi <rodrigo.vivi@intel.com>
   L:    intel-gfx@lists.freedesktop.org
   W:    https://01.org/linuxgraphics/
   B:    https://01.org/linuxgraphics/documentation/how-report-bugs
@@@ -7150,7 -7075,9 +7150,7 @@@ W:      http://irda.sourceforge.net
   S:    Maintained
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/sameo/irda-2.6.git
   F:    Documentation/networking/irda.txt
- -F:    drivers/net/irda/
- -F:    include/net/irda/
- -F:    net/irda/
+ +F:    drivers/staging/irda/
   
   IRQ DOMAINS (IRQ NUMBER MAPPING LIBRARY)
   M:    Marc Zyngier <marc.zyngier@arm.com>
@@@ -7175,6 -7102,7 +7175,6 @@@ M:      Marc Zyngier <marc.zyngier@arm.com
   L:    linux-kernel@vger.kernel.org
   S:    Maintained
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
- -T:    git git://git.infradead.org/users/jcooper/linux.git irqchip/core
   F:    Documentation/devicetree/bindings/interrupt-controller/
   F:    drivers/irqchip/
   
@@@ -7704,6 -7632,17 +7704,6 @@@ T:     git git://linuxtv.org/mkrufky/tuners
   S:    Maintained
   F:    drivers/media/dvb-frontends/lgdt3305.*
   
- -LGUEST
- -M:    Rusty Russell <rusty@rustcorp.com.au>
- -L:    lguest@lists.ozlabs.org
- -W:    http://lguest.ozlabs.org/
- -S:    Odd Fixes
- -F:    arch/x86/include/asm/lguest*.h
- -F:    arch/x86/lguest/
- -F:    drivers/lguest/
- -F:    include/linux/lguest*.h
- -F:    tools/lguest/
- -
   LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
   M:    Viresh Kumar <vireshk@kernel.org>
   L:    linux-ide@vger.kernel.org
@@@ -7839,7 -7778,6 +7839,7 @@@ F:      drivers/pci/hotplug/rpa
   F:    drivers/rtc/rtc-opal.c
   F:    drivers/scsi/ibmvscsi/
   F:    drivers/tty/hvc/hvc_opal.c
+ +F:    drivers/watchdog/wdrtas.c
   F:    tools/testing/selftests/powerpc
   N:    /pmac
   N:    powermac
@@@ -8478,9 -8416,7 +8478,9 @@@ F:      include/uapi/linux/uvcvideo.
   
   MEDIATEK ETHERNET DRIVER
   M:    Felix Fietkau <nbd@openwrt.org>
- -M:    John Crispin <blogic@openwrt.org>
+ +M:    John Crispin <john@phrozen.org>
+ +M:    Sean Wang <sean.wang@mediatek.com>
+ +M:    Nelson Chang <nelson.chang@mediatek.com>
   L:    netdev@vger.kernel.org
   S:    Maintained
   F:    drivers/net/ethernet/mediatek/
@@@ -8521,14 -8457,6 +8521,14 @@@ M:    Sean Wang <sean.wang@mediatek.com
   S:    Maintained
   F:    drivers/char/hw_random/mtk-rng.c
   
+ +MEDIATEK USB3 DRD IP DRIVER
+ +M:    Chunfeng Yun <chunfeng.yun@mediatek.com>
+ +L:    linux-usb@vger.kernel.org (moderated for non-subscribers)
+ +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+ +L:    linux-mediatek@lists.infradead.org (moderated for non-subscribers)
+ +S:    Maintained
+ +F:    drivers/usb/mtu3/
+ +
   MEGACHIPS STDPXXXX-GE-B850V3-FW LVDS/DP++ BRIDGES
   M:    Peter Senna Tschudin <peter.senna@collabora.com>
   M:    Martin Donnelly <martin.donnelly@ge.com>
@@@ -8693,7 -8621,7 +8693,7 @@@ M:      Mathieu Desnoyers <mathieu.desnoyers
   M:    "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
   L:    linux-kernel@vger.kernel.org
   S:    Supported
- -F:    kernel/membarrier.c
+ +F:    kernel/sched/membarrier.c
   F:    include/uapi/linux/membarrier.h
   
   MEMORY MANAGEMENT
@@@ -8783,12 -8711,6 +8783,12 @@@ F:    drivers/dma/at_hdmac.
   F:    drivers/dma/at_hdmac_regs.h
   F:    include/linux/platform_data/dma-atmel.h
   
+ +MICROCHIP / ATMEL ECC DRIVER
+ +M:    Tudor Ambarus <tudor.ambarus@microchip.com>
+ +L:    linux-crypto@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/crypto/atmel-ecc.*
+ +
   MICROCHIP / ATMEL ISC DRIVER
   M:    Songjun Wu <songjun.wu@microchip.com>
   L:    linux-media@vger.kernel.org
@@@ -9536,7 -9458,6 +9536,7 @@@ M:      Srinivas Kandagatla <srinivas.kandag
   S:    Maintained
   F:    drivers/nvmem/
   F:    Documentation/devicetree/bindings/nvmem/
+ +F:    Documentation/ABI/stable/sysfs-bus-nvmem
   F:    include/linux/nvmem-consumer.h
   F:    include/linux/nvmem-provider.h
   
@@@ -10454,7 -10375,7 +10454,7 @@@ L:   linux-gpio@vger.kernel.or
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git
   S:    Maintained
   F:    Documentation/devicetree/bindings/pinctrl/
- -F:    Documentation/pinctrl.txt
+ +F:    Documentation/driver-api/pinctl.rst
   F:    drivers/pinctrl/
   F:    include/linux/pinctrl/
   
@@@ -11182,7 -11103,7 +11182,7 @@@ M:   Fenghua Yu <fenghua.yu@intel.com
   L:    linux-kernel@vger.kernel.org
   S:    Supported
   F:    arch/x86/kernel/cpu/intel_rdt*
- -F:    arch/x86/include/asm/intel_rdt*
+ +F:    arch/x86/include/asm/intel_rdt_sched.h
   F:    Documentation/x86/intel_rdt*
   
   READ-COPY UPDATE (RCU)
@@@ -12561,6 -12482,12 +12561,12 @@@ M: Ion Badulescu <ionut@badula.org
   S:    Odd Fixes
   F:    drivers/net/ethernet/adaptec/starfire*
   
+ STEC S1220 SKD DRIVER
+ M:    Bart Van Assche <bart.vanassche@wdc.com>
+ L:    linux-block@vger.kernel.org
+ S:    Maintained
+ F:    drivers/block/skd*[ch]
+ 
   STI CEC DRIVER
   M:    Benjamin Gaignard <benjamin.gaignard@linaro.org>
   S:    Maintained
@@@ -13064,11 -12991,6 +13070,11 @@@ M: Yehezkel Bernat <yehezkel.bernat@int
   S:    Maintained
   F:    drivers/thunderbolt/
   
+ +THUNDERX GPIO DRIVER
+ +M:    David Daney <david.daney@cavium.com>
+ +S:    Maintained
+ +F:    drivers/gpio/gpio-thunderx.c
+ +
   TI AM437X VPFE DRIVER
   M:    "Lad, Prabhakar" <prabhakar.csengg@gmail.com>
   L:    linux-media@vger.kernel.org
@@@ -14080,7 -14002,6 +14086,7 @@@ F:   drivers/block/virtio_blk.
   F:    include/linux/virtio*.h
   F:    include/uapi/linux/virtio_*.h
   F:    drivers/crypto/virtio/
+ +F:    mm/balloon_compaction.c
   
   VIRTIO CRYPTO DRIVER
   M:    Gonglei <arei.gonglei@huawei.com>
@@@ -14295,12 -14216,6 +14301,12 @@@ F: drivers/watchdog
   F:    include/linux/watchdog.h
   F:    include/uapi/linux/watchdog.h
   
+ +WHISKEYCOVE PMIC GPIO DRIVER
+ +M:    Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
+ +L:    linux-gpio@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/gpio/gpio-wcove.c
+ +
   WIIMOTE HID DRIVER
   M:    David Herrmann <dh.herrmann@googlemail.com>
   L:    linux-input@vger.kernel.org
diff --combined arch/powerpc/sysdev/axonram.c

index 58507c3,1e15dea..c60e84e
--- 1/arch/powerpc/sysdev/axonram.c
--- 2/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@@ -110,7 -110,7 +110,7 @@@ axon_ram_irq_handler(int irq, void *dev
   static blk_qc_t
   axon_ram_make_request(struct request_queue *queue, struct bio *bio)
   {
-       struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data;
+       struct axon_ram_bank *bank = bio->bi_disk->private_data;
         unsigned long phys_mem, phys_end;
         void *user_mem;
         struct bio_vec vec;
@@@ -188,12 -188,15 +188,12 @@@ static int axon_ram_probe(struct platfo
   
         axon_ram_bank_id++;
   
- -      dev_info(&device->dev, "Found memory controller on %s\n",
- -                      device->dev.of_node->full_name);
+ +      dev_info(&device->dev, "Found memory controller on %pOF\n",
+ +                      device->dev.of_node);
   
- -      bank = kzalloc(sizeof(struct axon_ram_bank), GFP_KERNEL);
- -      if (bank == NULL) {
- -              dev_err(&device->dev, "Out of memory\n");
- -              rc = -ENOMEM;
- -              goto failed;
- -      }
+ +      bank = kzalloc(sizeof(*bank), GFP_KERNEL);
+ +      if (!bank)
+ +              return -ENOMEM;
   
         device->dev.platform_data = bank;
   
@@@ -289,22 -292,25 +289,22 @@@
         return 0;
   
   failed:
- -      if (bank != NULL) {
- -              if (bank->irq_id)
- -                      free_irq(bank->irq_id, device);
- -              if (bank->disk != NULL) {
- -                      if (bank->disk->major > 0)
- -                              unregister_blkdev(bank->disk->major,
- -                                              bank->disk->disk_name);
- -                      if (bank->disk->flags & GENHD_FL_UP)
- -                              del_gendisk(bank->disk);
- -                      put_disk(bank->disk);
- -              }
- -              kill_dax(bank->dax_dev);
- -              put_dax(bank->dax_dev);
- -              device->dev.platform_data = NULL;
- -              if (bank->io_addr != 0)
- -                      iounmap((void __iomem *) bank->io_addr);
- -              kfree(bank);
+ +      if (bank->irq_id)
+ +              free_irq(bank->irq_id, device);
+ +      if (bank->disk != NULL) {
+ +              if (bank->disk->major > 0)
+ +                      unregister_blkdev(bank->disk->major,
+ +                                      bank->disk->disk_name);
+ +              if (bank->disk->flags & GENHD_FL_UP)
+ +                      del_gendisk(bank->disk);
+ +              put_disk(bank->disk);
         }
- -
+ +      kill_dax(bank->dax_dev);
+ +      put_dax(bank->dax_dev);
+ +      device->dev.platform_data = NULL;
+ +      if (bank->io_addr != 0)
+ +              iounmap((void __iomem *) bank->io_addr);
+ +      kfree(bank);
         return rc;
   }
   
diff --combined block/bfq-iosched.h

index 859f0a8,fb28c25..cc4ea85
--- 1/block/bfq-iosched.h
--- 2/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@@ -71,29 -71,17 +71,29 @@@ struct bfq_service_tree 
    *
    * bfq_sched_data is the basic scheduler queue.  It supports three
    * ioprio_classes, and can be used either as a toplevel queue or as an
- - * intermediate queue on a hierarchical setup.  @next_in_service
- - * points to the active entity of the sched_data service trees that
- - * will be scheduled next. It is used to reduce the number of steps
- - * needed for each hierarchical-schedule update.
+ + * intermediate queue in a hierarchical setup.
    *
    * The supported ioprio_classes are the same as in CFQ, in descending
    * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
    * Requests from higher priority queues are served before all the
    * requests from lower priority queues; among requests of the same
    * queue requests are served according to B-WF2Q+.
- - * All the fields are protected by the queue lock of the containing bfqd.
+ + *
+ + * The schedule is implemented by the service trees, plus the field
+ + * @next_in_service, which points to the entity on the active trees
+ + * that will be served next, if 1) no changes in the schedule occurs
+ + * before the current in-service entity is expired, 2) the in-service
+ + * queue becomes idle when it expires, and 3) if the entity pointed by
+ + * in_service_entity is not a queue, then the in-service child entity
+ + * of the entity pointed by in_service_entity becomes idle on
+ + * expiration. This peculiar definition allows for the following
+ + * optimization, not yet exploited: while a given entity is still in
+ + * service, we already know which is the best candidate for next
+ + * service among the other active entitities in the same parent
+ + * entity. We can then quickly compare the timestamps of the
+ + * in-service entity with those of such best candidate.
+ + *
+ + * All fields are protected by the lock of the containing bfqd.
    */
   struct bfq_sched_data {
         /* entity in service */
@@@ -360,11 -348,11 +360,11 @@@ struct bfq_io_cq 
         uint64_t blkcg_serial_nr; /* the current blkcg serial */
   #endif
         /*
-        * Snapshot of the idle window before merging; taken to
-        * remember this value while the queue is merged, so as to be
-        * able to restore it in case of split.
+        * Snapshot of the has_short_time flag before merging; taken
+        * to remember its value while the queue is merged, so as to
+        * be able to restore it in case of split.
          */
-       bool saved_idle_window;
+       bool saved_has_short_ttime;
         /*
          * Same purpose as the previous two fields for the I/O bound
          * classification of a queue.
@@@ -638,7 -626,7 +638,7 @@@ enum bfqq_state_flags 
                                      * without idling the device
                                      */
         BFQQF_fifo_expire,      /* FIFO checked in this slice */
-       BFQQF_idle_window,      /* slice idling enabled */
+       BFQQF_has_short_ttime,  /* queue has a short think time */
         BFQQF_sync,             /* synchronous queue */
         BFQQF_IO_bound,         /*
                                  * bfqq has timed-out at least once
@@@ -667,7 -655,7 +667,7 @@@ BFQ_BFQQ_FNS(busy)
   BFQ_BFQQ_FNS(wait_request);
   BFQ_BFQQ_FNS(non_blocking_wait_rq);
   BFQ_BFQQ_FNS(fifo_expire);
- BFQ_BFQQ_FNS(idle_window);
+ BFQ_BFQQ_FNS(has_short_ttime);
   BFQ_BFQQ_FNS(sync);
   BFQ_BFQQ_FNS(IO_bound);
   BFQ_BFQQ_FNS(in_large_burst);
@@@ -929,13 -917,16 +929,16 @@@ void bfq_add_bfqq_busy(struct bfq_data 
   struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
   
   #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)        do {                    \
-       blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\
-                       bfq_bfqq_sync((bfqq)) ? 'S' : 'A',              \
-                       bfqq_group(bfqq)->blkg_path, ##args);           \
+       blk_add_cgroup_trace_msg((bfqd)->queue,                         \
+                       bfqg_to_blkg(bfqq_group(bfqq))->blkcg,          \
+                       "bfq%d%c " fmt, (bfqq)->pid,                    \
+                       bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args);     \
   } while (0)
   
- #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)        \
-       blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args)
+ #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)        do {                    \
+       blk_add_cgroup_trace_msg((bfqd)->queue,                         \
+               bfqg_to_blkg(bfqg)->blkcg, fmt, ##args);                \
+ } while (0)
   
   #else /* CONFIG_BFQ_GROUP_IOSCHED */
   
diff --combined block/bio-integrity.c

index 9b1ea47,553d75e..5df3290
--- 1/block/bio-integrity.c
--- 2/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@@ -146,7 -146,7 +146,7 @@@ int bio_integrity_add_page(struct bio *
         iv = bip->bip_vec + bip->bip_vcnt;
   
         if (bip->bip_vcnt &&
-           bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+           bvec_gap_to_prev(bio->bi_disk->queue,
                              &bip->bip_vec[bip->bip_vcnt - 1], offset))
                 return 0;
   
@@@ -190,7 -190,7 +190,7 @@@ static inline unsigned int bio_integrit
   static blk_status_t bio_integrity_process(struct bio *bio,
                 struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
   {
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
         struct blk_integrity_iter iter;
         struct bvec_iter bviter;
         struct bio_vec bv;
@@@ -199,7 -199,7 +199,7 @@@
         void *prot_buf = page_address(bip->bip_vec->bv_page) +
                 bip->bip_vec->bv_offset;
   
-       iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
+       iter.disk_name = bio->bi_disk->disk_name;
         iter.interval = 1 << bi->interval_exp;
         iter.seed = proc_iter->bi_sector;
         iter.prot_buf = prot_buf;
@@@ -236,8 -236,8 +236,8 @@@
   bool bio_integrity_prep(struct bio *bio)
   {
         struct bio_integrity_payload *bip;
-       struct blk_integrity *bi;
-       struct request_queue *q;
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+       struct request_queue *q = bio->bi_disk->queue;
         void *buf;
         unsigned long start, end;
         unsigned int len, nr_pages;
@@@ -245,8 -245,9 +245,9 @@@
         unsigned int intervals;
         blk_status_t status;
   
-       bi = bdev_get_integrity(bio->bi_bdev);
-       q = bdev_get_queue(bio->bi_bdev);
+       if (!bi)
+               return true;
+ 
         if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
                 return true;
   
@@@ -257,9 -258,6 +258,6 @@@
         if (bio_integrity(bio))
                 return true;
   
-       if (bi == NULL)
-               return true;
- 
         if (bio_data_dir(bio) == READ) {
                 if (!bi->profile->verify_fn ||
                     !(bi->flags & BLK_INTEGRITY_VERIFY))
@@@ -354,7 -352,7 +352,7 @@@ static void bio_integrity_verify_fn(str
         struct bio_integrity_payload *bip =
                 container_of(work, struct bio_integrity_payload, bip_work);
         struct bio *bio = bip->bip_bio;
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
         struct bvec_iter iter = bio->bi_iter;
   
         /*
@@@ -387,11 -385,12 +385,11 @@@
    */
   bool __bio_integrity_endio(struct bio *bio)
   {
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ +      struct bio_integrity_payload *bip = bio_integrity(bio);
   
         if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
- -          bi->profile->verify_fn) {
- -              struct bio_integrity_payload *bip = bio_integrity(bio);
- -
+ +          (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) {
                 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
                 queue_work(kintegrityd_wq, &bip->bip_work);
                 return false;
@@@ -413,7 -412,7 +411,7 @@@
   void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
   {
         struct bio_integrity_payload *bip = bio_integrity(bio);
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
         unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
   
         bip->bip_iter.bi_sector += bytes_done >> 9;
@@@ -430,7 -429,7 +428,7 @@@ EXPORT_SYMBOL(bio_integrity_advance)
   void bio_integrity_trim(struct bio *bio)
   {
         struct bio_integrity_payload *bip = bio_integrity(bio);
-       struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+       struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
   
         bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
   }
diff --combined block/blk-mq-debugfs.c

index 4f927a5,e53b612..980e730
--- 1/block/blk-mq-debugfs.c
--- 2/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@@ -48,8 -48,6 +48,6 @@@ static int blk_flags_show(struct seq_fi
   static const char *const blk_queue_flag_name[] = {
         QUEUE_FLAG_NAME(QUEUED),
         QUEUE_FLAG_NAME(STOPPED),
-       QUEUE_FLAG_NAME(SYNCFULL),
-       QUEUE_FLAG_NAME(ASYNCFULL),
         QUEUE_FLAG_NAME(DYING),
         QUEUE_FLAG_NAME(BYPASS),
         QUEUE_FLAG_NAME(BIDI),
@@@ -75,8 -73,6 +73,8 @@@
         QUEUE_FLAG_NAME(STATS),
         QUEUE_FLAG_NAME(POLL_STATS),
         QUEUE_FLAG_NAME(REGISTERED),
+ +      QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
+ +      QUEUE_FLAG_NAME(QUIESCED),
   };
   #undef QUEUE_FLAG_NAME
   
@@@ -267,7 -263,6 +265,7 @@@ static const char *const cmd_flag_name[
         CMD_FLAG_NAME(RAHEAD),
         CMD_FLAG_NAME(BACKGROUND),
         CMD_FLAG_NAME(NOUNMAP),
+ +      CMD_FLAG_NAME(NOWAIT),
   };
   #undef CMD_FLAG_NAME
   
@@@ -744,7 -739,7 +742,7 @@@ static int blk_mq_debugfs_release(struc
                 return seq_release(inode, file);
   }
   
- const struct file_operations blk_mq_debugfs_fops = {
+ static const struct file_operations blk_mq_debugfs_fops = {
         .open           = blk_mq_debugfs_open,
         .read           = seq_read,
         .write          = blk_mq_debugfs_write,
diff --combined block/blk-mq.c

index 4603b11,f84d145..3f18cff
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -83,6 -83,41 +83,41 @@@ static void blk_mq_hctx_clear_pending(s
         sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
   }
   
+ struct mq_inflight {
+       struct hd_struct *part;
+       unsigned int *inflight;
+ };
+ 
+ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
+                                 struct request *rq, void *priv,
+                                 bool reserved)
+ {
+       struct mq_inflight *mi = priv;
+ 
+       if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
+           !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+               /*
+                * index[0] counts the specific partition that was asked
+                * for. index[1] counts the ones that are active on the
+                * whole device, so increment that if mi->part is indeed
+                * a partition, and not a whole device.
+                */
+               if (rq->part == mi->part)
+                       mi->inflight[0]++;
+               if (mi->part->partno)
+                       mi->inflight[1]++;
+       }
+ }
+ 
+ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
+                     unsigned int inflight[2])
+ {
+       struct mq_inflight mi = { .part = part, .inflight = inflight, };
+ 
+       inflight[0] = inflight[1] = 0;
+       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+ }
+ 
   void blk_freeze_queue_start(struct request_queue *q)
   {
         int freeze_depth;
@@@ -301,12 -336,11 +336,12 @@@ static struct request *blk_mq_get_reque
         struct elevator_queue *e = q->elevator;
         struct request *rq;
         unsigned int tag;
+ +      struct blk_mq_ctx *local_ctx = NULL;
   
         blk_queue_enter_live(q);
         data->q = q;
         if (likely(!data->ctx))
- -              data->ctx = blk_mq_get_ctx(q);
+ +              data->ctx = local_ctx = blk_mq_get_ctx(q);
         if (likely(!data->hctx))
                 data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
         if (op & REQ_NOWAIT)
@@@ -325,10 -359,6 +360,10 @@@
   
         tag = blk_mq_get_tag(data);
         if (tag == BLK_MQ_TAG_FAIL) {
+ +              if (local_ctx) {
+ +                      blk_mq_put_ctx(local_ctx);
+ +                      data->ctx = NULL;
+ +              }
                 blk_queue_exit(q);
                 return NULL;
         }
@@@ -360,13 -390,13 +395,13 @@@ struct request *blk_mq_alloc_request(st
                 return ERR_PTR(ret);
   
         rq = blk_mq_get_request(q, NULL, op, &alloc_data);
- -
- -      blk_mq_put_ctx(alloc_data.ctx);
         blk_queue_exit(q);
   
         if (!rq)
                 return ERR_PTR(-EWOULDBLOCK);
   
+ +      blk_mq_put_ctx(alloc_data.ctx);
+ +
         rq->__data_len = 0;
         rq->__sector = (sector_t) -1;
         rq->bio = rq->biotail = NULL;
@@@ -411,6 -441,7 +446,6 @@@ struct request *blk_mq_alloc_request_hc
         alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
   
         rq = blk_mq_get_request(q, NULL, op, &alloc_data);
- -
         blk_queue_exit(q);
   
         if (!rq)
@@@ -624,11 -655,10 +659,10 @@@ static void blk_mq_requeue_work(struct 
                 container_of(work, struct request_queue, requeue_work.work);
         LIST_HEAD(rq_list);
         struct request *rq, *next;
-       unsigned long flags;
   
-       spin_lock_irqsave(&q->requeue_lock, flags);
+       spin_lock_irq(&q->requeue_lock);
         list_splice_init(&q->requeue_list, &rq_list);
-       spin_unlock_irqrestore(&q->requeue_lock, flags);
+       spin_unlock_irq(&q->requeue_lock);
   
         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
                 if (!(rq->rq_flags & RQF_SOFTBARRIER))
@@@ -683,8 -713,8 +717,8 @@@ EXPORT_SYMBOL(blk_mq_kick_requeue_list)
   void blk_mq_delay_kick_requeue_list(struct request_queue *q,
                                     unsigned long msecs)
   {
- -      kblockd_schedule_delayed_work(&q->requeue_work,
- -                                    msecs_to_jiffies(msecs));
+ +      kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
+ +                                  msecs_to_jiffies(msecs));
   }
   EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
   
@@@ -1102,9 -1132,19 +1136,19 @@@ static void __blk_mq_run_hw_queue(struc
   {
         int srcu_idx;
   
+       /*
+        * We should be running this queue from one of the CPUs that
+        * are mapped to it.
+        */
         WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
                 cpu_online(hctx->next_cpu));
   
+       /*
+        * We can't run the queue inline with ints disabled. Ensure that
+        * we catch bad users of this early.
+        */
+       WARN_ON_ONCE(in_interrupt());
+ 
         if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
                 rcu_read_lock();
                 blk_mq_sched_dispatch_requests(hctx);
@@@ -1218,7 -1258,7 +1262,7 @@@ EXPORT_SYMBOL(blk_mq_queue_stopped)
   /*
    * This function is often used for pausing .queue_rq() by driver when
    * there isn't enough resource or some conditions aren't satisfied, and
-  * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+  * BLK_STS_RESOURCE is usually returned.
    *
    * We do not guarantee that dispatch can be drained or blocked
    * after blk_mq_stop_hw_queue() returns. Please use
@@@ -1235,7 -1275,7 +1279,7 @@@ EXPORT_SYMBOL(blk_mq_stop_hw_queue)
   /*
    * This function is often used for pausing .queue_rq() by driver when
    * there isn't enough resource or some conditions aren't satisfied, and
-  * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+  * BLK_STS_RESOURCE is usually returned.
    *
    * We do not guarantee that dispatch can be drained or blocked
    * after blk_mq_stop_hw_queues() returns. Please use
diff --combined block/blk-throttle.c

index 80f5481,6a4c4c4..0fea76a
--- 1/block/blk-throttle.c
--- 2/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@@ -373,23 -373,13 +373,21 @@@ static unsigned int tg_iops_limit(struc
         if (likely(!blk_trace_note_message_enabled(__td->queue)))       \
                 break;                                                  \
         if ((__tg)) {                                                   \
-               char __pbuf[128];                                       \
-                                                                       \
-               blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf));    \
-               blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
+               blk_add_cgroup_trace_msg(__td->queue,                   \
+                       tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
         } else {                                                        \
                 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);  \
         }                                                               \
   } while (0)
   
+ +static inline unsigned int throtl_bio_data_size(struct bio *bio)
+ +{
+ +      /* assume it's one sector */
+ +      if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+ +              return 512;
+ +      return bio->bi_iter.bi_size;
+ +}
+ +
   static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
   {
         INIT_LIST_HEAD(&qn->node);
@@@ -942,7 -932,6 +940,7 @@@ static bool tg_with_in_bps_limit(struc
         bool rw = bio_data_dir(bio);
         u64 bytes_allowed, extra_bytes, tmp;
         unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+ +      unsigned int bio_size = throtl_bio_data_size(bio);
   
         jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
   
@@@ -956,14 -945,14 +954,14 @@@
         do_div(tmp, HZ);
         bytes_allowed = tmp;
   
- -      if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
+ +      if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
                 if (wait)
                         *wait = 0;
                 return true;
         }
   
         /* Calc approx time to dispatch */
- -      extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
+ +      extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
         jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
   
         if (!jiffy_wait)
@@@ -1043,12 -1032,11 +1041,12 @@@ static bool tg_may_dispatch(struct thro
   static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
   {
         bool rw = bio_data_dir(bio);
+ +      unsigned int bio_size = throtl_bio_data_size(bio);
   
         /* Charge the bio to the group */
- -      tg->bytes_disp[rw] += bio->bi_iter.bi_size;
+ +      tg->bytes_disp[rw] += bio_size;
         tg->io_disp[rw]++;
- -      tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+ +      tg->last_bytes_disp[rw] += bio_size;
         tg->last_io_disp[rw]++;
   
         /*
@@@ -2114,14 -2102,9 +2112,9 @@@ static inline void throtl_update_latenc
   static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
   {
   #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-       int ret;
- 
-       ret = bio_associate_current(bio);
-       if (ret == 0 || ret == -EBUSY)
+       if (bio->bi_css)
                 bio->bi_cg_private = tg;
         blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
- #else
-       bio_associate_current(bio);
   #endif
   }
   
diff --combined block/genhd.c

index 51c1d40,713b7d4..dd305c6
--- 1/block/genhd.c
--- 2/block/genhd.c
+++ b/block/genhd.c
@@@ -45,6 -45,52 +45,52 @@@ static void disk_add_events(struct gend
   static void disk_del_events(struct gendisk *disk);
   static void disk_release_events(struct gendisk *disk);
   
+ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+ {
+       if (q->mq_ops)
+               return;
+ 
+       atomic_inc(&part->in_flight[rw]);
+       if (part->partno)
+               atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+ }
+ 
+ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+ {
+       if (q->mq_ops)
+               return;
+ 
+       atomic_dec(&part->in_flight[rw]);
+       if (part->partno)
+               atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+ }
+ 
+ void part_in_flight(struct request_queue *q, struct hd_struct *part,
+                   unsigned int inflight[2])
+ {
+       if (q->mq_ops) {
+               blk_mq_in_flight(q, part, inflight);
+               return;
+       }
+ 
+       inflight[0] = atomic_read(&part->in_flight[0]) +
+                       atomic_read(&part->in_flight[1]);
+       if (part->partno) {
+               part = &part_to_disk(part)->part0;
+               inflight[1] = atomic_read(&part->in_flight[0]) +
+                               atomic_read(&part->in_flight[1]);
+       }
+ }
+ 
+ struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
+ {
+       struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
+ 
+       if (unlikely(partno < 0 || partno >= ptbl->len))
+               return NULL;
+       return rcu_dereference(ptbl->part[partno]);
+ }
+ 
   /**
    * disk_get_part - get partition
    * @disk: disk to look partition from
@@@ -61,21 -107,12 +107,12 @@@
    */
   struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
   {
-       struct hd_struct *part = NULL;
-       struct disk_part_tbl *ptbl;
- 
-       if (unlikely(partno < 0))
-               return NULL;
+       struct hd_struct *part;
   
         rcu_read_lock();
- 
-       ptbl = rcu_dereference(disk->part_tbl);
-       if (likely(partno < ptbl->len)) {
-               part = rcu_dereference(ptbl->part[partno]);
-               if (part)
-                       get_device(part_to_dev(part));
-       }
- 
+       part = __disk_get_part(disk, partno);
+       if (part)
+               get_device(part_to_dev(part));
         rcu_read_unlock();
   
         return part;
@@@ -242,7 -279,6 +279,7 @@@ EXPORT_SYMBOL_GPL(disk_map_sector_rcu)
    * Can be deleted altogether. Later.
    *
    */
+ +#define BLKDEV_MAJOR_HASH_SIZE 255
   static struct blk_major_name {
         struct blk_major_name *next;
         int major;
@@@ -260,11 -296,12 +297,11 @@@ void blkdev_show(struct seq_file *seqf
   {
         struct blk_major_name *dp;
   
- -      if (offset < BLKDEV_MAJOR_HASH_SIZE) {
- -              mutex_lock(&block_class_lock);
- -              for (dp = major_names[offset]; dp; dp = dp->next)
+ +      mutex_lock(&block_class_lock);
+ +      for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
+ +              if (dp->major == offset)
                         seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
- -              mutex_unlock(&block_class_lock);
- -      }
+ +      mutex_unlock(&block_class_lock);
   }
   #endif /* CONFIG_PROC_FS */
   
@@@ -309,14 -346,6 +346,14 @@@ int register_blkdev(unsigned int major
                 ret = major;
         }
   
+ +      if (major >= BLKDEV_MAJOR_MAX) {
+ +              pr_err("register_blkdev: major requested (%d) is greater than the maximum (%d) for %s\n",
+ +                     major, BLKDEV_MAJOR_MAX, name);
+ +
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
         p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
         if (p == NULL) {
                 ret = -ENOMEM;
@@@ -1098,12 -1127,13 +1135,13 @@@ static const struct attribute_group *di
    * original ptbl is freed using RCU callback.
    *
    * LOCKING:
-  * Matching bd_mutx locked.
+  * Matching bd_mutex locked or the caller is the only user of @disk.
    */
   static void disk_replace_part_tbl(struct gendisk *disk,
                                   struct disk_part_tbl *new_ptbl)
   {
-       struct disk_part_tbl *old_ptbl = disk->part_tbl;
+       struct disk_part_tbl *old_ptbl =
+               rcu_dereference_protected(disk->part_tbl, 1);
   
         rcu_assign_pointer(disk->part_tbl, new_ptbl);
   
@@@ -1122,14 -1152,16 +1160,16 @@@
    * uses RCU to allow unlocked dereferencing for stats and other stuff.
    *
    * LOCKING:
-  * Matching bd_mutex locked, might sleep.
+  * Matching bd_mutex locked or the caller is the only user of @disk.
+  * Might sleep.
    *
    * RETURNS:
    * 0 on success, -errno on failure.
    */
   int disk_expand_part_tbl(struct gendisk *disk, int partno)
   {
-       struct disk_part_tbl *old_ptbl = disk->part_tbl;
+       struct disk_part_tbl *old_ptbl =
+               rcu_dereference_protected(disk->part_tbl, 1);
         struct disk_part_tbl *new_ptbl;
         int len = old_ptbl ? old_ptbl->len : 0;
         int i, target;
@@@ -1212,6 -1244,7 +1252,7 @@@ static int diskstats_show(struct seq_fi
         struct disk_part_iter piter;
         struct hd_struct *hd;
         char buf[BDEVNAME_SIZE];
+       unsigned int inflight[2];
         int cpu;
   
         /*
@@@ -1225,8 -1258,9 +1266,9 @@@
         disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
         while ((hd = disk_part_iter_next(&piter))) {
                 cpu = part_stat_lock();
-               part_round_stats(cpu, hd);
+               part_round_stats(gp->queue, cpu, hd);
                 part_stat_unlock();
+               part_in_flight(gp->queue, hd, inflight);
                 seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
                            "%u %lu %lu %lu %u %u %u %u\n",
                            MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
@@@ -1239,7 -1273,7 +1281,7 @@@
                            part_stat_read(hd, merges[WRITE]),
                            part_stat_read(hd, sectors[WRITE]),
                            jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
-                          part_in_flight(hd),
+                          inflight[0],
                            jiffies_to_msecs(part_stat_read(hd, io_ticks)),
                            jiffies_to_msecs(part_stat_read(hd, time_in_queue))
                         );
@@@ -1321,6 -1355,14 +1363,14 @@@ EXPORT_SYMBOL(alloc_disk)
   struct gendisk *alloc_disk_node(int minors, int node_id)
   {
         struct gendisk *disk;
+       struct disk_part_tbl *ptbl;
+ 
+       if (minors > DISK_MAX_PARTS) {
+               printk(KERN_ERR
+                       "block: can't allocated more than %d partitions\n",
+                       DISK_MAX_PARTS);
+               minors = DISK_MAX_PARTS;
+       }
   
         disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
         if (disk) {
@@@ -1334,7 -1376,8 +1384,8 @@@
                         kfree(disk);
                         return NULL;
                 }
-               disk->part_tbl->part[0] = &disk->part0;
+               ptbl = rcu_dereference_protected(disk->part_tbl, 1);
+               rcu_assign_pointer(ptbl->part[0], &disk->part0);
   
                 /*
                  * set_capacity() and get_capacity() currently don't use
diff --combined drivers/block/Kconfig

index 80aaf34,5dd62a8..104180e
--- 1/drivers/block/Kconfig
--- 2/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@@ -17,6 -17,7 +17,7 @@@ if BLK_DE
   
   config BLK_DEV_NULL_BLK
         tristate "Null test block driver"
+       depends on CONFIGFS_FS
   
   config BLK_DEV_FD
         tristate "Normal floppy disk support"
@@@ -470,7 -471,7 +471,7 @@@ config VIRTIO_BL
         depends on VIRTIO
         ---help---
           This is the virtual block driver for virtio.  It can be used with
- -          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+ +          QEMU based VMMs (like KVM or Xen).  Say Y or M.
   
   config VIRTIO_BLK_SCSI
         bool "SCSI passthrough request for the Virtio block driver"
diff --combined drivers/block/brd.c

index 5d9ed06,006e1cb..bbd0d18
--- 1/drivers/block/brd.c
--- 2/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@@ -294,14 -294,13 +294,13 @@@ out
   
   static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
   {
-       struct block_device *bdev = bio->bi_bdev;
-       struct brd_device *brd = bdev->bd_disk->private_data;
+       struct brd_device *brd = bio->bi_disk->private_data;
         struct bio_vec bvec;
         sector_t sector;
         struct bvec_iter iter;
   
         sector = bio->bi_iter.bi_sector;
-       if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
+       if (bio_end_sector(bio) > get_capacity(bio->bi_disk))
                 goto io_error;
   
         bio_for_each_segment(bvec, bio, iter) {
@@@ -326,11 -325,7 +325,11 @@@ static int brd_rw_page(struct block_dev
                        struct page *page, bool is_write)
   {
         struct brd_device *brd = bdev->bd_disk->private_data;
- -      int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
+ +      int err;
+ +
+ +      if (PageTransHuge(page))
+ +              return -ENOTSUPP;
+ +      err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
         page_endio(page, is_write, err);
         return err;
   }
diff --combined drivers/block/loop.c

index f321b96,2fbd408..407cb17
--- 1/drivers/block/loop.c
--- 2/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@@ -221,7 -221,8 +221,7 @@@ static void __loop_update_dio(struct lo
   }
   
   static int
- -figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit,
- -               loff_t logical_blocksize)
+ +figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
   {
         loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
         sector_t x = (sector_t)size;
@@@ -233,6 -234,12 +233,6 @@@
                 lo->lo_offset = offset;
         if (lo->lo_sizelimit != sizelimit)
                 lo->lo_sizelimit = sizelimit;
- -      if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) {
- -              lo->lo_logical_blocksize = logical_blocksize;
- -              blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize);
- -              blk_queue_logical_block_size(lo->lo_queue,
- -                                           lo->lo_logical_blocksize);
- -      }
         set_capacity(lo->lo_disk, x);
         bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
         /* let user-space know about the new size */
@@@ -813,6 -820,7 +813,6 @@@ static void loop_config_discard(struct 
         struct file *file = lo->lo_backing_file;
         struct inode *inode = file->f_mapping->host;
         struct request_queue *q = lo->lo_queue;
- -      int lo_bits = 9;
   
         /*
          * We use punch hole to reclaim the free space used by the
@@@ -832,9 -840,11 +832,9 @@@
   
         q->limits.discard_granularity = inode->i_sb->s_blocksize;
         q->limits.discard_alignment = 0;
- -      if (lo->lo_flags & LO_FLAGS_BLOCKSIZE)
- -              lo_bits = blksize_bits(lo->lo_logical_blocksize);
   
- -      blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits);
- -      blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits);
+ +      blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+ +      blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
   }
   
@@@ -928,6 -938,7 +928,6 @@@ static int loop_set_fd(struct loop_devi
   
         lo->use_dio = false;
         lo->lo_blocksize = lo_blocksize;
- -      lo->lo_logical_blocksize = 512;
         lo->lo_device = bdev;
         lo->lo_flags = lo_flags;
         lo->lo_backing_file = file;
@@@ -1093,6 -1104,7 +1093,6 @@@ loop_set_status(struct loop_device *lo
         int err;
         struct loop_func_table *xfer;
         kuid_t uid = current_uid();
- -      int lo_flags = lo->lo_flags;
   
         if (lo->lo_encrypt_key_size &&
             !uid_eq(lo->lo_key_owner, uid) &&
@@@ -1125,9 -1137,26 +1125,9 @@@
         if (err)
                 goto exit;
   
- -      if (info->lo_flags & LO_FLAGS_BLOCKSIZE) {
- -              if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE))
- -                      lo->lo_logical_blocksize = 512;
- -              lo->lo_flags |= LO_FLAGS_BLOCKSIZE;
- -              if (LO_INFO_BLOCKSIZE(info) != 512 &&
- -                  LO_INFO_BLOCKSIZE(info) != 1024 &&
- -                  LO_INFO_BLOCKSIZE(info) != 2048 &&
- -                  LO_INFO_BLOCKSIZE(info) != 4096)
- -                      return -EINVAL;
- -              if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize)
- -                      return -EINVAL;
- -      }
- -
         if (lo->lo_offset != info->lo_offset ||
- -          lo->lo_sizelimit != info->lo_sizelimit ||
- -          lo->lo_flags != lo_flags ||
- -          ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) &&
- -           lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) {
- -              if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit,
- -                                   LO_INFO_BLOCKSIZE(info))) {
+ +          lo->lo_sizelimit != info->lo_sizelimit) {
+ +              if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
                         err = -EFBIG;
                         goto exit;
                 }
@@@ -1319,7 -1348,8 +1319,7 @@@ static int loop_set_capacity(struct loo
         if (unlikely(lo->lo_state != Lo_bound))
                 return -ENXIO;
   
- -      return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit,
- -                              lo->lo_logical_blocksize);
+ +      return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
   }
   
   static int loop_set_dio(struct loop_device *lo, unsigned long arg)
@@@ -1966,10 -1996,6 +1966,6 @@@ static int __init loop_init(void
         struct loop_device *lo;
         int err;
   
-       err = misc_register(&loop_misc);
-       if (err < 0)
-               return err;
- 
         part_shift = 0;
         if (max_part > 0) {
                 part_shift = fls(max_part);
@@@ -1987,12 -2013,12 +1983,12 @@@
   
         if ((1UL << part_shift) > DISK_MAX_PARTS) {
                 err = -EINVAL;
-               goto misc_out;
+               goto err_out;
         }
   
         if (max_loop > 1UL << (MINORBITS - part_shift)) {
                 err = -EINVAL;
-               goto misc_out;
+               goto err_out;
         }
   
         /*
@@@ -2011,6 -2037,11 +2007,11 @@@
                 range = 1UL << MINORBITS;
         }
   
+       err = misc_register(&loop_misc);
+       if (err < 0)
+               goto err_out;
+ 
+ 
         if (register_blkdev(LOOP_MAJOR, "loop")) {
                 err = -EIO;
                 goto misc_out;
@@@ -2030,6 -2061,7 +2031,7 @@@
   
   misc_out:
         misc_deregister(&loop_misc);
+ err_out:
         return err;
   }
   
diff --combined drivers/block/null_blk.c

index 81142ce,bd92286..8042c26
--- 1/drivers/block/null_blk.c
--- 2/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@@ -1,3 -1,7 +1,7 @@@
+ /*
+  * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
+  * Shaohua Li <shli@fb.com>
+  */
   #include <linux/module.h>
   
   #include <linux/moduleparam.h>
@@@ -9,27 -13,110 +13,110 @@@
   #include <linux/blk-mq.h>
   #include <linux/hrtimer.h>
   #include <linux/lightnvm.h>
+ #include <linux/configfs.h>
+ #include <linux/badblocks.h>
+ 
+ #define SECTOR_SHIFT          9
+ #define PAGE_SECTORS_SHIFT    (PAGE_SHIFT - SECTOR_SHIFT)
+ #define PAGE_SECTORS          (1 << PAGE_SECTORS_SHIFT)
+ #define SECTOR_SIZE           (1 << SECTOR_SHIFT)
+ #define SECTOR_MASK           (PAGE_SECTORS - 1)
+ 
+ #define FREE_BATCH            16
+ 
+ #define TICKS_PER_SEC         50ULL
+ #define TIMER_INTERVAL                (NSEC_PER_SEC / TICKS_PER_SEC)
+ 
+ static inline u64 mb_per_tick(int mbps)
+ {
+       return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
+ }
   
   struct nullb_cmd {
         struct list_head list;
         struct llist_node ll_list;
- -      struct call_single_data csd;
+ +      call_single_data_t csd;
         struct request *rq;
         struct bio *bio;
         unsigned int tag;
         struct nullb_queue *nq;
         struct hrtimer timer;
+       blk_status_t error;
   };
   
   struct nullb_queue {
         unsigned long *tag_map;
         wait_queue_head_t wait;
         unsigned int queue_depth;
+       struct nullb_device *dev;
   
         struct nullb_cmd *cmds;
   };
   
+ /*
+  * Status flags for nullb_device.
+  *
+  * CONFIGURED:        Device has been configured and turned on. Cannot reconfigure.
+  * UP:                Device is currently on and visible in userspace.
+  * THROTTLED: Device is being throttled.
+  * CACHE:     Device is using a write-back cache.
+  */
+ enum nullb_device_flags {
+       NULLB_DEV_FL_CONFIGURED = 0,
+       NULLB_DEV_FL_UP         = 1,
+       NULLB_DEV_FL_THROTTLED  = 2,
+       NULLB_DEV_FL_CACHE      = 3,
+ };
+ 
+ /*
+  * nullb_page is a page in memory for nullb devices.
+  *
+  * @page:     The page holding the data.
+  * @bitmap:   The bitmap represents which sector in the page has data.
+  *            Each bit represents one block size. For example, sector 8
+  *            will use the 7th bit
+  * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
+  * page is being flushing to storage. FREE means the cache page is freed and
+  * should be skipped from flushing to storage. Please see
+  * null_make_cache_space
+  */
+ struct nullb_page {
+       struct page *page;
+       unsigned long bitmap;
+ };
+ #define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1)
+ #define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2)
+ 
+ struct nullb_device {
+       struct nullb *nullb;
+       struct config_item item;
+       struct radix_tree_root data; /* data stored in the disk */
+       struct radix_tree_root cache; /* disk cache data */
+       unsigned long flags; /* device flags */
+       unsigned int curr_cache;
+       struct badblocks badblocks;
+ 
+       unsigned long size; /* device size in MB */
+       unsigned long completion_nsec; /* time in ns to complete a request */
+       unsigned long cache_size; /* disk cache size in MB */
+       unsigned int submit_queues; /* number of submission queues */
+       unsigned int home_node; /* home node for the device */
+       unsigned int queue_mode; /* block interface */
+       unsigned int blocksize; /* block size */
+       unsigned int irqmode; /* IRQ completion handler */
+       unsigned int hw_queue_depth; /* queue depth */
+       unsigned int index; /* index of the disk, only valid with a disk */
+       unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
+       bool use_lightnvm; /* register as a LightNVM device */
+       bool blocking; /* blocking blk-mq device */
+       bool use_per_node_hctx; /* use per-node allocation for hardware context */
+       bool power; /* power on/off the device */
+       bool memory_backed; /* if data is stored in memory */
+       bool discard; /* if support discard */
+ };
+ 
   struct nullb {
+       struct nullb_device *dev;
         struct list_head list;
         unsigned int index;
         struct request_queue *q;
@@@ -37,8 -124,10 +124,10 @@@
         struct nvm_dev *ndev;
         struct blk_mq_tag_set *tag_set;
         struct blk_mq_tag_set __tag_set;
-       struct hrtimer timer;
         unsigned int queue_depth;
+       atomic_long_t cur_bytes;
+       struct hrtimer bw_timer;
+       unsigned long cache_flush_pos;
         spinlock_t lock;
   
         struct nullb_queue *queues;
@@@ -49,7 -138,7 +138,7 @@@
   static LIST_HEAD(nullb_list);
   static struct mutex lock;
   static int null_major;
- static int nullb_indexes;
+ static DEFINE_IDA(nullb_indexes);
   static struct kmem_cache *ppa_cache;
   static struct blk_mq_tag_set tag_set;
   
@@@ -65,15 -154,15 +154,15 @@@ enum 
         NULL_Q_MQ               = 2,
   };
   
- static int submit_queues;
- module_param(submit_queues, int, S_IRUGO);
+ static int g_submit_queues = 1;
+ module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
   MODULE_PARM_DESC(submit_queues, "Number of submission queues");
   
- static int home_node = NUMA_NO_NODE;
- module_param(home_node, int, S_IRUGO);
+ static int g_home_node = NUMA_NO_NODE;
+ module_param_named(home_node, g_home_node, int, S_IRUGO);
   MODULE_PARM_DESC(home_node, "Home node for the device");
   
- static int queue_mode = NULL_Q_MQ;
+ static int g_queue_mode = NULL_Q_MQ;
   
   static int null_param_store_val(const char *str, int *val, int min, int max)
   {
@@@ -92,7 -181,7 +181,7 @@@
   
   static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
   {
-       return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ);
+       return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
   }
   
   static const struct kernel_param_ops null_queue_mode_param_ops = {
@@@ -100,38 -189,38 +189,38 @@@
         .get    = param_get_int,
   };
   
- device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO);
+ device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO);
   MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
   
- static int gb = 250;
- module_param(gb, int, S_IRUGO);
+ static int g_gb = 250;
+ module_param_named(gb, g_gb, int, S_IRUGO);
   MODULE_PARM_DESC(gb, "Size in GB");
   
- static int bs = 512;
- module_param(bs, int, S_IRUGO);
+ static int g_bs = 512;
+ module_param_named(bs, g_bs, int, S_IRUGO);
   MODULE_PARM_DESC(bs, "Block size (in bytes)");
   
   static int nr_devices = 1;
   module_param(nr_devices, int, S_IRUGO);
   MODULE_PARM_DESC(nr_devices, "Number of devices to register");
   
- static bool use_lightnvm;
- module_param(use_lightnvm, bool, S_IRUGO);
+ static bool g_use_lightnvm;
+ module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
   MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
   
- static bool blocking;
- module_param(blocking, bool, S_IRUGO);
+ static bool g_blocking;
+ module_param_named(blocking, g_blocking, bool, S_IRUGO);
   MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
   
   static bool shared_tags;
   module_param(shared_tags, bool, S_IRUGO);
   MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
   
- static int irqmode = NULL_IRQ_SOFTIRQ;
+ static int g_irqmode = NULL_IRQ_SOFTIRQ;
   
   static int null_set_irqmode(const char *str, const struct kernel_param *kp)
   {
-       return null_param_store_val(str, &irqmode, NULL_IRQ_NONE,
+       return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
                                         NULL_IRQ_TIMER);
   }
   
@@@ -140,21 -229,358 +229,358 @@@ static const struct kernel_param_ops nu
         .get    = param_get_int,
   };
   
- device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
+ device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO);
   MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
   
- static unsigned long completion_nsec = 10000;
- module_param(completion_nsec, ulong, S_IRUGO);
+ static unsigned long g_completion_nsec = 10000;
+ module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO);
   MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
   
- static int hw_queue_depth = 64;
- module_param(hw_queue_depth, int, S_IRUGO);
+ static int g_hw_queue_depth = 64;
+ module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO);
   MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
   
- static bool use_per_node_hctx = false;
- module_param(use_per_node_hctx, bool, S_IRUGO);
+ static bool g_use_per_node_hctx;
+ module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO);
   MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
   
+ static struct nullb_device *null_alloc_dev(void);
+ static void null_free_dev(struct nullb_device *dev);
+ static void null_del_dev(struct nullb *nullb);
+ static int null_add_dev(struct nullb_device *dev);
+ static void null_free_device_storage(struct nullb_device *dev, bool is_cache);
+ 
+ static inline struct nullb_device *to_nullb_device(struct config_item *item)
+ {
+       return item ? container_of(item, struct nullb_device, item) : NULL;
+ }
+ 
+ static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
+ {
+       return snprintf(page, PAGE_SIZE, "%u\n", val);
+ }
+ 
+ static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
+       char *page)
+ {
+       return snprintf(page, PAGE_SIZE, "%lu\n", val);
+ }
+ 
+ static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
+ {
+       return snprintf(page, PAGE_SIZE, "%u\n", val);
+ }
+ 
+ static ssize_t nullb_device_uint_attr_store(unsigned int *val,
+       const char *page, size_t count)
+ {
+       unsigned int tmp;
+       int result;
+ 
+       result = kstrtouint(page, 0, &tmp);
+       if (result)
+               return result;
+ 
+       *val = tmp;
+       return count;
+ }
+ 
+ static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
+       const char *page, size_t count)
+ {
+       int result;
+       unsigned long tmp;
+ 
+       result = kstrtoul(page, 0, &tmp);
+       if (result)
+               return result;
+ 
+       *val = tmp;
+       return count;
+ }
+ 
+ static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
+       size_t count)
+ {
+       bool tmp;
+       int result;
+ 
+       result = kstrtobool(page,  &tmp);
+       if (result)
+               return result;
+ 
+       *val = tmp;
+       return count;
+ }
+ 
+ /* The following macro should only be used with TYPE = {uint, ulong, bool}. */
+ #define NULLB_DEVICE_ATTR(NAME, TYPE)                                         \
+ static ssize_t                                                                        \
+ nullb_device_##NAME##_show(struct config_item *item, char *page)              \
+ {                                                                             \
+       return nullb_device_##TYPE##_attr_show(                                 \
+                               to_nullb_device(item)->NAME, page);             \
+ }                                                                             \
+ static ssize_t                                                                        \
+ nullb_device_##NAME##_store(struct config_item *item, const char *page,               \
+                           size_t count)                                       \
+ {                                                                             \
+       if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags))   \
+               return -EBUSY;                                                  \
+       return nullb_device_##TYPE##_attr_store(                                \
+                       &to_nullb_device(item)->NAME, page, count);             \
+ }                                                                             \
+ CONFIGFS_ATTR(nullb_device_, NAME);
+ 
+ NULLB_DEVICE_ATTR(size, ulong);
+ NULLB_DEVICE_ATTR(completion_nsec, ulong);
+ NULLB_DEVICE_ATTR(submit_queues, uint);
+ NULLB_DEVICE_ATTR(home_node, uint);
+ NULLB_DEVICE_ATTR(queue_mode, uint);
+ NULLB_DEVICE_ATTR(blocksize, uint);
+ NULLB_DEVICE_ATTR(irqmode, uint);
+ NULLB_DEVICE_ATTR(hw_queue_depth, uint);
+ NULLB_DEVICE_ATTR(index, uint);
+ NULLB_DEVICE_ATTR(use_lightnvm, bool);
+ NULLB_DEVICE_ATTR(blocking, bool);
+ NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
+ NULLB_DEVICE_ATTR(memory_backed, bool);
+ NULLB_DEVICE_ATTR(discard, bool);
+ NULLB_DEVICE_ATTR(mbps, uint);
+ NULLB_DEVICE_ATTR(cache_size, ulong);
+ 
+ static ssize_t nullb_device_power_show(struct config_item *item, char *page)
+ {
+       return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
+ }
+ 
+ static ssize_t nullb_device_power_store(struct config_item *item,
+                                    const char *page, size_t count)
+ {
+       struct nullb_device *dev = to_nullb_device(item);
+       bool newp = false;
+       ssize_t ret;
+ 
+       ret = nullb_device_bool_attr_store(&newp, page, count);
+       if (ret < 0)
+               return ret;
+ 
+       if (!dev->power && newp) {
+               if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
+                       return count;
+               if (null_add_dev(dev)) {
+                       clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+                       return -ENOMEM;
+               }
+ 
+               set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
+               dev->power = newp;
+       } else if (dev->power && !newp) {
+               mutex_lock(&lock);
+               dev->power = newp;
+               null_del_dev(dev->nullb);
+               mutex_unlock(&lock);
+               clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+       }
+ 
+       return count;
+ }
+ 
+ CONFIGFS_ATTR(nullb_device_, power);
+ 
+ static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
+ {
+       struct nullb_device *t_dev = to_nullb_device(item);
+ 
+       return badblocks_show(&t_dev->badblocks, page, 0);
+ }
+ 
+ static ssize_t nullb_device_badblocks_store(struct config_item *item,
+                                    const char *page, size_t count)
+ {
+       struct nullb_device *t_dev = to_nullb_device(item);
+       char *orig, *buf, *tmp;
+       u64 start, end;
+       int ret;
+ 
+       orig = kstrndup(page, count, GFP_KERNEL);
+       if (!orig)
+               return -ENOMEM;
+ 
+       buf = strstrip(orig);
+ 
+       ret = -EINVAL;
+       if (buf[0] != '+' && buf[0] != '-')
+               goto out;
+       tmp = strchr(&buf[1], '-');
+       if (!tmp)
+               goto out;
+       *tmp = '\0';
+       ret = kstrtoull(buf + 1, 0, &start);
+       if (ret)
+               goto out;
+       ret = kstrtoull(tmp + 1, 0, &end);
+       if (ret)
+               goto out;
+       ret = -EINVAL;
+       if (start > end)
+               goto out;
+       /* enable badblocks */
+       cmpxchg(&t_dev->badblocks.shift, -1, 0);
+       if (buf[0] == '+')
+               ret = badblocks_set(&t_dev->badblocks, start,
+                       end - start + 1, 1);
+       else
+               ret = badblocks_clear(&t_dev->badblocks, start,
+                       end - start + 1);
+       if (ret == 0)
+               ret = count;
+ out:
+       kfree(orig);
+       return ret;
+ }
+ CONFIGFS_ATTR(nullb_device_, badblocks);
+ 
+ static struct configfs_attribute *nullb_device_attrs[] = {
+       &nullb_device_attr_size,
+       &nullb_device_attr_completion_nsec,
+       &nullb_device_attr_submit_queues,
+       &nullb_device_attr_home_node,
+       &nullb_device_attr_queue_mode,
+       &nullb_device_attr_blocksize,
+       &nullb_device_attr_irqmode,
+       &nullb_device_attr_hw_queue_depth,
+       &nullb_device_attr_index,
+       &nullb_device_attr_use_lightnvm,
+       &nullb_device_attr_blocking,
+       &nullb_device_attr_use_per_node_hctx,
+       &nullb_device_attr_power,
+       &nullb_device_attr_memory_backed,
+       &nullb_device_attr_discard,
+       &nullb_device_attr_mbps,
+       &nullb_device_attr_cache_size,
+       &nullb_device_attr_badblocks,
+       NULL,
+ };
+ 
+ static void nullb_device_release(struct config_item *item)
+ {
+       struct nullb_device *dev = to_nullb_device(item);
+ 
+       badblocks_exit(&dev->badblocks);
+       null_free_device_storage(dev, false);
+       null_free_dev(dev);
+ }
+ 
+ static struct configfs_item_operations nullb_device_ops = {
+       .release        = nullb_device_release,
+ };
+ 
+ static struct config_item_type nullb_device_type = {
+       .ct_item_ops    = &nullb_device_ops,
+       .ct_attrs       = nullb_device_attrs,
+       .ct_owner       = THIS_MODULE,
+ };
+ 
+ static struct
+ config_item *nullb_group_make_item(struct config_group *group, const char *name)
+ {
+       struct nullb_device *dev;
+ 
+       dev = null_alloc_dev();
+       if (!dev)
+               return ERR_PTR(-ENOMEM);
+ 
+       config_item_init_type_name(&dev->item, name, &nullb_device_type);
+ 
+       return &dev->item;
+ }
+ 
+ static void
+ nullb_group_drop_item(struct config_group *group, struct config_item *item)
+ {
+       struct nullb_device *dev = to_nullb_device(item);
+ 
+       if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
+               mutex_lock(&lock);
+               dev->power = false;
+               null_del_dev(dev->nullb);
+               mutex_unlock(&lock);
+       }
+ 
+       config_item_put(item);
+ }
+ 
+ static ssize_t memb_group_features_show(struct config_item *item, char *page)
+ {
+       return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n");
+ }
+ 
+ CONFIGFS_ATTR_RO(memb_group_, features);
+ 
+ static struct configfs_attribute *nullb_group_attrs[] = {
+       &memb_group_attr_features,
+       NULL,
+ };
+ 
+ static struct configfs_group_operations nullb_group_ops = {
+       .make_item      = nullb_group_make_item,
+       .drop_item      = nullb_group_drop_item,
+ };
+ 
+ static struct config_item_type nullb_group_type = {
+       .ct_group_ops   = &nullb_group_ops,
+       .ct_attrs       = nullb_group_attrs,
+       .ct_owner       = THIS_MODULE,
+ };
+ 
+ static struct configfs_subsystem nullb_subsys = {
+       .su_group = {
+               .cg_item = {
+                       .ci_namebuf = "nullb",
+                       .ci_type = &nullb_group_type,
+               },
+       },
+ };
+ 
+ static inline int null_cache_active(struct nullb *nullb)
+ {
+       return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+ }
+ 
+ static struct nullb_device *null_alloc_dev(void)
+ {
+       struct nullb_device *dev;
+ 
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return NULL;
+       INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
+       INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
+       if (badblocks_init(&dev->badblocks, 0)) {
+               kfree(dev);
+               return NULL;
+       }
+ 
+       dev->size = g_gb * 1024;
+       dev->completion_nsec = g_completion_nsec;
+       dev->submit_queues = g_submit_queues;
+       dev->home_node = g_home_node;
+       dev->queue_mode = g_queue_mode;
+       dev->blocksize = g_bs;
+       dev->irqmode = g_irqmode;
+       dev->hw_queue_depth = g_hw_queue_depth;
+       dev->use_lightnvm = g_use_lightnvm;
+       dev->blocking = g_blocking;
+       dev->use_per_node_hctx = g_use_per_node_hctx;
+       return dev;
+ }
+ 
+ static void null_free_dev(struct nullb_device *dev)
+ {
+       kfree(dev);
+ }
+ 
   static void put_tag(struct nullb_queue *nq, unsigned int tag)
   {
         clear_bit_unlock(tag, nq->tag_map);
@@@ -193,7 -619,7 +619,7 @@@ static struct nullb_cmd *__alloc_cmd(st
                 cmd = &nq->cmds[tag];
                 cmd->tag = tag;
                 cmd->nq = nq;
-               if (irqmode == NULL_IRQ_TIMER) {
+               if (nq->dev->irqmode == NULL_IRQ_TIMER) {
                         hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
                                      HRTIMER_MODE_REL);
                         cmd->timer.function = null_cmd_timer_expired;
@@@ -229,19 -655,21 +655,21 @@@ static struct nullb_cmd *alloc_cmd(stru
   static void end_cmd(struct nullb_cmd *cmd)
   {
         struct request_queue *q = NULL;
+       int queue_mode = cmd->nq->dev->queue_mode;
   
         if (cmd->rq)
                 q = cmd->rq->q;
   
         switch (queue_mode)  {
         case NULL_Q_MQ:
-               blk_mq_end_request(cmd->rq, BLK_STS_OK);
+               blk_mq_end_request(cmd->rq, cmd->error);
                 return;
         case NULL_Q_RQ:
                 INIT_LIST_HEAD(&cmd->rq->queuelist);
-               blk_end_request_all(cmd->rq, BLK_STS_OK);
+               blk_end_request_all(cmd->rq, cmd->error);
                 break;
         case NULL_Q_BIO:
+               cmd->bio->bi_status = cmd->error;
                 bio_endio(cmd->bio);
                 break;
         }
@@@ -267,25 -695,582 +695,582 @@@ static enum hrtimer_restart null_cmd_ti
   
   static void null_cmd_end_timer(struct nullb_cmd *cmd)
   {
-       ktime_t kt = completion_nsec;
+       ktime_t kt = cmd->nq->dev->completion_nsec;
   
         hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
   }
   
   static void null_softirq_done_fn(struct request *rq)
   {
-       if (queue_mode == NULL_Q_MQ)
+       struct nullb *nullb = rq->q->queuedata;
+ 
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
                 end_cmd(blk_mq_rq_to_pdu(rq));
         else
                 end_cmd(rq->special);
   }
   
- static inline void null_handle_cmd(struct nullb_cmd *cmd)
+ static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
+ {
+       struct nullb_page *t_page;
+ 
+       t_page = kmalloc(sizeof(struct nullb_page), gfp_flags);
+       if (!t_page)
+               goto out;
+ 
+       t_page->page = alloc_pages(gfp_flags, 0);
+       if (!t_page->page)
+               goto out_freepage;
+ 
+       t_page->bitmap = 0;
+       return t_page;
+ out_freepage:
+       kfree(t_page);
+ out:
+       return NULL;
+ }
+ 
+ static void null_free_page(struct nullb_page *t_page)
+ {
+       __set_bit(NULLB_PAGE_FREE, &t_page->bitmap);
+       if (test_bit(NULLB_PAGE_LOCK, &t_page->bitmap))
+               return;
+       __free_page(t_page->page);
+       kfree(t_page);
+ }
+ 
+ static void null_free_sector(struct nullb *nullb, sector_t sector,
+       bool is_cache)
+ {
+       unsigned int sector_bit;
+       u64 idx;
+       struct nullb_page *t_page, *ret;
+       struct radix_tree_root *root;
+ 
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       sector_bit = (sector & SECTOR_MASK);
+ 
+       t_page = radix_tree_lookup(root, idx);
+       if (t_page) {
+               __clear_bit(sector_bit, &t_page->bitmap);
+ 
+               if (!t_page->bitmap) {
+                       ret = radix_tree_delete_item(root, idx, t_page);
+                       WARN_ON(ret != t_page);
+                       null_free_page(ret);
+                       if (is_cache)
+                               nullb->dev->curr_cache -= PAGE_SIZE;
+               }
+       }
+ }
+ 
+ static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
+       struct nullb_page *t_page, bool is_cache)
+ {
+       struct radix_tree_root *root;
+ 
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+ 
+       if (radix_tree_insert(root, idx, t_page)) {
+               null_free_page(t_page);
+               t_page = radix_tree_lookup(root, idx);
+               WARN_ON(!t_page || t_page->page->index != idx);
+       } else if (is_cache)
+               nullb->dev->curr_cache += PAGE_SIZE;
+ 
+       return t_page;
+ }
+ 
+ static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
+ {
+       unsigned long pos = 0;
+       int nr_pages;
+       struct nullb_page *ret, *t_pages[FREE_BATCH];
+       struct radix_tree_root *root;
+ 
+       root = is_cache ? &dev->cache : &dev->data;
+ 
+       do {
+               int i;
+ 
+               nr_pages = radix_tree_gang_lookup(root,
+                               (void **)t_pages, pos, FREE_BATCH);
+ 
+               for (i = 0; i < nr_pages; i++) {
+                       pos = t_pages[i]->page->index;
+                       ret = radix_tree_delete_item(root, pos, t_pages[i]);
+                       WARN_ON(ret != t_pages[i]);
+                       null_free_page(ret);
+               }
+ 
+               pos++;
+       } while (nr_pages == FREE_BATCH);
+ 
+       if (is_cache)
+               dev->curr_cache = 0;
+ }
+ 
+ static struct nullb_page *__null_lookup_page(struct nullb *nullb,
+       sector_t sector, bool for_write, bool is_cache)
+ {
+       unsigned int sector_bit;
+       u64 idx;
+       struct nullb_page *t_page;
+       struct radix_tree_root *root;
+ 
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       sector_bit = (sector & SECTOR_MASK);
+ 
+       root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+       t_page = radix_tree_lookup(root, idx);
+       WARN_ON(t_page && t_page->page->index != idx);
+ 
+       if (t_page && (for_write || test_bit(sector_bit, &t_page->bitmap)))
+               return t_page;
+ 
+       return NULL;
+ }
+ 
+ static struct nullb_page *null_lookup_page(struct nullb *nullb,
+       sector_t sector, bool for_write, bool ignore_cache)
+ {
+       struct nullb_page *page = NULL;
+ 
+       if (!ignore_cache)
+               page = __null_lookup_page(nullb, sector, for_write, true);
+       if (page)
+               return page;
+       return __null_lookup_page(nullb, sector, for_write, false);
+ }
+ 
+ static struct nullb_page *null_insert_page(struct nullb *nullb,
+       sector_t sector, bool ignore_cache)
+ {
+       u64 idx;
+       struct nullb_page *t_page;
+ 
+       t_page = null_lookup_page(nullb, sector, true, ignore_cache);
+       if (t_page)
+               return t_page;
+ 
+       spin_unlock_irq(&nullb->lock);
+ 
+       t_page = null_alloc_page(GFP_NOIO);
+       if (!t_page)
+               goto out_lock;
+ 
+       if (radix_tree_preload(GFP_NOIO))
+               goto out_freepage;
+ 
+       spin_lock_irq(&nullb->lock);
+       idx = sector >> PAGE_SECTORS_SHIFT;
+       t_page->page->index = idx;
+       t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
+       radix_tree_preload_end();
+ 
+       return t_page;
+ out_freepage:
+       null_free_page(t_page);
+ out_lock:
+       spin_lock_irq(&nullb->lock);
+       return null_lookup_page(nullb, sector, true, ignore_cache);
+ }
+ 
+ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
+ {
+       int i;
+       unsigned int offset;
+       u64 idx;
+       struct nullb_page *t_page, *ret;
+       void *dst, *src;
+ 
+       idx = c_page->page->index;
+ 
+       t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
+ 
+       __clear_bit(NULLB_PAGE_LOCK, &c_page->bitmap);
+       if (test_bit(NULLB_PAGE_FREE, &c_page->bitmap)) {
+               null_free_page(c_page);
+               if (t_page && t_page->bitmap == 0) {
+                       ret = radix_tree_delete_item(&nullb->dev->data,
+                               idx, t_page);
+                       null_free_page(t_page);
+               }
+               return 0;
+       }
+ 
+       if (!t_page)
+               return -ENOMEM;
+ 
+       src = kmap_atomic(c_page->page);
+       dst = kmap_atomic(t_page->page);
+ 
+       for (i = 0; i < PAGE_SECTORS;
+                       i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
+               if (test_bit(i, &c_page->bitmap)) {
+                       offset = (i << SECTOR_SHIFT);
+                       memcpy(dst + offset, src + offset,
+                               nullb->dev->blocksize);
+                       __set_bit(i, &t_page->bitmap);
+               }
+       }
+ 
+       kunmap_atomic(dst);
+       kunmap_atomic(src);
+ 
+       ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
+       null_free_page(ret);
+       nullb->dev->curr_cache -= PAGE_SIZE;
+ 
+       return 0;
+ }
+ 
+ static int null_make_cache_space(struct nullb *nullb, unsigned long n)
   {
+       int i, err, nr_pages;
+       struct nullb_page *c_pages[FREE_BATCH];
+       unsigned long flushed = 0, one_round;
+ 
+ again:
+       if ((nullb->dev->cache_size * 1024 * 1024) >
+            nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
+               return 0;
+ 
+       nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
+                       (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
+       /*
+        * nullb_flush_cache_page could unlock before using the c_pages. To
+        * avoid race, we don't allow page free
+        */
+       for (i = 0; i < nr_pages; i++) {
+               nullb->cache_flush_pos = c_pages[i]->page->index;
+               /*
+                * We found the page which is being flushed to disk by other
+                * threads
+                */
+               if (test_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap))
+                       c_pages[i] = NULL;
+               else
+                       __set_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap);
+       }
+ 
+       one_round = 0;
+       for (i = 0; i < nr_pages; i++) {
+               if (c_pages[i] == NULL)
+                       continue;
+               err = null_flush_cache_page(nullb, c_pages[i]);
+               if (err)
+                       return err;
+               one_round++;
+       }
+       flushed += one_round << PAGE_SHIFT;
+ 
+       if (n > flushed) {
+               if (nr_pages == 0)
+                       nullb->cache_flush_pos = 0;
+               if (one_round == 0) {
+                       /* give other threads a chance */
+                       spin_unlock_irq(&nullb->lock);
+                       spin_lock_irq(&nullb->lock);
+               }
+               goto again;
+       }
+       return 0;
+ }
+ 
+ static int copy_to_nullb(struct nullb *nullb, struct page *source,
+       unsigned int off, sector_t sector, size_t n, bool is_fua)
+ {
+       size_t temp, count = 0;
+       unsigned int offset;
+       struct nullb_page *t_page;
+       void *dst, *src;
+ 
+       while (count < n) {
+               temp = min_t(size_t, nullb->dev->blocksize, n - count);
+ 
+               if (null_cache_active(nullb) && !is_fua)
+                       null_make_cache_space(nullb, PAGE_SIZE);
+ 
+               offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+               t_page = null_insert_page(nullb, sector,
+                       !null_cache_active(nullb) || is_fua);
+               if (!t_page)
+                       return -ENOSPC;
+ 
+               src = kmap_atomic(source);
+               dst = kmap_atomic(t_page->page);
+               memcpy(dst + offset, src + off + count, temp);
+               kunmap_atomic(dst);
+               kunmap_atomic(src);
+ 
+               __set_bit(sector & SECTOR_MASK, &t_page->bitmap);
+ 
+               if (is_fua)
+                       null_free_sector(nullb, sector, true);
+ 
+               count += temp;
+               sector += temp >> SECTOR_SHIFT;
+       }
+       return 0;
+ }
+ 
+ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
+       unsigned int off, sector_t sector, size_t n)
+ {
+       size_t temp, count = 0;
+       unsigned int offset;
+       struct nullb_page *t_page;
+       void *dst, *src;
+ 
+       while (count < n) {
+               temp = min_t(size_t, nullb->dev->blocksize, n - count);
+ 
+               offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+               t_page = null_lookup_page(nullb, sector, false,
+                       !null_cache_active(nullb));
+ 
+               dst = kmap_atomic(dest);
+               if (!t_page) {
+                       memset(dst + off + count, 0, temp);
+                       goto next;
+               }
+               src = kmap_atomic(t_page->page);
+               memcpy(dst + off + count, src + offset, temp);
+               kunmap_atomic(src);
+ next:
+               kunmap_atomic(dst);
+ 
+               count += temp;
+               sector += temp >> SECTOR_SHIFT;
+       }
+       return 0;
+ }
+ 
+ static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n)
+ {
+       size_t temp;
+ 
+       spin_lock_irq(&nullb->lock);
+       while (n > 0) {
+               temp = min_t(size_t, n, nullb->dev->blocksize);
+               null_free_sector(nullb, sector, false);
+               if (null_cache_active(nullb))
+                       null_free_sector(nullb, sector, true);
+               sector += temp >> SECTOR_SHIFT;
+               n -= temp;
+       }
+       spin_unlock_irq(&nullb->lock);
+ }
+ 
+ static int null_handle_flush(struct nullb *nullb)
+ {
+       int err;
+ 
+       if (!null_cache_active(nullb))
+               return 0;
+ 
+       spin_lock_irq(&nullb->lock);
+       while (true) {
+               err = null_make_cache_space(nullb,
+                       nullb->dev->cache_size * 1024 * 1024);
+               if (err || nullb->dev->curr_cache == 0)
+                       break;
+       }
+ 
+       WARN_ON(!radix_tree_empty(&nullb->dev->cache));
+       spin_unlock_irq(&nullb->lock);
+       return err;
+ }
+ 
+ static int null_transfer(struct nullb *nullb, struct page *page,
+       unsigned int len, unsigned int off, bool is_write, sector_t sector,
+       bool is_fua)
+ {
+       int err = 0;
+ 
+       if (!is_write) {
+               err = copy_from_nullb(nullb, page, off, sector, len);
+               flush_dcache_page(page);
+       } else {
+               flush_dcache_page(page);
+               err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+       }
+ 
+       return err;
+ }
+ 
+ static int null_handle_rq(struct nullb_cmd *cmd)
+ {
+       struct request *rq = cmd->rq;
+       struct nullb *nullb = cmd->nq->dev->nullb;
+       int err;
+       unsigned int len;
+       sector_t sector;
+       struct req_iterator iter;
+       struct bio_vec bvec;
+ 
+       sector = blk_rq_pos(rq);
+ 
+       if (req_op(rq) == REQ_OP_DISCARD) {
+               null_handle_discard(nullb, sector, blk_rq_bytes(rq));
+               return 0;
+       }
+ 
+       spin_lock_irq(&nullb->lock);
+       rq_for_each_segment(bvec, rq, iter) {
+               len = bvec.bv_len;
+               err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+                                    op_is_write(req_op(rq)), sector,
+                                    req_op(rq) & REQ_FUA);
+               if (err) {
+                       spin_unlock_irq(&nullb->lock);
+                       return err;
+               }
+               sector += len >> SECTOR_SHIFT;
+       }
+       spin_unlock_irq(&nullb->lock);
+ 
+       return 0;
+ }
+ 
+ static int null_handle_bio(struct nullb_cmd *cmd)
+ {
+       struct bio *bio = cmd->bio;
+       struct nullb *nullb = cmd->nq->dev->nullb;
+       int err;
+       unsigned int len;
+       sector_t sector;
+       struct bio_vec bvec;
+       struct bvec_iter iter;
+ 
+       sector = bio->bi_iter.bi_sector;
+ 
+       if (bio_op(bio) == REQ_OP_DISCARD) {
+               null_handle_discard(nullb, sector,
+                       bio_sectors(bio) << SECTOR_SHIFT);
+               return 0;
+       }
+ 
+       spin_lock_irq(&nullb->lock);
+       bio_for_each_segment(bvec, bio, iter) {
+               len = bvec.bv_len;
+               err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+                                    op_is_write(bio_op(bio)), sector,
+                                    bio_op(bio) & REQ_FUA);
+               if (err) {
+                       spin_unlock_irq(&nullb->lock);
+                       return err;
+               }
+               sector += len >> SECTOR_SHIFT;
+       }
+       spin_unlock_irq(&nullb->lock);
+       return 0;
+ }
+ 
+ static void null_stop_queue(struct nullb *nullb)
+ {
+       struct request_queue *q = nullb->q;
+ 
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
+               blk_mq_stop_hw_queues(q);
+       else {
+               spin_lock_irq(q->queue_lock);
+               blk_stop_queue(q);
+               spin_unlock_irq(q->queue_lock);
+       }
+ }
+ 
+ static void null_restart_queue_async(struct nullb *nullb)
+ {
+       struct request_queue *q = nullb->q;
+       unsigned long flags;
+ 
+       if (nullb->dev->queue_mode == NULL_Q_MQ)
+               blk_mq_start_stopped_hw_queues(q, true);
+       else {
+               spin_lock_irqsave(q->queue_lock, flags);
+               blk_start_queue_async(q);
+               spin_unlock_irqrestore(q->queue_lock, flags);
+       }
+ }
+ 
+ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
+ {
+       struct nullb_device *dev = cmd->nq->dev;
+       struct nullb *nullb = dev->nullb;
+       int err = 0;
+ 
+       if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
+               struct request *rq = cmd->rq;
+ 
+               if (!hrtimer_active(&nullb->bw_timer))
+                       hrtimer_restart(&nullb->bw_timer);
+ 
+               if (atomic_long_sub_return(blk_rq_bytes(rq),
+                               &nullb->cur_bytes) < 0) {
+                       null_stop_queue(nullb);
+                       /* race with timer */
+                       if (atomic_long_read(&nullb->cur_bytes) > 0)
+                               null_restart_queue_async(nullb);
+                       if (dev->queue_mode == NULL_Q_RQ) {
+                               struct request_queue *q = nullb->q;
+ 
+                               spin_lock_irq(q->queue_lock);
+                               rq->rq_flags |= RQF_DONTPREP;
+                               blk_requeue_request(q, rq);
+                               spin_unlock_irq(q->queue_lock);
+                               return BLK_STS_OK;
+                       } else
+                               /* requeue request */
+                               return BLK_STS_RESOURCE;
+               }
+       }
+ 
+       if (nullb->dev->badblocks.shift != -1) {
+               int bad_sectors;
+               sector_t sector, size, first_bad;
+               bool is_flush = true;
+ 
+               if (dev->queue_mode == NULL_Q_BIO &&
+                               bio_op(cmd->bio) != REQ_OP_FLUSH) {
+                       is_flush = false;
+                       sector = cmd->bio->bi_iter.bi_sector;
+                       size = bio_sectors(cmd->bio);
+               }
+               if (dev->queue_mode != NULL_Q_BIO &&
+                               req_op(cmd->rq) != REQ_OP_FLUSH) {
+                       is_flush = false;
+                       sector = blk_rq_pos(cmd->rq);
+                       size = blk_rq_sectors(cmd->rq);
+               }
+               if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector,
+                               size, &first_bad, &bad_sectors)) {
+                       cmd->error = BLK_STS_IOERR;
+                       goto out;
+               }
+       }
+ 
+       if (dev->memory_backed) {
+               if (dev->queue_mode == NULL_Q_BIO) {
+                       if (bio_op(cmd->bio) == REQ_OP_FLUSH)
+                               err = null_handle_flush(nullb);
+                       else
+                               err = null_handle_bio(cmd);
+               } else {
+                       if (req_op(cmd->rq) == REQ_OP_FLUSH)
+                               err = null_handle_flush(nullb);
+                       else
+                               err = null_handle_rq(cmd);
+               }
+       }
+       cmd->error = errno_to_blk_status(err);
+ out:
         /* Complete IO by inline, softirq or timer */
-       switch (irqmode) {
+       switch (dev->irqmode) {
         case NULL_IRQ_SOFTIRQ:
-               switch (queue_mode)  {
+               switch (dev->queue_mode)  {
                 case NULL_Q_MQ:
                         blk_mq_complete_request(cmd->rq);
                         break;
@@@ -307,6 -1292,34 +1292,34 @@@
                 null_cmd_end_timer(cmd);
                 break;
         }
+       return BLK_STS_OK;
+ }
+ 
+ static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
+ {
+       struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
+       ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+       unsigned int mbps = nullb->dev->mbps;
+ 
+       if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
+               return HRTIMER_NORESTART;
+ 
+       atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
+       null_restart_queue_async(nullb);
+ 
+       hrtimer_forward_now(&nullb->bw_timer, timer_interval);
+ 
+       return HRTIMER_RESTART;
+ }
+ 
+ static void nullb_setup_bwtimer(struct nullb *nullb)
+ {
+       ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+ 
+       hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       nullb->bw_timer.function = nullb_bwtimer_fn;
+       atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
+       hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
   }
   
   static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
@@@ -366,20 -1379,20 +1379,20 @@@ static blk_status_t null_queue_rq(struc
                          const struct blk_mq_queue_data *bd)
   {
         struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+       struct nullb_queue *nq = hctx->driver_data;
   
         might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
   
-       if (irqmode == NULL_IRQ_TIMER) {
+       if (nq->dev->irqmode == NULL_IRQ_TIMER) {
                 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
                 cmd->timer.function = null_cmd_timer_expired;
         }
         cmd->rq = bd->rq;
-       cmd->nq = hctx->driver_data;
+       cmd->nq = nq;
   
         blk_mq_start_request(bd->rq);
   
-       null_handle_cmd(cmd);
-       return BLK_STS_OK;
+       return null_handle_cmd(cmd);
   }
   
   static const struct blk_mq_ops null_mq_ops = {
@@@ -438,7 -1451,8 +1451,8 @@@ static int null_lnvm_submit_io(struct n
   
   static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
   {
-       sector_t size = gb * 1024 * 1024 * 1024ULL;
+       struct nullb *nullb = dev->q->queuedata;
+       sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
         sector_t blksize;
         struct nvm_id_group *grp;
   
@@@ -460,7 -1474,7 +1474,7 @@@
         id->ppaf.ch_offset = 56;
         id->ppaf.ch_len = 8;
   
-       sector_div(size, bs); /* convert size to pages */
+       sector_div(size, nullb->dev->blocksize); /* convert size to pages */
         size >>= 8; /* concert size to pgs pr blk */
         grp = &id->grp;
         grp->mtype = 0;
@@@ -474,8 -1488,8 +1488,8 @@@
         grp->num_blk = blksize;
         grp->num_pln = 1;
   
-       grp->fpg_sz = bs;
-       grp->csecs = bs;
+       grp->fpg_sz = nullb->dev->blocksize;
+       grp->csecs = nullb->dev->blocksize;
         grp->trdt = 25000;
         grp->trdm = 25000;
         grp->tprt = 500000;
@@@ -483,7 -1497,7 +1497,7 @@@
         grp->tbet = 1500000;
         grp->tbem = 1500000;
         grp->mpos = 0x010101; /* single plane rwe */
-       grp->cpar = hw_queue_depth;
+       grp->cpar = nullb->dev->hw_queue_depth;
   
         return 0;
   }
@@@ -568,19 -1582,44 +1582,44 @@@ static void null_nvm_unregister(struct 
   
   static void null_del_dev(struct nullb *nullb)
   {
+       struct nullb_device *dev = nullb->dev;
+ 
+       ida_simple_remove(&nullb_indexes, nullb->index);
+ 
         list_del_init(&nullb->list);
   
-       if (use_lightnvm)
+       if (dev->use_lightnvm)
                 null_nvm_unregister(nullb);
         else
                 del_gendisk(nullb->disk);
+ 
+       if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
+               hrtimer_cancel(&nullb->bw_timer);
+               atomic_long_set(&nullb->cur_bytes, LONG_MAX);
+               null_restart_queue_async(nullb);
+       }
+ 
         blk_cleanup_queue(nullb->q);
-       if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+       if (dev->queue_mode == NULL_Q_MQ &&
+           nullb->tag_set == &nullb->__tag_set)
                 blk_mq_free_tag_set(nullb->tag_set);
-       if (!use_lightnvm)
+       if (!dev->use_lightnvm)
                 put_disk(nullb->disk);
         cleanup_queues(nullb);
+       if (null_cache_active(nullb))
+               null_free_device_storage(nullb->dev, true);
         kfree(nullb);
+       dev->nullb = NULL;
+ }
+ 
+ static void null_config_discard(struct nullb *nullb)
+ {
+       if (nullb->dev->discard == false)
+               return;
+       nullb->q->limits.discard_granularity = nullb->dev->blocksize;
+       nullb->q->limits.discard_alignment = nullb->dev->blocksize;
+       blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nullb->q);
   }
   
   static int null_open(struct block_device *bdev, fmode_t mode)
@@@ -605,6 -1644,7 +1644,7 @@@ static void null_init_queue(struct null
   
         init_waitqueue_head(&nq->wait);
         nq->queue_depth = nullb->queue_depth;
+       nq->dev = nullb->dev;
   }
   
   static void null_init_queues(struct nullb *nullb)
@@@ -652,13 -1692,13 +1692,13 @@@ static int setup_commands(struct nullb_
   
   static int setup_queues(struct nullb *nullb)
   {
-       nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
-                                                               GFP_KERNEL);
+       nullb->queues = kzalloc(nullb->dev->submit_queues *
+               sizeof(struct nullb_queue), GFP_KERNEL);
         if (!nullb->queues)
                 return -ENOMEM;
   
         nullb->nr_queues = 0;
-       nullb->queue_depth = hw_queue_depth;
+       nullb->queue_depth = nullb->dev->hw_queue_depth;
   
         return 0;
   }
@@@ -668,7 -1708,7 +1708,7 @@@ static int init_driver_queues(struct nu
         struct nullb_queue *nq;
         int i, ret = 0;
   
-       for (i = 0; i < submit_queues; i++) {
+       for (i = 0; i < nullb->dev->submit_queues; i++) {
                 nq = &nullb->queues[i];
   
                 null_init_queue(nullb, nq);
@@@ -686,10 -1726,10 +1726,10 @@@ static int null_gendisk_register(struc
         struct gendisk *disk;
         sector_t size;
   
-       disk = nullb->disk = alloc_disk_node(1, home_node);
+       disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
         if (!disk)
                 return -ENOMEM;
-       size = gb * 1024 * 1024 * 1024ULL;
+       size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
         set_capacity(disk, size >> 9);
   
         disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
@@@ -704,49 -1744,86 +1744,86 @@@
         return 0;
   }
   
- static int null_init_tag_set(struct blk_mq_tag_set *set)
+ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
   {
         set->ops = &null_mq_ops;
-       set->nr_hw_queues = submit_queues;
-       set->queue_depth = hw_queue_depth;
-       set->numa_node = home_node;
+       set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
+                                               g_submit_queues;
+       set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
+                                               g_hw_queue_depth;
+       set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
         set->cmd_size   = sizeof(struct nullb_cmd);
         set->flags = BLK_MQ_F_SHOULD_MERGE;
         set->driver_data = NULL;
   
-       if (blocking)
+       if ((nullb && nullb->dev->blocking) || g_blocking)
                 set->flags |= BLK_MQ_F_BLOCKING;
   
         return blk_mq_alloc_tag_set(set);
   }
   
- static int null_add_dev(void)
+ static void null_validate_conf(struct nullb_device *dev)
+ {
+       dev->blocksize = round_down(dev->blocksize, 512);
+       dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
+       if (dev->use_lightnvm && dev->blocksize != 4096)
+               dev->blocksize = 4096;
+ 
+       if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
+               dev->queue_mode = NULL_Q_MQ;
+ 
+       if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
+               if (dev->submit_queues != nr_online_nodes)
+                       dev->submit_queues = nr_online_nodes;
+       } else if (dev->submit_queues > nr_cpu_ids)
+               dev->submit_queues = nr_cpu_ids;
+       else if (dev->submit_queues == 0)
+               dev->submit_queues = 1;
+ 
+       dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
+       dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
+ 
+       /* Do memory allocation, so set blocking */
+       if (dev->memory_backed)
+               dev->blocking = true;
+       else /* cache is meaningless */
+               dev->cache_size = 0;
+       dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
+                                               dev->cache_size);
+       dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
+       /* can not stop a queue */
+       if (dev->queue_mode == NULL_Q_BIO)
+               dev->mbps = 0;
+ }
+ 
+ static int null_add_dev(struct nullb_device *dev)
   {
         struct nullb *nullb;
         int rv;
   
-       nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
+       null_validate_conf(dev);
+ 
+       nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
         if (!nullb) {
                 rv = -ENOMEM;
                 goto out;
         }
+       nullb->dev = dev;
+       dev->nullb = nullb;
   
         spin_lock_init(&nullb->lock);
   
-       if (queue_mode == NULL_Q_MQ && use_per_node_hctx)
-               submit_queues = nr_online_nodes;
- 
         rv = setup_queues(nullb);
         if (rv)
                 goto out_free_nullb;
   
-       if (queue_mode == NULL_Q_MQ) {
+       if (dev->queue_mode == NULL_Q_MQ) {
                 if (shared_tags) {
                         nullb->tag_set = &tag_set;
                         rv = 0;
                 } else {
                         nullb->tag_set = &nullb->__tag_set;
-                       rv = null_init_tag_set(nullb->tag_set);
+                       rv = null_init_tag_set(nullb, nullb->tag_set);
                 }
   
                 if (rv)
@@@ -758,8 -1835,8 +1835,8 @@@
                         goto out_cleanup_tags;
                 }
                 null_init_queues(nullb);
-       } else if (queue_mode == NULL_Q_BIO) {
-               nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+       } else if (dev->queue_mode == NULL_Q_BIO) {
+               nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node);
                 if (!nullb->q) {
                         rv = -ENOMEM;
                         goto out_cleanup_queues;
@@@ -769,7 -1846,8 +1846,8 @@@
                 if (rv)
                         goto out_cleanup_blk_queue;
         } else {
-               nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+               nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock,
+                                               dev->home_node);
                 if (!nullb->q) {
                         rv = -ENOMEM;
                         goto out_cleanup_queues;
@@@ -781,20 -1859,34 +1859,34 @@@
                         goto out_cleanup_blk_queue;
         }
   
+       if (dev->mbps) {
+               set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
+               nullb_setup_bwtimer(nullb);
+       }
+ 
+       if (dev->cache_size > 0) {
+               set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+               blk_queue_write_cache(nullb->q, true, true);
+               blk_queue_flush_queueable(nullb->q, true);
+       }
+ 
         nullb->q->queuedata = nullb;
         queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
         queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
   
         mutex_lock(&lock);
-       nullb->index = nullb_indexes++;
+       nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
+       dev->index = nullb->index;
         mutex_unlock(&lock);
   
-       blk_queue_logical_block_size(nullb->q, bs);
-       blk_queue_physical_block_size(nullb->q, bs);
+       blk_queue_logical_block_size(nullb->q, dev->blocksize);
+       blk_queue_physical_block_size(nullb->q, dev->blocksize);
+ 
+       null_config_discard(nullb);
   
         sprintf(nullb->disk_name, "nullb%d", nullb->index);
   
-       if (use_lightnvm)
+       if (dev->use_lightnvm)
                 rv = null_nvm_register(nullb);
         else
                 rv = null_gendisk_register(nullb);
@@@ -810,7 -1902,7 +1902,7 @@@
   out_cleanup_blk_queue:
         blk_cleanup_queue(nullb->q);
   out_cleanup_tags:
-       if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+       if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
                 blk_mq_free_tag_set(nullb->tag_set);
   out_cleanup_queues:
         cleanup_queues(nullb);
@@@ -825,51 -1917,63 +1917,63 @@@ static int __init null_init(void
         int ret = 0;
         unsigned int i;
         struct nullb *nullb;
+       struct nullb_device *dev;
+ 
+       /* check for nullb_page.bitmap */
+       if (sizeof(unsigned long) * 8 - 2 < (PAGE_SIZE >> SECTOR_SHIFT))
+               return -EINVAL;
   
-       if (bs > PAGE_SIZE) {
+       if (g_bs > PAGE_SIZE) {
                 pr_warn("null_blk: invalid block size\n");
                 pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
-               bs = PAGE_SIZE;
+               g_bs = PAGE_SIZE;
         }
   
-       if (use_lightnvm && bs != 4096) {
+       if (g_use_lightnvm && g_bs != 4096) {
                 pr_warn("null_blk: LightNVM only supports 4k block size\n");
                 pr_warn("null_blk: defaults block size to 4k\n");
-               bs = 4096;
+               g_bs = 4096;
         }
   
-       if (use_lightnvm && queue_mode != NULL_Q_MQ) {
+       if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
                 pr_warn("null_blk: LightNVM only supported for blk-mq\n");
                 pr_warn("null_blk: defaults queue mode to blk-mq\n");
-               queue_mode = NULL_Q_MQ;
+               g_queue_mode = NULL_Q_MQ;
         }
   
-       if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
-               if (submit_queues < nr_online_nodes) {
-                       pr_warn("null_blk: submit_queues param is set to %u.",
+       if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
+               if (g_submit_queues != nr_online_nodes) {
+                       pr_warn("null_blk: submit_queues param is set to %u.\n",
                                                         nr_online_nodes);
-                       submit_queues = nr_online_nodes;
+                       g_submit_queues = nr_online_nodes;
                 }
-       } else if (submit_queues > nr_cpu_ids)
-               submit_queues = nr_cpu_ids;
-       else if (!submit_queues)
-               submit_queues = 1;
+       } else if (g_submit_queues > nr_cpu_ids)
+               g_submit_queues = nr_cpu_ids;
+       else if (g_submit_queues <= 0)
+               g_submit_queues = 1;
   
-       if (queue_mode == NULL_Q_MQ && shared_tags) {
-               ret = null_init_tag_set(&tag_set);
+       if (g_queue_mode == NULL_Q_MQ && shared_tags) {
+               ret = null_init_tag_set(NULL, &tag_set);
                 if (ret)
                         return ret;
         }
   
+       config_group_init(&nullb_subsys.su_group);
+       mutex_init(&nullb_subsys.su_mutex);
+ 
+       ret = configfs_register_subsystem(&nullb_subsys);
+       if (ret)
+               goto err_tagset;
+ 
         mutex_init(&lock);
   
         null_major = register_blkdev(0, "nullb");
         if (null_major < 0) {
                 ret = null_major;
-               goto err_tagset;
+               goto err_conf;
         }
   
-       if (use_lightnvm) {
+       if (g_use_lightnvm) {
                 ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
                                                                 0, 0, NULL);
                 if (!ppa_cache) {
@@@ -880,9 -1984,14 +1984,14 @@@
         }
   
         for (i = 0; i < nr_devices; i++) {
-               ret = null_add_dev();
-               if (ret)
+               dev = null_alloc_dev();
+               if (!dev)
+                       goto err_dev;
+               ret = null_add_dev(dev);
+               if (ret) {
+                       null_free_dev(dev);
                         goto err_dev;
+               }
         }
   
         pr_info("null: module loaded\n");
@@@ -891,13 -2000,17 +2000,17 @@@
   err_dev:
         while (!list_empty(&nullb_list)) {
                 nullb = list_entry(nullb_list.next, struct nullb, list);
+               dev = nullb->dev;
                 null_del_dev(nullb);
+               null_free_dev(dev);
         }
         kmem_cache_destroy(ppa_cache);
   err_ppa:
         unregister_blkdev(null_major, "nullb");
+ err_conf:
+       configfs_unregister_subsystem(&nullb_subsys);
   err_tagset:
-       if (queue_mode == NULL_Q_MQ && shared_tags)
+       if (g_queue_mode == NULL_Q_MQ && shared_tags)
                 blk_mq_free_tag_set(&tag_set);
         return ret;
   }
@@@ -906,16 -2019,22 +2019,22 @@@ static void __exit null_exit(void
   {
         struct nullb *nullb;
   
+       configfs_unregister_subsystem(&nullb_subsys);
+ 
         unregister_blkdev(null_major, "nullb");
   
         mutex_lock(&lock);
         while (!list_empty(&nullb_list)) {
+               struct nullb_device *dev;
+ 
                 nullb = list_entry(nullb_list.next, struct nullb, list);
+               dev = nullb->dev;
                 null_del_dev(nullb);
+               null_free_dev(dev);
         }
         mutex_unlock(&lock);
   
-       if (queue_mode == NULL_Q_MQ && shared_tags)
+       if (g_queue_mode == NULL_Q_MQ && shared_tags)
                 blk_mq_free_tag_set(&tag_set);
   
         kmem_cache_destroy(ppa_cache);
@@@ -924,5 -2043,5 +2043,5 @@@
   module_init(null_init);
   module_exit(null_exit);
   
- MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");
+ MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
   MODULE_LICENSE("GPL");
diff --combined drivers/block/virtio_blk.c

index d3d5523,0ba1eb9..34e17ee
--- 1/drivers/block/virtio_blk.c
--- 2/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@@ -265,7 -265,7 +265,7 @@@ static blk_status_t virtio_queue_rq(str
         }
   
         spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
-       if (req_op(req) == REQ_OP_SCSI_IN || req_op(req) == REQ_OP_SCSI_OUT)
+       if (blk_rq_is_scsi(req))
                 err = virtblk_add_req_scsi(vblk->vqs[qid].vq, vbr, vbr->sg, num);
         else
                 err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
@@@ -381,7 -381,6 +381,7 @@@ static void virtblk_config_changed_work
         struct request_queue *q = vblk->disk->queue;
         char cap_str_2[10], cap_str_10[10];
         char *envp[] = { "RESIZE=1", NULL };
+ +      unsigned long long nblocks;
         u64 capacity;
   
         /* Host must always specify the capacity. */
@@@ -394,19 -393,16 +394,19 @@@
                 capacity = (sector_t)-1;
         }
   
- -      string_get_size(capacity, queue_logical_block_size(q),
+ +      nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
+ +
+ +      string_get_size(nblocks, queue_logical_block_size(q),
                         STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
- -      string_get_size(capacity, queue_logical_block_size(q),
+ +      string_get_size(nblocks, queue_logical_block_size(q),
                         STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
   
         dev_notice(&vdev->dev,
- -                "new size: %llu %d-byte logical blocks (%s/%s)\n",
- -                (unsigned long long)capacity,
- -                queue_logical_block_size(q),
- -                cap_str_10, cap_str_2);
+ +                 "new size: %llu %d-byte logical blocks (%s/%s)\n",
+ +                 nblocks,
+ +                 queue_logical_block_size(q),
+ +                 cap_str_10,
+ +                 cap_str_2);
   
         set_capacity(vblk->disk, capacity);
         revalidate_disk(vblk->disk);
diff --combined drivers/block/xen-blkback/xenbus.c

index 2adb859,88eaea6..21c1be1
--- 1/drivers/block/xen-blkback/xenbus.c
--- 2/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@@ -244,7 -244,6 +244,7 @@@ static int xen_blkif_disconnect(struct 
   {
         struct pending_req *req, *n;
         unsigned int j, r;
+ +      bool busy = false;
   
         for (r = 0; r < blkif->nr_rings; r++) {
                 struct xen_blkif_ring *ring = &blkif->rings[r];
@@@ -262,10 -261,8 +262,10 @@@
                  * don't have any discard_io or other_io requests. So, checking
                  * for inflight IO is enough.
                  */
- -              if (atomic_read(&ring->inflight) > 0)
- -                      return -EBUSY;
+ +              if (atomic_read(&ring->inflight) > 0) {
+ +                      busy = true;
+ +                      continue;
+ +              }
   
                 if (ring->irq) {
                         unbind_from_irqhandler(ring->irq, ring);
@@@ -303,9 -300,6 +303,9 @@@
                 WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
                 ring->active = false;
         }
+ +      if (busy)
+ +              return -EBUSY;
+ +
         blkif->nr_ring_pages = 0;
         /*
          * blkif->rings was allocated in connect_ring, so we should free it in
@@@ -816,7 -810,8 +816,8 @@@ static void frontend_changed(struct xen
                 xenbus_switch_state(dev, XenbusStateClosed);
                 if (xenbus_dev_is_online(dev))
                         break;
-               /* fall through if not online */
+               /* fall through */
+               /* if not online */
         case XenbusStateUnknown:
                 /* implies xen_blkif_disconnect() via xen_blkbk_remove() */
                 device_unregister(&dev->dev);
diff --combined drivers/block/xen-blkfront.c

index 2468c28,270019e..891265a
--- 1/drivers/block/xen-blkfront.c
--- 2/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@@ -2075,9 -2075,9 +2075,9 @@@ static int blkfront_resume(struct xenbu
                         /*
                          * Get the bios in the request so we can re-queue them.
                          */
- -                      if (req_op(shadow[i].request) == REQ_OP_FLUSH ||
- -                          req_op(shadow[i].request) == REQ_OP_DISCARD ||
- -                          req_op(shadow[i].request) == REQ_OP_SECURE_ERASE ||
+ +                      if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
+ +                          req_op(shadow[j].request) == REQ_OP_DISCARD ||
+ +                          req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
                             shadow[j].request->cmd_flags & REQ_FUA) {
                                 /*
                                  * Flush operations don't contain bios, so
@@@ -2456,7 -2456,7 +2456,7 @@@ static void blkback_changed(struct xenb
         case XenbusStateClosed:
                 if (dev->state == XenbusStateClosed)
                         break;
-               /* Missed the backend's Closing state -- fallthrough */
+               /* fall through */
         case XenbusStateClosing:
                 if (info)
                         blkfront_closing(info);
diff --combined drivers/block/zram/zram_drv.c

index 4a0438c,1c3383b..4063f3f
--- 1/drivers/block/zram/zram_drv.c
--- 2/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@@ -270,349 -270,6 +270,349 @@@ static ssize_t mem_used_max_store(struc
         return len;
   }
   
-       bio->bi_bdev = zram->bdev;
+ +#ifdef CONFIG_ZRAM_WRITEBACK
+ +static bool zram_wb_enabled(struct zram *zram)
+ +{
+ +      return zram->backing_dev;
+ +}
+ +
+ +static void reset_bdev(struct zram *zram)
+ +{
+ +      struct block_device *bdev;
+ +
+ +      if (!zram_wb_enabled(zram))
+ +              return;
+ +
+ +      bdev = zram->bdev;
+ +      if (zram->old_block_size)
+ +              set_blocksize(bdev, zram->old_block_size);
+ +      blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ +      /* hope filp_close flush all of IO */
+ +      filp_close(zram->backing_dev, NULL);
+ +      zram->backing_dev = NULL;
+ +      zram->old_block_size = 0;
+ +      zram->bdev = NULL;
+ +
+ +      kvfree(zram->bitmap);
+ +      zram->bitmap = NULL;
+ +}
+ +
+ +static ssize_t backing_dev_show(struct device *dev,
+ +              struct device_attribute *attr, char *buf)
+ +{
+ +      struct zram *zram = dev_to_zram(dev);
+ +      struct file *file = zram->backing_dev;
+ +      char *p;
+ +      ssize_t ret;
+ +
+ +      down_read(&zram->init_lock);
+ +      if (!zram_wb_enabled(zram)) {
+ +              memcpy(buf, "none\n", 5);
+ +              up_read(&zram->init_lock);
+ +              return 5;
+ +      }
+ +
+ +      p = file_path(file, buf, PAGE_SIZE - 1);
+ +      if (IS_ERR(p)) {
+ +              ret = PTR_ERR(p);
+ +              goto out;
+ +      }
+ +
+ +      ret = strlen(p);
+ +      memmove(buf, p, ret);
+ +      buf[ret++] = '\n';
+ +out:
+ +      up_read(&zram->init_lock);
+ +      return ret;
+ +}
+ +
+ +static ssize_t backing_dev_store(struct device *dev,
+ +              struct device_attribute *attr, const char *buf, size_t len)
+ +{
+ +      char *file_name;
+ +      struct file *backing_dev = NULL;
+ +      struct inode *inode;
+ +      struct address_space *mapping;
+ +      unsigned int bitmap_sz, old_block_size = 0;
+ +      unsigned long nr_pages, *bitmap = NULL;
+ +      struct block_device *bdev = NULL;
+ +      int err;
+ +      struct zram *zram = dev_to_zram(dev);
+ +
+ +      file_name = kmalloc(PATH_MAX, GFP_KERNEL);
+ +      if (!file_name)
+ +              return -ENOMEM;
+ +
+ +      down_write(&zram->init_lock);
+ +      if (init_done(zram)) {
+ +              pr_info("Can't setup backing device for initialized device\n");
+ +              err = -EBUSY;
+ +              goto out;
+ +      }
+ +
+ +      strlcpy(file_name, buf, len);
+ +
+ +      backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
+ +      if (IS_ERR(backing_dev)) {
+ +              err = PTR_ERR(backing_dev);
+ +              backing_dev = NULL;
+ +              goto out;
+ +      }
+ +
+ +      mapping = backing_dev->f_mapping;
+ +      inode = mapping->host;
+ +
+ +      /* Support only block device in this moment */
+ +      if (!S_ISBLK(inode->i_mode)) {
+ +              err = -ENOTBLK;
+ +              goto out;
+ +      }
+ +
+ +      bdev = bdgrab(I_BDEV(inode));
+ +      err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+ +      if (err < 0)
+ +              goto out;
+ +
+ +      nr_pages = i_size_read(inode) >> PAGE_SHIFT;
+ +      bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
+ +      bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
+ +      if (!bitmap) {
+ +              err = -ENOMEM;
+ +              goto out;
+ +      }
+ +
+ +      old_block_size = block_size(bdev);
+ +      err = set_blocksize(bdev, PAGE_SIZE);
+ +      if (err)
+ +              goto out;
+ +
+ +      reset_bdev(zram);
+ +      spin_lock_init(&zram->bitmap_lock);
+ +
+ +      zram->old_block_size = old_block_size;
+ +      zram->bdev = bdev;
+ +      zram->backing_dev = backing_dev;
+ +      zram->bitmap = bitmap;
+ +      zram->nr_pages = nr_pages;
+ +      up_write(&zram->init_lock);
+ +
+ +      pr_info("setup backing device %s\n", file_name);
+ +      kfree(file_name);
+ +
+ +      return len;
+ +out:
+ +      if (bitmap)
+ +              kvfree(bitmap);
+ +
+ +      if (bdev)
+ +              blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ +
+ +      if (backing_dev)
+ +              filp_close(backing_dev, NULL);
+ +
+ +      up_write(&zram->init_lock);
+ +
+ +      kfree(file_name);
+ +
+ +      return err;
+ +}
+ +
+ +static unsigned long get_entry_bdev(struct zram *zram)
+ +{
+ +      unsigned long entry;
+ +
+ +      spin_lock(&zram->bitmap_lock);
+ +      /* skip 0 bit to confuse zram.handle = 0 */
+ +      entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
+ +      if (entry == zram->nr_pages) {
+ +              spin_unlock(&zram->bitmap_lock);
+ +              return 0;
+ +      }
+ +
+ +      set_bit(entry, zram->bitmap);
+ +      spin_unlock(&zram->bitmap_lock);
+ +
+ +      return entry;
+ +}
+ +
+ +static void put_entry_bdev(struct zram *zram, unsigned long entry)
+ +{
+ +      int was_set;
+ +
+ +      spin_lock(&zram->bitmap_lock);
+ +      was_set = test_and_clear_bit(entry, zram->bitmap);
+ +      spin_unlock(&zram->bitmap_lock);
+ +      WARN_ON_ONCE(!was_set);
+ +}
+ +
+ +void zram_page_end_io(struct bio *bio)
+ +{
+ +      struct page *page = bio->bi_io_vec[0].bv_page;
+ +
+ +      page_endio(page, op_is_write(bio_op(bio)),
+ +                      blk_status_to_errno(bio->bi_status));
+ +      bio_put(bio);
+ +}
+ +
+ +/*
+ + * Returns 1 if the submission is successful.
+ + */
+ +static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
+ +                      unsigned long entry, struct bio *parent)
+ +{
+ +      struct bio *bio;
+ +
+ +      bio = bio_alloc(GFP_ATOMIC, 1);
+ +      if (!bio)
+ +              return -ENOMEM;
+ +
+ +      bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
-       bio->bi_bdev = zram->bdev;
++      bio_set_dev(bio, zram->bdev);
+ +      if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
+ +              bio_put(bio);
+ +              return -EIO;
+ +      }
+ +
+ +      if (!parent) {
+ +              bio->bi_opf = REQ_OP_READ;
+ +              bio->bi_end_io = zram_page_end_io;
+ +      } else {
+ +              bio->bi_opf = parent->bi_opf;
+ +              bio_chain(bio, parent);
+ +      }
+ +
+ +      submit_bio(bio);
+ +      return 1;
+ +}
+ +
+ +struct zram_work {
+ +      struct work_struct work;
+ +      struct zram *zram;
+ +      unsigned long entry;
+ +      struct bio *bio;
+ +};
+ +
+ +#if PAGE_SIZE != 4096
+ +static void zram_sync_read(struct work_struct *work)
+ +{
+ +      struct bio_vec bvec;
+ +      struct zram_work *zw = container_of(work, struct zram_work, work);
+ +      struct zram *zram = zw->zram;
+ +      unsigned long entry = zw->entry;
+ +      struct bio *bio = zw->bio;
+ +
+ +      read_from_bdev_async(zram, &bvec, entry, bio);
+ +}
+ +
+ +/*
+ + * Block layer want one ->make_request_fn to be active at a time
+ + * so if we use chained IO with parent IO in same context,
+ + * it's a deadlock. To avoid, it, it uses worker thread context.
+ + */
+ +static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+ +                              unsigned long entry, struct bio *bio)
+ +{
+ +      struct zram_work work;
+ +
+ +      work.zram = zram;
+ +      work.entry = entry;
+ +      work.bio = bio;
+ +
+ +      INIT_WORK_ONSTACK(&work.work, zram_sync_read);
+ +      queue_work(system_unbound_wq, &work.work);
+ +      flush_work(&work.work);
+ +      destroy_work_on_stack(&work.work);
+ +
+ +      return 1;
+ +}
+ +#else
+ +static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+ +                              unsigned long entry, struct bio *bio)
+ +{
+ +      WARN_ON(1);
+ +      return -EIO;
+ +}
+ +#endif
+ +
+ +static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+ +                      unsigned long entry, struct bio *parent, bool sync)
+ +{
+ +      if (sync)
+ +              return read_from_bdev_sync(zram, bvec, entry, parent);
+ +      else
+ +              return read_from_bdev_async(zram, bvec, entry, parent);
+ +}
+ +
+ +static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+ +                                      u32 index, struct bio *parent,
+ +                                      unsigned long *pentry)
+ +{
+ +      struct bio *bio;
+ +      unsigned long entry;
+ +
+ +      bio = bio_alloc(GFP_ATOMIC, 1);
+ +      if (!bio)
+ +              return -ENOMEM;
+ +
+ +      entry = get_entry_bdev(zram);
+ +      if (!entry) {
+ +              bio_put(bio);
+ +              return -ENOSPC;
+ +      }
+ +
+ +      bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
++      bio_set_dev(bio, zram->bdev);
+ +      if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+ +                                      bvec->bv_offset)) {
+ +              bio_put(bio);
+ +              put_entry_bdev(zram, entry);
+ +              return -EIO;
+ +      }
+ +
+ +      if (!parent) {
+ +              bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
+ +              bio->bi_end_io = zram_page_end_io;
+ +      } else {
+ +              bio->bi_opf = parent->bi_opf;
+ +              bio_chain(bio, parent);
+ +      }
+ +
+ +      submit_bio(bio);
+ +      *pentry = entry;
+ +
+ +      return 0;
+ +}
+ +
+ +static void zram_wb_clear(struct zram *zram, u32 index)
+ +{
+ +      unsigned long entry;
+ +
+ +      zram_clear_flag(zram, index, ZRAM_WB);
+ +      entry = zram_get_element(zram, index);
+ +      zram_set_element(zram, index, 0);
+ +      put_entry_bdev(zram, entry);
+ +}
+ +
+ +#else
+ +static bool zram_wb_enabled(struct zram *zram) { return false; }
+ +static inline void reset_bdev(struct zram *zram) {};
+ +static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+ +                                      u32 index, struct bio *parent,
+ +                                      unsigned long *pentry)
+ +
+ +{
+ +      return -EIO;
+ +}
+ +
+ +static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+ +                      unsigned long entry, struct bio *parent, bool sync)
+ +{
+ +      return -EIO;
+ +}
+ +static void zram_wb_clear(struct zram *zram, u32 index) {}
+ +#endif
+ +
+ +
   /*
    * We switched to per-cpu streams and this attr is not needed anymore.
    * However, we will keep it around for some time, because:
@@@ -651,7 -308,7 +651,7 @@@ static ssize_t comp_algorithm_store(str
                 struct device_attribute *attr, const char *buf, size_t len)
   {
         struct zram *zram = dev_to_zram(dev);
- -      char compressor[CRYPTO_MAX_ALG_NAME];
+ +      char compressor[ARRAY_SIZE(zram->compressor)];
         size_t sz;
   
         strlcpy(compressor, buf, sizeof(compressor));
@@@ -670,7 -327,7 +670,7 @@@
                 return -EBUSY;
         }
   
- -      strlcpy(zram->compressor, compressor, sizeof(compressor));
+ +      strcpy(zram->compressor, compressor);
         up_write(&zram->init_lock);
         return len;
   }
@@@ -796,6 -453,30 +796,6 @@@ static bool zram_same_page_read(struct 
         return false;
   }
   
- -static bool zram_same_page_write(struct zram *zram, u32 index,
- -                                      struct page *page)
- -{
- -      unsigned long element;
- -      void *mem = kmap_atomic(page);
- -
- -      if (page_same_filled(mem, &element)) {
- -              kunmap_atomic(mem);
- -              /* Free memory associated with this sector now. */
- -              zram_slot_lock(zram, index);
- -              zram_free_page(zram, index);
- -              zram_set_flag(zram, index, ZRAM_SAME);
- -              zram_set_element(zram, index, element);
- -              zram_slot_unlock(zram, index);
- -
- -              atomic64_inc(&zram->stats.same_pages);
- -              atomic64_inc(&zram->stats.pages_stored);
- -              return true;
- -      }
- -      kunmap_atomic(mem);
- -
- -      return false;
- -}
- -
   static void zram_meta_free(struct zram *zram, u64 disksize)
   {
         size_t num_pages = disksize >> PAGE_SHIFT;
@@@ -834,13 -515,7 +834,13 @@@ static bool zram_meta_alloc(struct zra
    */
   static void zram_free_page(struct zram *zram, size_t index)
   {
- -      unsigned long handle = zram_get_handle(zram, index);
+ +      unsigned long handle;
+ +
+ +      if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
+ +              zram_wb_clear(zram, index);
+ +              atomic64_dec(&zram->stats.pages_stored);
+ +              return;
+ +      }
   
         /*
          * No memory is allocated for same element filled pages.
@@@ -854,7 -529,6 +854,7 @@@
                 return;
         }
   
+ +      handle = zram_get_handle(zram, index);
         if (!handle)
                 return;
   
@@@ -868,31 -542,13 +868,31 @@@
         zram_set_obj_size(zram, index, 0);
   }
   
- -static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
+ +static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
+ +                              struct bio *bio, bool partial_io)
   {
         int ret;
         unsigned long handle;
         unsigned int size;
         void *src, *dst;
   
+ +      if (zram_wb_enabled(zram)) {
+ +              zram_slot_lock(zram, index);
+ +              if (zram_test_flag(zram, index, ZRAM_WB)) {
+ +                      struct bio_vec bvec;
+ +
+ +                      zram_slot_unlock(zram, index);
+ +
+ +                      bvec.bv_page = page;
+ +                      bvec.bv_len = PAGE_SIZE;
+ +                      bvec.bv_offset = 0;
+ +                      return read_from_bdev(zram, &bvec,
+ +                                      zram_get_element(zram, index),
+ +                                      bio, partial_io);
+ +              }
+ +              zram_slot_unlock(zram, index);
+ +      }
+ +
         if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
                 return 0;
   
@@@ -925,7 -581,7 +925,7 @@@
   }
   
   static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
- -                              u32 index, int offset)
+ +                              u32 index, int offset, struct bio *bio)
   {
         int ret;
         struct page *page;
@@@ -938,7 -594,7 +938,7 @@@
                         return -ENOMEM;
         }
   
- -      ret = zram_decompress_page(zram, page, index);
+ +      ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
         if (unlikely(ret))
                 goto out;
   
@@@ -957,57 -613,30 +957,57 @@@ out
         return ret;
   }
   
- -static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
- -                      struct page *page,
- -                      unsigned long *out_handle, unsigned int *out_comp_len)
+ +static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+ +                              u32 index, struct bio *bio)
   {
- -      int ret;
- -      unsigned int comp_len;
- -      void *src;
+ +      int ret = 0;
         unsigned long alloced_pages;
         unsigned long handle = 0;
+ +      unsigned int comp_len = 0;
+ +      void *src, *dst, *mem;
+ +      struct zcomp_strm *zstrm;
+ +      struct page *page = bvec->bv_page;
+ +      unsigned long element = 0;
+ +      enum zram_pageflags flags = 0;
+ +      bool allow_wb = true;
+ +
+ +      mem = kmap_atomic(page);
+ +      if (page_same_filled(mem, &element)) {
+ +              kunmap_atomic(mem);
+ +              /* Free memory associated with this sector now. */
+ +              flags = ZRAM_SAME;
+ +              atomic64_inc(&zram->stats.same_pages);
+ +              goto out;
+ +      }
+ +      kunmap_atomic(mem);
   
   compress_again:
+ +      zstrm = zcomp_stream_get(zram->comp);
         src = kmap_atomic(page);
- -      ret = zcomp_compress(*zstrm, src, &comp_len);
+ +      ret = zcomp_compress(zstrm, src, &comp_len);
         kunmap_atomic(src);
   
         if (unlikely(ret)) {
+ +              zcomp_stream_put(zram->comp);
                 pr_err("Compression failed! err=%d\n", ret);
- -              if (handle)
- -                      zs_free(zram->mem_pool, handle);
+ +              zs_free(zram->mem_pool, handle);
                 return ret;
         }
   
- -      if (unlikely(comp_len > max_zpage_size))
+ +      if (unlikely(comp_len > max_zpage_size)) {
+ +              if (zram_wb_enabled(zram) && allow_wb) {
+ +                      zcomp_stream_put(zram->comp);
+ +                      ret = write_to_bdev(zram, bvec, index, bio, &element);
+ +                      if (!ret) {
+ +                              flags = ZRAM_WB;
+ +                              ret = 1;
+ +                              goto out;
+ +                      }
+ +                      allow_wb = false;
+ +                      goto compress_again;
+ +              }
                 comp_len = PAGE_SIZE;
+ +      }
   
         /*
          * handle allocation has 2 paths:
@@@ -1034,6 -663,7 +1034,6 @@@
                 handle = zs_malloc(zram->mem_pool, comp_len,
                                 GFP_NOIO | __GFP_HIGHMEM |
                                 __GFP_MOVABLE);
- -              *zstrm = zcomp_stream_get(zram->comp);
                 if (handle)
                         goto compress_again;
                 return -ENOMEM;
@@@ -1043,11 -673,34 +1043,11 @@@
         update_used_max(zram, alloced_pages);
   
         if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+ +              zcomp_stream_put(zram->comp);
                 zs_free(zram->mem_pool, handle);
                 return -ENOMEM;
         }
   
- -      *out_handle = handle;
- -      *out_comp_len = comp_len;
- -      return 0;
- -}
- -
- -static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
- -{
- -      int ret;
- -      unsigned long handle;
- -      unsigned int comp_len;
- -      void *src, *dst;
- -      struct zcomp_strm *zstrm;
- -      struct page *page = bvec->bv_page;
- -
- -      if (zram_same_page_write(zram, index, page))
- -              return 0;
- -
- -      zstrm = zcomp_stream_get(zram->comp);
- -      ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
- -      if (ret) {
- -              zcomp_stream_put(zram->comp);
- -              return ret;
- -      }
- -
         dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
   
         src = zstrm->buffer;
@@@ -1059,31 -712,25 +1059,31 @@@
   
         zcomp_stream_put(zram->comp);
         zs_unmap_object(zram->mem_pool, handle);
- -
+ +      atomic64_add(comp_len, &zram->stats.compr_data_size);
+ +out:
         /*
          * Free memory associated with this sector
          * before overwriting unused sectors.
          */
         zram_slot_lock(zram, index);
         zram_free_page(zram, index);
- -      zram_set_handle(zram, index, handle);
- -      zram_set_obj_size(zram, index, comp_len);
+ +
+ +      if (flags) {
+ +              zram_set_flag(zram, index, flags);
+ +              zram_set_element(zram, index, element);
+ +      }  else {
+ +              zram_set_handle(zram, index, handle);
+ +              zram_set_obj_size(zram, index, comp_len);
+ +      }
         zram_slot_unlock(zram, index);
   
         /* Update stats */
- -      atomic64_add(comp_len, &zram->stats.compr_data_size);
         atomic64_inc(&zram->stats.pages_stored);
- -      return 0;
+ +      return ret;
   }
   
   static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
- -                              u32 index, int offset)
+ +                              u32 index, int offset, struct bio *bio)
   {
         int ret;
         struct page *page = NULL;
@@@ -1101,7 -748,7 +1101,7 @@@
                 if (!page)
                         return -ENOMEM;
   
- -              ret = zram_decompress_page(zram, page, index);
+ +              ret = __zram_bvec_read(zram, page, index, bio, true);
                 if (ret)
                         goto out;
   
@@@ -1116,7 -763,7 +1116,7 @@@
                 vec.bv_offset = 0;
         }
   
- -      ret = __zram_bvec_write(zram, &vec, index);
+ +      ret = __zram_bvec_write(zram, &vec, index, bio);
   out:
         if (is_partial_io(bvec))
                 __free_page(page);
@@@ -1161,33 -808,29 +1161,34 @@@ static void zram_bio_discard(struct zra
         }
   }
   
+ +/*
+ + * Returns errno if it has some problem. Otherwise return 0 or 1.
+ + * Returns 0 if IO request was done synchronously
+ + * Returns 1 if IO request was successfully submitted.
+ + */
   static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
- -                      int offset, bool is_write)
+ +                      int offset, bool is_write, struct bio *bio)
   {
         unsigned long start_time = jiffies;
         int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
+       struct request_queue *q = zram->disk->queue;
         int ret;
   
-       generic_start_io_acct(rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+       generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
                         &zram->disk->part0);
   
         if (!is_write) {
                 atomic64_inc(&zram->stats.num_reads);
- -              ret = zram_bvec_read(zram, bvec, index, offset);
+ +              ret = zram_bvec_read(zram, bvec, index, offset, bio);
                 flush_dcache_page(bvec->bv_page);
         } else {
                 atomic64_inc(&zram->stats.num_writes);
- -              ret = zram_bvec_write(zram, bvec, index, offset);
+ +              ret = zram_bvec_write(zram, bvec, index, offset, bio);
         }
   
-       generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
+       generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
   
- -      if (unlikely(ret)) {
+ +      if (unlikely(ret < 0)) {
                 if (!is_write)
                         atomic64_inc(&zram->stats.failed_reads);
                 else
@@@ -1226,7 -869,7 +1227,7 @@@ static void __zram_make_request(struct 
                         bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
                                                         unwritten);
                         if (zram_bvec_rw(zram, &bv, index, offset,
- -                                      op_is_write(bio_op(bio))) < 0)
+ +                                      op_is_write(bio_op(bio)), bio) < 0)
                                 goto out;
   
                         bv.bv_offset += bv.bv_len;
@@@ -1280,18 -923,16 +1281,18 @@@ static void zram_slot_free_notify(struc
   static int zram_rw_page(struct block_device *bdev, sector_t sector,
                        struct page *page, bool is_write)
   {
- -      int offset, err = -EIO;
+ +      int offset, ret;
         u32 index;
         struct zram *zram;
         struct bio_vec bv;
   
+ +      if (PageTransHuge(page))
+ +              return -ENOTSUPP;
         zram = bdev->bd_disk->private_data;
   
         if (!valid_io_request(zram, sector, PAGE_SIZE)) {
                 atomic64_inc(&zram->stats.invalid_io);
- -              err = -EINVAL;
+ +              ret = -EINVAL;
                 goto out;
         }
   
@@@ -1302,7 -943,7 +1303,7 @@@
         bv.bv_len = PAGE_SIZE;
         bv.bv_offset = 0;
   
- -      err = zram_bvec_rw(zram, &bv, index, offset, is_write);
+ +      ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
   out:
         /*
          * If I/O fails, just return error(ie, non-zero) without
@@@ -1312,20 -953,9 +1313,20 @@@
          * bio->bi_end_io does things to handle the error
          * (e.g., SetPageError, set_page_dirty and extra works).
          */
- -      if (err == 0)
+ +      if (unlikely(ret < 0))
+ +              return ret;
+ +
+ +      switch (ret) {
+ +      case 0:
                 page_endio(page, is_write, 0);
- -      return err;
+ +              break;
+ +      case 1:
+ +              ret = 0;
+ +              break;
+ +      default:
+ +              WARN_ON(1);
+ +      }
+ +      return ret;
   }
   
   static void zram_reset_device(struct zram *zram)
@@@ -1354,7 -984,6 +1355,7 @@@
         zram_meta_free(zram, disksize);
         memset(&zram->stats, 0, sizeof(zram->stats));
         zcomp_destroy(comp);
+ +      reset_bdev(zram);
   }
   
   static ssize_t disksize_store(struct device *dev,
@@@ -1480,9 -1109,6 +1481,9 @@@ static DEVICE_ATTR_WO(mem_limit)
   static DEVICE_ATTR_WO(mem_used_max);
   static DEVICE_ATTR_RW(max_comp_streams);
   static DEVICE_ATTR_RW(comp_algorithm);
+ +#ifdef CONFIG_ZRAM_WRITEBACK
+ +static DEVICE_ATTR_RW(backing_dev);
+ +#endif
   
   static struct attribute *zram_disk_attrs[] = {
         &dev_attr_disksize.attr,
@@@ -1493,9 -1119,6 +1494,9 @@@
         &dev_attr_mem_used_max.attr,
         &dev_attr_max_comp_streams.attr,
         &dev_attr_comp_algorithm.attr,
+ +#ifdef CONFIG_ZRAM_WRITEBACK
+ +      &dev_attr_backing_dev.attr,
+ +#endif
         &dev_attr_io_stat.attr,
         &dev_attr_mm_stat.attr,
         &dev_attr_debug_stat.attr,
diff --combined drivers/md/dm-crypt.c

index fa17e54,ca99147..54aef8e
--- 1/drivers/md/dm-crypt.c
--- 2/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@@ -758,8 -758,9 +758,8 @@@ static int crypt_iv_tcw_whitening(struc
         int i, r;
   
         /* xor whitening with sector number */
- -      memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
- -      crypto_xor(buf, (u8 *)&sector, 8);
- -      crypto_xor(&buf[8], (u8 *)&sector, 8);
+ +      crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
+ +      crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);
   
         /* calculate crc32 for every 32bit part and xor it */
         desc->tfm = tcw->crc32_tfm;
@@@ -804,10 -805,10 +804,10 @@@ static int crypt_iv_tcw_gen(struct cryp
         }
   
         /* Calculate IV */
- -      memcpy(iv, tcw->iv_seed, cc->iv_size);
- -      crypto_xor(iv, (u8 *)&sector, 8);
+ +      crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)&sector, 8);
         if (cc->iv_size > 8)
- -              crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
+ +              crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
+ +                             cc->iv_size - 8);
   
         return r;
   }
@@@ -932,9 -933,6 +932,6 @@@ static int dm_crypt_integrity_io_alloc(
         bip->bip_iter.bi_size = tag_len;
         bip->bip_iter.bi_sector = io->cc->start + io->sector;
   
-       /* We own the metadata, do not let bio_free to release it */
-       bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
- 
         ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
                                      tag_len, offset_in_page(io->integrity_metadata));
         if (unlikely(ret != tag_len))
@@@ -1546,7 -1544,7 +1543,7 @@@ static void clone_init(struct dm_crypt_
   
         clone->bi_private = io;
         clone->bi_end_io  = crypt_endio;
-       clone->bi_bdev    = cc->dev->bdev;
+       bio_set_dev(clone, cc->dev->bdev);
         clone->bi_opf     = io->base_bio->bi_opf;
   }
   
@@@ -2795,7 -2793,7 +2792,7 @@@ static int crypt_map(struct dm_target *
          */
         if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
             bio_op(bio) == REQ_OP_DISCARD)) {
-               bio->bi_bdev = cc->dev->bdev;
+               bio_set_dev(bio, cc->dev->bdev);
                 if (bio_sectors(bio))
                         bio->bi_iter.bi_sector = cc->start +
                                 dm_target_offset(ti, bio->bi_iter.bi_sector);
diff --combined drivers/md/dm-mpath.c

index d24e4b0,573046b..96aedaa
--- 1/drivers/md/dm-mpath.c
--- 2/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@@ -504,6 -504,7 +504,6 @@@ static int multipath_clone_and_map(stru
                 if (queue_dying) {
                         atomic_inc(&m->pg_init_in_progress);
                         activate_or_offline_path(pgpath);
- -                      return DM_MAPIO_REQUEUE;
                 }
                 return DM_MAPIO_DELAY_REQUEUE;
         }
@@@ -565,7 -566,7 +565,7 @@@ static int __multipath_map_bio(struct m
         mpio->nr_bytes = nr_bytes;
   
         bio->bi_status = 0;
-       bio->bi_bdev = pgpath->path.dev->bdev;
+       bio_set_dev(bio, pgpath->path.dev->bdev);
         bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
   
         if (pgpath->pg->ps.type->start_io)
@@@ -1457,6 -1458,7 +1457,6 @@@ static int noretry_error(blk_status_t e
         case BLK_STS_TARGET:
         case BLK_STS_NEXUS:
         case BLK_STS_MEDIUM:
- -      case BLK_STS_RESOURCE:
                 return 1;
         }
   
diff --combined drivers/md/dm.c

index d669fdd,b28b9ce..04ae795
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -27,6 -27,16 +27,6 @@@
   
   #define DM_MSG_PREFIX "core"
   
- -#ifdef CONFIG_PRINTK
- -/*
- - * ratelimit state to be used in DMXXX_LIMIT().
- - */
- -DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
- -                     DEFAULT_RATELIMIT_INTERVAL,
- -                     DEFAULT_RATELIMIT_BURST);
- -EXPORT_SYMBOL(dm_ratelimit_state);
- -#endif
- -
   /*
    * Cookies are numeric values sent with CHANGE and REMOVE
    * uevents while resuming, removing or renaming the device.
@@@ -510,7 -520,7 +510,7 @@@ static void start_io_acct(struct dm_io 
         io->start_time = jiffies;
   
         cpu = part_stat_lock();
-       part_round_stats(cpu, &dm_disk(md)->part0);
+       part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
         part_stat_unlock();
         atomic_set(&dm_disk(md)->part0.in_flight[rw],
                 atomic_inc_return(&md->pending[rw]));
@@@ -529,7 -539,7 +529,7 @@@ static void end_io_acct(struct dm_io *i
         int pending;
         int rw = bio_data_dir(bio);
   
-       generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
+       generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
   
         if (unlikely(dm_stats_used(&md->stats)))
                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@@ -841,10 -851,10 +841,10 @@@ static void clone_endio(struct bio *bio
   
         if (unlikely(error == BLK_STS_TARGET)) {
                 if (bio_op(bio) == REQ_OP_WRITE_SAME &&
-                   !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+                   !bio->bi_disk->queue->limits.max_write_same_sectors)
                         disable_write_same(md);
                 if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
-                   !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+                   !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
                         disable_write_zeroes(md);
         }
   
@@@ -1205,8 -1215,8 +1205,8 @@@ static void __map_bio(struct dm_target_
                 break;
         case DM_MAPIO_REMAPPED:
                 /* the bio has been remapped so dispatch it */
-               trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
-                                     tio->io->bio->bi_bdev->bd_dev, sector);
+               trace_block_bio_remap(clone->bi_disk->queue, clone,
+                                     bio_dev(tio->io->bio), sector);
                 generic_make_request(clone);
                 break;
         case DM_MAPIO_KILL:
@@@ -1513,7 -1523,7 +1513,7 @@@ static void __split_and_process_bio(str
         }
   
         /* drop the extra reference count */
- -      dec_pending(ci.io, error);
+ +      dec_pending(ci.io, errno_to_blk_status(error));
   }
   /*-----------------------------------------------------------------
    * CRUD END
@@@ -1532,7 -1542,7 +1532,7 @@@ static blk_qc_t dm_make_request(struct 
   
         map = dm_get_live_table(md, &srcu_idx);
   
-       generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
+       generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
   
         /* if we're suspended, we have to queue this io for later */
         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
@@@ -1786,7 -1796,7 +1786,7 @@@ static struct mapped_device *alloc_dev(
                 goto bad;
   
         bio_init(&md->flush_bio, NULL, 0);
-       md->flush_bio.bi_bdev = md->bdev;
+       bio_set_dev(&md->flush_bio, md->bdev);
         md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
   
         dm_stats_init(&md->stats);
diff --combined drivers/md/md.c

index b01e458,0afdc1b..078c6f3
--- 1/drivers/md/md.c
--- 2/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -422,7 -422,7 +422,7 @@@ static void submit_flushes(struct work_
                         bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
                         bi->bi_end_io = md_end_flush;
                         bi->bi_private = rdev;
-                       bi->bi_bdev = rdev->bdev;
+                       bio_set_dev(bi, rdev->bdev);
                         bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
                         atomic_inc(&mddev->flush_pending);
                         submit_bio(bi);
@@@ -772,7 -772,7 +772,7 @@@ void md_super_write(struct mddev *mddev
   
         atomic_inc(&rdev->nr_pending);
   
-       bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
+       bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
         bio->bi_iter.bi_sector = sector;
         bio_add_page(bio, page, size, 0);
         bio->bi_private = rdev;
@@@ -803,8 -803,10 +803,10 @@@ int sync_page_io(struct md_rdev *rdev, 
         struct bio *bio = md_bio_alloc_sync(rdev->mddev);
         int ret;
   
-       bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
-               rdev->meta_bdev : rdev->bdev;
+       if (metadata_op && rdev->meta_bdev)
+               bio_set_dev(bio, rdev->meta_bdev);
+       else
+               bio_set_dev(bio, rdev->bdev);
         bio_set_op_attrs(bio, op, op_flags);
         if (metadata_op)
                 bio->bi_iter.bi_sector = sector + rdev->sb_start;
@@@ -7996,7 -7998,7 +7998,7 @@@ bool md_write_start(struct mddev *mddev
         if (mddev->safemode == 1)
                 mddev->safemode = 0;
         /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
- -      if (mddev->in_sync || !mddev->sync_checkers) {
+ +      if (mddev->in_sync || mddev->sync_checkers) {
                 spin_lock(&mddev->lock);
                 if (mddev->in_sync) {
                         mddev->in_sync = 0;
@@@ -8656,9 -8658,6 +8658,9 @@@ void md_check_recovery(struct mddev *md
         if (mddev_trylock(mddev)) {
                 int spares = 0;
   
+ +              if (!mddev->external && mddev->safemode == 1)
+ +                      mddev->safemode = 0;
+ +
                 if (mddev->ro) {
                         struct md_rdev *rdev;
                         if (!mddev->external && mddev->in_sync)
diff --combined drivers/md/raid5-cache.c

index 2dcbafa,f253a9c..5d7da1c
--- 1/drivers/md/raid5-cache.c
--- 2/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@@ -236,10 -236,9 +236,10 @@@ struct r5l_io_unit 
         bool need_split_bio;
         struct bio *split_bio;
   
- -      unsigned int has_flush:1;      /* include flush request */
- -      unsigned int has_fua:1;        /* include fua request */
- -      unsigned int has_null_flush:1; /* include empty flush request */
+ +      unsigned int has_flush:1;               /* include flush request */
+ +      unsigned int has_fua:1;                 /* include fua request */
+ +      unsigned int has_null_flush:1;          /* include null flush request */
+ +      unsigned int has_flush_payload:1;       /* include flush payload  */
         /*
          * io isn't sent yet, flush/fua request can only be submitted till it's
          * the first IO in running_ios list
@@@ -572,8 -571,6 +572,8 @@@ static void r5l_log_endio(struct bio *b
         struct r5l_io_unit *io_deferred;
         struct r5l_log *log = io->log;
         unsigned long flags;
+ +      bool has_null_flush;
+ +      bool has_flush_payload;
   
         if (bio->bi_status)
                 md_error(log->rdev->mddev, log->rdev);
@@@ -583,16 -580,6 +583,16 @@@
   
         spin_lock_irqsave(&log->io_list_lock, flags);
         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+ +
+ +      /*
+ +       * if the io doesn't not have null_flush or flush payload,
+ +       * it is not safe to access it after releasing io_list_lock.
+ +       * Therefore, it is necessary to check the condition with
+ +       * the lock held.
+ +       */
+ +      has_null_flush = io->has_null_flush;
+ +      has_flush_payload = io->has_flush_payload;
+ +
         if (log->need_cache_flush && !list_empty(&io->stripe_list))
                 r5l_move_to_end_ios(log);
         else
@@@ -613,23 -600,19 +613,23 @@@
         if (log->need_cache_flush)
                 md_wakeup_thread(log->rdev->mddev->thread);
   
- -      if (io->has_null_flush) {
+ +      /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
+ +      if (has_null_flush) {
                 struct bio *bi;
   
                 WARN_ON(bio_list_empty(&io->flush_barriers));
                 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
                         bio_endio(bi);
- -                      atomic_dec(&io->pending_stripe);
+ +                      if (atomic_dec_and_test(&io->pending_stripe)) {
+ +                              __r5l_stripe_write_finished(io);
+ +                              return;
+ +                      }
                 }
         }
- -
- -      /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
- -      if (atomic_read(&io->pending_stripe) == 0)
- -              __r5l_stripe_write_finished(io);
+ +      /* decrease pending_stripe for flush payload */
+ +      if (has_flush_payload)
+ +              if (atomic_dec_and_test(&io->pending_stripe))
+ +                      __r5l_stripe_write_finished(io);
   }
   
   static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
@@@ -745,7 -728,7 +745,7 @@@ static struct bio *r5l_bio_alloc(struc
         struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
   
         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-       bio->bi_bdev = log->rdev->bdev;
+       bio_set_dev(bio, log->rdev->bdev);
         bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
   
         return bio;
@@@ -898,11 -881,6 +898,11 @@@ static void r5l_append_flush_payload(st
         payload->size = cpu_to_le32(sizeof(__le64));
         payload->flush_stripes[0] = cpu_to_le64(sect);
         io->meta_offset += meta_size;
+ +      /* multiple flush payloads count as one pending_stripe */
+ +      if (!io->has_flush_payload) {
+ +              io->has_flush_payload = 1;
+ +              atomic_inc(&io->pending_stripe);
+ +      }
         mutex_unlock(&log->io_mutex);
   }
   
@@@ -1313,7 -1291,7 +1313,7 @@@ void r5l_flush_stripe_to_raid(struct r5
         if (!do_flush)
                 return;
         bio_reset(&log->flush_bio);
-       log->flush_bio.bi_bdev = log->rdev->bdev;
+       bio_set_dev(&log->flush_bio, log->rdev->bdev);
         log->flush_bio.bi_end_io = r5l_log_flush_endio;
         log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
         submit_bio(&log->flush_bio);
@@@ -1691,7 -1669,7 +1691,7 @@@ static int r5l_recovery_fetch_ra_pool(s
                                       sector_t offset)
   {
         bio_reset(ctx->ra_bio);
-       ctx->ra_bio->bi_bdev = log->rdev->bdev;
+       bio_set_dev(ctx->ra_bio, log->rdev->bdev);
         bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
         ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
   
@@@ -2562,32 -2540,23 +2562,32 @@@ static ssize_t r5c_journal_mode_show(st
    */
   int r5c_journal_mode_set(struct mddev *mddev, int mode)
   {
- -      struct r5conf *conf = mddev->private;
- -      struct r5l_log *log = conf->log;
- -
- -      if (!log)
- -              return -ENODEV;
+ +      struct r5conf *conf;
+ +      int err;
   
         if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
             mode > R5C_JOURNAL_MODE_WRITE_BACK)
                 return -EINVAL;
   
+ +      err = mddev_lock(mddev);
+ +      if (err)
+ +              return err;
+ +      conf = mddev->private;
+ +      if (!conf || !conf->log) {
+ +              mddev_unlock(mddev);
+ +              return -ENODEV;
+ +      }
+ +
         if (raid5_calc_degraded(conf) > 0 &&
- -          mode == R5C_JOURNAL_MODE_WRITE_BACK)
+ +          mode == R5C_JOURNAL_MODE_WRITE_BACK) {
+ +              mddev_unlock(mddev);
                 return -EINVAL;
+ +      }
   
         mddev_suspend(mddev);
         conf->log->r5c_journal_mode = mode;
         mddev_resume(mddev);
+ +      mddev_unlock(mddev);
   
         pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
                  mdname(mddev), mode, r5c_journal_mode_str[mode]);
diff --combined drivers/nvme/host/core.c

index 37046ac,f03452d..c596dd3
--- 1/drivers/nvme/host/core.c
--- 2/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@@ -336,7 -336,7 +336,7 @@@ static int nvme_get_stream_params(struc
   
         c.directive.opcode = nvme_admin_directive_recv;
         c.directive.nsid = cpu_to_le32(nsid);
- -      c.directive.numd = cpu_to_le32(sizeof(*s));
+ +      c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
         c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
         c.directive.dtype = NVME_DIR_STREAMS;
   
@@@ -613,11 -613,7 +613,7 @@@ int __nvme_submit_user_cmd(struct reque
   
                 if (!disk)
                         goto submit;
-               bio->bi_bdev = bdget_disk(disk, 0);
-               if (!bio->bi_bdev) {
-                       ret = -ENODEV;
-                       goto out_unmap;
-               }
+               bio->bi_disk = disk;
   
                 if (meta_buffer && meta_len) {
                         struct bio_integrity_payload *bip;
@@@ -668,11 -664,8 +664,8 @@@
    out_free_meta:
         kfree(meta);
    out_unmap:
-       if (bio) {
-               if (disk && bio->bi_bdev)
-                       bdput(bio->bi_bdev);
+       if (bio)
                 blk_rq_unmap_user(bio);
-       }
    out:
         blk_mq_free_request(req);
         return ret;
@@@ -1509,7 -1502,7 +1502,7 @@@ static void nvme_set_queue_limits(struc
         blk_queue_write_cache(q, vwc, vwc);
   }
   
- -static void nvme_configure_apst(struct nvme_ctrl *ctrl)
+ +static int nvme_configure_apst(struct nvme_ctrl *ctrl)
   {
         /*
          * APST (Autonomous Power State Transition) lets us program a
@@@ -1538,16 -1531,16 +1531,16 @@@
          * then don't do anything.
          */
         if (!ctrl->apsta)
- -              return;
+ +              return 0;
   
         if (ctrl->npss > 31) {
                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
- -              return;
+ +              return 0;
         }
   
         table = kzalloc(sizeof(*table), GFP_KERNEL);
         if (!table)
- -              return;
+ +              return 0;
   
         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
                 /* Turn off APST. */
@@@ -1629,7 -1622,6 +1622,7 @@@
                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
   
         kfree(table);
+ +      return ret;
   }
   
   static void nvme_set_latency_tolerance(struct device *dev, s32 val)
@@@ -1836,16 -1828,13 +1829,16 @@@ int nvme_init_identify(struct nvme_ctr
                  * In fabrics we need to verify the cntlid matches the
                  * admin connect
                  */
- -              if (ctrl->cntlid != le16_to_cpu(id->cntlid))
+ +              if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
                         ret = -EINVAL;
+ +                      goto out_free;
+ +              }
   
                 if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
                         dev_err(ctrl->device,
                                 "keep-alive support is mandatory for fabrics\n");
                         ret = -EINVAL;
+ +                      goto out_free;
                 }
         } else {
                 ctrl->cntlid = le16_to_cpu(id->cntlid);
@@@ -1860,20 -1849,11 +1853,20 @@@
         else if (!ctrl->apst_enabled && prev_apst_enabled)
                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
   
- -      nvme_configure_apst(ctrl);
- -      nvme_configure_directives(ctrl);
+ +      ret = nvme_configure_apst(ctrl);
+ +      if (ret < 0)
+ +              return ret;
+ +
+ +      ret = nvme_configure_directives(ctrl);
+ +      if (ret < 0)
+ +              return ret;
   
         ctrl->identified = true;
   
+ +      return 0;
+ +
+ +out_free:
+ +      kfree(id);
         return ret;
   }
   EXPORT_SYMBOL_GPL(nvme_init_identify);
@@@ -2017,11 -1997,9 +2010,11 @@@ static ssize_t wwid_show(struct device 
         if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
                 return sprintf(buf, "eui.%8phN\n", ns->eui);
   
- -      while (ctrl->serial[serial_len - 1] == ' ')
+ +      while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
+ +                                ctrl->serial[serial_len - 1] == '\0'))
                 serial_len--;
- -      while (ctrl->model[model_len - 1] == ' ')
+ +      while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
+ +                               ctrl->model[model_len - 1] == '\0'))
                 model_len--;
   
         return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
diff --combined drivers/nvme/host/rdma.c

index a7f7d0a,9ff0eb3..bf42d31
--- 1/drivers/nvme/host/rdma.c
--- 2/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@@ -19,7 -19,6 +19,7 @@@
   #include <linux/string.h>
   #include <linux/atomic.h>
   #include <linux/blk-mq.h>
+ +#include <linux/blk-mq-rdma.h>
   #include <linux/types.h>
   #include <linux/list.h>
   #include <linux/mutex.h>
@@@ -464,10 -463,14 +464,10 @@@ static int nvme_rdma_create_queue_ib(st
         ibdev = queue->device->dev;
   
         /*
- -       * The admin queue is barely used once the controller is live, so don't
- -       * bother to spread it out.
+ +       * Spread I/O queues completion vectors according their queue index.
+ +       * Admin queues can always go on completion vector 0.
          */
- -      if (idx == 0)
- -              comp_vector = 0;
- -      else
- -              comp_vector = idx % ibdev->num_comp_vectors;
- -
+ +      comp_vector = idx == 0 ? idx : idx - 1;
   
         /* +1 for ib_stop_cq */
         queue->ib_cq = ib_alloc_cq(ibdev, queue,
@@@ -608,20 -611,10 +608,20 @@@ out_free_queues
   static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
   {
         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ +      struct ib_device *ibdev = ctrl->device->dev;
         unsigned int nr_io_queues;
         int i, ret;
   
         nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
+ +
+ +      /*
+ +       * we map queues according to the device irq vectors for
+ +       * optimal locality so we don't need more queues than
+ +       * completion vectors.
+ +       */
+ +      nr_io_queues = min_t(unsigned int, nr_io_queues,
+ +                              ibdev->num_comp_vectors);
+ +
         ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
         if (ret)
                 return ret;
@@@ -711,14 -704,16 +711,16 @@@ static void nvme_rdma_reconnect_ctrl_wo
         if (ctrl->ctrl.queue_count > 1) {
                 nvme_rdma_free_io_queues(ctrl);
   
-               ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+               ret = blk_mq_reinit_tagset(&ctrl->tag_set,
+                                          nvme_rdma_reinit_request);
                 if (ret)
                         goto requeue;
         }
   
         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
   
-       ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
+       ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set,
+                                  nvme_rdma_reinit_request);
         if (ret)
                 goto requeue;
   
@@@ -927,11 -922,7 +929,11 @@@ static int nvme_rdma_map_sg_fr(struct n
         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
         int nr;
   
- -      nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
+ +      /*
+ +       * Align the MR to a 4K page size to match the ctrl page size and
+ +       * the block virtual boundary.
+ +       */
+ +      nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
         if (nr < count) {
                 if (nr < 0)
                         return nr;
@@@ -1509,23 -1500,14 +1511,22 @@@ static void nvme_rdma_complete_rq(struc
         nvme_complete_rq(rq);
   }
   
+ +static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
+ +{
+ +      struct nvme_rdma_ctrl *ctrl = set->driver_data;
+ +
+ +      return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+ +}
+ +
   static const struct blk_mq_ops nvme_rdma_mq_ops = {
         .queue_rq       = nvme_rdma_queue_rq,
         .complete       = nvme_rdma_complete_rq,
         .init_request   = nvme_rdma_init_request,
         .exit_request   = nvme_rdma_exit_request,
-       .reinit_request = nvme_rdma_reinit_request,
         .init_hctx      = nvme_rdma_init_hctx,
         .poll           = nvme_rdma_poll,
         .timeout        = nvme_rdma_timeout,
+ +      .map_queues     = nvme_rdma_map_queues,
   };
   
   static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
@@@ -1533,7 -1515,6 +1534,6 @@@
         .complete       = nvme_rdma_complete_rq,
         .init_request   = nvme_rdma_init_request,
         .exit_request   = nvme_rdma_exit_request,
-       .reinit_request = nvme_rdma_reinit_request,
         .init_hctx      = nvme_rdma_init_admin_hctx,
         .timeout        = nvme_rdma_timeout,
   };
@@@ -1602,7 -1583,7 +1602,7 @@@ static int nvme_rdma_configure_admin_qu
                 goto out_cleanup_queue;
   
         ctrl->ctrl.max_hw_sectors =
- -              (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
+ +              (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
   
         error = nvme_init_identify(&ctrl->ctrl);
         if (error)
@@@ -1731,7 -1712,8 +1731,8 @@@ static void nvme_rdma_reset_ctrl_work(s
         }
   
         if (ctrl->ctrl.queue_count > 1) {
-               ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+               ret = blk_mq_reinit_tagset(&ctrl->tag_set,
+                                          nvme_rdma_reinit_request);
                 if (ret)
                         goto del_dead_ctrl;
   
@@@ -1965,6 -1947,10 +1966,6 @@@ static struct nvmf_transport_ops nvme_r
         .create_ctrl    = nvme_rdma_create_ctrl,
   };
   
- -static void nvme_rdma_add_one(struct ib_device *ib_device)
- -{
- -}
- -
   static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
   {
         struct nvme_rdma_ctrl *ctrl;
@@@ -1986,6 -1972,7 +1987,6 @@@
   
   static struct ib_client nvme_rdma_ib_client = {
         .name   = "nvme_rdma",
- -      .add = nvme_rdma_add_one,
         .remove = nvme_rdma_remove_one
   };
   
diff --combined fs/btrfs/disk-io.c

index f45b61f,0640c27..4f428a4
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -3499,7 -3499,7 +3499,7 @@@ static void write_dev_flush(struct btrf
   
         bio_reset(bio);
         bio->bi_end_io = btrfs_end_empty_barrier;
-       bio->bi_bdev = device->bdev;
+       bio_set_dev(bio, device->bdev);
         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
         init_completion(&device->flush_wait);
         bio->bi_private = &device->flush_wait;
@@@ -3516,7 -3516,7 +3516,7 @@@ static blk_status_t wait_dev_flush(stru
         struct bio *bio = device->flush_bio;
   
         if (!device->flush_bio_sent)
- -              return 0;
+ +              return BLK_STS_OK;
   
         device->flush_bio_sent = 0;
         wait_for_completion_io(&device->flush_wait);
@@@ -3563,7 -3563,7 +3563,7 @@@ static int barrier_all_devices(struct b
                         continue;
   
                 write_dev_flush(dev);
- -              dev->last_flush_error = 0;
+ +              dev->last_flush_error = BLK_STS_OK;
         }
   
         /* wait for all the barriers */
diff --combined fs/btrfs/raid56.c

index 2cf6ba4,d268cb6..24a6222
--- 1/fs/btrfs/raid56.c
--- 2/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@@ -905,7 -905,7 +905,7 @@@ static void raid_write_end_io(struct bi
         if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
   
- -      err = 0;
+ +      err = BLK_STS_OK;
   
         /* OK, we have read all the stripes we need to. */
         max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
@@@ -1090,7 -1090,8 +1090,8 @@@ static int rbio_add_io_page(struct btrf
                  */
                 if (last_end == disk_start && stripe->dev->bdev &&
                     !last->bi_status &&
-                   last->bi_bdev == stripe->dev->bdev) {
+                   last->bi_disk == stripe->dev->bdev->bd_disk &&
+                   last->bi_partno == stripe->dev->bdev->bd_partno) {
                         ret = bio_add_page(last, page, PAGE_SIZE, 0);
                         if (ret == PAGE_SIZE)
                                 return 0;
@@@ -1100,7 -1101,7 +1101,7 @@@
         /* put a new bio on the list */
         bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
         bio->bi_iter.bi_size = 0;
-       bio->bi_bdev = stripe->dev->bdev;
+       bio_set_dev(bio, stripe->dev->bdev);
         bio->bi_iter.bi_sector = disk_start >> 9;
   
         bio_add_page(bio, page, PAGE_SIZE, 0);
@@@ -1324,7 -1325,7 +1325,7 @@@ write_data
         return;
   
   cleanup:
- -      rbio_orig_end_io(rbio, -EIO);
+ +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
   }
   
   /*
@@@ -1347,7 -1348,8 +1348,8 @@@ static int find_bio_stripe(struct btrfs
                 stripe_start = stripe->physical;
                 if (physical >= stripe_start &&
                     physical < stripe_start + rbio->stripe_len &&
-                   bio->bi_bdev == stripe->dev->bdev) {
+                   bio->bi_disk == stripe->dev->bdev->bd_disk &&
+                   bio->bi_partno == stripe->dev->bdev->bd_partno) {
                         return i;
                 }
         }
@@@ -1475,7 -1477,7 +1477,7 @@@ static void raid_rmw_end_io(struct bio 
   
   cleanup:
   
- -      rbio_orig_end_io(rbio, -EIO);
+ +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
   }
   
   static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
@@@ -1579,7 -1581,7 +1581,7 @@@ static int raid56_rmw_stripe(struct btr
         return 0;
   
   cleanup:
- -      rbio_orig_end_io(rbio, -EIO);
+ +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
         return -EIO;
   
   finish:
@@@ -1795,12 -1797,12 +1797,12 @@@ static void __raid_recover_end_io(struc
         void **pointers;
         int faila = -1, failb = -1;
         struct page *page;
- -      int err;
+ +      blk_status_t err;
         int i;
   
         pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
         if (!pointers) {
- -              err = -ENOMEM;
+ +              err = BLK_STS_RESOURCE;
                 goto cleanup_io;
         }
   
@@@ -1856,7 -1858,7 +1858,7 @@@
                                          * a bad data or Q stripe.
                                          * TODO, we should redo the xor here.
                                          */
- -                                      err = -EIO;
+ +                                      err = BLK_STS_IOERR;
                                         goto cleanup;
                                 }
                                 /*
@@@ -1882,7 -1884,7 +1884,7 @@@
                         if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
                                 if (rbio->bbio->raid_map[faila] ==
                                     RAID5_P_STRIPE) {
- -                                      err = -EIO;
+ +                                      err = BLK_STS_IOERR;
                                         goto cleanup;
                                 }
                                 /*
@@@ -1954,13 -1956,13 +1956,13 @@@ pstripe
                 }
         }
   
- -      err = 0;
+ +      err = BLK_STS_OK;
   cleanup:
         kfree(pointers);
   
   cleanup_io:
         if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- -              if (err == 0)
+ +              if (err == BLK_STS_OK)
                         cache_rbio_pages(rbio);
                 else
                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@@ -1968,7 -1970,7 +1970,7 @@@
                 rbio_orig_end_io(rbio, err);
         } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
                 rbio_orig_end_io(rbio, err);
- -      } else if (err == 0) {
+ +      } else if (err == BLK_STS_OK) {
                 rbio->faila = -1;
                 rbio->failb = -1;
   
@@@ -2005,7 -2007,7 +2007,7 @@@ static void raid_recover_end_io(struct 
                 return;
   
         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
- -              rbio_orig_end_io(rbio, -EIO);
+ +              rbio_orig_end_io(rbio, BLK_STS_IOERR);
         else
                 __raid_recover_end_io(rbio);
   }
@@@ -2104,7 -2106,7 +2106,7 @@@ out
   cleanup:
         if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
             rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
- -              rbio_orig_end_io(rbio, -EIO);
+ +              rbio_orig_end_io(rbio, BLK_STS_IOERR);
         return -EIO;
   }
   
@@@ -2431,7 -2433,7 +2433,7 @@@ submit_write
         nr_data = bio_list_size(&bio_list);
         if (!nr_data) {
                 /* Every parity is right */
- -              rbio_orig_end_io(rbio, 0);
+ +              rbio_orig_end_io(rbio, BLK_STS_OK);
                 return;
         }
   
@@@ -2451,7 -2453,7 +2453,7 @@@
         return;
   
   cleanup:
- -      rbio_orig_end_io(rbio, -EIO);
+ +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
   }
   
   static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@@ -2519,7 -2521,7 +2521,7 @@@ static void validate_rbio_for_parity_sc
         return;
   
   cleanup:
- -      rbio_orig_end_io(rbio, -EIO);
+ +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
   }
   
   /*
@@@ -2633,7 -2635,7 +2635,7 @@@ static void raid56_parity_scrub_stripe(
         return;
   
   cleanup:
- -      rbio_orig_end_io(rbio, -EIO);
+ +      rbio_orig_end_io(rbio, BLK_STS_IOERR);
         return;
   
   finish:
diff --combined fs/btrfs/volumes.c

index bd679bc,f9f0f47..002aa31
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -6188,7 -6188,7 +6188,7 @@@ static void submit_stripe_bio(struct bt
                 rcu_read_unlock();
         }
   #endif
-       bio->bi_bdev = dev->bdev;
+       bio_set_dev(bio, dev->bdev);
   
         btrfs_bio_counter_inc_noblocked(fs_info);
   
@@@ -6212,8 -6212,8 +6212,8 @@@ static void bbio_error(struct btrfs_bi
         }
   }
   
- -int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- -                int mirror_num, int async_submit)
+ +blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ +                         int mirror_num, int async_submit)
   {
         struct btrfs_device *dev;
         struct bio *first_bio = bio;
@@@ -6233,7 -6233,7 +6233,7 @@@
                                 &map_length, &bbio, mirror_num, 1);
         if (ret) {
                 btrfs_bio_counter_dec(fs_info);
- -              return ret;
+ +              return errno_to_blk_status(ret);
         }
   
         total_devs = bbio->num_stripes;
@@@ -6256,7 -6256,7 +6256,7 @@@
                 }
   
                 btrfs_bio_counter_dec(fs_info);
- -              return ret;
+ +              return errno_to_blk_status(ret);
         }
   
         if (map_length < length) {
@@@ -6283,7 -6283,7 +6283,7 @@@
                                   dev_nr, async_submit);
         }
         btrfs_bio_counter_dec(fs_info);
- -      return 0;
+ +      return BLK_STS_OK;
   }
   
   struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
diff --combined fs/buffer.c

index 50da0e1,50e51a6..170df85
--- 1/fs/buffer.c
--- 2/fs/buffer.c
+++ b/fs/buffer.c
@@@ -1627,17 -1627,20 +1627,17 @@@ void clean_bdev_aliases(struct block_de
         struct pagevec pvec;
         pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
         pgoff_t end;
- -      int i;
+ +      int i, count;
         struct buffer_head *bh;
         struct buffer_head *head;
   
         end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
         pagevec_init(&pvec, 0);
- -      while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
- -                      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
- -              for (i = 0; i < pagevec_count(&pvec); i++) {
+ +      while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
+ +              count = pagevec_count(&pvec);
+ +              for (i = 0; i < count; i++) {
                         struct page *page = pvec.pages[i];
   
- -                      index = page->index;
- -                      if (index > end)
- -                              break;
                         if (!page_has_buffers(page))
                                 continue;
                         /*
@@@ -1667,9 -1670,7 +1667,9 @@@ unlock_page
                 }
                 pagevec_release(&pvec);
                 cond_resched();
- -              index++;
+ +              /* End of range already reached? */
+ +              if (index > end || !index)
+ +                      break;
         }
   }
   EXPORT_SYMBOL(clean_bdev_aliases);
@@@ -3056,7 -3057,7 +3056,7 @@@ void guard_bio_eod(int op, struct bio *
         struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
         unsigned truncated_bytes;
   
-       maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+       maxsector = get_capacity(bio->bi_disk);
         if (!maxsector)
                 return;
   
@@@ -3115,7 -3116,7 +3115,7 @@@ static int submit_bh_wbc(int op, int op
         }
   
         bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-       bio->bi_bdev = bh->b_bdev;
+       bio_set_dev(bio, bh->b_bdev);
         bio->bi_write_hint = write_hint;
   
         bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@@@ -3548,10 -3549,10 +3548,10 @@@ page_cache_seek_hole_data(struct inode 
         pagevec_init(&pvec, 0);
   
         do {
- -              unsigned want, nr_pages, i;
+ +              unsigned nr_pages, i;
   
- -              want = min_t(unsigned, end - index, PAGEVEC_SIZE);
- -              nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+ +              nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+ +                                              end - 1);
                 if (nr_pages == 0)
                         break;
   
@@@ -3572,6 -3573,10 +3572,6 @@@
                             lastoff < page_offset(page))
                                 goto check_range;
   
- -                      /* Searching done if the page index is out of range. */
- -                      if (page->index >= end)
- -                              goto not_found;
- -
                         lock_page(page);
                         if (likely(page->mapping == inode->i_mapping) &&
                             page_has_buffers(page)) {
@@@ -3584,6 -3589,12 +3584,6 @@@
                         unlock_page(page);
                         lastoff = page_offset(page) + PAGE_SIZE;
                 }
- -
- -              /* Searching done if fewer pages returned than wanted. */
- -              if (nr_pages < want)
- -                      break;
- -
- -              index = pvec.pages[i - 1]->index + 1;
                 pagevec_release(&pvec);
         } while (index < end);
   
diff --combined fs/gfs2/lops.c

index 7dabbe7,720c19a..c8ff7b7
--- 1/fs/gfs2/lops.c
--- 2/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@@ -207,11 -207,8 +207,11 @@@ static void gfs2_end_log_write(struct b
         struct page *page;
         int i;
   
- -      if (bio->bi_status)
- -              fs_err(sdp, "Error %d writing to log\n", bio->bi_status);
+ +      if (bio->bi_status) {
+ +              fs_err(sdp, "Error %d writing to journal, jid=%u\n",
+ +                     bio->bi_status, sdp->sd_jdesc->jd_jid);
+ +              wake_up(&sdp->sd_logd_waitq);
+ +      }
   
         bio_for_each_segment_all(bvec, bio, i) {
                 page = bvec->bv_page;
@@@ -268,7 -265,7 +268,7 @@@ static struct bio *gfs2_log_alloc_bio(s
   
         bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
         bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
-       bio->bi_bdev = sb->s_bdev;
+       bio_set_dev(bio, sb->s_bdev);
         bio->bi_end_io = gfs2_end_log_write;
         bio->bi_private = sdp;
   
diff --combined fs/gfs2/meta_io.c

index 61ef6c9,39433a1..52de103
--- 1/fs/gfs2/meta_io.c
--- 2/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@@ -221,7 -221,7 +221,7 @@@ static void gfs2_submit_bhs(int op, in
   
                 bio = bio_alloc(GFP_NOIO, num);
                 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-               bio->bi_bdev = bh->b_bdev;
+               bio_set_dev(bio, bh->b_bdev);
                 while (num > 0) {
                         bh = *bhs;
                         if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
@@@ -419,9 -419,8 +419,9 @@@ int gfs2_meta_indirect_buffer(struct gf
         if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
                 brelse(bh);
                 ret = -EIO;
+ +      } else {
+ +              *bhp = bh;
         }
- -      *bhp = bh;
         return ret;
   }
   
@@@ -453,7 -452,7 +453,7 @@@ struct buffer_head *gfs2_meta_ra(struc
         if (buffer_uptodate(first_bh))
                 goto out;
         if (!buffer_locked(first_bh))
- -              ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
+ +              ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh);
   
         dblock++;
         extlen--;
@@@ -462,9 -461,7 +462,9 @@@
                 bh = gfs2_getbuf(gl, dblock, CREATE);
   
                 if (!buffer_uptodate(bh) && !buffer_locked(bh))
- -                      ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh);
+ +                      ll_rw_block(REQ_OP_READ,
+ +                                  REQ_RAHEAD | REQ_META | REQ_PRIO,
+ +                                  1, &bh);
                 brelse(bh);
                 dblock++;
                 extlen--;
diff --combined fs/gfs2/ops_fstype.c

index c0a4b37,8155e16..8459358
--- 1/fs/gfs2/ops_fstype.c
--- 2/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@@ -242,7 -242,7 +242,7 @@@ static int gfs2_read_super(struct gfs2_
   
         bio = bio_alloc(GFP_NOFS, 1);
         bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
-       bio->bi_bdev = sb->s_bdev;
+       bio_set_dev(bio, sb->s_bdev);
         bio_add_page(bio, page, PAGE_SIZE, 0);
   
         bio->bi_end_io = end_bio_io_page;
@@@ -1113,7 -1113,7 +1113,7 @@@ static int fill_super(struct super_bloc
                 return error;
         }
   
- -      snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
+ +      snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name);
   
         error = gfs2_sys_fs_add(sdp);
         /*
@@@ -1159,10 -1159,10 +1159,10 @@@
         }
   
         if (sdp->sd_args.ar_spectator)
- -              snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+ +              snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s",
                          sdp->sd_table_name);
         else
- -              snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+ +              snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u",
                          sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
   
         error = init_inodes(sdp, DO);
@@@ -1388,6 -1388,7 +1388,6 @@@ static void gfs2_kill_sb(struct super_b
         sdp->sd_root_dir = NULL;
         sdp->sd_master_dir = NULL;
         shrink_dcache_sb(sb);
- -      gfs2_delete_debugfs_file(sdp);
         free_percpu(sdp->sd_lkstats);
         kill_block_super(sb);
   }
diff --combined fs/iomap.c

index 8554a8d,77be885..269b24a
--- 1/fs/iomap.c
--- 2/fs/iomap.c
+++ b/fs/iomap.c
@@@ -278,7 -278,7 +278,7 @@@ iomap_dirty_actor(struct inode *inode, 
                 unsigned long bytes;    /* Bytes to write to page */
   
                 offset = (pos & (PAGE_SIZE - 1));
- -              bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
+ +              bytes = min_t(loff_t, PAGE_SIZE - offset, length);
   
                 rpage = __iomap_read_page(inode, pos);
                 if (IS_ERR(rpage))
@@@ -373,7 -373,7 +373,7 @@@ iomap_zero_range_actor(struct inode *in
                 unsigned offset, bytes;
   
                 offset = pos & (PAGE_SIZE - 1); /* Within page */
- -              bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+ +              bytes = min_t(loff_t, PAGE_SIZE - offset, count);
   
                 if (IS_DAX(inode))
                         status = iomap_dax_zero(pos, offset, bytes, iomap);
@@@ -477,10 -477,10 +477,10 @@@ int iomap_page_mkwrite(struct vm_fault 
   
         set_page_dirty(page);
         wait_for_stable_page(page);
- -      return 0;
+ +      return VM_FAULT_LOCKED;
   out_unlock:
         unlock_page(page);
- -      return ret;
+ +      return block_page_mkwrite_return(ret);
   }
   EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
   
@@@ -805,7 -805,7 +805,7 @@@ iomap_dio_zero(struct iomap_dio *dio, s
         struct bio *bio;
   
         bio = bio_alloc(GFP_KERNEL, 1);
-       bio->bi_bdev = iomap->bdev;
+       bio_set_dev(bio, iomap->bdev);
         bio->bi_iter.bi_sector =
                 iomap->blkno + ((pos - iomap->offset) >> 9);
         bio->bi_private = dio;
@@@ -884,7 -884,7 +884,7 @@@ iomap_dio_actor(struct inode *inode, lo
                         return 0;
   
                 bio = bio_alloc(GFP_KERNEL, nr_pages);
-               bio->bi_bdev = iomap->bdev;
+               bio_set_dev(bio, iomap->bdev);
                 bio->bi_iter.bi_sector =
                         iomap->blkno + ((pos - iomap->offset) >> 9);
                 bio->bi_write_hint = dio->iocb->ki_hint;
diff --combined fs/kernfs/file.c

index e6c8954,7441925..9698e51
--- 1/fs/kernfs/file.c
--- 2/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@@ -616,7 -616,7 +616,7 @@@ static void kernfs_put_open_node(struc
   
   static int kernfs_fop_open(struct inode *inode, struct file *file)
   {
-       struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+       struct kernfs_node *kn = inode->i_private;
         struct kernfs_root *root = kernfs_root(kn);
         const struct kernfs_ops *ops;
         struct kernfs_open_file *of;
@@@ -768,7 -768,7 +768,7 @@@ static void kernfs_release_file(struct 
   
   static int kernfs_fop_release(struct inode *inode, struct file *filp)
   {
-       struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+       struct kernfs_node *kn = inode->i_private;
         struct kernfs_open_file *of = kernfs_of(filp);
   
         if (kn->flags & KERNFS_HAS_RELEASE) {
@@@ -835,7 -835,7 +835,7 @@@ void kernfs_drain_open_files(struct ker
   static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
   {
         struct kernfs_open_file *of = kernfs_of(filp);
-       struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+       struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
         struct kernfs_open_node *on = kn->attr.open;
   
         if (!kernfs_get_active(kn))
@@@ -895,7 -895,7 +895,7 @@@ repeat
                  * have the matching @file available.  Look up the inodes
                  * and generate the events manually.
                  */
-               inode = ilookup(info->sb, kn->ino);
+               inode = ilookup(info->sb, kn->id.ino);
                 if (!inode)
                         continue;
   
@@@ -903,7 -903,7 +903,7 @@@
                 if (parent) {
                         struct inode *p_inode;
   
-                       p_inode = ilookup(info->sb, parent->ino);
+                       p_inode = ilookup(info->sb, parent->id.ino);
                         if (p_inode) {
                                 fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
                                          inode, FSNOTIFY_EVENT_INODE, kn->name, 0);
@@@ -997,7 -997,7 +997,7 @@@ struct kernfs_node *__kernfs_create_fil
   
   #ifdef CONFIG_DEBUG_LOCK_ALLOC
         if (key) {
- -              lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+ +              lockdep_init_map(&kn->dep_map, "kn->count", key, 0);
                 kn->flags |= KERNFS_LOCKDEP;
         }
   #endif
diff --combined fs/ocfs2/cluster/heartbeat.c

index 56ac07c,6aea157..d020604
--- 1/fs/ocfs2/cluster/heartbeat.c
--- 2/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@@ -505,7 -505,8 +505,7 @@@ static inline void o2hb_bio_wait_dec(st
         }
   }
   
- -static void o2hb_wait_on_io(struct o2hb_region *reg,
- -                          struct o2hb_bio_wait_ctxt *wc)
+ +static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
   {
         o2hb_bio_wait_dec(wc, 1);
         wait_for_completion(&wc->wc_io_complete);
@@@ -553,7 -554,7 +553,7 @@@ static struct bio *o2hb_setup_one_bio(s
   
         /* Must put everything in 512 byte sectors for the bio... */
         bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
-       bio->bi_bdev = reg->hr_bdev;
+       bio_set_dev(bio, reg->hr_bdev);
         bio->bi_private = wc;
         bio->bi_end_io = o2hb_bio_end_io;
         bio_set_op_attrs(bio, op, op_flags);
@@@ -607,7 -608,7 +607,7 @@@ static int o2hb_read_slots(struct o2hb_
         status = 0;
   
   bail_and_wait:
- -      o2hb_wait_on_io(reg, &wc);
+ +      o2hb_wait_on_io(&wc);
         if (wc.wc_error && !status)
                 status = wc.wc_error;
   
@@@ -1161,7 -1162,7 +1161,7 @@@ static int o2hb_do_disk_heartbeat(struc
          * before we can go to steady state.  This ensures that
          * people we find in our steady state have seen us.
          */
- -      o2hb_wait_on_io(reg, &write_wc);
+ +      o2hb_wait_on_io(&write_wc);
         if (write_wc.wc_error) {
                 /* Do not re-arm the write timeout on I/O error - we
                  * can't be sure that the new block ever made it to
@@@ -1274,7 -1275,7 +1274,7 @@@ static int o2hb_thread(void *data
                 o2hb_prepare_block(reg, 0);
                 ret = o2hb_issue_node_write(reg, &write_wc);
                 if (ret == 0)
- -                      o2hb_wait_on_io(reg, &write_wc);
+ +                      o2hb_wait_on_io(&write_wc);
                 else
                         mlog_errno(ret);
         }
@@@ -2575,6 -2576,22 +2575,6 @@@ void o2hb_unregister_callback(const cha
   }
   EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
   
- -int o2hb_check_node_heartbeating(u8 node_num)
- -{
- -      unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- -
- -      o2hb_fill_node_map(testing_map, sizeof(testing_map));
- -      if (!test_bit(node_num, testing_map)) {
- -              mlog(ML_HEARTBEAT,
- -                   "node (%u) does not have heartbeating enabled.\n",
- -                   node_num);
- -              return 0;
- -      }
- -
- -      return 1;
- -}
- -EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
- -
   int o2hb_check_node_heartbeating_no_sem(u8 node_num)
   {
         unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@@ -2609,6 -2626,23 +2609,6 @@@ int o2hb_check_node_heartbeating_from_c
   }
   EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
   
- -/* Makes sure our local node is configured with a node number, and is
- - * heartbeating. */
- -int o2hb_check_local_node_heartbeating(void)
- -{
- -      u8 node_num;
- -
- -      /* if this node was set then we have networking */
- -      node_num = o2nm_this_node();
- -      if (node_num == O2NM_MAX_NODES) {
- -              mlog(ML_HEARTBEAT, "this node has not been configured.\n");
- -              return 0;
- -      }
- -
- -      return o2hb_check_node_heartbeating(node_num);
- -}
- -EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
- -
   /*
    * this is just a hack until we get the plumbing which flips file systems
    * read only and drops the hb ref instead of killing the node dead.
diff --combined fs/xfs/xfs_aops.c

index f9efd67,c8ca03a..fffae13
--- 1/fs/xfs/xfs_aops.c
--- 2/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@@ -85,11 -85,11 +85,11 @@@ xfs_find_bdev_for_inode
    * associated buffer_heads, paying attention to the start and end offsets that
    * we need to process on the page.
    *
- - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
- - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
- - * the page at all, as we may be racing with memory reclaim and it can free both
- - * the bufferhead chain and the page as it will see the page as clean and
- - * unused.
+ + * Note that we open code the action in end_buffer_async_write here so that we
+ + * only have to iterate over the buffers attached to the page once.  This is not
+ + * only more efficient, but also ensures that we only calls end_page_writeback
+ + * at the end of the iteration, and thus avoids the pitfall of having the page
+ + * and buffers potentially freed after every call to end_buffer_async_write.
    */
   static void
   xfs_finish_page_writeback(
@@@ -97,44 -97,29 +97,44 @@@
         struct bio_vec          *bvec,
         int                     error)
   {
- -      unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
- -      struct buffer_head      *head, *bh, *next;
+ +      struct buffer_head      *head = page_buffers(bvec->bv_page), *bh = head;
+ +      bool                    busy = false;
         unsigned int            off = 0;
- -      unsigned int            bsize;
+ +      unsigned long           flags;
   
         ASSERT(bvec->bv_offset < PAGE_SIZE);
         ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
- -      ASSERT(end < PAGE_SIZE);
+ +      ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
         ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
   
- -      bh = head = page_buffers(bvec->bv_page);
- -
- -      bsize = bh->b_size;
+ +      local_irq_save(flags);
+ +      bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
         do {
- -              if (off > end)
- -                      break;
- -              next = bh->b_this_page;
- -              if (off < bvec->bv_offset)
- -                      goto next_bh;
- -              bh->b_end_io(bh, !error);
- -next_bh:
- -              off += bsize;
- -      } while ((bh = next) != head);
+ +              if (off >= bvec->bv_offset &&
+ +                  off < bvec->bv_offset + bvec->bv_len) {
+ +                      ASSERT(buffer_async_write(bh));
+ +                      ASSERT(bh->b_end_io == NULL);
+ +
+ +                      if (error) {
+ +                              mark_buffer_write_io_error(bh);
+ +                              clear_buffer_uptodate(bh);
+ +                              SetPageError(bvec->bv_page);
+ +                      } else {
+ +                              set_buffer_uptodate(bh);
+ +                      }
+ +                      clear_buffer_async_write(bh);
+ +                      unlock_buffer(bh);
+ +              } else if (buffer_async_write(bh)) {
+ +                      ASSERT(buffer_locked(bh));
+ +                      busy = true;
+ +              }
+ +              off += bh->b_size;
+ +      } while ((bh = bh->b_this_page) != head);
+ +      bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ +      local_irq_restore(flags);
+ +
+ +      if (!busy)
+ +              end_page_writeback(bvec->bv_page);
   }
   
   /*
@@@ -148,10 -133,8 +148,10 @@@ xfs_destroy_ioend
         int                     error)
   {
         struct inode            *inode = ioend->io_inode;
- -      struct bio              *last = ioend->io_bio;
- -      struct bio              *bio, *next;
+ +      struct bio              *bio = &ioend->io_inline_bio;
+ +      struct bio              *last = ioend->io_bio, *next;
+ +      u64                     start = bio->bi_iter.bi_sector;
+ +      bool                    quiet = bio_flagged(bio, BIO_QUIET);
   
         for (bio = &ioend->io_inline_bio; bio; bio = next) {
                 struct bio_vec  *bvec;
@@@ -172,11 -155,6 +172,11 @@@
   
                 bio_put(bio);
         }
+ +
+ +      if (unlikely(error && !quiet)) {
+ +              xfs_err_ratelimited(XFS_I(inode)->i_mount,
+ +                      "writeback error on sector %llu", start);
+ +      }
   }
   
   /*
@@@ -445,8 -423,7 +445,8 @@@ xfs_start_buffer_writeback
         ASSERT(!buffer_delay(bh));
         ASSERT(!buffer_unwritten(bh));
   
- -      mark_buffer_async_write(bh);
+ +      bh->b_end_io = NULL;
+ +      set_buffer_async_write(bh);
         set_buffer_uptodate(bh);
         clear_buffer_dirty(bh);
   }
@@@ -540,7 -517,7 +540,7 @@@ xfs_init_bio_from_bh
         struct buffer_head      *bh)
   {
         bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-       bio->bi_bdev = bh->b_bdev;
+       bio_set_dev(bio, bh->b_bdev);
   }
   
   static struct xfs_ioend *
diff --combined include/linux/bio.h

index 1f0720d,a8fe793..275c91c
--- 1/include/linux/bio.h
--- 2/include/linux/bio.h
+++ b/include/linux/bio.h
@@@ -38,15 -38,7 +38,15 @@@
   #define BIO_BUG_ON
   #endif
   
+ +#ifdef CONFIG_THP_SWAP
+ +#if HPAGE_PMD_NR > 256
+ +#define BIO_MAX_PAGES         HPAGE_PMD_NR
+ +#else
   #define BIO_MAX_PAGES         256
+ +#endif
+ +#else
+ +#define BIO_MAX_PAGES         256
+ +#endif
   
   #define bio_prio(bio)                 (bio)->bi_ioprio
   #define bio_set_prio(bio, prio)               ((bio)->bi_ioprio = prio)
@@@ -471,10 -463,11 +471,11 @@@ extern struct bio *bio_copy_kern(struc
   extern void bio_set_pages_dirty(struct bio *bio);
   extern void bio_check_pages_dirty(struct bio *bio);
   
- void generic_start_io_acct(int rw, unsigned long sectors,
-                          struct hd_struct *part);
- void generic_end_io_acct(int rw, struct hd_struct *part,
-                        unsigned long start_time);
+ void generic_start_io_acct(struct request_queue *q, int rw,
+                               unsigned long sectors, struct hd_struct *part);
+ void generic_end_io_acct(struct request_queue *q, int rw,
+                               struct hd_struct *part,
+                               unsigned long start_time);
   
   #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
   # error       "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
@@@ -501,6 -494,24 +502,24 @@@ extern struct bio_vec *bvec_alloc(gfp_t
   extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
   extern unsigned int bvec_nr_vecs(unsigned short idx);
   
+ #define bio_set_dev(bio, bdev)                        \
+ do {                                          \
+       (bio)->bi_disk = (bdev)->bd_disk;       \
+       (bio)->bi_partno = (bdev)->bd_partno;   \
+ } while (0)
+ 
+ #define bio_copy_dev(dst, src)                        \
+ do {                                          \
+       (dst)->bi_disk = (src)->bi_disk;        \
+       (dst)->bi_partno = (src)->bi_partno;    \
+ } while (0)
+ 
+ #define bio_dev(bio) \
+       disk_devt((bio)->bi_disk)
+ 
+ #define bio_devname(bio, buf) \
+       __bdevname(bio_dev(bio), (buf))
+ 
   #ifdef CONFIG_BLK_CGROUP
   int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
   int bio_associate_current(struct bio *bio);
diff --combined include/linux/blkdev.h

index 4b99b13,f45f157..460294b
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -134,7 -134,7 +134,7 @@@ typedef __u32 __bitwise req_flags_t
   struct request {
         struct list_head queuelist;
         union {
- -              struct call_single_data csd;
+ +              call_single_data_t csd;
                 u64 fifo_time;
         };
   
@@@ -568,6 -568,7 +568,6 @@@ struct request_queue 
   
   #if defined(CONFIG_BLK_DEV_BSG)
         bsg_job_fn              *bsg_job_fn;
- -      int                     bsg_job_size;
         struct bsg_class_device bsg_dev;
   #endif
   
@@@ -600,38 -601,36 +600,36 @@@
         u64                     write_hints[BLK_MAX_WRITE_HINTS];
   };
   
- #define QUEUE_FLAG_QUEUED     1       /* uses generic tag queueing */
- #define QUEUE_FLAG_STOPPED    2       /* queue is stopped */
- #define       QUEUE_FLAG_SYNCFULL     3       /* read queue has been filled */
- #define QUEUE_FLAG_ASYNCFULL  4       /* write queue has been filled */
- #define QUEUE_FLAG_DYING      5       /* queue being torn down */
- #define QUEUE_FLAG_BYPASS     6       /* act as dumb FIFO queue */
- #define QUEUE_FLAG_BIDI               7       /* queue supports bidi requests */
- #define QUEUE_FLAG_NOMERGES     8     /* disable merge attempts */
- #define QUEUE_FLAG_SAME_COMP  9       /* complete on same CPU-group */
- #define QUEUE_FLAG_FAIL_IO     10     /* fake timeout */
- #define QUEUE_FLAG_STACKABLE   11     /* supports request stacking */
- #define QUEUE_FLAG_NONROT      12     /* non-rotational device (SSD) */
+ #define QUEUE_FLAG_QUEUED     0       /* uses generic tag queueing */
+ #define QUEUE_FLAG_STOPPED    1       /* queue is stopped */
+ #define QUEUE_FLAG_DYING      2       /* queue being torn down */
+ #define QUEUE_FLAG_BYPASS     3       /* act as dumb FIFO queue */
+ #define QUEUE_FLAG_BIDI               4       /* queue supports bidi requests */
+ #define QUEUE_FLAG_NOMERGES     5     /* disable merge attempts */
+ #define QUEUE_FLAG_SAME_COMP  6       /* complete on same CPU-group */
+ #define QUEUE_FLAG_FAIL_IO    7       /* fake timeout */
+ #define QUEUE_FLAG_STACKABLE  8       /* supports request stacking */
+ #define QUEUE_FLAG_NONROT     9       /* non-rotational device (SSD) */
   #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
- #define QUEUE_FLAG_IO_STAT     13     /* do IO stats */
- #define QUEUE_FLAG_DISCARD     14     /* supports DISCARD */
- #define QUEUE_FLAG_NOXMERGES   15     /* No extended merges */
- #define QUEUE_FLAG_ADD_RANDOM  16     /* Contributes to random pool */
- #define QUEUE_FLAG_SECERASE    17     /* supports secure erase */
- #define QUEUE_FLAG_SAME_FORCE  18     /* force complete on same CPU */
- #define QUEUE_FLAG_DEAD        19     /* queue tear-down finished */
- #define QUEUE_FLAG_INIT_DONE   20     /* queue is initialized */
- #define QUEUE_FLAG_NO_SG_MERGE 21     /* don't attempt to merge SG segments*/
- #define QUEUE_FLAG_POLL              22       /* IO polling enabled if set */
- #define QUEUE_FLAG_WC        23       /* Write back caching */
- #define QUEUE_FLAG_FUA               24       /* device supports FUA writes */
- #define QUEUE_FLAG_FLUSH_NQ    25     /* flush not queueuable */
- #define QUEUE_FLAG_DAX         26     /* device supports DAX */
- #define QUEUE_FLAG_STATS       27     /* track rq completion times */
- #define QUEUE_FLAG_POLL_STATS  28     /* collecting stats for hybrid polling */
- #define QUEUE_FLAG_REGISTERED  29     /* queue has been registered to a disk */
- #define QUEUE_FLAG_SCSI_PASSTHROUGH 30        /* queue supports SCSI commands */
- #define QUEUE_FLAG_QUIESCED    31     /* queue has been quiesced */
+ #define QUEUE_FLAG_IO_STAT     10     /* do IO stats */
+ #define QUEUE_FLAG_DISCARD     11     /* supports DISCARD */
+ #define QUEUE_FLAG_NOXMERGES   12     /* No extended merges */
+ #define QUEUE_FLAG_ADD_RANDOM  13     /* Contributes to random pool */
+ #define QUEUE_FLAG_SECERASE    14     /* supports secure erase */
+ #define QUEUE_FLAG_SAME_FORCE  15     /* force complete on same CPU */
+ #define QUEUE_FLAG_DEAD        16     /* queue tear-down finished */
+ #define QUEUE_FLAG_INIT_DONE   17     /* queue is initialized */
+ #define QUEUE_FLAG_NO_SG_MERGE 18     /* don't attempt to merge SG segments*/
+ #define QUEUE_FLAG_POLL              19       /* IO polling enabled if set */
+ #define QUEUE_FLAG_WC        20       /* Write back caching */
+ #define QUEUE_FLAG_FUA               21       /* device supports FUA writes */
+ #define QUEUE_FLAG_FLUSH_NQ    22     /* flush not queueuable */
+ #define QUEUE_FLAG_DAX         23     /* device supports DAX */
+ #define QUEUE_FLAG_STATS       24     /* track rq completion times */
+ #define QUEUE_FLAG_POLL_STATS  25     /* collecting stats for hybrid polling */
+ #define QUEUE_FLAG_REGISTERED  26     /* queue has been registered to a disk */
+ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27        /* queue supports SCSI commands */
+ #define QUEUE_FLAG_QUIESCED    28     /* queue has been quiesced */
   
   #define QUEUE_FLAG_DEFAULT    ((1 << QUEUE_FLAG_IO_STAT) |            \
                                  (1 << QUEUE_FLAG_STACKABLE)    |       \
diff --combined include/linux/cgroup.h

index 085056e,6144fe9..d023ac5
--- 1/include/linux/cgroup.h
--- 2/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@@ -36,28 -36,18 +36,28 @@@
   #define CGROUP_WEIGHT_DFL             100
   #define CGROUP_WEIGHT_MAX             10000
   
+ +/* walk only threadgroup leaders */
+ +#define CSS_TASK_ITER_PROCS           (1U << 0)
+ +/* walk all threaded css_sets in the domain */
+ +#define CSS_TASK_ITER_THREADED                (1U << 1)
+ +
   /* a css_task_iter should be treated as an opaque object */
   struct css_task_iter {
         struct cgroup_subsys            *ss;
+ +      unsigned int                    flags;
   
         struct list_head                *cset_pos;
         struct list_head                *cset_head;
   
+ +      struct list_head                *tcset_pos;
+ +      struct list_head                *tcset_head;
+ +
         struct list_head                *task_pos;
         struct list_head                *tasks_head;
         struct list_head                *mg_tasks_head;
   
         struct css_set                  *cur_cset;
+ +      struct css_set                  *cur_dcset;
         struct task_struct              *cur_task;
         struct list_head                iters_node;     /* css_set->task_iters */
   };
@@@ -139,7 -129,7 +139,7 @@@ struct task_struct *cgroup_taskset_firs
   struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
   
- -void css_task_iter_start(struct cgroup_subsys_state *css,
+ +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                          struct css_task_iter *it);
   struct task_struct *css_task_iter_next(struct css_task_iter *it);
   void css_task_iter_end(struct css_task_iter *it);
@@@ -398,16 -388,6 +398,16 @@@ static inline void css_put_many(struct 
                 percpu_ref_put_many(&css->refcnt, n);
   }
   
+ +static inline void cgroup_get(struct cgroup *cgrp)
+ +{
+ +      css_get(&cgrp->self);
+ +}
+ +
+ +static inline bool cgroup_tryget(struct cgroup *cgrp)
+ +{
+ +      return css_tryget(&cgrp->self);
+ +}
+ +
   static inline void cgroup_put(struct cgroup *cgrp)
   {
         css_put(&cgrp->self);
@@@ -520,20 -500,6 +520,20 @@@ static inline struct cgroup *task_cgrou
         return task_css(task, subsys_id)->cgroup;
   }
   
+ +static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
+ +{
+ +      return task_css_set(task)->dfl_cgrp;
+ +}
+ +
+ +static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+ +{
+ +      struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+ +
+ +      if (parent_css)
+ +              return container_of(parent_css, struct cgroup, self);
+ +      return NULL;
+ +}
+ +
   /**
    * cgroup_is_descendant - test ancestry
    * @cgrp: the cgroup to be tested
@@@ -571,14 -537,13 +571,14 @@@ static inline bool task_under_cgroup_hi
   /* no synchronization, the result can only be used as a hint */
   static inline bool cgroup_is_populated(struct cgroup *cgrp)
   {
- -      return cgrp->populated_cnt;
+ +      return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
+ +              cgrp->nr_populated_threaded_children;
   }
   
   /* returns ino associated with a cgroup */
   static inline ino_t cgroup_ino(struct cgroup *cgrp)
   {
-       return cgrp->kn->ino;
+       return cgrp->kn->id.ino;
   }
   
   /* cft/css accessors for cftype->write() operation */
@@@ -644,6 -609,13 +644,13 @@@ static inline void cgroup_kthread_ready
         current->no_cgroup_migration = 0;
   }
   
+ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
+ {
+       return &cgrp->kn->id;
+ }
+ 
+ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+                                       char *buf, size_t buflen);
   #else /* !CONFIG_CGROUPS */
   
   struct cgroup_subsys_state;
@@@ -666,12 -638,19 +673,19 @@@ static inline int cgroup_init_early(voi
   static inline int cgroup_init(void) { return 0; }
   static inline void cgroup_init_kthreadd(void) {}
   static inline void cgroup_kthread_ready(void) {}
+ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
+ {
+       return NULL;
+ }
   
   static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                                struct cgroup *ancestor)
   {
         return true;
   }
+ 
+ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+       char *buf, size_t buflen) {}
   #endif /* !CONFIG_CGROUPS */
   
   /*
diff --combined include/linux/fs.h

index c57002a,706dd3a..7d6079d
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -72,8 -72,6 +72,8 @@@ extern int leases_enable, lease_break_t
   extern int sysctl_protected_symlinks;
   extern int sysctl_protected_hardlinks;
   
+ +typedef __kernel_rwf_t rwf_t;
+ +
   struct buffer_head;
   typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create);
@@@ -429,6 -427,7 +429,7 @@@ struct block_device 
   #endif
         struct block_device *   bd_contains;
         unsigned                bd_block_size;
+       u8                      bd_partno;
         struct hd_struct *      bd_part;
         /* number of times partitions within this device have been opened. */
         unsigned                bd_part_count;
@@@ -909,9 -908,9 +910,9 @@@ static inline struct file *get_file(str
   /* Page cache limit. The filesystems should put that into their s_maxbytes 
      limits, otherwise bad things can happen in VM. */ 
   #if BITS_PER_LONG==32
- -#define MAX_LFS_FILESIZE      (((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1)
+ +#define MAX_LFS_FILESIZE      ((loff_t)ULONG_MAX << PAGE_SHIFT)
   #elif BITS_PER_LONG==64
- -#define MAX_LFS_FILESIZE      ((loff_t)0x7fffffffffffffffLL)
+ +#define MAX_LFS_FILESIZE      ((loff_t)LLONG_MAX)
   #endif
   
   #define FL_POSIX      1
@@@ -1002,6 -1001,7 +1003,6 @@@ struct file_lock 
         unsigned char fl_type;
         unsigned int fl_pid;
         int fl_link_cpu;                /* what cpu's list is this on? */
- -      struct pid *fl_nspid;
         wait_queue_head_t fl_wait;
         struct file *fl_file;
         loff_t fl_start;
@@@ -1269,6 -1269,8 +1270,6 @@@ extern void f_delown(struct file *filp)
   extern pid_t f_getown(struct file *filp);
   extern int send_sigurg(struct fown_struct *fown);
   
- -struct mm_struct;
- -
   /*
    *    Umount options
    */
@@@ -1757,9 -1759,9 +1758,9 @@@ extern ssize_t __vfs_write(struct file 
   extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
   extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
   extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
- -              unsigned long, loff_t *, int);
+ +              unsigned long, loff_t *, rwf_t);
   extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
- -              unsigned long, loff_t *, int);
+ +              unsigned long, loff_t *, rwf_t);
   extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                    loff_t, size_t, unsigned int);
   extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
@@@ -2470,13 -2472,9 +2471,13 @@@ static inline void bd_unlink_disk_holde
   #endif
   
   /* fs/char_dev.c */
- -#define CHRDEV_MAJOR_HASH_SIZE        255
+ +#define CHRDEV_MAJOR_MAX 512
   /* Marks the bottom of the first segment of free char majors */
   #define CHRDEV_MAJOR_DYN_END 234
+ +/* Marks the top and bottom of the second segment of free char majors */
+ +#define CHRDEV_MAJOR_DYN_EXT_START 511
+ +#define CHRDEV_MAJOR_DYN_EXT_END 384
+ +
   extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
   extern int register_chrdev_region(dev_t, unsigned, const char *);
   extern int __register_chrdev(unsigned int major, unsigned int baseminor,
@@@ -2503,14 -2501,14 +2504,14 @@@ static inline void unregister_chrdev(un
   #define BDEVT_SIZE    10      /* Largest string for MAJ:MIN for blkdev */
   
   #ifdef CONFIG_BLOCK
- -#define BLKDEV_MAJOR_HASH_SIZE        255
+ +#define BLKDEV_MAJOR_MAX      512
   extern const char *__bdevname(dev_t, char *buffer);
   extern const char *bdevname(struct block_device *bdev, char *buffer);
   extern struct block_device *lookup_bdev(const char *);
   extern void blkdev_show(struct seq_file *,off_t);
   
   #else
- -#define BLKDEV_MAJOR_HASH_SIZE        0
+ +#define BLKDEV_MAJOR_MAX      0
   #endif
   
   extern void init_special_inode(struct inode *, umode_t, dev_t);
@@@ -2542,19 -2540,12 +2543,19 @@@ extern int invalidate_inode_pages2_rang
   extern int write_inode_now(struct inode *, int);
   extern int filemap_fdatawrite(struct address_space *);
   extern int filemap_flush(struct address_space *);
- -extern int filemap_fdatawait(struct address_space *);
   extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
   extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                    loff_t lend);
+ +
+ +static inline int filemap_fdatawait(struct address_space *mapping)
+ +{
+ +      return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
+ +}
+ +
   extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
                                   loff_t lend);
+ +extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+ +                                              loff_t lend);
   extern int filemap_write_and_wait(struct address_space *mapping);
   extern int filemap_write_and_wait_range(struct address_space *mapping,
                                         loff_t lstart, loff_t lend);
@@@ -2563,19 -2554,12 +2564,19 @@@ extern int __filemap_fdatawrite_range(s
   extern int filemap_fdatawrite_range(struct address_space *mapping,
                                 loff_t start, loff_t end);
   extern int filemap_check_errors(struct address_space *mapping);
- -
   extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+ +
+ +extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+ +                                              loff_t lend);
   extern int __must_check file_check_and_advance_wb_err(struct file *file);
   extern int __must_check file_write_and_wait_range(struct file *file,
                                                 loff_t start, loff_t end);
   
+ +static inline int file_write_and_wait(struct file *file)
+ +{
+ +      return file_write_and_wait_range(file, 0, LLONG_MAX);
+ +}
+ +
   /**
    * filemap_set_wb_err - set a writeback error on an address_space
    * @mapping: mapping in which to set writeback error
@@@ -2589,6 -2573,8 +2590,6 @@@
    * When a writeback error occurs, most filesystems will want to call
    * filemap_set_wb_err to record the error in the mapping so that it will be
    * automatically reported whenever fsync is called on the file.
- - *
- - * FIXME: mention FS_* flag here?
    */
   static inline void filemap_set_wb_err(struct address_space *mapping, int err)
   {
@@@ -2846,7 -2832,6 +2847,7 @@@ static inline void lockdep_annotate_ino
   #endif
   extern void unlock_new_inode(struct inode *);
   extern unsigned int get_next_ino(void);
+ +extern void evict_inodes(struct super_block *sb);
   
   extern void __iget(struct inode * inode);
   extern void iget_failed(struct inode *);
@@@ -2890,9 -2875,9 +2891,9 @@@ extern ssize_t generic_file_direct_writ
   extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
   
   ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
- -              int flags);
+ +              rwf_t flags);
   ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
- -              int flags);
+ +              rwf_t flags);
   
   /* fs/block_dev.c */
   extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
@@@ -3159,7 -3144,7 +3160,7 @@@ static inline int iocb_flags(struct fil
         return res;
   }
   
- -static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+ +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
   {
         if (unlikely(flags & ~RWF_SUPPORTED))
                 return -EOPNOTSUPP;
diff --combined kernel/cgroup/cgroup.c

index 4f2196a,2aba1c5..d6551cd
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -162,9 -162,6 +162,9 @@@ static u16 cgrp_dfl_inhibit_ss_mask
   /* some controllers are implicitly enabled on the default hierarchy */
   static u16 cgrp_dfl_implicit_ss_mask;
   
+ +/* some controllers can be threaded on the default hierarchy */
+ +static u16 cgrp_dfl_threaded_ss_mask;
+ +
   /* The list of hierarchy roots */
   LIST_HEAD(cgroup_roots);
   static int cgroup_root_count;
@@@ -319,87 -316,13 +319,87 @@@ static void cgroup_idr_remove(struct id
         spin_unlock_bh(&cgroup_idr_lock);
   }
   
- -static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+ +static bool cgroup_has_tasks(struct cgroup *cgrp)
   {
- -      struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+ +      return cgrp->nr_populated_csets;
+ +}
   
- -      if (parent_css)
- -              return container_of(parent_css, struct cgroup, self);
- -      return NULL;
+ +bool cgroup_is_threaded(struct cgroup *cgrp)
+ +{
+ +      return cgrp->dom_cgrp != cgrp;
+ +}
+ +
+ +/* can @cgrp host both domain and threaded children? */
+ +static bool cgroup_is_mixable(struct cgroup *cgrp)
+ +{
+ +      /*
+ +       * Root isn't under domain level resource control exempting it from
+ +       * the no-internal-process constraint, so it can serve as a thread
+ +       * root and a parent of resource domains at the same time.
+ +       */
+ +      return !cgroup_parent(cgrp);
+ +}
+ +
+ +/* can @cgrp become a thread root? should always be true for a thread root */
+ +static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+ +{
+ +      /* mixables don't care */
+ +      if (cgroup_is_mixable(cgrp))
+ +              return true;
+ +
+ +      /* domain roots can't be nested under threaded */
+ +      if (cgroup_is_threaded(cgrp))
+ +              return false;
+ +
+ +      /* can only have either domain or threaded children */
+ +      if (cgrp->nr_populated_domain_children)
+ +              return false;
+ +
+ +      /* and no domain controllers can be enabled */
+ +      if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
+ +/* is @cgrp root of a threaded subtree? */
+ +bool cgroup_is_thread_root(struct cgroup *cgrp)
+ +{
+ +      /* thread root should be a domain */
+ +      if (cgroup_is_threaded(cgrp))
+ +              return false;
+ +
+ +      /* a domain w/ threaded children is a thread root */
+ +      if (cgrp->nr_threaded_children)
+ +              return true;
+ +
+ +      /*
+ +       * A domain which has tasks and explicit threaded controllers
+ +       * enabled is a thread root.
+ +       */
+ +      if (cgroup_has_tasks(cgrp) &&
+ +          (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +/* a domain which isn't connected to the root w/o brekage can't be used */
+ +static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+ +{
+ +      /* the cgroup itself can be a thread root */
+ +      if (cgroup_is_threaded(cgrp))
+ +              return false;
+ +
+ +      /* but the ancestors can't be unless mixable */
+ +      while ((cgrp = cgroup_parent(cgrp))) {
+ +              if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+ +                      return false;
+ +              if (cgroup_is_threaded(cgrp))
+ +                      return false;
+ +      }
+ +
+ +      return true;
   }
   
   /* subsystems visibly enabled on a cgroup */
@@@ -408,14 -331,8 +408,14 @@@ static u16 cgroup_control(struct cgrou
         struct cgroup *parent = cgroup_parent(cgrp);
         u16 root_ss_mask = cgrp->root->subsys_mask;
   
- -      if (parent)
- -              return parent->subtree_control;
+ +      if (parent) {
+ +              u16 ss_mask = parent->subtree_control;
+ +
+ +              /* threaded cgroups can only have threaded controllers */
+ +              if (cgroup_is_threaded(cgrp))
+ +                      ss_mask &= cgrp_dfl_threaded_ss_mask;
+ +              return ss_mask;
+ +      }
   
         if (cgroup_on_dfl(cgrp))
                 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@@ -428,14 -345,8 +428,14 @@@ static u16 cgroup_ss_mask(struct cgrou
   {
         struct cgroup *parent = cgroup_parent(cgrp);
   
- -      if (parent)
- -              return parent->subtree_ss_mask;
+ +      if (parent) {
+ +              u16 ss_mask = parent->subtree_ss_mask;
+ +
+ +              /* threaded cgroups can only have threaded controllers */
+ +              if (cgroup_is_threaded(cgrp))
+ +                      ss_mask &= cgrp_dfl_threaded_ss_mask;
+ +              return ss_mask;
+ +      }
   
         return cgrp->root->subsys_mask;
   }
@@@ -525,12 -436,22 +525,12 @@@ out_unlock
         return css;
   }
   
- -static void __maybe_unused cgroup_get(struct cgroup *cgrp)
- -{
- -      css_get(&cgrp->self);
- -}
- -
   static void cgroup_get_live(struct cgroup *cgrp)
   {
         WARN_ON_ONCE(cgroup_is_dead(cgrp));
         css_get(&cgrp->self);
   }
   
- -static bool cgroup_tryget(struct cgroup *cgrp)
- -{
- -      return css_tryget(&cgrp->self);
- -}
- -
   struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
   {
         struct cgroup *cgrp = of->kn->parent->priv;
@@@ -639,11 -560,9 +639,11 @@@ EXPORT_SYMBOL_GPL(of_css)
    */
   struct css_set init_css_set = {
         .refcount               = REFCOUNT_INIT(1),
+ +      .dom_cset               = &init_css_set,
         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
         .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
+ +      .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
@@@ -651,11 -570,6 +651,11 @@@
   
   static int css_set_count      = 1;    /* 1 for init_css_set */
   
+ +static bool css_set_threaded(struct css_set *cset)
+ +{
+ +      return cset->dom_cset != cset;
+ +}
+ +
   /**
    * css_set_populated - does a css_set contain any tasks?
    * @cset: target css_set
@@@ -673,48 -587,39 +673,48 @@@ static bool css_set_populated(struct cs
   }
   
   /**
- - * cgroup_update_populated - updated populated count of a cgroup
+ + * cgroup_update_populated - update the populated count of a cgroup
    * @cgrp: the target cgroup
    * @populated: inc or dec populated count
    *
    * One of the css_sets associated with @cgrp is either getting its first
- - * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
- - * count is propagated towards root so that a given cgroup's populated_cnt
- - * is zero iff the cgroup and all its descendants don't contain any tasks.
+ + * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
+ + * count is propagated towards root so that a given cgroup's
+ + * nr_populated_children is zero iff none of its descendants contain any
+ + * tasks.
    *
- - * @cgrp's interface file "cgroup.populated" is zero if
- - * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
- - * changes from or to zero, userland is notified that the content of the
- - * interface file has changed.  This can be used to detect when @cgrp and
- - * its descendants become populated or empty.
+ + * @cgrp's interface file "cgroup.populated" is zero if both
+ + * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ + * 1 otherwise.  When the sum changes from or to zero, userland is notified
+ + * that the content of the interface file has changed.  This can be used to
+ + * detect when @cgrp and its descendants become populated or empty.
    */
   static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
   {
+ +      struct cgroup *child = NULL;
+ +      int adj = populated ? 1 : -1;
+ +
         lockdep_assert_held(&css_set_lock);
   
         do {
- -              bool trigger;
+ +              bool was_populated = cgroup_is_populated(cgrp);
   
- -              if (populated)
- -                      trigger = !cgrp->populated_cnt++;
- -              else
- -                      trigger = !--cgrp->populated_cnt;
+ +              if (!child) {
+ +                      cgrp->nr_populated_csets += adj;
+ +              } else {
+ +                      if (cgroup_is_threaded(child))
+ +                              cgrp->nr_populated_threaded_children += adj;
+ +                      else
+ +                              cgrp->nr_populated_domain_children += adj;
+ +              }
   
- -              if (!trigger)
+ +              if (was_populated == cgroup_is_populated(cgrp))
                         break;
   
                 cgroup1_check_for_release(cgrp);
                 cgroup_file_notify(&cgrp->events_file);
   
+ +              child = cgrp;
                 cgrp = cgroup_parent(cgrp);
         } while (cgrp);
   }
@@@ -725,7 -630,7 +725,7 @@@
    * @populated: whether @cset is populated or depopulated
    *
    * @cset is either getting the first task or losing the last.  Update the
- - * ->populated_cnt of all associated cgroups accordingly.
+ + * populated counters of all associated cgroups accordingly.
    */
   static void css_set_update_populated(struct css_set *cset, bool populated)
   {
@@@ -748,7 -653,7 +748,7 @@@
    * css_set, @from_cset can be NULL.  If @task is being disassociated
    * instead of moved, @to_cset can be NULL.
    *
- - * This function automatically handles populated_cnt updates and
+ + * This function automatically handles populated counter updates and
    * css_task_iter adjustments but the caller is responsible for managing
    * @from_cset and @to_cset's reference counts.
    */
@@@ -832,8 -737,6 +832,8 @@@ void put_css_set_locked(struct css_set 
         if (!refcount_dec_and_test(&cset->refcount))
                 return;
   
+ +      WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+ +
         /* This css_set is dead. unlink it and release cgroup and css refs */
         for_each_subsys(ss, ssid) {
                 list_del(&cset->e_cset_node[ssid]);
@@@ -850,11 -753,6 +850,11 @@@
                 kfree(link);
         }
   
+ +      if (css_set_threaded(cset)) {
+ +              list_del(&cset->threaded_csets_node);
+ +              put_css_set_locked(cset->dom_cset);
+ +      }
+ +
         kfree_rcu(cset, rcu_head);
   }
   
@@@ -873,7 -771,6 +873,7 @@@ static bool compare_css_sets(struct css
                              struct cgroup *new_cgrp,
                              struct cgroup_subsys_state *template[])
   {
+ +      struct cgroup *new_dfl_cgrp;
         struct list_head *l1, *l2;
   
         /*
@@@ -884,16 -781,6 +884,16 @@@
         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                 return false;
   
+ +
+ +      /* @cset's domain should match the default cgroup's */
+ +      if (cgroup_on_dfl(new_cgrp))
+ +              new_dfl_cgrp = new_cgrp;
+ +      else
+ +              new_dfl_cgrp = old_cset->dfl_cgrp;
+ +
+ +      if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+ +              return false;
+ +
         /*
          * Compare cgroup pointers in order to distinguish between
          * different cgroups in hierarchies.  As different cgroups may
@@@ -1101,11 -988,9 +1101,11 @@@ static struct css_set *find_css_set(str
         }
   
         refcount_set(&cset->refcount, 1);
+ +      cset->dom_cset = cset;
         INIT_LIST_HEAD(&cset->tasks);
         INIT_LIST_HEAD(&cset->mg_tasks);
         INIT_LIST_HEAD(&cset->task_iters);
+ +      INIT_LIST_HEAD(&cset->threaded_csets);
         INIT_HLIST_NODE(&cset->hlist);
         INIT_LIST_HEAD(&cset->cgrp_links);
         INIT_LIST_HEAD(&cset->mg_preload_node);
@@@ -1143,28 -1028,6 +1143,28 @@@
   
         spin_unlock_irq(&css_set_lock);
   
+ +      /*
+ +       * If @cset should be threaded, look up the matching dom_cset and
+ +       * link them up.  We first fully initialize @cset then look for the
+ +       * dom_cset.  It's simpler this way and safe as @cset is guaranteed
+ +       * to stay empty until we return.
+ +       */
+ +      if (cgroup_is_threaded(cset->dfl_cgrp)) {
+ +              struct css_set *dcset;
+ +
+ +              dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+ +              if (!dcset) {
+ +                      put_css_set(cset);
+ +                      return NULL;
+ +              }
+ +
+ +              spin_lock_irq(&css_set_lock);
+ +              cset->dom_cset = dcset;
+ +              list_add_tail(&cset->threaded_csets_node,
+ +                            &dcset->threaded_csets);
+ +              spin_unlock_irq(&css_set_lock);
+ +      }
+ +
         return cset;
   }
   
@@@ -1292,8 -1155,6 +1292,8 @@@ static struct cgroup *cset_cgroup_from_
   
         if (cset == &init_css_set) {
                 res = &root->cgrp;
+ +      } else if (root == &cgrp_dfl_root) {
+ +              res = cset->dfl_cgrp;
         } else {
                 struct cgrp_cset_link *link;
   
@@@ -1809,9 -1670,6 +1809,9 @@@ static void init_cgroup_housekeeping(st
         mutex_init(&cgrp->pidlist_mutex);
         cgrp->self.cgroup = cgrp;
         cgrp->self.flags |= CSS_ONLINE;
+ +      cgrp->dom_cgrp = cgrp;
+ +      cgrp->max_descendants = INT_MAX;
+ +      cgrp->max_depth = INT_MAX;
   
         for_each_subsys(ss, ssid)
                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@@ -1879,7 -1737,8 +1879,8 @@@ int cgroup_setup_root(struct cgroup_roo
                 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
   
         root->kf_root = kernfs_create_root(kf_sops,
-                                          KERNFS_ROOT_CREATE_DEACTIVATED,
+                                          KERNFS_ROOT_CREATE_DEACTIVATED |
+                                          KERNFS_ROOT_SUPPORT_EXPORTOP,
                                            root_cgrp);
         if (IS_ERR(root->kf_root)) {
                 ret = PTR_ERR(root->kf_root);
@@@ -2148,8 -2007,6 +2149,8 @@@ static void cgroup_migrate_add_task(str
         if (!cset->mg_src_cgrp)
                 return;
   
+ +      mgctx->tset.nr_tasks++;
+ +
         list_move_tail(&task->cg_list, &cset->mg_tasks);
         if (list_empty(&cset->mg_node))
                 list_add_tail(&cset->mg_node,
@@@ -2238,19 -2095,21 +2239,19 @@@ static int cgroup_migrate_execute(struc
         struct css_set *cset, *tmp_cset;
         int ssid, failed_ssid, ret;
   
- -      /* methods shouldn't be called if no task is actually migrating */
- -      if (list_empty(&tset->src_csets))
- -              return 0;
- -
         /* check that we can legitimately attach to the cgroup */
- -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- -              if (ss->can_attach) {
- -                      tset->ssid = ssid;
- -                      ret = ss->can_attach(tset);
- -                      if (ret) {
- -                              failed_ssid = ssid;
- -                              goto out_cancel_attach;
+ +      if (tset->nr_tasks) {
+ +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ +                      if (ss->can_attach) {
+ +                              tset->ssid = ssid;
+ +                              ret = ss->can_attach(tset);
+ +                              if (ret) {
+ +                                      failed_ssid = ssid;
+ +                                      goto out_cancel_attach;
+ +                              }
                         }
- -              }
- -      } while_each_subsys_mask();
+ +              } while_each_subsys_mask();
+ +      }
   
         /*
          * Now that we're guaranteed success, proceed to move all tasks to
@@@ -2279,29 -2138,25 +2280,29 @@@
          */
         tset->csets = &tset->dst_csets;
   
- -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- -              if (ss->attach) {
- -                      tset->ssid = ssid;
- -                      ss->attach(tset);
- -              }
- -      } while_each_subsys_mask();
+ +      if (tset->nr_tasks) {
+ +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ +                      if (ss->attach) {
+ +                              tset->ssid = ssid;
+ +                              ss->attach(tset);
+ +                      }
+ +              } while_each_subsys_mask();
+ +      }
   
         ret = 0;
         goto out_release_tset;
   
   out_cancel_attach:
- -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- -              if (ssid == failed_ssid)
- -                      break;
- -              if (ss->cancel_attach) {
- -                      tset->ssid = ssid;
- -                      ss->cancel_attach(tset);
- -              }
- -      } while_each_subsys_mask();
+ +      if (tset->nr_tasks) {
+ +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ +                      if (ssid == failed_ssid)
+ +                              break;
+ +                      if (ss->cancel_attach) {
+ +                              tset->ssid = ssid;
+ +                              ss->cancel_attach(tset);
+ +                      }
+ +              } while_each_subsys_mask();
+ +      }
   out_release_tset:
         spin_lock_irq(&css_set_lock);
         list_splice_init(&tset->dst_csets, &tset->src_csets);
@@@ -2314,40 -2169,17 +2315,40 @@@
   }
   
   /**
- - * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ + * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
    * @dst_cgrp: destination cgroup to test
    *
- - * On the default hierarchy, except for the root, subtree_control must be
- - * zero for migration destination cgroups with tasks so that child cgroups
- - * don't compete against tasks.
+ + * On the default hierarchy, except for the mixable, (possible) thread root
+ + * and threaded cgroups, subtree_control must be zero for migration
+ + * destination cgroups with tasks so that child cgroups don't compete
+ + * against tasks.
    */
- -bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+ +int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
   {
- -      return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
- -              !dst_cgrp->subtree_control;
+ +      /* v1 doesn't have any restriction */
+ +      if (!cgroup_on_dfl(dst_cgrp))
+ +              return 0;
+ +
+ +      /* verify @dst_cgrp can host resources */
+ +      if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+ +              return -EOPNOTSUPP;
+ +
+ +      /* mixables don't care */
+ +      if (cgroup_is_mixable(dst_cgrp))
+ +              return 0;
+ +
+ +      /*
+ +       * If @dst_cgrp is already or can become a thread root or is
+ +       * threaded, it doesn't matter.
+ +       */
+ +      if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+ +              return 0;
+ +
+ +      /* apply no-internal-process constraint */
+ +      if (dst_cgrp->subtree_control)
+ +              return -EBUSY;
+ +
+ +      return 0;
   }
   
   /**
@@@ -2552,9 -2384,8 +2553,9 @@@ int cgroup_attach_task(struct cgroup *d
         struct task_struct *task;
         int ret;
   
- -      if (!cgroup_may_migrate_to(dst_cgrp))
- -              return -EBUSY;
+ +      ret = cgroup_migrate_vet_dst(dst_cgrp);
+ +      if (ret)
+ +              return ret;
   
         /* look up all src csets */
         spin_lock_irq(&css_set_lock);
@@@ -2581,23 -2412,96 +2582,23 @@@
         return ret;
   }
   
- -static int cgroup_procs_write_permission(struct task_struct *task,
- -                                       struct cgroup *dst_cgrp,
- -                                       struct kernfs_open_file *of)
- -{
- -      struct super_block *sb = of->file->f_path.dentry->d_sb;
- -      struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- -      struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
- -      struct cgroup *src_cgrp, *com_cgrp;
- -      struct inode *inode;
- -      int ret;
- -
- -      if (!cgroup_on_dfl(dst_cgrp)) {
- -              const struct cred *cred = current_cred();
- -              const struct cred *tcred = get_task_cred(task);
- -
- -              /*
- -               * even if we're attaching all tasks in the thread group,
- -               * we only need to check permissions on one of them.
- -               */
- -              if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
- -                  uid_eq(cred->euid, tcred->uid) ||
- -                  uid_eq(cred->euid, tcred->suid))
- -                      ret = 0;
- -              else
- -                      ret = -EACCES;
- -
- -              put_cred(tcred);
- -              return ret;
- -      }
- -
- -      /* find the source cgroup */
- -      spin_lock_irq(&css_set_lock);
- -      src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- -      spin_unlock_irq(&css_set_lock);
- -
- -      /* and the common ancestor */
- -      com_cgrp = src_cgrp;
- -      while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
- -              com_cgrp = cgroup_parent(com_cgrp);
- -
- -      /* %current should be authorized to migrate to the common ancestor */
- -      inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- -      if (!inode)
- -              return -ENOMEM;
- -
- -      ret = inode_permission(inode, MAY_WRITE);
- -      iput(inode);
- -      if (ret)
- -              return ret;
- -
- -      /*
- -       * If namespaces are delegation boundaries, %current must be able
- -       * to see both source and destination cgroups from its namespace.
- -       */
- -      if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
- -          (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
- -           !cgroup_is_descendant(dst_cgrp, root_cgrp)))
- -              return -ENOENT;
- -
- -      return 0;
- -}
- -
- -/*
- - * Find the task_struct of the task to attach by vpid and pass it along to the
- - * function to attach either it or all tasks in its threadgroup. Will lock
- - * cgroup_mutex and threadgroup.
- - */
- -ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- -                           size_t nbytes, loff_t off, bool threadgroup)
+ +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ +      __acquires(&cgroup_threadgroup_rwsem)
   {
         struct task_struct *tsk;
- -      struct cgroup_subsys *ss;
- -      struct cgroup *cgrp;
         pid_t pid;
- -      int ssid, ret;
   
         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- -              return -EINVAL;
- -
- -      cgrp = cgroup_kn_lock_live(of->kn, false);
- -      if (!cgrp)
- -              return -ENODEV;
+ +              return ERR_PTR(-EINVAL);
   
         percpu_down_write(&cgroup_threadgroup_rwsem);
+ +
         rcu_read_lock();
         if (pid) {
                 tsk = find_task_by_vpid(pid);
                 if (!tsk) {
- -                      ret = -ESRCH;
- -                      goto out_unlock_rcu;
+ +                      tsk = ERR_PTR(-ESRCH);
+ +                      goto out_unlock_threadgroup;
                 }
         } else {
                 tsk = current;
@@@ -2613,33 -2517,35 +2614,33 @@@
          * cgroup with no rt_runtime allocated.  Just say no.
          */
         if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
- -              ret = -EINVAL;
- -              goto out_unlock_rcu;
+ +              tsk = ERR_PTR(-EINVAL);
+ +              goto out_unlock_threadgroup;
         }
   
         get_task_struct(tsk);
+ +      goto out_unlock_rcu;
+ +
+ +out_unlock_threadgroup:
+ +      percpu_up_write(&cgroup_threadgroup_rwsem);
+ +out_unlock_rcu:
         rcu_read_unlock();
+ +      return tsk;
+ +}
   
- -      ret = cgroup_procs_write_permission(tsk, cgrp, of);
- -      if (!ret)
- -              ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+ +void cgroup_procs_write_finish(struct task_struct *task)
+ +      __releases(&cgroup_threadgroup_rwsem)
+ +{
+ +      struct cgroup_subsys *ss;
+ +      int ssid;
   
- -      put_task_struct(tsk);
- -      goto out_unlock_threadgroup;
+ +      /* release reference from cgroup_procs_write_start() */
+ +      put_task_struct(task);
   
- -out_unlock_rcu:
- -      rcu_read_unlock();
- -out_unlock_threadgroup:
         percpu_up_write(&cgroup_threadgroup_rwsem);
         for_each_subsys(ss, ssid)
                 if (ss->post_attach)
                         ss->post_attach();
- -      cgroup_kn_unlock(of->kn);
- -      return ret ?: nbytes;
- -}
- -
- -ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- -                         loff_t off)
- -{
- -      return __cgroup_procs_write(of, buf, nbytes, off, true);
   }
   
   static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@@ -2982,46 -2888,6 +2983,46 @@@ static void cgroup_finalize_control(str
         cgroup_apply_control_disable(cgrp);
   }
   
+ +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+ +{
+ +      u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+ +
+ +      /* if nothing is getting enabled, nothing to worry about */
+ +      if (!enable)
+ +              return 0;
+ +
+ +      /* can @cgrp host any resources? */
+ +      if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+ +              return -EOPNOTSUPP;
+ +
+ +      /* mixables don't care */
+ +      if (cgroup_is_mixable(cgrp))
+ +              return 0;
+ +
+ +      if (domain_enable) {
+ +              /* can't enable domain controllers inside a thread subtree */
+ +              if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ +                      return -EOPNOTSUPP;
+ +      } else {
+ +              /*
+ +               * Threaded controllers can handle internal competitions
+ +               * and are always allowed inside a (prospective) thread
+ +               * subtree.
+ +               */
+ +              if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ +                      return 0;
+ +      }
+ +
+ +      /*
+ +       * Controllers can't be enabled for a cgroup with tasks to avoid
+ +       * child cgroups competing against tasks.
+ +       */
+ +      if (cgroup_has_tasks(cgrp))
+ +              return -EBUSY;
+ +
+ +      return 0;
+ +}
+ +
   /* change the enabled child controllers for a cgroup in the default hierarchy */
   static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                             char *buf, size_t nbytes,
@@@ -3097,9 -2963,33 +3098,9 @@@
                 goto out_unlock;
         }
   
- -      /*
- -       * Except for the root, subtree_control must be zero for a cgroup
- -       * with tasks so that child cgroups don't compete against tasks.
- -       */
- -      if (enable && cgroup_parent(cgrp)) {
- -              struct cgrp_cset_link *link;
- -
- -              /*
- -               * Because namespaces pin csets too, @cgrp->cset_links
- -               * might not be empty even when @cgrp is empty.  Walk and
- -               * verify each cset.
- -               */
- -              spin_lock_irq(&css_set_lock);
- -
- -              ret = 0;
- -              list_for_each_entry(link, &cgrp->cset_links, cset_link) {
- -                      if (css_set_populated(link->cset)) {
- -                              ret = -EBUSY;
- -                              break;
- -                      }
- -              }
- -
- -              spin_unlock_irq(&css_set_lock);
- -
- -              if (ret)
- -                      goto out_unlock;
- -      }
+ +      ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+ +      if (ret)
+ +              goto out_unlock;
   
         /* save and update control masks and prepare csses */
         cgroup_save_control(cgrp);
@@@ -3108,182 -2998,16 +3109,182 @@@
         cgrp->subtree_control &= ~disable;
   
         ret = cgroup_apply_control(cgrp);
- -
         cgroup_finalize_control(cgrp, ret);
+ +      if (ret)
+ +              goto out_unlock;
   
         kernfs_activate(cgrp->kn);
- -      ret = 0;
   out_unlock:
         cgroup_kn_unlock(of->kn);
         return ret ?: nbytes;
   }
   
+ +/**
+ + * cgroup_enable_threaded - make @cgrp threaded
+ + * @cgrp: the target cgroup
+ + *
+ + * Called when "threaded" is written to the cgroup.type interface file and
+ + * tries to make @cgrp threaded and join the parent's resource domain.
+ + * This function is never called on the root cgroup as cgroup.type doesn't
+ + * exist on it.
+ + */
+ +static int cgroup_enable_threaded(struct cgroup *cgrp)
+ +{
+ +      struct cgroup *parent = cgroup_parent(cgrp);
+ +      struct cgroup *dom_cgrp = parent->dom_cgrp;
+ +      int ret;
+ +
+ +      lockdep_assert_held(&cgroup_mutex);
+ +
+ +      /* noop if already threaded */
+ +      if (cgroup_is_threaded(cgrp))
+ +              return 0;
+ +
+ +      /* we're joining the parent's domain, ensure its validity */
+ +      if (!cgroup_is_valid_domain(dom_cgrp) ||
+ +          !cgroup_can_be_thread_root(dom_cgrp))
+ +              return -EOPNOTSUPP;
+ +
+ +      /*
+ +       * The following shouldn't cause actual migrations and should
+ +       * always succeed.
+ +       */
+ +      cgroup_save_control(cgrp);
+ +
+ +      cgrp->dom_cgrp = dom_cgrp;
+ +      ret = cgroup_apply_control(cgrp);
+ +      if (!ret)
+ +              parent->nr_threaded_children++;
+ +      else
+ +              cgrp->dom_cgrp = cgrp;
+ +
+ +      cgroup_finalize_control(cgrp, ret);
+ +      return ret;
+ +}
+ +
+ +static int cgroup_type_show(struct seq_file *seq, void *v)
+ +{
+ +      struct cgroup *cgrp = seq_css(seq)->cgroup;
+ +
+ +      if (cgroup_is_threaded(cgrp))
+ +              seq_puts(seq, "threaded\n");
+ +      else if (!cgroup_is_valid_domain(cgrp))
+ +              seq_puts(seq, "domain invalid\n");
+ +      else if (cgroup_is_thread_root(cgrp))
+ +              seq_puts(seq, "domain threaded\n");
+ +      else
+ +              seq_puts(seq, "domain\n");
+ +
+ +      return 0;
+ +}
+ +
+ +static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+ +                               size_t nbytes, loff_t off)
+ +{
+ +      struct cgroup *cgrp;
+ +      int ret;
+ +
+ +      /* only switching to threaded mode is supported */
+ +      if (strcmp(strstrip(buf), "threaded"))
+ +              return -EINVAL;
+ +
+ +      cgrp = cgroup_kn_lock_live(of->kn, false);
+ +      if (!cgrp)
+ +              return -ENOENT;
+ +
+ +      /* threaded can only be enabled */
+ +      ret = cgroup_enable_threaded(cgrp);
+ +
+ +      cgroup_kn_unlock(of->kn);
+ +      return ret ?: nbytes;
+ +}
+ +
+ +static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+ +{
+ +      struct cgroup *cgrp = seq_css(seq)->cgroup;
+ +      int descendants = READ_ONCE(cgrp->max_descendants);
+ +
+ +      if (descendants == INT_MAX)
+ +              seq_puts(seq, "max\n");
+ +      else
+ +              seq_printf(seq, "%d\n", descendants);
+ +
+ +      return 0;
+ +}
+ +
+ +static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+ +                                         char *buf, size_t nbytes, loff_t off)
+ +{
+ +      struct cgroup *cgrp;
+ +      int descendants;
+ +      ssize_t ret;
+ +
+ +      buf = strstrip(buf);
+ +      if (!strcmp(buf, "max")) {
+ +              descendants = INT_MAX;
+ +      } else {
+ +              ret = kstrtoint(buf, 0, &descendants);
+ +              if (ret)
+ +                      return ret;
+ +      }
+ +
+ +      if (descendants < 0)
+ +              return -ERANGE;
+ +
+ +      cgrp = cgroup_kn_lock_live(of->kn, false);
+ +      if (!cgrp)
+ +              return -ENOENT;
+ +
+ +      cgrp->max_descendants = descendants;
+ +
+ +      cgroup_kn_unlock(of->kn);
+ +
+ +      return nbytes;
+ +}
+ +
+ +static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+ +{
+ +      struct cgroup *cgrp = seq_css(seq)->cgroup;
+ +      int depth = READ_ONCE(cgrp->max_depth);
+ +
+ +      if (depth == INT_MAX)
+ +              seq_puts(seq, "max\n");
+ +      else
+ +              seq_printf(seq, "%d\n", depth);
+ +
+ +      return 0;
+ +}
+ +
+ +static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+ +                                    char *buf, size_t nbytes, loff_t off)
+ +{
+ +      struct cgroup *cgrp;
+ +      ssize_t ret;
+ +      int depth;
+ +
+ +      buf = strstrip(buf);
+ +      if (!strcmp(buf, "max")) {
+ +              depth = INT_MAX;
+ +      } else {
+ +              ret = kstrtoint(buf, 0, &depth);
+ +              if (ret)
+ +                      return ret;
+ +      }
+ +
+ +      if (depth < 0)
+ +              return -ERANGE;
+ +
+ +      cgrp = cgroup_kn_lock_live(of->kn, false);
+ +      if (!cgrp)
+ +              return -ENOENT;
+ +
+ +      cgrp->max_depth = depth;
+ +
+ +      cgroup_kn_unlock(of->kn);
+ +
+ +      return nbytes;
+ +}
+ +
   static int cgroup_events_show(struct seq_file *seq, void *v)
   {
         seq_printf(seq, "populated %d\n",
@@@ -3291,18 -3015,6 +3292,18 @@@
         return 0;
   }
   
+ +static int cgroup_stat_show(struct seq_file *seq, void *v)
+ +{
+ +      struct cgroup *cgroup = seq_css(seq)->cgroup;
+ +
+ +      seq_printf(seq, "nr_descendants %d\n",
+ +                 cgroup->nr_descendants);
+ +      seq_printf(seq, "nr_dying_descendants %d\n",
+ +                 cgroup->nr_dying_descendants);
+ +
+ +      return 0;
+ +}
+ +
   static int cgroup_file_open(struct kernfs_open_file *of)
   {
         struct cftype *cft = of->kn->priv;
@@@ -3519,6 -3231,7 +3520,6 @@@ restart
   
   static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
   {
- -      LIST_HEAD(pending);
         struct cgroup_subsys *ss = cfts[0].ss;
         struct cgroup *root = &ss->root->cgrp;
         struct cgroup_subsys_state *css;
@@@ -3943,58 -3656,6 +3944,58 @@@ bool css_has_online_children(struct cgr
         return ret;
   }
   
+ +static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+ +{
+ +      struct list_head *l;
+ +      struct cgrp_cset_link *link;
+ +      struct css_set *cset;
+ +
+ +      lockdep_assert_held(&css_set_lock);
+ +
+ +      /* find the next threaded cset */
+ +      if (it->tcset_pos) {
+ +              l = it->tcset_pos->next;
+ +
+ +              if (l != it->tcset_head) {
+ +                      it->tcset_pos = l;
+ +                      return container_of(l, struct css_set,
+ +                                          threaded_csets_node);
+ +              }
+ +
+ +              it->tcset_pos = NULL;
+ +      }
+ +
+ +      /* find the next cset */
+ +      l = it->cset_pos;
+ +      l = l->next;
+ +      if (l == it->cset_head) {
+ +              it->cset_pos = NULL;
+ +              return NULL;
+ +      }
+ +
+ +      if (it->ss) {
+ +              cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+ +      } else {
+ +              link = list_entry(l, struct cgrp_cset_link, cset_link);
+ +              cset = link->cset;
+ +      }
+ +
+ +      it->cset_pos = l;
+ +
+ +      /* initialize threaded css_set walking */
+ +      if (it->flags & CSS_TASK_ITER_THREADED) {
+ +              if (it->cur_dcset)
+ +                      put_css_set_locked(it->cur_dcset);
+ +              it->cur_dcset = cset;
+ +              get_css_set(cset);
+ +
+ +              it->tcset_head = &cset->threaded_csets;
+ +              it->tcset_pos = &cset->threaded_csets;
+ +      }
+ +
+ +      return cset;
+ +}
+ +
   /**
    * css_task_iter_advance_css_set - advance a task itererator to the next css_set
    * @it: the iterator to advance
@@@ -4003,19 -3664,32 +4004,19 @@@
    */
   static void css_task_iter_advance_css_set(struct css_task_iter *it)
   {
- -      struct list_head *l = it->cset_pos;
- -      struct cgrp_cset_link *link;
         struct css_set *cset;
   
         lockdep_assert_held(&css_set_lock);
   
         /* Advance to the next non-empty css_set */
         do {
- -              l = l->next;
- -              if (l == it->cset_head) {
- -                      it->cset_pos = NULL;
+ +              cset = css_task_iter_next_css_set(it);
+ +              if (!cset) {
                         it->task_pos = NULL;
                         return;
                 }
- -
- -              if (it->ss) {
- -                      cset = container_of(l, struct css_set,
- -                                          e_cset_node[it->ss->id]);
- -              } else {
- -                      link = list_entry(l, struct cgrp_cset_link, cset_link);
- -                      cset = link->cset;
- -              }
         } while (!css_set_populated(cset));
   
- -      it->cset_pos = l;
- -
         if (!list_empty(&cset->tasks))
                 it->task_pos = cset->tasks.next;
         else
@@@ -4055,7 -3729,6 +4056,7 @@@ static void css_task_iter_advance(struc
         lockdep_assert_held(&css_set_lock);
         WARN_ON_ONCE(!l);
   
+ +repeat:
         /*
          * Advance iterator to find next entry.  cset->tasks is consumed
          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
@@@ -4070,18 -3743,11 +4071,18 @@@
                 css_task_iter_advance_css_set(it);
         else
                 it->task_pos = l;
+ +
+ +      /* if PROCS, skip over tasks which aren't group leaders */
+ +      if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+ +          !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+ +                                          cg_list)))
+ +              goto repeat;
   }
   
   /**
    * css_task_iter_start - initiate task iteration
    * @css: the css to walk tasks of
+ + * @flags: CSS_TASK_ITER_* flags
    * @it: the task iterator to use
    *
    * Initiate iteration through the tasks of @css.  The caller can call
@@@ -4089,7 -3755,7 +4090,7 @@@
    * returns NULL.  On completion of iteration, css_task_iter_end() must be
    * called.
    */
- -void css_task_iter_start(struct cgroup_subsys_state *css,
+ +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                          struct css_task_iter *it)
   {
         /* no one should try to iterate before mounting cgroups */
@@@ -4100,7 -3766,6 +4101,7 @@@
         spin_lock_irq(&css_set_lock);
   
         it->ss = css->ss;
+ +      it->flags = flags;
   
         if (it->ss)
                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@@ -4158,9 -3823,6 +4159,9 @@@ void css_task_iter_end(struct css_task_
                 spin_unlock_irq(&css_set_lock);
         }
   
+ +      if (it->cur_dcset)
+ +              put_css_set(it->cur_dcset);
+ +
         if (it->cur_task)
                 put_task_struct(it->cur_task);
   }
@@@ -4177,12 -3839,16 +4178,12 @@@ static void *cgroup_procs_next(struct s
   {
         struct kernfs_open_file *of = s->private;
         struct css_task_iter *it = of->priv;
- -      struct task_struct *task;
   
- -      do {
- -              task = css_task_iter_next(it);
- -      } while (task && !thread_group_leader(task));
- -
- -      return task;
+ +      return css_task_iter_next(it);
   }
   
- -static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+ +static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+ +                                unsigned int iter_flags)
   {
         struct kernfs_open_file *of = s->private;
         struct cgroup *cgrp = seq_css(s)->cgroup;
@@@ -4200,169 -3866,24 +4201,169 @@@
                 if (!it)
                         return ERR_PTR(-ENOMEM);
                 of->priv = it;
- -              css_task_iter_start(&cgrp->self, it);
+ +              css_task_iter_start(&cgrp->self, iter_flags, it);
         } else if (!(*pos)++) {
                 css_task_iter_end(it);
- -              css_task_iter_start(&cgrp->self, it);
+ +              css_task_iter_start(&cgrp->self, iter_flags, it);
         }
   
         return cgroup_procs_next(s, NULL, NULL);
   }
   
+ +static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+ +{
+ +      struct cgroup *cgrp = seq_css(s)->cgroup;
+ +
+ +      /*
+ +       * All processes of a threaded subtree belong to the domain cgroup
+ +       * of the subtree.  Only threads can be distributed across the
+ +       * subtree.  Reject reads on cgroup.procs in the subtree proper.
+ +       * They're always empty anyway.
+ +       */
+ +      if (cgroup_is_threaded(cgrp))
+ +              return ERR_PTR(-EOPNOTSUPP);
+ +
+ +      return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+ +                                          CSS_TASK_ITER_THREADED);
+ +}
+ +
   static int cgroup_procs_show(struct seq_file *s, void *v)
   {
- -      seq_printf(s, "%d\n", task_tgid_vnr(v));
+ +      seq_printf(s, "%d\n", task_pid_vnr(v));
         return 0;
   }
   
+ +static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+ +                                       struct cgroup *dst_cgrp,
+ +                                       struct super_block *sb)
+ +{
+ +      struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ +      struct cgroup *com_cgrp = src_cgrp;
+ +      struct inode *inode;
+ +      int ret;
+ +
+ +      lockdep_assert_held(&cgroup_mutex);
+ +
+ +      /* find the common ancestor */
+ +      while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ +              com_cgrp = cgroup_parent(com_cgrp);
+ +
+ +      /* %current should be authorized to migrate to the common ancestor */
+ +      inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ +      if (!inode)
+ +              return -ENOMEM;
+ +
+ +      ret = inode_permission(inode, MAY_WRITE);
+ +      iput(inode);
+ +      if (ret)
+ +              return ret;
+ +
+ +      /*
+ +       * If namespaces are delegation boundaries, %current must be able
+ +       * to see both source and destination cgroups from its namespace.
+ +       */
+ +      if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ +          (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+ +           !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+ +              return -ENOENT;
+ +
+ +      return 0;
+ +}
+ +
+ +static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ +                                char *buf, size_t nbytes, loff_t off)
+ +{
+ +      struct cgroup *src_cgrp, *dst_cgrp;
+ +      struct task_struct *task;
+ +      ssize_t ret;
+ +
+ +      dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ +      if (!dst_cgrp)
+ +              return -ENODEV;
+ +
+ +      task = cgroup_procs_write_start(buf, true);
+ +      ret = PTR_ERR_OR_ZERO(task);
+ +      if (ret)
+ +              goto out_unlock;
+ +
+ +      /* find the source cgroup */
+ +      spin_lock_irq(&css_set_lock);
+ +      src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ +      spin_unlock_irq(&css_set_lock);
+ +
+ +      ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ +                                          of->file->f_path.dentry->d_sb);
+ +      if (ret)
+ +              goto out_finish;
+ +
+ +      ret = cgroup_attach_task(dst_cgrp, task, true);
+ +
+ +out_finish:
+ +      cgroup_procs_write_finish(task);
+ +out_unlock:
+ +      cgroup_kn_unlock(of->kn);
+ +
+ +      return ret ?: nbytes;
+ +}
+ +
+ +static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+ +{
+ +      return __cgroup_procs_start(s, pos, 0);
+ +}
+ +
+ +static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+ +                                  char *buf, size_t nbytes, loff_t off)
+ +{
+ +      struct cgroup *src_cgrp, *dst_cgrp;
+ +      struct task_struct *task;
+ +      ssize_t ret;
+ +
+ +      buf = strstrip(buf);
+ +
+ +      dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ +      if (!dst_cgrp)
+ +              return -ENODEV;
+ +
+ +      task = cgroup_procs_write_start(buf, false);
+ +      ret = PTR_ERR_OR_ZERO(task);
+ +      if (ret)
+ +              goto out_unlock;
+ +
+ +      /* find the source cgroup */
+ +      spin_lock_irq(&css_set_lock);
+ +      src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ +      spin_unlock_irq(&css_set_lock);
+ +
+ +      /* thread migrations follow the cgroup.procs delegation rule */
+ +      ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ +                                          of->file->f_path.dentry->d_sb);
+ +      if (ret)
+ +              goto out_finish;
+ +
+ +      /* and must be contained in the same domain */
+ +      ret = -EOPNOTSUPP;
+ +      if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+ +              goto out_finish;
+ +
+ +      ret = cgroup_attach_task(dst_cgrp, task, false);
+ +
+ +out_finish:
+ +      cgroup_procs_write_finish(task);
+ +out_unlock:
+ +      cgroup_kn_unlock(of->kn);
+ +
+ +      return ret ?: nbytes;
+ +}
+ +
   /* cgroup core interface files for the default hierarchy */
   static struct cftype cgroup_base_files[] = {
         {
+ +              .name = "cgroup.type",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cgroup_type_show,
+ +              .write = cgroup_type_write,
+ +      },
+ +      {
                 .name = "cgroup.procs",
                 .flags = CFTYPE_NS_DELEGATABLE,
                 .file_offset = offsetof(struct cgroup, procs_file),
@@@ -4373,14 -3894,6 +4374,14 @@@
                 .write = cgroup_procs_write,
         },
         {
+ +              .name = "cgroup.threads",
+ +              .release = cgroup_procs_release,
+ +              .seq_start = cgroup_threads_start,
+ +              .seq_next = cgroup_procs_next,
+ +              .seq_show = cgroup_procs_show,
+ +              .write = cgroup_threads_write,
+ +      },
+ +      {
                 .name = "cgroup.controllers",
                 .seq_show = cgroup_controllers_show,
         },
@@@ -4396,20 -3909,6 +4397,20 @@@
                 .file_offset = offsetof(struct cgroup, events_file),
                 .seq_show = cgroup_events_show,
         },
+ +      {
+ +              .name = "cgroup.max.descendants",
+ +              .seq_show = cgroup_max_descendants_show,
+ +              .write = cgroup_max_descendants_write,
+ +      },
+ +      {
+ +              .name = "cgroup.max.depth",
+ +              .seq_show = cgroup_max_depth_show,
+ +              .write = cgroup_max_depth_write,
+ +      },
+ +      {
+ +              .name = "cgroup.stat",
+ +              .seq_show = cgroup_stat_show,
+ +      },
         { }     /* terminate */
   };
   
@@@ -4509,15 -4008,9 +4510,15 @@@ static void css_release_work_fn(struct 
                 if (ss->css_released)
                         ss->css_released(css);
         } else {
+ +              struct cgroup *tcgrp;
+ +
                 /* cgroup release path */
                 trace_cgroup_release(cgrp);
   
+ +              for (tcgrp = cgroup_parent(cgrp); tcgrp;
+ +                   tcgrp = cgroup_parent(tcgrp))
+ +                      tcgrp->nr_dying_descendants--;
+ +
                 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
                 cgrp->id = -1;
   
@@@ -4604,6 -4097,9 +4605,6 @@@ static void offline_css(struct cgroup_s
         if (!(css->flags & CSS_ONLINE))
                 return;
   
- -      if (ss->css_reset)
- -              ss->css_reset(css);
- -
         if (ss->css_offline)
                 ss->css_offline(css);
   
@@@ -4713,13 -4209,9 +4714,13 @@@ static struct cgroup *cgroup_create(str
         cgrp->root = root;
         cgrp->level = level;
   
- -      for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ +      for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
   
+ +              if (tcgrp != cgrp)
+ +                      tcgrp->nr_descendants++;
+ +      }
+ +
         if (notify_on_release(parent))
                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
   
@@@ -4760,29 -4252,6 +4761,29 @@@ out_free_cgrp
         return ERR_PTR(ret);
   }
   
+ +static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+ +{
+ +      struct cgroup *cgroup;
+ +      int ret = false;
+ +      int level = 1;
+ +
+ +      lockdep_assert_held(&cgroup_mutex);
+ +
+ +      for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+ +              if (cgroup->nr_descendants >= cgroup->max_descendants)
+ +                      goto fail;
+ +
+ +              if (level > cgroup->max_depth)
+ +                      goto fail;
+ +
+ +              level++;
+ +      }
+ +
+ +      ret = true;
+ +fail:
+ +      return ret;
+ +}
+ +
   int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
   {
         struct cgroup *parent, *cgrp;
@@@ -4797,11 -4266,6 +4798,11 @@@
         if (!parent)
                 return -ENODEV;
   
+ +      if (!cgroup_check_hierarchy_limits(parent)) {
+ +              ret = -EAGAIN;
+ +              goto out_unlock;
+ +      }
+ +
         cgrp = cgroup_create(parent);
         if (IS_ERR(cgrp)) {
                 ret = PTR_ERR(cgrp);
@@@ -4953,7 -4417,6 +4954,7 @@@ static void kill_css(struct cgroup_subs
   static int cgroup_destroy_locked(struct cgroup *cgrp)
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
   {
+ +      struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
         struct cgroup_subsys_state *css;
         struct cgrp_cset_link *link;
         int ssid;
@@@ -4998,15 -4461,7 +4999,15 @@@
          */
         kernfs_remove(cgrp->kn);
   
- -      cgroup1_check_for_release(cgroup_parent(cgrp));
+ +      if (parent && cgroup_is_threaded(cgrp))
+ +              parent->nr_threaded_children--;
+ +
+ +      for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ +              tcgrp->nr_descendants--;
+ +              tcgrp->nr_dying_descendants++;
+ +      }
+ +
+ +      cgroup1_check_for_release(parent);
   
         /* put the base reference */
         percpu_ref_kill(&cgrp->self.refcnt);
@@@ -5201,17 -4656,11 +5202,17 @@@ int __init cgroup_init(void
   
                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
   
+ +              /* implicit controllers must be threaded too */
+ +              WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+ +
                 if (ss->implicit_on_dfl)
                         cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                 else if (!ss->dfl_cftypes)
                         cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
   
+ +              if (ss->threaded)
+ +                      cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+ +
                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                 } else {
@@@ -5221,10 -4670,6 +5222,10 @@@
   
                 if (ss->bind)
                         ss->bind(init_css_set.subsys[ssid]);
+ +
+ +              mutex_lock(&cgroup_mutex);
+ +              css_populate_dir(init_css_set.subsys[ssid]);
+ +              mutex_unlock(&cgroup_mutex);
         }
   
         /* init_css_set.subsys[] has been updated, re-hash */
@@@ -5256,6 -4701,18 +5257,18 @@@ static int __init cgroup_wq_init(void
   }
   core_initcall(cgroup_wq_init);
   
+ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+                                       char *buf, size_t buflen)
+ {
+       struct kernfs_node *kn;
+ 
+       kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+       if (!kn)
+               return;
+       kernfs_path(kn, buf, buflen);
+       kernfs_put(kn);
+ }
+ 
   /*
    * proc_cgroup_show()
    *  - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --combined mm/page_io.c

index 20139b9,9cf1bc7..21502d3
--- 1/mm/page_io.c
--- 2/mm/page_io.c
+++ b/mm/page_io.c
@@@ -22,24 -22,24 +22,27 @@@
   #include <linux/frontswap.h>
   #include <linux/blkdev.h>
   #include <linux/uio.h>
+ +#include <linux/sched/task.h>
   #include <asm/pgtable.h>
   
   static struct bio *get_swap_bio(gfp_t gfp_flags,
                                 struct page *page, bio_end_io_t end_io)
   {
+ +      int i, nr = hpage_nr_pages(page);
         struct bio *bio;
   
- -      bio = bio_alloc(gfp_flags, 1);
+ +      bio = bio_alloc(gfp_flags, nr);
         if (bio) {
-               bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+               struct block_device *bdev;
+ 
+               bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
+               bio_set_dev(bio, bdev);
                 bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
                 bio->bi_end_io = end_io;
   
- -              bio_add_page(bio, page, PAGE_SIZE, 0);
- -              BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
+ +              for (i = 0; i < nr; i++)
+ +                      bio_add_page(bio, page + i, PAGE_SIZE, 0);
+ +              VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
         }
         return bio;
   }
@@@ -60,8 -60,7 +63,7 @@@ void end_swap_bio_write(struct bio *bio
                  */
                 set_page_dirty(page);
                 pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
-                        imajor(bio->bi_bdev->bd_inode),
-                        iminor(bio->bi_bdev->bd_inode),
+                        MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
                          (unsigned long long)bio->bi_iter.bi_sector);
                 ClearPageReclaim(page);
         }
@@@ -126,8 -125,7 +128,7 @@@ static void end_swap_bio_read(struct bi
                 SetPageError(page);
                 ClearPageUptodate(page);
                 pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
-                        imajor(bio->bi_bdev->bd_inode),
-                        iminor(bio->bi_bdev->bd_inode),
+                        MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
                          (unsigned long long)bio->bi_iter.bi_sector);
                 goto out;
         }
@@@ -139,7 -137,6 +140,7 @@@ out
         WRITE_ONCE(bio->bi_private, NULL);
         bio_put(bio);
         wake_up_process(waiter);
+ +      put_task_struct(waiter);
   }
   
   int generic_swapfile_activate(struct swap_info_struct *sis,
@@@ -264,15 -261,6 +265,15 @@@ static sector_t swap_page_sector(struc
         return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
   }
   
+ +static inline void count_swpout_vm_event(struct page *page)
+ +{
+ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ +      if (unlikely(PageTransHuge(page)))
+ +              count_vm_event(THP_SWPOUT);
+ +#endif
+ +      count_vm_events(PSWPOUT, hpage_nr_pages(page));
+ +}
+ +
   int __swap_writepage(struct page *page, struct writeback_control *wbc,
                 bio_end_io_t end_write_func)
   {
@@@ -324,7 -312,7 +325,7 @@@
   
         ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
         if (!ret) {
- -              count_vm_event(PSWPOUT);
+ +              count_swpout_vm_event(page);
                 return 0;
         }
   
@@@ -337,7 -325,7 +338,7 @@@
                 goto out;
         }
         bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- -      count_vm_event(PSWPOUT);
+ +      count_swpout_vm_event(page);
         set_page_writeback(page);
         unlock_page(page);
         submit_bio(bio);
@@@ -351,7 -339,7 +352,7 @@@ int swap_readpage(struct page *page, bo
         int ret = 0;
         struct swap_info_struct *sis = page_swap_info(page);
         blk_qc_t qc;
-       struct block_device *bdev;
+       struct gendisk *disk;
   
         VM_BUG_ON_PAGE(!PageSwapCache(page), page);
         VM_BUG_ON_PAGE(!PageLocked(page), page);
@@@ -390,12 -378,7 +391,12 @@@
                 ret = -ENOMEM;
                 goto out;
         }
-       bdev = bio->bi_bdev;
+       disk = bio->bi_disk;
+ +      /*
+ +       * Keep this task valid during swap readpage because the oom killer may
+ +       * attempt to access it in the page fault retry time check.
+ +       */
+ +      get_task_struct(current);
         bio->bi_private = current;
         bio_set_op_attrs(bio, REQ_OP_READ, 0);
         count_vm_event(PSWPIN);
@@@ -406,7 -389,7 +407,7 @@@
                 if (!READ_ONCE(bio->bi_private))
                         break;
   
-               if (!blk_mq_poll(bdev_get_queue(bdev), qc))
+               if (!blk_mq_poll(disk->queue, qc))
                         break;
         }
         __set_current_state(TASK_RUNNING);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 7 Sep 2017 18:59:42 +0000 (11:59 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 7 Sep 2017 18:59:42 +0000 (11:59 -0700)
		1	2
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/sysdev/axonram.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/bfq-iosched.h	patch \|	diff1 \|	diff2 \|	blob \| history
block/bio-integrity.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq-debugfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-throttle.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/genhd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/brd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/loop.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/null_blk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/virtio_blk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/xen-blkback/xenbus.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/xen-blkfront.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/zram/zram_drv.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-crypt.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-mpath.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/md.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid5-cache.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/rdma.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/raid56.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/lops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/meta_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/ops_fstype.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/iomap.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/kernfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/cluster/heartbeat.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/bio.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_io.c	patch \|	diff1 \|	diff2 \|	blob \| history