F: drivers/acpi/
F: drivers/pnp/pnpacpi/
F: include/linux/acpi.h
+F: include/linux/fwnode.h
F: include/acpi/
F: Documentation/acpi/
F: Documentation/ABI/testing/sysfs-bus-acpi
F: drivers/pci/*/*/*acpi*
F: tools/power/acpi/
+ACPI APEI
+M: "Rafael J. Wysocki" <rjw@rjwysocki.net>
+M: Len Brown <lenb@kernel.org>
+L: linux-acpi@vger.kernel.org
+R: Tony Luck <tony.luck@intel.com>
+R: Borislav Petkov <bp@alien8.de>
+F: drivers/acpi/apei/
+
ACPI COMPONENT ARCHITECTURE (ACPICA)
M: Robert Moore <robert.moore@intel.com>
M: Lv Zheng <lv.zheng@intel.com>
F: arch/arm/mach-artpec
F: arch/arm/boot/dts/artpec6*
F: drivers/clk/axis
+F: drivers/crypto/axis
F: drivers/pinctrl/pinctrl-artpec*
F: Documentation/devicetree/bindings/pinctrl/axis,artpec6-pinctrl.txt
R: Benjamin Herrenschmidt <benh@kernel.crashing.org>
R: Joel Stanley <joel@jms.id.au>
L: linux-i2c@vger.kernel.org
-L: openbmc@lists.ozlabs.org
+L: openbmc@lists.ozlabs.org (moderated for non-subscribers)
S: Maintained
F: drivers/irqchip/irq-aspeed-i2c-ic.c
F: drivers/i2c/busses/i2c-aspeed.c
ARM/CORTINA SYSTEMS GEMINI ARM ARCHITECTURE
M: Hans Ulli Kroll <ulli.kroll@googlemail.com>
+M: Linus Walleij <linus.walleij@linaro.org>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
T: git git://github.com/ulli-kroll/linux.git
S: Maintained
+F: Documentation/devicetree/bindings/arm/gemini.txt
+F: Documentation/devicetree/bindings/pinctrl/cortina,gemini-pinctrl.txt
+F: Documentation/devicetree/bindings/rtc/faraday,ftrtc010.txt
F: arch/arm/mach-gemini/
+F: drivers/pinctrl/pinctrl-gemini.c
F: drivers/rtc/rtc-ftrtc010.c
ARM/CSR SIRFPRIMA2 MACHINE SUPPORT
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
L: linux-mediatek@lists.infradead.org (moderated for non-subscribers)
S: Maintained
-F: drivers/phy/phy-mt65xx-usb3.c
+F: drivers/phy/mediatek/phy-mtk-tphy.c
ARM/MICREL KS8695 ARCHITECTURE
M: Greg Ungerer <gerg@uclinux.org>
F: drivers/bus/uniphier-system-bus.c
F: drivers/clk/uniphier/
F: drivers/i2c/busses/i2c-uniphier*
+F: drivers/irqchip/irq-uniphier-aidet.c
F: drivers/pinctrl/uniphier/
F: drivers/reset/reset-uniphier.c
F: drivers/tty/serial/8250/8250_uniphier.c
S: Maintained
F: Documentation/ABI/testing/sysfs-class-net-batman-adv
F: Documentation/ABI/testing/sysfs-class-net-mesh
-F: Documentation/networking/batman-adv.txt
+F: Documentation/networking/batman-adv.rst
F: include/uapi/linux/batman_adv.h
F: net/batman-adv/
F: drivers/gpu/drm/qxl/
F: include/uapi/drm/qxl_drm.h
+DRM DRIVER FOR PERVASIVE DISPLAYS REPAPER PANELS
+M: Noralf Trønnes <noralf@tronnes.org>
+S: Maintained
+F: drivers/gpu/drm/tinydrm/repaper.c
+F: Documentation/devicetree/bindings/display/repaper.txt
+
DRM DRIVER FOR RAGE 128 VIDEO CARDS
S: Orphan / Obsolete
F: drivers/gpu/drm/r128/
F: drivers/gpu/drm/sis/
F: include/uapi/drm/sis_drm.h
+DRM DRIVER FOR SITRONIX ST7586 PANELS
+M: David Lechner <david@lechnology.com>
+S: Maintained
+F: drivers/gpu/drm/tinydrm/st7586.c
+F: Documentation/devicetree/bindings/display/st7586.txt
+
DRM DRIVER FOR TDFX VIDEO CARDS
S: Orphan / Obsolete
F: drivers/gpu/drm/tdfx/
F: include/drm/drm_panel.h
F: Documentation/devicetree/bindings/display/panel/
+DRM TINYDRM DRIVERS
+M: Noralf Trønnes <noralf@tronnes.org>
+W: https://github.com/notro/tinydrm/wiki/Development
+T: git git://anongit.freedesktop.org/drm/drm-misc
+S: Maintained
+F: drivers/gpu/drm/tinydrm/
+F: include/drm/tinydrm/
+
DSBR100 USB FM RADIO DRIVER
M: Alexey Klimov <klimov.linux@gmail.com>
L: linux-media@vger.kernel.org
M: Florian Fainelli <f.fainelli@gmail.com>
L: netdev@vger.kernel.org
S: Maintained
-F: include/linux/phy.h
-F: include/linux/phy_fixed.h
-F: drivers/net/phy/
+F: Documentation/ABI/testing/sysfs-bus-mdio
+F: Documentation/devicetree/bindings/net/mdio*
F: Documentation/networking/phy.txt
+F: drivers/net/phy/
F: drivers/of/of_mdio.c
F: drivers/of/of_net.c
+F: include/linux/*mdio*.h
+F: include/linux/of_net.h
+F: include/linux/phy.h
+F: include/linux/phy_fixed.h
+F: include/linux/platform_data/mdio-gpio.h
+F: include/linux/platform_data/mdio-bcm-unimac.h
+F: include/trace/events/mdio.h
+F: include/uapi/linux/mdio.h
+F: include/uapi/linux/mii.h
EXT2 FILE SYSTEM
M: Jan Kara <jack@suse.com>
FPGA MANAGER FRAMEWORK
M: Alan Tull <atull@kernel.org>
-R: Moritz Fischer <moritz.fischer@ettus.com>
+R: Moritz Fischer <mdf@kernel.org>
L: linux-fpga@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/atull/linux-fpga.git
+Q: http://patchwork.kernel.org/project/linux-fpga/list/
F: Documentation/fpga/
F: Documentation/devicetree/bindings/fpga/
F: drivers/fpga/
F: drivers/staging/greybus/spilib.c
F: drivers/staging/greybus/spilib.h
-GREYBUS LOOBACK/TIME PROTOCOLS DRIVERS
+GREYBUS LOOPBACK/TIME PROTOCOLS DRIVERS
M: Bryan O'Donoghue <pure.logic@nexus-software.ie>
S: Maintained
F: drivers/staging/greybus/loopback.c
F: drivers/net/ethernet/hisilicon/
F: Documentation/devicetree/bindings/net/hisilicon*.txt
+HISILICON NETWORK SUBSYSTEM 3 DRIVER (HNS3)
+M: Yisen Zhuang <yisen.zhuang@huawei.com>
+M: Salil Mehta <salil.mehta@huawei.com>
+L: netdev@vger.kernel.org
+W: http://www.hisilicon.com
+S: Maintained
+F: drivers/net/ethernet/hisilicon/hns3/
+
HISILICON ROCE DRIVER
M: Lijun Ou <oulijun@huawei.com>
M: Wei Hu(Xavier) <xavier.huwei@huawei.com>
S: Maintained
F: drivers/input/touchscreen/htcpen.c
+HUAWEI ETHERNET DRIVER
+M: Aviad Krawczyk <aviad.krawczyk@huawei.com>
+L: netdev@vger.kernel.org
+S: Supported
+F: Documentation/networking/hinic.txt
+F: drivers/net/ethernet/huawei/hinic/
+
HUGETLB FILESYSTEM
M: Nadia Yvette Chambers <nyc@holomorphy.com>
S: Maintained
M: Stephen Hemminger <sthemmin@microsoft.com>
L: devel@linuxdriverproject.org
S: Maintained
+F: Documentation/networking/netvsc.txt
F: arch/x86/include/asm/mshyperv.h
+F: arch/x86/include/asm/trace/hyperv.h
F: arch/x86/include/uapi/asm/hyperv.h
F: arch/x86/kernel/cpu/mshyperv.c
F: arch/x86/hyperv
F: drivers/scsi/storvsc_drv.c
F: drivers/uio/uio_hv_generic.c
F: drivers/video/fbdev/hyperv_fb.c
+F: net/vmw_vsock/hyperv_transport.c
F: include/linux/hyperv.h
+F: include/uapi/linux/hyperv.h
F: tools/hv/
F: Documentation/ABI/stable/sysfs-bus-vmbus
S: Supported
F: drivers/net/ethernet/ibm/ibmvnic.*
+IBM Power Virtual Accelerator Switchboard
+M: Sukadev Bhattiprolu
+L: linuxppc-dev@lists.ozlabs.org
+S: Supported
+F: arch/powerpc/platforms/powernv/vas*
+F: arch/powerpc/platforms/powernv/copy-paste.h
+F: arch/powerpc/include/asm/vas.h
+F: arch/powerpc/include/uapi/asm/vas.h
+
IBM Power Virtual Ethernet Device Driver
M: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
L: netdev@vger.kernel.org
F: drivers/scsi/isci/
INTEL DRM DRIVERS (excluding Poulsbo, Moorestown and derivative chipsets)
-M: Daniel Vetter <daniel.vetter@intel.com>
M: Jani Nikula <jani.nikula@linux.intel.com>
+M: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
+M: Rodrigo Vivi <rodrigo.vivi@intel.com>
L: intel-gfx@lists.freedesktop.org
W: https://01.org/linuxgraphics/
B: https://01.org/linuxgraphics/documentation/how-report-bugs
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sameo/irda-2.6.git
F: Documentation/networking/irda.txt
-F: drivers/net/irda/
-F: include/net/irda/
-F: net/irda/
+F: drivers/staging/irda/
IRQ DOMAINS (IRQ NUMBER MAPPING LIBRARY)
M: Marc Zyngier <marc.zyngier@arm.com>
L: linux-kernel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
-T: git git://git.infradead.org/users/jcooper/linux.git irqchip/core
F: Documentation/devicetree/bindings/interrupt-controller/
F: drivers/irqchip/
S: Maintained
F: drivers/media/dvb-frontends/lgdt3305.*
-LGUEST
-M: Rusty Russell <rusty@rustcorp.com.au>
-L: lguest@lists.ozlabs.org
-W: http://lguest.ozlabs.org/
-S: Odd Fixes
-F: arch/x86/include/asm/lguest*.h
-F: arch/x86/lguest/
-F: drivers/lguest/
-F: include/linux/lguest*.h
-F: tools/lguest/
-
LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
M: Viresh Kumar <vireshk@kernel.org>
L: linux-ide@vger.kernel.org
F: drivers/rtc/rtc-opal.c
F: drivers/scsi/ibmvscsi/
F: drivers/tty/hvc/hvc_opal.c
+F: drivers/watchdog/wdrtas.c
F: tools/testing/selftests/powerpc
N: /pmac
N: powermac
MEDIATEK ETHERNET DRIVER
M: Felix Fietkau <nbd@openwrt.org>
-M: John Crispin <blogic@openwrt.org>
+M: John Crispin <john@phrozen.org>
+M: Sean Wang <sean.wang@mediatek.com>
+M: Nelson Chang <nelson.chang@mediatek.com>
L: netdev@vger.kernel.org
S: Maintained
F: drivers/net/ethernet/mediatek/
S: Maintained
F: drivers/char/hw_random/mtk-rng.c
+MEDIATEK USB3 DRD IP DRIVER
+M: Chunfeng Yun <chunfeng.yun@mediatek.com>
+L: linux-usb@vger.kernel.org (moderated for non-subscribers)
+L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+L: linux-mediatek@lists.infradead.org (moderated for non-subscribers)
+S: Maintained
+F: drivers/usb/mtu3/
+
MEGACHIPS STDPXXXX-GE-B850V3-FW LVDS/DP++ BRIDGES
M: Peter Senna Tschudin <peter.senna@collabora.com>
M: Martin Donnelly <martin.donnelly@ge.com>
M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
L: linux-kernel@vger.kernel.org
S: Supported
-F: kernel/membarrier.c
+F: kernel/sched/membarrier.c
F: include/uapi/linux/membarrier.h
MEMORY MANAGEMENT
F: drivers/dma/at_hdmac_regs.h
F: include/linux/platform_data/dma-atmel.h
+MICROCHIP / ATMEL ECC DRIVER
+M: Tudor Ambarus <tudor.ambarus@microchip.com>
+L: linux-crypto@vger.kernel.org
+S: Maintained
+F: drivers/crypto/atmel-ecc.*
+
MICROCHIP / ATMEL ISC DRIVER
M: Songjun Wu <songjun.wu@microchip.com>
L: linux-media@vger.kernel.org
S: Maintained
F: drivers/nvmem/
F: Documentation/devicetree/bindings/nvmem/
+F: Documentation/ABI/stable/sysfs-bus-nvmem
F: include/linux/nvmem-consumer.h
F: include/linux/nvmem-provider.h
T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git
S: Maintained
F: Documentation/devicetree/bindings/pinctrl/
-F: Documentation/pinctrl.txt
+F: Documentation/driver-api/pinctl.rst
F: drivers/pinctrl/
F: include/linux/pinctrl/
L: linux-kernel@vger.kernel.org
S: Supported
F: arch/x86/kernel/cpu/intel_rdt*
-F: arch/x86/include/asm/intel_rdt*
+F: arch/x86/include/asm/intel_rdt_sched.h
F: Documentation/x86/intel_rdt*
READ-COPY UPDATE (RCU)
S: Odd Fixes
F: drivers/net/ethernet/adaptec/starfire*
+ STEC S1220 SKD DRIVER
+ M: Bart Van Assche <bart.vanassche@wdc.com>
+ L: linux-block@vger.kernel.org
+ S: Maintained
+ F: drivers/block/skd*[ch]
+
STI CEC DRIVER
M: Benjamin Gaignard <benjamin.gaignard@linaro.org>
S: Maintained
S: Maintained
F: drivers/thunderbolt/
+THUNDERX GPIO DRIVER
+M: David Daney <david.daney@cavium.com>
+S: Maintained
+F: drivers/gpio/gpio-thunderx.c
+
TI AM437X VPFE DRIVER
M: "Lad, Prabhakar" <prabhakar.csengg@gmail.com>
L: linux-media@vger.kernel.org
F: include/linux/virtio*.h
F: include/uapi/linux/virtio_*.h
F: drivers/crypto/virtio/
+F: mm/balloon_compaction.c
VIRTIO CRYPTO DRIVER
M: Gonglei <arei.gonglei@huawei.com>
F: include/linux/watchdog.h
F: include/uapi/linux/watchdog.h
+WHISKEYCOVE PMIC GPIO DRIVER
+M: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
+L: linux-gpio@vger.kernel.org
+S: Maintained
+F: drivers/gpio/gpio-wcove.c
+
WIIMOTE HID DRIVER
M: David Herrmann <dh.herrmann@googlemail.com>
L: linux-input@vger.kernel.org
static blk_qc_t
axon_ram_make_request(struct request_queue *queue, struct bio *bio)
{
- struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data;
+ struct axon_ram_bank *bank = bio->bi_disk->private_data;
unsigned long phys_mem, phys_end;
void *user_mem;
struct bio_vec vec;
axon_ram_bank_id++;
- dev_info(&device->dev, "Found memory controller on %s\n",
- device->dev.of_node->full_name);
+ dev_info(&device->dev, "Found memory controller on %pOF\n",
+ device->dev.of_node);
- bank = kzalloc(sizeof(struct axon_ram_bank), GFP_KERNEL);
- if (bank == NULL) {
- dev_err(&device->dev, "Out of memory\n");
- rc = -ENOMEM;
- goto failed;
- }
+ bank = kzalloc(sizeof(*bank), GFP_KERNEL);
+ if (!bank)
+ return -ENOMEM;
device->dev.platform_data = bank;
return 0;
failed:
- if (bank != NULL) {
- if (bank->irq_id)
- free_irq(bank->irq_id, device);
- if (bank->disk != NULL) {
- if (bank->disk->major > 0)
- unregister_blkdev(bank->disk->major,
- bank->disk->disk_name);
- if (bank->disk->flags & GENHD_FL_UP)
- del_gendisk(bank->disk);
- put_disk(bank->disk);
- }
- kill_dax(bank->dax_dev);
- put_dax(bank->dax_dev);
- device->dev.platform_data = NULL;
- if (bank->io_addr != 0)
- iounmap((void __iomem *) bank->io_addr);
- kfree(bank);
+ if (bank->irq_id)
+ free_irq(bank->irq_id, device);
+ if (bank->disk != NULL) {
+ if (bank->disk->major > 0)
+ unregister_blkdev(bank->disk->major,
+ bank->disk->disk_name);
+ if (bank->disk->flags & GENHD_FL_UP)
+ del_gendisk(bank->disk);
+ put_disk(bank->disk);
}
-
+ kill_dax(bank->dax_dev);
+ put_dax(bank->dax_dev);
+ device->dev.platform_data = NULL;
+ if (bank->io_addr != 0)
+ iounmap((void __iomem *) bank->io_addr);
+ kfree(bank);
return rc;
}
*
* bfq_sched_data is the basic scheduler queue. It supports three
* ioprio_classes, and can be used either as a toplevel queue or as an
- * intermediate queue on a hierarchical setup. @next_in_service
- * points to the active entity of the sched_data service trees that
- * will be scheduled next. It is used to reduce the number of steps
- * needed for each hierarchical-schedule update.
+ * intermediate queue in a hierarchical setup.
*
* The supported ioprio_classes are the same as in CFQ, in descending
* priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
* Requests from higher priority queues are served before all the
* requests from lower priority queues; among requests of the same
* queue requests are served according to B-WF2Q+.
- * All the fields are protected by the queue lock of the containing bfqd.
+ *
+ * The schedule is implemented by the service trees, plus the field
+ * @next_in_service, which points to the entity on the active trees
+ * that will be served next, if 1) no changes in the schedule occurs
+ * before the current in-service entity is expired, 2) the in-service
+ * queue becomes idle when it expires, and 3) if the entity pointed by
+ * in_service_entity is not a queue, then the in-service child entity
+ * of the entity pointed by in_service_entity becomes idle on
+ * expiration. This peculiar definition allows for the following
+ * optimization, not yet exploited: while a given entity is still in
+ * service, we already know which is the best candidate for next
+ * service among the other active entitities in the same parent
+ * entity. We can then quickly compare the timestamps of the
+ * in-service entity with those of such best candidate.
+ *
+ * All fields are protected by the lock of the containing bfqd.
*/
struct bfq_sched_data {
/* entity in service */
uint64_t blkcg_serial_nr; /* the current blkcg serial */
#endif
/*
- * Snapshot of the idle window before merging; taken to
- * remember this value while the queue is merged, so as to be
- * able to restore it in case of split.
+ * Snapshot of the has_short_time flag before merging; taken
+ * to remember its value while the queue is merged, so as to
+ * be able to restore it in case of split.
*/
- bool saved_idle_window;
+ bool saved_has_short_ttime;
/*
* Same purpose as the previous two fields for the I/O bound
* classification of a queue.
* without idling the device
*/
BFQQF_fifo_expire, /* FIFO checked in this slice */
- BFQQF_idle_window, /* slice idling enabled */
+ BFQQF_has_short_ttime, /* queue has a short think time */
BFQQF_sync, /* synchronous queue */
BFQQF_IO_bound, /*
* bfqq has timed-out at least once
BFQ_BFQQ_FNS(wait_request);
BFQ_BFQQ_FNS(non_blocking_wait_rq);
BFQ_BFQQ_FNS(fifo_expire);
- BFQ_BFQQ_FNS(idle_window);
+ BFQ_BFQQ_FNS(has_short_ttime);
BFQ_BFQQ_FNS(sync);
BFQ_BFQQ_FNS(IO_bound);
BFQ_BFQQ_FNS(in_large_burst);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
- bfqq_group(bfqq)->blkg_path, ##args); \
+ blk_add_cgroup_trace_msg((bfqd)->queue, \
+ bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
+ "bfq%d%c " fmt, (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
} while (0)
- #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) \
- blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args)
+ #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
+ blk_add_cgroup_trace_msg((bfqd)->queue, \
+ bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \
+ } while (0)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt &&
- bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+ bvec_gap_to_prev(bio->bi_disk->queue,
&bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0;
static blk_status_t bio_integrity_process(struct bio *bio,
struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
{
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
struct blk_integrity_iter iter;
struct bvec_iter bviter;
struct bio_vec bv;
void *prot_buf = page_address(bip->bip_vec->bv_page) +
bip->bip_vec->bv_offset;
- iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ iter.disk_name = bio->bi_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
iter.seed = proc_iter->bi_sector;
iter.prot_buf = prot_buf;
bool bio_integrity_prep(struct bio *bio)
{
struct bio_integrity_payload *bip;
- struct blk_integrity *bi;
- struct request_queue *q;
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct request_queue *q = bio->bi_disk->queue;
void *buf;
unsigned long start, end;
unsigned int len, nr_pages;
unsigned int intervals;
blk_status_t status;
- bi = bdev_get_integrity(bio->bi_bdev);
- q = bdev_get_queue(bio->bi_bdev);
+ if (!bi)
+ return true;
+
if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
return true;
if (bio_integrity(bio))
return true;
- if (bi == NULL)
- return true;
-
if (bio_data_dir(bio) == READ) {
if (!bi->profile->verify_fn ||
!(bi->flags & BLK_INTEGRITY_VERIFY))
struct bio_integrity_payload *bip =
container_of(work, struct bio_integrity_payload, bip_work);
struct bio *bio = bip->bip_bio;
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
struct bvec_iter iter = bio->bi_iter;
/*
*/
bool __bio_integrity_endio(struct bio *bio)
{
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct bio_integrity_payload *bip = bio_integrity(bio);
if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
- bi->profile->verify_fn) {
- struct bio_integrity_payload *bip = bio_integrity(bio);
-
+ (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) {
INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
queue_work(kintegrityd_wq, &bip->bip_work);
return false;
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
bip->bip_iter.bi_sector += bytes_done >> 9;
void bio_integrity_trim(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
}
static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(QUEUED),
QUEUE_FLAG_NAME(STOPPED),
- QUEUE_FLAG_NAME(SYNCFULL),
- QUEUE_FLAG_NAME(ASYNCFULL),
QUEUE_FLAG_NAME(DYING),
QUEUE_FLAG_NAME(BYPASS),
QUEUE_FLAG_NAME(BIDI),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(POLL_STATS),
QUEUE_FLAG_NAME(REGISTERED),
+ QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
+ QUEUE_FLAG_NAME(QUIESCED),
};
#undef QUEUE_FLAG_NAME
CMD_FLAG_NAME(RAHEAD),
CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOUNMAP),
+ CMD_FLAG_NAME(NOWAIT),
};
#undef CMD_FLAG_NAME
return seq_release(inode, file);
}
- const struct file_operations blk_mq_debugfs_fops = {
+ static const struct file_operations blk_mq_debugfs_fops = {
.open = blk_mq_debugfs_open,
.read = seq_read,
.write = blk_mq_debugfs_write,
sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
}
+ struct mq_inflight {
+ struct hd_struct *part;
+ unsigned int *inflight;
+ };
+
+ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, void *priv,
+ bool reserved)
+ {
+ struct mq_inflight *mi = priv;
+
+ if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
+ !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+ /*
+ * index[0] counts the specific partition that was asked
+ * for. index[1] counts the ones that are active on the
+ * whole device, so increment that if mi->part is indeed
+ * a partition, and not a whole device.
+ */
+ if (rq->part == mi->part)
+ mi->inflight[0]++;
+ if (mi->part->partno)
+ mi->inflight[1]++;
+ }
+ }
+
+ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
+ unsigned int inflight[2])
+ {
+ struct mq_inflight mi = { .part = part, .inflight = inflight, };
+
+ inflight[0] = inflight[1] = 0;
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+ }
+
void blk_freeze_queue_start(struct request_queue *q)
{
int freeze_depth;
struct elevator_queue *e = q->elevator;
struct request *rq;
unsigned int tag;
+ struct blk_mq_ctx *local_ctx = NULL;
blk_queue_enter_live(q);
data->q = q;
if (likely(!data->ctx))
- data->ctx = blk_mq_get_ctx(q);
+ data->ctx = local_ctx = blk_mq_get_ctx(q);
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
if (op & REQ_NOWAIT)
tag = blk_mq_get_tag(data);
if (tag == BLK_MQ_TAG_FAIL) {
+ if (local_ctx) {
+ blk_mq_put_ctx(local_ctx);
+ data->ctx = NULL;
+ }
blk_queue_exit(q);
return NULL;
}
return ERR_PTR(ret);
rq = blk_mq_get_request(q, NULL, op, &alloc_data);
-
- blk_mq_put_ctx(alloc_data.ctx);
blk_queue_exit(q);
if (!rq)
return ERR_PTR(-EWOULDBLOCK);
+ blk_mq_put_ctx(alloc_data.ctx);
+
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
rq = blk_mq_get_request(q, NULL, op, &alloc_data);
-
blk_queue_exit(q);
if (!rq)
container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
struct request *rq, *next;
- unsigned long flags;
- spin_lock_irqsave(&q->requeue_lock, flags);
+ spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
- spin_unlock_irqrestore(&q->requeue_lock, flags);
+ spin_unlock_irq(&q->requeue_lock);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
if (!(rq->rq_flags & RQF_SOFTBARRIER))
void blk_mq_delay_kick_requeue_list(struct request_queue *q,
unsigned long msecs)
{
- kblockd_schedule_delayed_work(&q->requeue_work,
- msecs_to_jiffies(msecs));
+ kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
+ msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
{
int srcu_idx;
+ /*
+ * We should be running this queue from one of the CPUs that
+ * are mapped to it.
+ */
WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
cpu_online(hctx->next_cpu));
+ /*
+ * We can't run the queue inline with ints disabled. Ensure that
+ * we catch bad users of this early.
+ */
+ WARN_ON_ONCE(in_interrupt());
+
if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
rcu_read_lock();
blk_mq_sched_dispatch_requests(hctx);
/*
* This function is often used for pausing .queue_rq() by driver when
* there isn't enough resource or some conditions aren't satisfied, and
- * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+ * BLK_STS_RESOURCE is usually returned.
*
* We do not guarantee that dispatch can be drained or blocked
* after blk_mq_stop_hw_queue() returns. Please use
/*
* This function is often used for pausing .queue_rq() by driver when
* there isn't enough resource or some conditions aren't satisfied, and
- * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+ * BLK_STS_RESOURCE is usually returned.
*
* We do not guarantee that dispatch can be drained or blocked
* after blk_mq_stop_hw_queues() returns. Please use
if (likely(!blk_trace_note_message_enabled(__td->queue))) \
break; \
if ((__tg)) { \
- char __pbuf[128]; \
- \
- blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
- blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
+ blk_add_cgroup_trace_msg(__td->queue, \
+ tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
} else { \
blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
} \
} while (0)
+static inline unsigned int throtl_bio_data_size(struct bio *bio)
+{
+ /* assume it's one sector */
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+ return 512;
+ return bio->bi_iter.bi_size;
+}
+
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+ unsigned int bio_size = throtl_bio_data_size(bio);
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
do_div(tmp, HZ);
bytes_allowed = tmp;
- if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
+ if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
if (wait)
*wait = 0;
return true;
}
/* Calc approx time to dispatch */
- extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
+ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
if (!jiffy_wait)
static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
+ unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
- tg->bytes_disp[rw] += bio->bi_iter.bi_size;
+ tg->bytes_disp[rw] += bio_size;
tg->io_disp[rw]++;
- tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+ tg->last_bytes_disp[rw] += bio_size;
tg->last_io_disp[rw]++;
/*
static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
{
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- int ret;
-
- ret = bio_associate_current(bio);
- if (ret == 0 || ret == -EBUSY)
+ if (bio->bi_css)
bio->bi_cg_private = tg;
blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
- #else
- bio_associate_current(bio);
#endif
}
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);
+ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+ {
+ if (q->mq_ops)
+ return;
+
+ atomic_inc(&part->in_flight[rw]);
+ if (part->partno)
+ atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+ }
+
+ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+ {
+ if (q->mq_ops)
+ return;
+
+ atomic_dec(&part->in_flight[rw]);
+ if (part->partno)
+ atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+ }
+
+ void part_in_flight(struct request_queue *q, struct hd_struct *part,
+ unsigned int inflight[2])
+ {
+ if (q->mq_ops) {
+ blk_mq_in_flight(q, part, inflight);
+ return;
+ }
+
+ inflight[0] = atomic_read(&part->in_flight[0]) +
+ atomic_read(&part->in_flight[1]);
+ if (part->partno) {
+ part = &part_to_disk(part)->part0;
+ inflight[1] = atomic_read(&part->in_flight[0]) +
+ atomic_read(&part->in_flight[1]);
+ }
+ }
+
+ struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
+ {
+ struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
+
+ if (unlikely(partno < 0 || partno >= ptbl->len))
+ return NULL;
+ return rcu_dereference(ptbl->part[partno]);
+ }
+
/**
* disk_get_part - get partition
* @disk: disk to look partition from
*/
struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
{
- struct hd_struct *part = NULL;
- struct disk_part_tbl *ptbl;
-
- if (unlikely(partno < 0))
- return NULL;
+ struct hd_struct *part;
rcu_read_lock();
-
- ptbl = rcu_dereference(disk->part_tbl);
- if (likely(partno < ptbl->len)) {
- part = rcu_dereference(ptbl->part[partno]);
- if (part)
- get_device(part_to_dev(part));
- }
-
+ part = __disk_get_part(disk, partno);
+ if (part)
+ get_device(part_to_dev(part));
rcu_read_unlock();
return part;
* Can be deleted altogether. Later.
*
*/
+#define BLKDEV_MAJOR_HASH_SIZE 255
static struct blk_major_name {
struct blk_major_name *next;
int major;
{
struct blk_major_name *dp;
- if (offset < BLKDEV_MAJOR_HASH_SIZE) {
- mutex_lock(&block_class_lock);
- for (dp = major_names[offset]; dp; dp = dp->next)
+ mutex_lock(&block_class_lock);
+ for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
+ if (dp->major == offset)
seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
- mutex_unlock(&block_class_lock);
- }
+ mutex_unlock(&block_class_lock);
}
#endif /* CONFIG_PROC_FS */
ret = major;
}
+ if (major >= BLKDEV_MAJOR_MAX) {
+ pr_err("register_blkdev: major requested (%d) is greater than the maximum (%d) for %s\n",
+ major, BLKDEV_MAJOR_MAX, name);
+
+ ret = -EINVAL;
+ goto out;
+ }
+
p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
if (p == NULL) {
ret = -ENOMEM;
* original ptbl is freed using RCU callback.
*
* LOCKING:
- * Matching bd_mutx locked.
+ * Matching bd_mutex locked or the caller is the only user of @disk.
*/
static void disk_replace_part_tbl(struct gendisk *disk,
struct disk_part_tbl *new_ptbl)
{
- struct disk_part_tbl *old_ptbl = disk->part_tbl;
+ struct disk_part_tbl *old_ptbl =
+ rcu_dereference_protected(disk->part_tbl, 1);
rcu_assign_pointer(disk->part_tbl, new_ptbl);
* uses RCU to allow unlocked dereferencing for stats and other stuff.
*
* LOCKING:
- * Matching bd_mutex locked, might sleep.
+ * Matching bd_mutex locked or the caller is the only user of @disk.
+ * Might sleep.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int disk_expand_part_tbl(struct gendisk *disk, int partno)
{
- struct disk_part_tbl *old_ptbl = disk->part_tbl;
+ struct disk_part_tbl *old_ptbl =
+ rcu_dereference_protected(disk->part_tbl, 1);
struct disk_part_tbl *new_ptbl;
int len = old_ptbl ? old_ptbl->len : 0;
int i, target;
struct disk_part_iter piter;
struct hd_struct *hd;
char buf[BDEVNAME_SIZE];
+ unsigned int inflight[2];
int cpu;
/*
disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
while ((hd = disk_part_iter_next(&piter))) {
cpu = part_stat_lock();
- part_round_stats(cpu, hd);
+ part_round_stats(gp->queue, cpu, hd);
part_stat_unlock();
+ part_in_flight(gp->queue, hd, inflight);
seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
"%u %lu %lu %lu %u %u %u %u\n",
MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
part_stat_read(hd, merges[WRITE]),
part_stat_read(hd, sectors[WRITE]),
jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
- part_in_flight(hd),
+ inflight[0],
jiffies_to_msecs(part_stat_read(hd, io_ticks)),
jiffies_to_msecs(part_stat_read(hd, time_in_queue))
);
struct gendisk *alloc_disk_node(int minors, int node_id)
{
struct gendisk *disk;
+ struct disk_part_tbl *ptbl;
+
+ if (minors > DISK_MAX_PARTS) {
+ printk(KERN_ERR
+ "block: can't allocated more than %d partitions\n",
+ DISK_MAX_PARTS);
+ minors = DISK_MAX_PARTS;
+ }
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (disk) {
kfree(disk);
return NULL;
}
- disk->part_tbl->part[0] = &disk->part0;
+ ptbl = rcu_dereference_protected(disk->part_tbl, 1);
+ rcu_assign_pointer(ptbl->part[0], &disk->part0);
/*
* set_capacity() and get_capacity() currently don't use
config BLK_DEV_NULL_BLK
tristate "Null test block driver"
+ depends on CONFIGFS_FS
config BLK_DEV_FD
tristate "Normal floppy disk support"
depends on VIRTIO
---help---
This is the virtual block driver for virtio. It can be used with
- lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
+ QEMU based VMMs (like KVM or Xen). Say Y or M.
config VIRTIO_BLK_SCSI
bool "SCSI passthrough request for the Virtio block driver"
static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
{
- struct block_device *bdev = bio->bi_bdev;
- struct brd_device *brd = bdev->bd_disk->private_data;
+ struct brd_device *brd = bio->bi_disk->private_data;
struct bio_vec bvec;
sector_t sector;
struct bvec_iter iter;
sector = bio->bi_iter.bi_sector;
- if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
+ if (bio_end_sector(bio) > get_capacity(bio->bi_disk))
goto io_error;
bio_for_each_segment(bvec, bio, iter) {
struct page *page, bool is_write)
{
struct brd_device *brd = bdev->bd_disk->private_data;
- int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
+ int err;
+
+ if (PageTransHuge(page))
+ return -ENOTSUPP;
+ err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
page_endio(page, is_write, err);
return err;
}
}
static int
-figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit,
- loff_t logical_blocksize)
+figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
{
loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
sector_t x = (sector_t)size;
lo->lo_offset = offset;
if (lo->lo_sizelimit != sizelimit)
lo->lo_sizelimit = sizelimit;
- if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) {
- lo->lo_logical_blocksize = logical_blocksize;
- blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize);
- blk_queue_logical_block_size(lo->lo_queue,
- lo->lo_logical_blocksize);
- }
set_capacity(lo->lo_disk, x);
bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
/* let user-space know about the new size */
struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host;
struct request_queue *q = lo->lo_queue;
- int lo_bits = 9;
/*
* We use punch hole to reclaim the free space used by the
q->limits.discard_granularity = inode->i_sb->s_blocksize;
q->limits.discard_alignment = 0;
- if (lo->lo_flags & LO_FLAGS_BLOCKSIZE)
- lo_bits = blksize_bits(lo->lo_logical_blocksize);
- blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits);
- blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits);
+ blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+ blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
}
lo->use_dio = false;
lo->lo_blocksize = lo_blocksize;
- lo->lo_logical_blocksize = 512;
lo->lo_device = bdev;
lo->lo_flags = lo_flags;
lo->lo_backing_file = file;
int err;
struct loop_func_table *xfer;
kuid_t uid = current_uid();
- int lo_flags = lo->lo_flags;
if (lo->lo_encrypt_key_size &&
!uid_eq(lo->lo_key_owner, uid) &&
if (err)
goto exit;
- if (info->lo_flags & LO_FLAGS_BLOCKSIZE) {
- if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE))
- lo->lo_logical_blocksize = 512;
- lo->lo_flags |= LO_FLAGS_BLOCKSIZE;
- if (LO_INFO_BLOCKSIZE(info) != 512 &&
- LO_INFO_BLOCKSIZE(info) != 1024 &&
- LO_INFO_BLOCKSIZE(info) != 2048 &&
- LO_INFO_BLOCKSIZE(info) != 4096)
- return -EINVAL;
- if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize)
- return -EINVAL;
- }
-
if (lo->lo_offset != info->lo_offset ||
- lo->lo_sizelimit != info->lo_sizelimit ||
- lo->lo_flags != lo_flags ||
- ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) &&
- lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) {
- if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit,
- LO_INFO_BLOCKSIZE(info))) {
+ lo->lo_sizelimit != info->lo_sizelimit) {
+ if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
err = -EFBIG;
goto exit;
}
if (unlikely(lo->lo_state != Lo_bound))
return -ENXIO;
- return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit,
- lo->lo_logical_blocksize);
+ return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
}
static int loop_set_dio(struct loop_device *lo, unsigned long arg)
struct loop_device *lo;
int err;
- err = misc_register(&loop_misc);
- if (err < 0)
- return err;
-
part_shift = 0;
if (max_part > 0) {
part_shift = fls(max_part);
if ((1UL << part_shift) > DISK_MAX_PARTS) {
err = -EINVAL;
- goto misc_out;
+ goto err_out;
}
if (max_loop > 1UL << (MINORBITS - part_shift)) {
err = -EINVAL;
- goto misc_out;
+ goto err_out;
}
/*
range = 1UL << MINORBITS;
}
+ err = misc_register(&loop_misc);
+ if (err < 0)
+ goto err_out;
+
+
if (register_blkdev(LOOP_MAJOR, "loop")) {
err = -EIO;
goto misc_out;
misc_out:
misc_deregister(&loop_misc);
+ err_out:
return err;
}
+ /*
+ * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
+ * Shaohua Li <shli@fb.com>
+ */
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/blk-mq.h>
#include <linux/hrtimer.h>
#include <linux/lightnvm.h>
+ #include <linux/configfs.h>
+ #include <linux/badblocks.h>
+
+ #define SECTOR_SHIFT 9
+ #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+ #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
+ #define SECTOR_SIZE (1 << SECTOR_SHIFT)
+ #define SECTOR_MASK (PAGE_SECTORS - 1)
+
+ #define FREE_BATCH 16
+
+ #define TICKS_PER_SEC 50ULL
+ #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC)
+
+ static inline u64 mb_per_tick(int mbps)
+ {
+ return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
+ }
struct nullb_cmd {
struct list_head list;
struct llist_node ll_list;
- struct call_single_data csd;
+ call_single_data_t csd;
struct request *rq;
struct bio *bio;
unsigned int tag;
struct nullb_queue *nq;
struct hrtimer timer;
+ blk_status_t error;
};
struct nullb_queue {
unsigned long *tag_map;
wait_queue_head_t wait;
unsigned int queue_depth;
+ struct nullb_device *dev;
struct nullb_cmd *cmds;
};
+ /*
+ * Status flags for nullb_device.
+ *
+ * CONFIGURED: Device has been configured and turned on. Cannot reconfigure.
+ * UP: Device is currently on and visible in userspace.
+ * THROTTLED: Device is being throttled.
+ * CACHE: Device is using a write-back cache.
+ */
+ enum nullb_device_flags {
+ NULLB_DEV_FL_CONFIGURED = 0,
+ NULLB_DEV_FL_UP = 1,
+ NULLB_DEV_FL_THROTTLED = 2,
+ NULLB_DEV_FL_CACHE = 3,
+ };
+
+ /*
+ * nullb_page is a page in memory for nullb devices.
+ *
+ * @page: The page holding the data.
+ * @bitmap: The bitmap represents which sector in the page has data.
+ * Each bit represents one block size. For example, sector 8
+ * will use the 7th bit
+ * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
+ * page is being flushing to storage. FREE means the cache page is freed and
+ * should be skipped from flushing to storage. Please see
+ * null_make_cache_space
+ */
+ struct nullb_page {
+ struct page *page;
+ unsigned long bitmap;
+ };
+ #define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1)
+ #define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2)
+
+ struct nullb_device {
+ struct nullb *nullb;
+ struct config_item item;
+ struct radix_tree_root data; /* data stored in the disk */
+ struct radix_tree_root cache; /* disk cache data */
+ unsigned long flags; /* device flags */
+ unsigned int curr_cache;
+ struct badblocks badblocks;
+
+ unsigned long size; /* device size in MB */
+ unsigned long completion_nsec; /* time in ns to complete a request */
+ unsigned long cache_size; /* disk cache size in MB */
+ unsigned int submit_queues; /* number of submission queues */
+ unsigned int home_node; /* home node for the device */
+ unsigned int queue_mode; /* block interface */
+ unsigned int blocksize; /* block size */
+ unsigned int irqmode; /* IRQ completion handler */
+ unsigned int hw_queue_depth; /* queue depth */
+ unsigned int index; /* index of the disk, only valid with a disk */
+ unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
+ bool use_lightnvm; /* register as a LightNVM device */
+ bool blocking; /* blocking blk-mq device */
+ bool use_per_node_hctx; /* use per-node allocation for hardware context */
+ bool power; /* power on/off the device */
+ bool memory_backed; /* if data is stored in memory */
+ bool discard; /* if support discard */
+ };
+
struct nullb {
+ struct nullb_device *dev;
struct list_head list;
unsigned int index;
struct request_queue *q;
struct nvm_dev *ndev;
struct blk_mq_tag_set *tag_set;
struct blk_mq_tag_set __tag_set;
- struct hrtimer timer;
unsigned int queue_depth;
+ atomic_long_t cur_bytes;
+ struct hrtimer bw_timer;
+ unsigned long cache_flush_pos;
spinlock_t lock;
struct nullb_queue *queues;
static LIST_HEAD(nullb_list);
static struct mutex lock;
static int null_major;
- static int nullb_indexes;
+ static DEFINE_IDA(nullb_indexes);
static struct kmem_cache *ppa_cache;
static struct blk_mq_tag_set tag_set;
NULL_Q_MQ = 2,
};
- static int submit_queues;
- module_param(submit_queues, int, S_IRUGO);
+ static int g_submit_queues = 1;
+ module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
MODULE_PARM_DESC(submit_queues, "Number of submission queues");
- static int home_node = NUMA_NO_NODE;
- module_param(home_node, int, S_IRUGO);
+ static int g_home_node = NUMA_NO_NODE;
+ module_param_named(home_node, g_home_node, int, S_IRUGO);
MODULE_PARM_DESC(home_node, "Home node for the device");
- static int queue_mode = NULL_Q_MQ;
+ static int g_queue_mode = NULL_Q_MQ;
static int null_param_store_val(const char *str, int *val, int min, int max)
{
static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
{
- return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ);
+ return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
}
static const struct kernel_param_ops null_queue_mode_param_ops = {
.get = param_get_int,
};
- device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO);
+ device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO);
MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
- static int gb = 250;
- module_param(gb, int, S_IRUGO);
+ static int g_gb = 250;
+ module_param_named(gb, g_gb, int, S_IRUGO);
MODULE_PARM_DESC(gb, "Size in GB");
- static int bs = 512;
- module_param(bs, int, S_IRUGO);
+ static int g_bs = 512;
+ module_param_named(bs, g_bs, int, S_IRUGO);
MODULE_PARM_DESC(bs, "Block size (in bytes)");
static int nr_devices = 1;
module_param(nr_devices, int, S_IRUGO);
MODULE_PARM_DESC(nr_devices, "Number of devices to register");
- static bool use_lightnvm;
- module_param(use_lightnvm, bool, S_IRUGO);
+ static bool g_use_lightnvm;
+ module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
- static bool blocking;
- module_param(blocking, bool, S_IRUGO);
+ static bool g_blocking;
+ module_param_named(blocking, g_blocking, bool, S_IRUGO);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
static bool shared_tags;
module_param(shared_tags, bool, S_IRUGO);
MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
- static int irqmode = NULL_IRQ_SOFTIRQ;
+ static int g_irqmode = NULL_IRQ_SOFTIRQ;
static int null_set_irqmode(const char *str, const struct kernel_param *kp)
{
- return null_param_store_val(str, &irqmode, NULL_IRQ_NONE,
+ return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
NULL_IRQ_TIMER);
}
.get = param_get_int,
};
- device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
+ device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO);
MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
- static unsigned long completion_nsec = 10000;
- module_param(completion_nsec, ulong, S_IRUGO);
+ static unsigned long g_completion_nsec = 10000;
+ module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO);
MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
- static int hw_queue_depth = 64;
- module_param(hw_queue_depth, int, S_IRUGO);
+ static int g_hw_queue_depth = 64;
+ module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO);
MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
- static bool use_per_node_hctx = false;
- module_param(use_per_node_hctx, bool, S_IRUGO);
+ static bool g_use_per_node_hctx;
+ module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO);
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
+ static struct nullb_device *null_alloc_dev(void);
+ static void null_free_dev(struct nullb_device *dev);
+ static void null_del_dev(struct nullb *nullb);
+ static int null_add_dev(struct nullb_device *dev);
+ static void null_free_device_storage(struct nullb_device *dev, bool is_cache);
+
+ static inline struct nullb_device *to_nullb_device(struct config_item *item)
+ {
+ return item ? container_of(item, struct nullb_device, item) : NULL;
+ }
+
+ static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
+ {
+ return snprintf(page, PAGE_SIZE, "%u\n", val);
+ }
+
+ static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
+ char *page)
+ {
+ return snprintf(page, PAGE_SIZE, "%lu\n", val);
+ }
+
+ static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
+ {
+ return snprintf(page, PAGE_SIZE, "%u\n", val);
+ }
+
+ static ssize_t nullb_device_uint_attr_store(unsigned int *val,
+ const char *page, size_t count)
+ {
+ unsigned int tmp;
+ int result;
+
+ result = kstrtouint(page, 0, &tmp);
+ if (result)
+ return result;
+
+ *val = tmp;
+ return count;
+ }
+
+ static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
+ const char *page, size_t count)
+ {
+ int result;
+ unsigned long tmp;
+
+ result = kstrtoul(page, 0, &tmp);
+ if (result)
+ return result;
+
+ *val = tmp;
+ return count;
+ }
+
+ static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
+ size_t count)
+ {
+ bool tmp;
+ int result;
+
+ result = kstrtobool(page, &tmp);
+ if (result)
+ return result;
+
+ *val = tmp;
+ return count;
+ }
+
+ /* The following macro should only be used with TYPE = {uint, ulong, bool}. */
+ #define NULLB_DEVICE_ATTR(NAME, TYPE) \
+ static ssize_t \
+ nullb_device_##NAME##_show(struct config_item *item, char *page) \
+ { \
+ return nullb_device_##TYPE##_attr_show( \
+ to_nullb_device(item)->NAME, page); \
+ } \
+ static ssize_t \
+ nullb_device_##NAME##_store(struct config_item *item, const char *page, \
+ size_t count) \
+ { \
+ if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags)) \
+ return -EBUSY; \
+ return nullb_device_##TYPE##_attr_store( \
+ &to_nullb_device(item)->NAME, page, count); \
+ } \
+ CONFIGFS_ATTR(nullb_device_, NAME);
+
+ NULLB_DEVICE_ATTR(size, ulong);
+ NULLB_DEVICE_ATTR(completion_nsec, ulong);
+ NULLB_DEVICE_ATTR(submit_queues, uint);
+ NULLB_DEVICE_ATTR(home_node, uint);
+ NULLB_DEVICE_ATTR(queue_mode, uint);
+ NULLB_DEVICE_ATTR(blocksize, uint);
+ NULLB_DEVICE_ATTR(irqmode, uint);
+ NULLB_DEVICE_ATTR(hw_queue_depth, uint);
+ NULLB_DEVICE_ATTR(index, uint);
+ NULLB_DEVICE_ATTR(use_lightnvm, bool);
+ NULLB_DEVICE_ATTR(blocking, bool);
+ NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
+ NULLB_DEVICE_ATTR(memory_backed, bool);
+ NULLB_DEVICE_ATTR(discard, bool);
+ NULLB_DEVICE_ATTR(mbps, uint);
+ NULLB_DEVICE_ATTR(cache_size, ulong);
+
+ static ssize_t nullb_device_power_show(struct config_item *item, char *page)
+ {
+ return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
+ }
+
+ static ssize_t nullb_device_power_store(struct config_item *item,
+ const char *page, size_t count)
+ {
+ struct nullb_device *dev = to_nullb_device(item);
+ bool newp = false;
+ ssize_t ret;
+
+ ret = nullb_device_bool_attr_store(&newp, page, count);
+ if (ret < 0)
+ return ret;
+
+ if (!dev->power && newp) {
+ if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
+ return count;
+ if (null_add_dev(dev)) {
+ clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+ return -ENOMEM;
+ }
+
+ set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
+ dev->power = newp;
+ } else if (dev->power && !newp) {
+ mutex_lock(&lock);
+ dev->power = newp;
+ null_del_dev(dev->nullb);
+ mutex_unlock(&lock);
+ clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+ }
+
+ return count;
+ }
+
+ CONFIGFS_ATTR(nullb_device_, power);
+
+ static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
+ {
+ struct nullb_device *t_dev = to_nullb_device(item);
+
+ return badblocks_show(&t_dev->badblocks, page, 0);
+ }
+
+ static ssize_t nullb_device_badblocks_store(struct config_item *item,
+ const char *page, size_t count)
+ {
+ struct nullb_device *t_dev = to_nullb_device(item);
+ char *orig, *buf, *tmp;
+ u64 start, end;
+ int ret;
+
+ orig = kstrndup(page, count, GFP_KERNEL);
+ if (!orig)
+ return -ENOMEM;
+
+ buf = strstrip(orig);
+
+ ret = -EINVAL;
+ if (buf[0] != '+' && buf[0] != '-')
+ goto out;
+ tmp = strchr(&buf[1], '-');
+ if (!tmp)
+ goto out;
+ *tmp = '\0';
+ ret = kstrtoull(buf + 1, 0, &start);
+ if (ret)
+ goto out;
+ ret = kstrtoull(tmp + 1, 0, &end);
+ if (ret)
+ goto out;
+ ret = -EINVAL;
+ if (start > end)
+ goto out;
+ /* enable badblocks */
+ cmpxchg(&t_dev->badblocks.shift, -1, 0);
+ if (buf[0] == '+')
+ ret = badblocks_set(&t_dev->badblocks, start,
+ end - start + 1, 1);
+ else
+ ret = badblocks_clear(&t_dev->badblocks, start,
+ end - start + 1);
+ if (ret == 0)
+ ret = count;
+ out:
+ kfree(orig);
+ return ret;
+ }
+ CONFIGFS_ATTR(nullb_device_, badblocks);
+
+ static struct configfs_attribute *nullb_device_attrs[] = {
+ &nullb_device_attr_size,
+ &nullb_device_attr_completion_nsec,
+ &nullb_device_attr_submit_queues,
+ &nullb_device_attr_home_node,
+ &nullb_device_attr_queue_mode,
+ &nullb_device_attr_blocksize,
+ &nullb_device_attr_irqmode,
+ &nullb_device_attr_hw_queue_depth,
+ &nullb_device_attr_index,
+ &nullb_device_attr_use_lightnvm,
+ &nullb_device_attr_blocking,
+ &nullb_device_attr_use_per_node_hctx,
+ &nullb_device_attr_power,
+ &nullb_device_attr_memory_backed,
+ &nullb_device_attr_discard,
+ &nullb_device_attr_mbps,
+ &nullb_device_attr_cache_size,
+ &nullb_device_attr_badblocks,
+ NULL,
+ };
+
+ static void nullb_device_release(struct config_item *item)
+ {
+ struct nullb_device *dev = to_nullb_device(item);
+
+ badblocks_exit(&dev->badblocks);
+ null_free_device_storage(dev, false);
+ null_free_dev(dev);
+ }
+
+ static struct configfs_item_operations nullb_device_ops = {
+ .release = nullb_device_release,
+ };
+
+ static struct config_item_type nullb_device_type = {
+ .ct_item_ops = &nullb_device_ops,
+ .ct_attrs = nullb_device_attrs,
+ .ct_owner = THIS_MODULE,
+ };
+
+ static struct
+ config_item *nullb_group_make_item(struct config_group *group, const char *name)
+ {
+ struct nullb_device *dev;
+
+ dev = null_alloc_dev();
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&dev->item, name, &nullb_device_type);
+
+ return &dev->item;
+ }
+
+ static void
+ nullb_group_drop_item(struct config_group *group, struct config_item *item)
+ {
+ struct nullb_device *dev = to_nullb_device(item);
+
+ if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
+ mutex_lock(&lock);
+ dev->power = false;
+ null_del_dev(dev->nullb);
+ mutex_unlock(&lock);
+ }
+
+ config_item_put(item);
+ }
+
+ static ssize_t memb_group_features_show(struct config_item *item, char *page)
+ {
+ return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n");
+ }
+
+ CONFIGFS_ATTR_RO(memb_group_, features);
+
+ static struct configfs_attribute *nullb_group_attrs[] = {
+ &memb_group_attr_features,
+ NULL,
+ };
+
+ static struct configfs_group_operations nullb_group_ops = {
+ .make_item = nullb_group_make_item,
+ .drop_item = nullb_group_drop_item,
+ };
+
+ static struct config_item_type nullb_group_type = {
+ .ct_group_ops = &nullb_group_ops,
+ .ct_attrs = nullb_group_attrs,
+ .ct_owner = THIS_MODULE,
+ };
+
+ static struct configfs_subsystem nullb_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "nullb",
+ .ci_type = &nullb_group_type,
+ },
+ },
+ };
+
+ static inline int null_cache_active(struct nullb *nullb)
+ {
+ return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+ }
+
+ static struct nullb_device *null_alloc_dev(void)
+ {
+ struct nullb_device *dev;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return NULL;
+ INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
+ INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
+ if (badblocks_init(&dev->badblocks, 0)) {
+ kfree(dev);
+ return NULL;
+ }
+
+ dev->size = g_gb * 1024;
+ dev->completion_nsec = g_completion_nsec;
+ dev->submit_queues = g_submit_queues;
+ dev->home_node = g_home_node;
+ dev->queue_mode = g_queue_mode;
+ dev->blocksize = g_bs;
+ dev->irqmode = g_irqmode;
+ dev->hw_queue_depth = g_hw_queue_depth;
+ dev->use_lightnvm = g_use_lightnvm;
+ dev->blocking = g_blocking;
+ dev->use_per_node_hctx = g_use_per_node_hctx;
+ return dev;
+ }
+
+ static void null_free_dev(struct nullb_device *dev)
+ {
+ kfree(dev);
+ }
+
static void put_tag(struct nullb_queue *nq, unsigned int tag)
{
clear_bit_unlock(tag, nq->tag_map);
cmd = &nq->cmds[tag];
cmd->tag = tag;
cmd->nq = nq;
- if (irqmode == NULL_IRQ_TIMER) {
+ if (nq->dev->irqmode == NULL_IRQ_TIMER) {
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
static void end_cmd(struct nullb_cmd *cmd)
{
struct request_queue *q = NULL;
+ int queue_mode = cmd->nq->dev->queue_mode;
if (cmd->rq)
q = cmd->rq->q;
switch (queue_mode) {
case NULL_Q_MQ:
- blk_mq_end_request(cmd->rq, BLK_STS_OK);
+ blk_mq_end_request(cmd->rq, cmd->error);
return;
case NULL_Q_RQ:
INIT_LIST_HEAD(&cmd->rq->queuelist);
- blk_end_request_all(cmd->rq, BLK_STS_OK);
+ blk_end_request_all(cmd->rq, cmd->error);
break;
case NULL_Q_BIO:
+ cmd->bio->bi_status = cmd->error;
bio_endio(cmd->bio);
break;
}
static void null_cmd_end_timer(struct nullb_cmd *cmd)
{
- ktime_t kt = completion_nsec;
+ ktime_t kt = cmd->nq->dev->completion_nsec;
hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
}
static void null_softirq_done_fn(struct request *rq)
{
- if (queue_mode == NULL_Q_MQ)
+ struct nullb *nullb = rq->q->queuedata;
+
+ if (nullb->dev->queue_mode == NULL_Q_MQ)
end_cmd(blk_mq_rq_to_pdu(rq));
else
end_cmd(rq->special);
}
- static inline void null_handle_cmd(struct nullb_cmd *cmd)
+ static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
+ {
+ struct nullb_page *t_page;
+
+ t_page = kmalloc(sizeof(struct nullb_page), gfp_flags);
+ if (!t_page)
+ goto out;
+
+ t_page->page = alloc_pages(gfp_flags, 0);
+ if (!t_page->page)
+ goto out_freepage;
+
+ t_page->bitmap = 0;
+ return t_page;
+ out_freepage:
+ kfree(t_page);
+ out:
+ return NULL;
+ }
+
+ static void null_free_page(struct nullb_page *t_page)
+ {
+ __set_bit(NULLB_PAGE_FREE, &t_page->bitmap);
+ if (test_bit(NULLB_PAGE_LOCK, &t_page->bitmap))
+ return;
+ __free_page(t_page->page);
+ kfree(t_page);
+ }
+
+ static void null_free_sector(struct nullb *nullb, sector_t sector,
+ bool is_cache)
+ {
+ unsigned int sector_bit;
+ u64 idx;
+ struct nullb_page *t_page, *ret;
+ struct radix_tree_root *root;
+
+ root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+ idx = sector >> PAGE_SECTORS_SHIFT;
+ sector_bit = (sector & SECTOR_MASK);
+
+ t_page = radix_tree_lookup(root, idx);
+ if (t_page) {
+ __clear_bit(sector_bit, &t_page->bitmap);
+
+ if (!t_page->bitmap) {
+ ret = radix_tree_delete_item(root, idx, t_page);
+ WARN_ON(ret != t_page);
+ null_free_page(ret);
+ if (is_cache)
+ nullb->dev->curr_cache -= PAGE_SIZE;
+ }
+ }
+ }
+
+ static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
+ struct nullb_page *t_page, bool is_cache)
+ {
+ struct radix_tree_root *root;
+
+ root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+
+ if (radix_tree_insert(root, idx, t_page)) {
+ null_free_page(t_page);
+ t_page = radix_tree_lookup(root, idx);
+ WARN_ON(!t_page || t_page->page->index != idx);
+ } else if (is_cache)
+ nullb->dev->curr_cache += PAGE_SIZE;
+
+ return t_page;
+ }
+
+ static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
+ {
+ unsigned long pos = 0;
+ int nr_pages;
+ struct nullb_page *ret, *t_pages[FREE_BATCH];
+ struct radix_tree_root *root;
+
+ root = is_cache ? &dev->cache : &dev->data;
+
+ do {
+ int i;
+
+ nr_pages = radix_tree_gang_lookup(root,
+ (void **)t_pages, pos, FREE_BATCH);
+
+ for (i = 0; i < nr_pages; i++) {
+ pos = t_pages[i]->page->index;
+ ret = radix_tree_delete_item(root, pos, t_pages[i]);
+ WARN_ON(ret != t_pages[i]);
+ null_free_page(ret);
+ }
+
+ pos++;
+ } while (nr_pages == FREE_BATCH);
+
+ if (is_cache)
+ dev->curr_cache = 0;
+ }
+
+ static struct nullb_page *__null_lookup_page(struct nullb *nullb,
+ sector_t sector, bool for_write, bool is_cache)
+ {
+ unsigned int sector_bit;
+ u64 idx;
+ struct nullb_page *t_page;
+ struct radix_tree_root *root;
+
+ idx = sector >> PAGE_SECTORS_SHIFT;
+ sector_bit = (sector & SECTOR_MASK);
+
+ root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+ t_page = radix_tree_lookup(root, idx);
+ WARN_ON(t_page && t_page->page->index != idx);
+
+ if (t_page && (for_write || test_bit(sector_bit, &t_page->bitmap)))
+ return t_page;
+
+ return NULL;
+ }
+
+ static struct nullb_page *null_lookup_page(struct nullb *nullb,
+ sector_t sector, bool for_write, bool ignore_cache)
+ {
+ struct nullb_page *page = NULL;
+
+ if (!ignore_cache)
+ page = __null_lookup_page(nullb, sector, for_write, true);
+ if (page)
+ return page;
+ return __null_lookup_page(nullb, sector, for_write, false);
+ }
+
+ static struct nullb_page *null_insert_page(struct nullb *nullb,
+ sector_t sector, bool ignore_cache)
+ {
+ u64 idx;
+ struct nullb_page *t_page;
+
+ t_page = null_lookup_page(nullb, sector, true, ignore_cache);
+ if (t_page)
+ return t_page;
+
+ spin_unlock_irq(&nullb->lock);
+
+ t_page = null_alloc_page(GFP_NOIO);
+ if (!t_page)
+ goto out_lock;
+
+ if (radix_tree_preload(GFP_NOIO))
+ goto out_freepage;
+
+ spin_lock_irq(&nullb->lock);
+ idx = sector >> PAGE_SECTORS_SHIFT;
+ t_page->page->index = idx;
+ t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
+ radix_tree_preload_end();
+
+ return t_page;
+ out_freepage:
+ null_free_page(t_page);
+ out_lock:
+ spin_lock_irq(&nullb->lock);
+ return null_lookup_page(nullb, sector, true, ignore_cache);
+ }
+
+ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
+ {
+ int i;
+ unsigned int offset;
+ u64 idx;
+ struct nullb_page *t_page, *ret;
+ void *dst, *src;
+
+ idx = c_page->page->index;
+
+ t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
+
+ __clear_bit(NULLB_PAGE_LOCK, &c_page->bitmap);
+ if (test_bit(NULLB_PAGE_FREE, &c_page->bitmap)) {
+ null_free_page(c_page);
+ if (t_page && t_page->bitmap == 0) {
+ ret = radix_tree_delete_item(&nullb->dev->data,
+ idx, t_page);
+ null_free_page(t_page);
+ }
+ return 0;
+ }
+
+ if (!t_page)
+ return -ENOMEM;
+
+ src = kmap_atomic(c_page->page);
+ dst = kmap_atomic(t_page->page);
+
+ for (i = 0; i < PAGE_SECTORS;
+ i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
+ if (test_bit(i, &c_page->bitmap)) {
+ offset = (i << SECTOR_SHIFT);
+ memcpy(dst + offset, src + offset,
+ nullb->dev->blocksize);
+ __set_bit(i, &t_page->bitmap);
+ }
+ }
+
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+
+ ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
+ null_free_page(ret);
+ nullb->dev->curr_cache -= PAGE_SIZE;
+
+ return 0;
+ }
+
+ static int null_make_cache_space(struct nullb *nullb, unsigned long n)
{
+ int i, err, nr_pages;
+ struct nullb_page *c_pages[FREE_BATCH];
+ unsigned long flushed = 0, one_round;
+
+ again:
+ if ((nullb->dev->cache_size * 1024 * 1024) >
+ nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
+ return 0;
+
+ nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
+ (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
+ /*
+ * nullb_flush_cache_page could unlock before using the c_pages. To
+ * avoid race, we don't allow page free
+ */
+ for (i = 0; i < nr_pages; i++) {
+ nullb->cache_flush_pos = c_pages[i]->page->index;
+ /*
+ * We found the page which is being flushed to disk by other
+ * threads
+ */
+ if (test_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap))
+ c_pages[i] = NULL;
+ else
+ __set_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap);
+ }
+
+ one_round = 0;
+ for (i = 0; i < nr_pages; i++) {
+ if (c_pages[i] == NULL)
+ continue;
+ err = null_flush_cache_page(nullb, c_pages[i]);
+ if (err)
+ return err;
+ one_round++;
+ }
+ flushed += one_round << PAGE_SHIFT;
+
+ if (n > flushed) {
+ if (nr_pages == 0)
+ nullb->cache_flush_pos = 0;
+ if (one_round == 0) {
+ /* give other threads a chance */
+ spin_unlock_irq(&nullb->lock);
+ spin_lock_irq(&nullb->lock);
+ }
+ goto again;
+ }
+ return 0;
+ }
+
+ static int copy_to_nullb(struct nullb *nullb, struct page *source,
+ unsigned int off, sector_t sector, size_t n, bool is_fua)
+ {
+ size_t temp, count = 0;
+ unsigned int offset;
+ struct nullb_page *t_page;
+ void *dst, *src;
+
+ while (count < n) {
+ temp = min_t(size_t, nullb->dev->blocksize, n - count);
+
+ if (null_cache_active(nullb) && !is_fua)
+ null_make_cache_space(nullb, PAGE_SIZE);
+
+ offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+ t_page = null_insert_page(nullb, sector,
+ !null_cache_active(nullb) || is_fua);
+ if (!t_page)
+ return -ENOSPC;
+
+ src = kmap_atomic(source);
+ dst = kmap_atomic(t_page->page);
+ memcpy(dst + offset, src + off + count, temp);
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+
+ __set_bit(sector & SECTOR_MASK, &t_page->bitmap);
+
+ if (is_fua)
+ null_free_sector(nullb, sector, true);
+
+ count += temp;
+ sector += temp >> SECTOR_SHIFT;
+ }
+ return 0;
+ }
+
+ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
+ unsigned int off, sector_t sector, size_t n)
+ {
+ size_t temp, count = 0;
+ unsigned int offset;
+ struct nullb_page *t_page;
+ void *dst, *src;
+
+ while (count < n) {
+ temp = min_t(size_t, nullb->dev->blocksize, n - count);
+
+ offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+ t_page = null_lookup_page(nullb, sector, false,
+ !null_cache_active(nullb));
+
+ dst = kmap_atomic(dest);
+ if (!t_page) {
+ memset(dst + off + count, 0, temp);
+ goto next;
+ }
+ src = kmap_atomic(t_page->page);
+ memcpy(dst + off + count, src + offset, temp);
+ kunmap_atomic(src);
+ next:
+ kunmap_atomic(dst);
+
+ count += temp;
+ sector += temp >> SECTOR_SHIFT;
+ }
+ return 0;
+ }
+
+ static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n)
+ {
+ size_t temp;
+
+ spin_lock_irq(&nullb->lock);
+ while (n > 0) {
+ temp = min_t(size_t, n, nullb->dev->blocksize);
+ null_free_sector(nullb, sector, false);
+ if (null_cache_active(nullb))
+ null_free_sector(nullb, sector, true);
+ sector += temp >> SECTOR_SHIFT;
+ n -= temp;
+ }
+ spin_unlock_irq(&nullb->lock);
+ }
+
+ static int null_handle_flush(struct nullb *nullb)
+ {
+ int err;
+
+ if (!null_cache_active(nullb))
+ return 0;
+
+ spin_lock_irq(&nullb->lock);
+ while (true) {
+ err = null_make_cache_space(nullb,
+ nullb->dev->cache_size * 1024 * 1024);
+ if (err || nullb->dev->curr_cache == 0)
+ break;
+ }
+
+ WARN_ON(!radix_tree_empty(&nullb->dev->cache));
+ spin_unlock_irq(&nullb->lock);
+ return err;
+ }
+
+ static int null_transfer(struct nullb *nullb, struct page *page,
+ unsigned int len, unsigned int off, bool is_write, sector_t sector,
+ bool is_fua)
+ {
+ int err = 0;
+
+ if (!is_write) {
+ err = copy_from_nullb(nullb, page, off, sector, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+ }
+
+ return err;
+ }
+
+ static int null_handle_rq(struct nullb_cmd *cmd)
+ {
+ struct request *rq = cmd->rq;
+ struct nullb *nullb = cmd->nq->dev->nullb;
+ int err;
+ unsigned int len;
+ sector_t sector;
+ struct req_iterator iter;
+ struct bio_vec bvec;
+
+ sector = blk_rq_pos(rq);
+
+ if (req_op(rq) == REQ_OP_DISCARD) {
+ null_handle_discard(nullb, sector, blk_rq_bytes(rq));
+ return 0;
+ }
+
+ spin_lock_irq(&nullb->lock);
+ rq_for_each_segment(bvec, rq, iter) {
+ len = bvec.bv_len;
+ err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+ op_is_write(req_op(rq)), sector,
+ req_op(rq) & REQ_FUA);
+ if (err) {
+ spin_unlock_irq(&nullb->lock);
+ return err;
+ }
+ sector += len >> SECTOR_SHIFT;
+ }
+ spin_unlock_irq(&nullb->lock);
+
+ return 0;
+ }
+
+ static int null_handle_bio(struct nullb_cmd *cmd)
+ {
+ struct bio *bio = cmd->bio;
+ struct nullb *nullb = cmd->nq->dev->nullb;
+ int err;
+ unsigned int len;
+ sector_t sector;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ sector = bio->bi_iter.bi_sector;
+
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ null_handle_discard(nullb, sector,
+ bio_sectors(bio) << SECTOR_SHIFT);
+ return 0;
+ }
+
+ spin_lock_irq(&nullb->lock);
+ bio_for_each_segment(bvec, bio, iter) {
+ len = bvec.bv_len;
+ err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+ op_is_write(bio_op(bio)), sector,
+ bio_op(bio) & REQ_FUA);
+ if (err) {
+ spin_unlock_irq(&nullb->lock);
+ return err;
+ }
+ sector += len >> SECTOR_SHIFT;
+ }
+ spin_unlock_irq(&nullb->lock);
+ return 0;
+ }
+
+ static void null_stop_queue(struct nullb *nullb)
+ {
+ struct request_queue *q = nullb->q;
+
+ if (nullb->dev->queue_mode == NULL_Q_MQ)
+ blk_mq_stop_hw_queues(q);
+ else {
+ spin_lock_irq(q->queue_lock);
+ blk_stop_queue(q);
+ spin_unlock_irq(q->queue_lock);
+ }
+ }
+
+ static void null_restart_queue_async(struct nullb *nullb)
+ {
+ struct request_queue *q = nullb->q;
+ unsigned long flags;
+
+ if (nullb->dev->queue_mode == NULL_Q_MQ)
+ blk_mq_start_stopped_hw_queues(q, true);
+ else {
+ spin_lock_irqsave(q->queue_lock, flags);
+ blk_start_queue_async(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ }
+ }
+
+ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
+ {
+ struct nullb_device *dev = cmd->nq->dev;
+ struct nullb *nullb = dev->nullb;
+ int err = 0;
+
+ if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
+ struct request *rq = cmd->rq;
+
+ if (!hrtimer_active(&nullb->bw_timer))
+ hrtimer_restart(&nullb->bw_timer);
+
+ if (atomic_long_sub_return(blk_rq_bytes(rq),
+ &nullb->cur_bytes) < 0) {
+ null_stop_queue(nullb);
+ /* race with timer */
+ if (atomic_long_read(&nullb->cur_bytes) > 0)
+ null_restart_queue_async(nullb);
+ if (dev->queue_mode == NULL_Q_RQ) {
+ struct request_queue *q = nullb->q;
+
+ spin_lock_irq(q->queue_lock);
+ rq->rq_flags |= RQF_DONTPREP;
+ blk_requeue_request(q, rq);
+ spin_unlock_irq(q->queue_lock);
+ return BLK_STS_OK;
+ } else
+ /* requeue request */
+ return BLK_STS_RESOURCE;
+ }
+ }
+
+ if (nullb->dev->badblocks.shift != -1) {
+ int bad_sectors;
+ sector_t sector, size, first_bad;
+ bool is_flush = true;
+
+ if (dev->queue_mode == NULL_Q_BIO &&
+ bio_op(cmd->bio) != REQ_OP_FLUSH) {
+ is_flush = false;
+ sector = cmd->bio->bi_iter.bi_sector;
+ size = bio_sectors(cmd->bio);
+ }
+ if (dev->queue_mode != NULL_Q_BIO &&
+ req_op(cmd->rq) != REQ_OP_FLUSH) {
+ is_flush = false;
+ sector = blk_rq_pos(cmd->rq);
+ size = blk_rq_sectors(cmd->rq);
+ }
+ if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector,
+ size, &first_bad, &bad_sectors)) {
+ cmd->error = BLK_STS_IOERR;
+ goto out;
+ }
+ }
+
+ if (dev->memory_backed) {
+ if (dev->queue_mode == NULL_Q_BIO) {
+ if (bio_op(cmd->bio) == REQ_OP_FLUSH)
+ err = null_handle_flush(nullb);
+ else
+ err = null_handle_bio(cmd);
+ } else {
+ if (req_op(cmd->rq) == REQ_OP_FLUSH)
+ err = null_handle_flush(nullb);
+ else
+ err = null_handle_rq(cmd);
+ }
+ }
+ cmd->error = errno_to_blk_status(err);
+ out:
/* Complete IO by inline, softirq or timer */
- switch (irqmode) {
+ switch (dev->irqmode) {
case NULL_IRQ_SOFTIRQ:
- switch (queue_mode) {
+ switch (dev->queue_mode) {
case NULL_Q_MQ:
blk_mq_complete_request(cmd->rq);
break;
null_cmd_end_timer(cmd);
break;
}
+ return BLK_STS_OK;
+ }
+
+ static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
+ {
+ struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
+ ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+ unsigned int mbps = nullb->dev->mbps;
+
+ if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
+ return HRTIMER_NORESTART;
+
+ atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
+ null_restart_queue_async(nullb);
+
+ hrtimer_forward_now(&nullb->bw_timer, timer_interval);
+
+ return HRTIMER_RESTART;
+ }
+
+ static void nullb_setup_bwtimer(struct nullb *nullb)
+ {
+ ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+
+ hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ nullb->bw_timer.function = nullb_bwtimer_fn;
+ atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
+ hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
}
static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
const struct blk_mq_queue_data *bd)
{
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+ struct nullb_queue *nq = hctx->driver_data;
might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
- if (irqmode == NULL_IRQ_TIMER) {
+ if (nq->dev->irqmode == NULL_IRQ_TIMER) {
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
}
cmd->rq = bd->rq;
- cmd->nq = hctx->driver_data;
+ cmd->nq = nq;
blk_mq_start_request(bd->rq);
- null_handle_cmd(cmd);
- return BLK_STS_OK;
+ return null_handle_cmd(cmd);
}
static const struct blk_mq_ops null_mq_ops = {
static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
{
- sector_t size = gb * 1024 * 1024 * 1024ULL;
+ struct nullb *nullb = dev->q->queuedata;
+ sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
sector_t blksize;
struct nvm_id_group *grp;
id->ppaf.ch_offset = 56;
id->ppaf.ch_len = 8;
- sector_div(size, bs); /* convert size to pages */
+ sector_div(size, nullb->dev->blocksize); /* convert size to pages */
size >>= 8; /* concert size to pgs pr blk */
grp = &id->grp;
grp->mtype = 0;
grp->num_blk = blksize;
grp->num_pln = 1;
- grp->fpg_sz = bs;
- grp->csecs = bs;
+ grp->fpg_sz = nullb->dev->blocksize;
+ grp->csecs = nullb->dev->blocksize;
grp->trdt = 25000;
grp->trdm = 25000;
grp->tprt = 500000;
grp->tbet = 1500000;
grp->tbem = 1500000;
grp->mpos = 0x010101; /* single plane rwe */
- grp->cpar = hw_queue_depth;
+ grp->cpar = nullb->dev->hw_queue_depth;
return 0;
}
static void null_del_dev(struct nullb *nullb)
{
+ struct nullb_device *dev = nullb->dev;
+
+ ida_simple_remove(&nullb_indexes, nullb->index);
+
list_del_init(&nullb->list);
- if (use_lightnvm)
+ if (dev->use_lightnvm)
null_nvm_unregister(nullb);
else
del_gendisk(nullb->disk);
+
+ if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
+ hrtimer_cancel(&nullb->bw_timer);
+ atomic_long_set(&nullb->cur_bytes, LONG_MAX);
+ null_restart_queue_async(nullb);
+ }
+
blk_cleanup_queue(nullb->q);
- if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+ if (dev->queue_mode == NULL_Q_MQ &&
+ nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
- if (!use_lightnvm)
+ if (!dev->use_lightnvm)
put_disk(nullb->disk);
cleanup_queues(nullb);
+ if (null_cache_active(nullb))
+ null_free_device_storage(nullb->dev, true);
kfree(nullb);
+ dev->nullb = NULL;
+ }
+
+ static void null_config_discard(struct nullb *nullb)
+ {
+ if (nullb->dev->discard == false)
+ return;
+ nullb->q->limits.discard_granularity = nullb->dev->blocksize;
+ nullb->q->limits.discard_alignment = nullb->dev->blocksize;
+ blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nullb->q);
}
static int null_open(struct block_device *bdev, fmode_t mode)
init_waitqueue_head(&nq->wait);
nq->queue_depth = nullb->queue_depth;
+ nq->dev = nullb->dev;
}
static void null_init_queues(struct nullb *nullb)
static int setup_queues(struct nullb *nullb)
{
- nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
- GFP_KERNEL);
+ nullb->queues = kzalloc(nullb->dev->submit_queues *
+ sizeof(struct nullb_queue), GFP_KERNEL);
if (!nullb->queues)
return -ENOMEM;
nullb->nr_queues = 0;
- nullb->queue_depth = hw_queue_depth;
+ nullb->queue_depth = nullb->dev->hw_queue_depth;
return 0;
}
struct nullb_queue *nq;
int i, ret = 0;
- for (i = 0; i < submit_queues; i++) {
+ for (i = 0; i < nullb->dev->submit_queues; i++) {
nq = &nullb->queues[i];
null_init_queue(nullb, nq);
struct gendisk *disk;
sector_t size;
- disk = nullb->disk = alloc_disk_node(1, home_node);
+ disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
if (!disk)
return -ENOMEM;
- size = gb * 1024 * 1024 * 1024ULL;
+ size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
set_capacity(disk, size >> 9);
disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
return 0;
}
- static int null_init_tag_set(struct blk_mq_tag_set *set)
+ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
{
set->ops = &null_mq_ops;
- set->nr_hw_queues = submit_queues;
- set->queue_depth = hw_queue_depth;
- set->numa_node = home_node;
+ set->nr_hw_queues = nullb ? nullb->dev->submit_queues :
+ g_submit_queues;
+ set->queue_depth = nullb ? nullb->dev->hw_queue_depth :
+ g_hw_queue_depth;
+ set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
set->cmd_size = sizeof(struct nullb_cmd);
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->driver_data = NULL;
- if (blocking)
+ if ((nullb && nullb->dev->blocking) || g_blocking)
set->flags |= BLK_MQ_F_BLOCKING;
return blk_mq_alloc_tag_set(set);
}
- static int null_add_dev(void)
+ static void null_validate_conf(struct nullb_device *dev)
+ {
+ dev->blocksize = round_down(dev->blocksize, 512);
+ dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
+ if (dev->use_lightnvm && dev->blocksize != 4096)
+ dev->blocksize = 4096;
+
+ if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
+ dev->queue_mode = NULL_Q_MQ;
+
+ if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
+ if (dev->submit_queues != nr_online_nodes)
+ dev->submit_queues = nr_online_nodes;
+ } else if (dev->submit_queues > nr_cpu_ids)
+ dev->submit_queues = nr_cpu_ids;
+ else if (dev->submit_queues == 0)
+ dev->submit_queues = 1;
+
+ dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
+ dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
+
+ /* Do memory allocation, so set blocking */
+ if (dev->memory_backed)
+ dev->blocking = true;
+ else /* cache is meaningless */
+ dev->cache_size = 0;
+ dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
+ dev->cache_size);
+ dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
+ /* can not stop a queue */
+ if (dev->queue_mode == NULL_Q_BIO)
+ dev->mbps = 0;
+ }
+
+ static int null_add_dev(struct nullb_device *dev)
{
struct nullb *nullb;
int rv;
- nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
+ null_validate_conf(dev);
+
+ nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
if (!nullb) {
rv = -ENOMEM;
goto out;
}
+ nullb->dev = dev;
+ dev->nullb = nullb;
spin_lock_init(&nullb->lock);
- if (queue_mode == NULL_Q_MQ && use_per_node_hctx)
- submit_queues = nr_online_nodes;
-
rv = setup_queues(nullb);
if (rv)
goto out_free_nullb;
- if (queue_mode == NULL_Q_MQ) {
+ if (dev->queue_mode == NULL_Q_MQ) {
if (shared_tags) {
nullb->tag_set = &tag_set;
rv = 0;
} else {
nullb->tag_set = &nullb->__tag_set;
- rv = null_init_tag_set(nullb->tag_set);
+ rv = null_init_tag_set(nullb, nullb->tag_set);
}
if (rv)
goto out_cleanup_tags;
}
null_init_queues(nullb);
- } else if (queue_mode == NULL_Q_BIO) {
- nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+ } else if (dev->queue_mode == NULL_Q_BIO) {
+ nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node);
if (!nullb->q) {
rv = -ENOMEM;
goto out_cleanup_queues;
if (rv)
goto out_cleanup_blk_queue;
} else {
- nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+ nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock,
+ dev->home_node);
if (!nullb->q) {
rv = -ENOMEM;
goto out_cleanup_queues;
goto out_cleanup_blk_queue;
}
+ if (dev->mbps) {
+ set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
+ nullb_setup_bwtimer(nullb);
+ }
+
+ if (dev->cache_size > 0) {
+ set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+ blk_queue_write_cache(nullb->q, true, true);
+ blk_queue_flush_queueable(nullb->q, true);
+ }
+
nullb->q->queuedata = nullb;
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
mutex_lock(&lock);
- nullb->index = nullb_indexes++;
+ nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
+ dev->index = nullb->index;
mutex_unlock(&lock);
- blk_queue_logical_block_size(nullb->q, bs);
- blk_queue_physical_block_size(nullb->q, bs);
+ blk_queue_logical_block_size(nullb->q, dev->blocksize);
+ blk_queue_physical_block_size(nullb->q, dev->blocksize);
+
+ null_config_discard(nullb);
sprintf(nullb->disk_name, "nullb%d", nullb->index);
- if (use_lightnvm)
+ if (dev->use_lightnvm)
rv = null_nvm_register(nullb);
else
rv = null_gendisk_register(nullb);
out_cleanup_blk_queue:
blk_cleanup_queue(nullb->q);
out_cleanup_tags:
- if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+ if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
out_cleanup_queues:
cleanup_queues(nullb);
int ret = 0;
unsigned int i;
struct nullb *nullb;
+ struct nullb_device *dev;
+
+ /* check for nullb_page.bitmap */
+ if (sizeof(unsigned long) * 8 - 2 < (PAGE_SIZE >> SECTOR_SHIFT))
+ return -EINVAL;
- if (bs > PAGE_SIZE) {
+ if (g_bs > PAGE_SIZE) {
pr_warn("null_blk: invalid block size\n");
pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
- bs = PAGE_SIZE;
+ g_bs = PAGE_SIZE;
}
- if (use_lightnvm && bs != 4096) {
+ if (g_use_lightnvm && g_bs != 4096) {
pr_warn("null_blk: LightNVM only supports 4k block size\n");
pr_warn("null_blk: defaults block size to 4k\n");
- bs = 4096;
+ g_bs = 4096;
}
- if (use_lightnvm && queue_mode != NULL_Q_MQ) {
+ if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
pr_warn("null_blk: LightNVM only supported for blk-mq\n");
pr_warn("null_blk: defaults queue mode to blk-mq\n");
- queue_mode = NULL_Q_MQ;
+ g_queue_mode = NULL_Q_MQ;
}
- if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
- if (submit_queues < nr_online_nodes) {
- pr_warn("null_blk: submit_queues param is set to %u.",
+ if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
+ if (g_submit_queues != nr_online_nodes) {
+ pr_warn("null_blk: submit_queues param is set to %u.\n",
nr_online_nodes);
- submit_queues = nr_online_nodes;
+ g_submit_queues = nr_online_nodes;
}
- } else if (submit_queues > nr_cpu_ids)
- submit_queues = nr_cpu_ids;
- else if (!submit_queues)
- submit_queues = 1;
+ } else if (g_submit_queues > nr_cpu_ids)
+ g_submit_queues = nr_cpu_ids;
+ else if (g_submit_queues <= 0)
+ g_submit_queues = 1;
- if (queue_mode == NULL_Q_MQ && shared_tags) {
- ret = null_init_tag_set(&tag_set);
+ if (g_queue_mode == NULL_Q_MQ && shared_tags) {
+ ret = null_init_tag_set(NULL, &tag_set);
if (ret)
return ret;
}
+ config_group_init(&nullb_subsys.su_group);
+ mutex_init(&nullb_subsys.su_mutex);
+
+ ret = configfs_register_subsystem(&nullb_subsys);
+ if (ret)
+ goto err_tagset;
+
mutex_init(&lock);
null_major = register_blkdev(0, "nullb");
if (null_major < 0) {
ret = null_major;
- goto err_tagset;
+ goto err_conf;
}
- if (use_lightnvm) {
+ if (g_use_lightnvm) {
ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
0, 0, NULL);
if (!ppa_cache) {
}
for (i = 0; i < nr_devices; i++) {
- ret = null_add_dev();
- if (ret)
+ dev = null_alloc_dev();
+ if (!dev)
+ goto err_dev;
+ ret = null_add_dev(dev);
+ if (ret) {
+ null_free_dev(dev);
goto err_dev;
+ }
}
pr_info("null: module loaded\n");
err_dev:
while (!list_empty(&nullb_list)) {
nullb = list_entry(nullb_list.next, struct nullb, list);
+ dev = nullb->dev;
null_del_dev(nullb);
+ null_free_dev(dev);
}
kmem_cache_destroy(ppa_cache);
err_ppa:
unregister_blkdev(null_major, "nullb");
+ err_conf:
+ configfs_unregister_subsystem(&nullb_subsys);
err_tagset:
- if (queue_mode == NULL_Q_MQ && shared_tags)
+ if (g_queue_mode == NULL_Q_MQ && shared_tags)
blk_mq_free_tag_set(&tag_set);
return ret;
}
{
struct nullb *nullb;
+ configfs_unregister_subsystem(&nullb_subsys);
+
unregister_blkdev(null_major, "nullb");
mutex_lock(&lock);
while (!list_empty(&nullb_list)) {
+ struct nullb_device *dev;
+
nullb = list_entry(nullb_list.next, struct nullb, list);
+ dev = nullb->dev;
null_del_dev(nullb);
+ null_free_dev(dev);
}
mutex_unlock(&lock);
- if (queue_mode == NULL_Q_MQ && shared_tags)
+ if (g_queue_mode == NULL_Q_MQ && shared_tags)
blk_mq_free_tag_set(&tag_set);
kmem_cache_destroy(ppa_cache);
module_init(null_init);
module_exit(null_exit);
- MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");
+ MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
MODULE_LICENSE("GPL");
}
spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
- if (req_op(req) == REQ_OP_SCSI_IN || req_op(req) == REQ_OP_SCSI_OUT)
+ if (blk_rq_is_scsi(req))
err = virtblk_add_req_scsi(vblk->vqs[qid].vq, vbr, vbr->sg, num);
else
err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10];
char *envp[] = { "RESIZE=1", NULL };
+ unsigned long long nblocks;
u64 capacity;
/* Host must always specify the capacity. */
capacity = (sector_t)-1;
}
- string_get_size(capacity, queue_logical_block_size(q),
+ nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
+
+ string_get_size(nblocks, queue_logical_block_size(q),
STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
- string_get_size(capacity, queue_logical_block_size(q),
+ string_get_size(nblocks, queue_logical_block_size(q),
STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
dev_notice(&vdev->dev,
- "new size: %llu %d-byte logical blocks (%s/%s)\n",
- (unsigned long long)capacity,
- queue_logical_block_size(q),
- cap_str_10, cap_str_2);
+ "new size: %llu %d-byte logical blocks (%s/%s)\n",
+ nblocks,
+ queue_logical_block_size(q),
+ cap_str_10,
+ cap_str_2);
set_capacity(vblk->disk, capacity);
revalidate_disk(vblk->disk);
{
struct pending_req *req, *n;
unsigned int j, r;
+ bool busy = false;
for (r = 0; r < blkif->nr_rings; r++) {
struct xen_blkif_ring *ring = &blkif->rings[r];
* don't have any discard_io or other_io requests. So, checking
* for inflight IO is enough.
*/
- if (atomic_read(&ring->inflight) > 0)
- return -EBUSY;
+ if (atomic_read(&ring->inflight) > 0) {
+ busy = true;
+ continue;
+ }
if (ring->irq) {
unbind_from_irqhandler(ring->irq, ring);
WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
ring->active = false;
}
+ if (busy)
+ return -EBUSY;
+
blkif->nr_ring_pages = 0;
/*
* blkif->rings was allocated in connect_ring, so we should free it in
xenbus_switch_state(dev, XenbusStateClosed);
if (xenbus_dev_is_online(dev))
break;
- /* fall through if not online */
+ /* fall through */
+ /* if not online */
case XenbusStateUnknown:
/* implies xen_blkif_disconnect() via xen_blkbk_remove() */
device_unregister(&dev->dev);
/*
* Get the bios in the request so we can re-queue them.
*/
- if (req_op(shadow[i].request) == REQ_OP_FLUSH ||
- req_op(shadow[i].request) == REQ_OP_DISCARD ||
- req_op(shadow[i].request) == REQ_OP_SECURE_ERASE ||
+ if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
+ req_op(shadow[j].request) == REQ_OP_DISCARD ||
+ req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
shadow[j].request->cmd_flags & REQ_FUA) {
/*
* Flush operations don't contain bios, so
case XenbusStateClosed:
if (dev->state == XenbusStateClosed)
break;
- /* Missed the backend's Closing state -- fallthrough */
+ /* fall through */
case XenbusStateClosing:
if (info)
blkfront_closing(info);
return len;
}
- bio->bi_bdev = zram->bdev;
+#ifdef CONFIG_ZRAM_WRITEBACK
+static bool zram_wb_enabled(struct zram *zram)
+{
+ return zram->backing_dev;
+}
+
+static void reset_bdev(struct zram *zram)
+{
+ struct block_device *bdev;
+
+ if (!zram_wb_enabled(zram))
+ return;
+
+ bdev = zram->bdev;
+ if (zram->old_block_size)
+ set_blocksize(bdev, zram->old_block_size);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+ /* hope filp_close flush all of IO */
+ filp_close(zram->backing_dev, NULL);
+ zram->backing_dev = NULL;
+ zram->old_block_size = 0;
+ zram->bdev = NULL;
+
+ kvfree(zram->bitmap);
+ zram->bitmap = NULL;
+}
+
+static ssize_t backing_dev_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+ struct file *file = zram->backing_dev;
+ char *p;
+ ssize_t ret;
+
+ down_read(&zram->init_lock);
+ if (!zram_wb_enabled(zram)) {
+ memcpy(buf, "none\n", 5);
+ up_read(&zram->init_lock);
+ return 5;
+ }
+
+ p = file_path(file, buf, PAGE_SIZE - 1);
+ if (IS_ERR(p)) {
+ ret = PTR_ERR(p);
+ goto out;
+ }
+
+ ret = strlen(p);
+ memmove(buf, p, ret);
+ buf[ret++] = '\n';
+out:
+ up_read(&zram->init_lock);
+ return ret;
+}
+
+static ssize_t backing_dev_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ char *file_name;
+ struct file *backing_dev = NULL;
+ struct inode *inode;
+ struct address_space *mapping;
+ unsigned int bitmap_sz, old_block_size = 0;
+ unsigned long nr_pages, *bitmap = NULL;
+ struct block_device *bdev = NULL;
+ int err;
+ struct zram *zram = dev_to_zram(dev);
+
+ file_name = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!file_name)
+ return -ENOMEM;
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ pr_info("Can't setup backing device for initialized device\n");
+ err = -EBUSY;
+ goto out;
+ }
+
+ strlcpy(file_name, buf, len);
+
+ backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
+ if (IS_ERR(backing_dev)) {
+ err = PTR_ERR(backing_dev);
+ backing_dev = NULL;
+ goto out;
+ }
+
+ mapping = backing_dev->f_mapping;
+ inode = mapping->host;
+
+ /* Support only block device in this moment */
+ if (!S_ISBLK(inode->i_mode)) {
+ err = -ENOTBLK;
+ goto out;
+ }
+
+ bdev = bdgrab(I_BDEV(inode));
+ err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+ if (err < 0)
+ goto out;
+
+ nr_pages = i_size_read(inode) >> PAGE_SHIFT;
+ bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
+ bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
+ if (!bitmap) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ old_block_size = block_size(bdev);
+ err = set_blocksize(bdev, PAGE_SIZE);
+ if (err)
+ goto out;
+
+ reset_bdev(zram);
+ spin_lock_init(&zram->bitmap_lock);
+
+ zram->old_block_size = old_block_size;
+ zram->bdev = bdev;
+ zram->backing_dev = backing_dev;
+ zram->bitmap = bitmap;
+ zram->nr_pages = nr_pages;
+ up_write(&zram->init_lock);
+
+ pr_info("setup backing device %s\n", file_name);
+ kfree(file_name);
+
+ return len;
+out:
+ if (bitmap)
+ kvfree(bitmap);
+
+ if (bdev)
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+ if (backing_dev)
+ filp_close(backing_dev, NULL);
+
+ up_write(&zram->init_lock);
+
+ kfree(file_name);
+
+ return err;
+}
+
+static unsigned long get_entry_bdev(struct zram *zram)
+{
+ unsigned long entry;
+
+ spin_lock(&zram->bitmap_lock);
+ /* skip 0 bit to confuse zram.handle = 0 */
+ entry = find_next_zero_bit(zram->bitmap, zram->nr_pages, 1);
+ if (entry == zram->nr_pages) {
+ spin_unlock(&zram->bitmap_lock);
+ return 0;
+ }
+
+ set_bit(entry, zram->bitmap);
+ spin_unlock(&zram->bitmap_lock);
+
+ return entry;
+}
+
+static void put_entry_bdev(struct zram *zram, unsigned long entry)
+{
+ int was_set;
+
+ spin_lock(&zram->bitmap_lock);
+ was_set = test_and_clear_bit(entry, zram->bitmap);
+ spin_unlock(&zram->bitmap_lock);
+ WARN_ON_ONCE(!was_set);
+}
+
+void zram_page_end_io(struct bio *bio)
+{
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ page_endio(page, op_is_write(bio_op(bio)),
+ blk_status_to_errno(bio->bi_status));
+ bio_put(bio);
+}
+
+/*
+ * Returns 1 if the submission is successful.
+ */
+static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *parent)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_ATOMIC, 1);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
- bio->bi_bdev = zram->bdev;
++ bio_set_dev(bio, zram->bdev);
+ if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
+ bio_put(bio);
+ return -EIO;
+ }
+
+ if (!parent) {
+ bio->bi_opf = REQ_OP_READ;
+ bio->bi_end_io = zram_page_end_io;
+ } else {
+ bio->bi_opf = parent->bi_opf;
+ bio_chain(bio, parent);
+ }
+
+ submit_bio(bio);
+ return 1;
+}
+
+struct zram_work {
+ struct work_struct work;
+ struct zram *zram;
+ unsigned long entry;
+ struct bio *bio;
+};
+
+#if PAGE_SIZE != 4096
+static void zram_sync_read(struct work_struct *work)
+{
+ struct bio_vec bvec;
+ struct zram_work *zw = container_of(work, struct zram_work, work);
+ struct zram *zram = zw->zram;
+ unsigned long entry = zw->entry;
+ struct bio *bio = zw->bio;
+
+ read_from_bdev_async(zram, &bvec, entry, bio);
+}
+
+/*
+ * Block layer want one ->make_request_fn to be active at a time
+ * so if we use chained IO with parent IO in same context,
+ * it's a deadlock. To avoid, it, it uses worker thread context.
+ */
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *bio)
+{
+ struct zram_work work;
+
+ work.zram = zram;
+ work.entry = entry;
+ work.bio = bio;
+
+ INIT_WORK_ONSTACK(&work.work, zram_sync_read);
+ queue_work(system_unbound_wq, &work.work);
+ flush_work(&work.work);
+ destroy_work_on_stack(&work.work);
+
+ return 1;
+}
+#else
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *bio)
+{
+ WARN_ON(1);
+ return -EIO;
+}
+#endif
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *parent, bool sync)
+{
+ if (sync)
+ return read_from_bdev_sync(zram, bvec, entry, parent);
+ else
+ return read_from_bdev_async(zram, bvec, entry, parent);
+}
+
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+ u32 index, struct bio *parent,
+ unsigned long *pentry)
+{
+ struct bio *bio;
+ unsigned long entry;
+
+ bio = bio_alloc(GFP_ATOMIC, 1);
+ if (!bio)
+ return -ENOMEM;
+
+ entry = get_entry_bdev(zram);
+ if (!entry) {
+ bio_put(bio);
+ return -ENOSPC;
+ }
+
+ bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
++ bio_set_dev(bio, zram->bdev);
+ if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+ bvec->bv_offset)) {
+ bio_put(bio);
+ put_entry_bdev(zram, entry);
+ return -EIO;
+ }
+
+ if (!parent) {
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
+ bio->bi_end_io = zram_page_end_io;
+ } else {
+ bio->bi_opf = parent->bi_opf;
+ bio_chain(bio, parent);
+ }
+
+ submit_bio(bio);
+ *pentry = entry;
+
+ return 0;
+}
+
+static void zram_wb_clear(struct zram *zram, u32 index)
+{
+ unsigned long entry;
+
+ zram_clear_flag(zram, index, ZRAM_WB);
+ entry = zram_get_element(zram, index);
+ zram_set_element(zram, index, 0);
+ put_entry_bdev(zram, entry);
+}
+
+#else
+static bool zram_wb_enabled(struct zram *zram) { return false; }
+static inline void reset_bdev(struct zram *zram) {};
+static int write_to_bdev(struct zram *zram, struct bio_vec *bvec,
+ u32 index, struct bio *parent,
+ unsigned long *pentry)
+
+{
+ return -EIO;
+}
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+ unsigned long entry, struct bio *parent, bool sync)
+{
+ return -EIO;
+}
+static void zram_wb_clear(struct zram *zram, u32 index) {}
+#endif
+
+
/*
* We switched to per-cpu streams and this attr is not needed anymore.
* However, we will keep it around for some time, because:
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
- char compressor[CRYPTO_MAX_ALG_NAME];
+ char compressor[ARRAY_SIZE(zram->compressor)];
size_t sz;
strlcpy(compressor, buf, sizeof(compressor));
return -EBUSY;
}
- strlcpy(zram->compressor, compressor, sizeof(compressor));
+ strcpy(zram->compressor, compressor);
up_write(&zram->init_lock);
return len;
}
return false;
}
-static bool zram_same_page_write(struct zram *zram, u32 index,
- struct page *page)
-{
- unsigned long element;
- void *mem = kmap_atomic(page);
-
- if (page_same_filled(mem, &element)) {
- kunmap_atomic(mem);
- /* Free memory associated with this sector now. */
- zram_slot_lock(zram, index);
- zram_free_page(zram, index);
- zram_set_flag(zram, index, ZRAM_SAME);
- zram_set_element(zram, index, element);
- zram_slot_unlock(zram, index);
-
- atomic64_inc(&zram->stats.same_pages);
- atomic64_inc(&zram->stats.pages_stored);
- return true;
- }
- kunmap_atomic(mem);
-
- return false;
-}
-
static void zram_meta_free(struct zram *zram, u64 disksize)
{
size_t num_pages = disksize >> PAGE_SHIFT;
*/
static void zram_free_page(struct zram *zram, size_t index)
{
- unsigned long handle = zram_get_handle(zram, index);
+ unsigned long handle;
+
+ if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) {
+ zram_wb_clear(zram, index);
+ atomic64_dec(&zram->stats.pages_stored);
+ return;
+ }
/*
* No memory is allocated for same element filled pages.
return;
}
+ handle = zram_get_handle(zram, index);
if (!handle)
return;
zram_set_obj_size(zram, index, 0);
}
-static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
+ struct bio *bio, bool partial_io)
{
int ret;
unsigned long handle;
unsigned int size;
void *src, *dst;
+ if (zram_wb_enabled(zram)) {
+ zram_slot_lock(zram, index);
+ if (zram_test_flag(zram, index, ZRAM_WB)) {
+ struct bio_vec bvec;
+
+ zram_slot_unlock(zram, index);
+
+ bvec.bv_page = page;
+ bvec.bv_len = PAGE_SIZE;
+ bvec.bv_offset = 0;
+ return read_from_bdev(zram, &bvec,
+ zram_get_element(zram, index),
+ bio, partial_io);
+ }
+ zram_slot_unlock(zram, index);
+ }
+
if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
return 0;
}
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
- u32 index, int offset)
+ u32 index, int offset, struct bio *bio)
{
int ret;
struct page *page;
return -ENOMEM;
}
- ret = zram_decompress_page(zram, page, index);
+ ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
if (unlikely(ret))
goto out;
return ret;
}
-static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
- struct page *page,
- unsigned long *out_handle, unsigned int *out_comp_len)
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+ u32 index, struct bio *bio)
{
- int ret;
- unsigned int comp_len;
- void *src;
+ int ret = 0;
unsigned long alloced_pages;
unsigned long handle = 0;
+ unsigned int comp_len = 0;
+ void *src, *dst, *mem;
+ struct zcomp_strm *zstrm;
+ struct page *page = bvec->bv_page;
+ unsigned long element = 0;
+ enum zram_pageflags flags = 0;
+ bool allow_wb = true;
+
+ mem = kmap_atomic(page);
+ if (page_same_filled(mem, &element)) {
+ kunmap_atomic(mem);
+ /* Free memory associated with this sector now. */
+ flags = ZRAM_SAME;
+ atomic64_inc(&zram->stats.same_pages);
+ goto out;
+ }
+ kunmap_atomic(mem);
compress_again:
+ zstrm = zcomp_stream_get(zram->comp);
src = kmap_atomic(page);
- ret = zcomp_compress(*zstrm, src, &comp_len);
+ ret = zcomp_compress(zstrm, src, &comp_len);
kunmap_atomic(src);
if (unlikely(ret)) {
+ zcomp_stream_put(zram->comp);
pr_err("Compression failed! err=%d\n", ret);
- if (handle)
- zs_free(zram->mem_pool, handle);
+ zs_free(zram->mem_pool, handle);
return ret;
}
- if (unlikely(comp_len > max_zpage_size))
+ if (unlikely(comp_len > max_zpage_size)) {
+ if (zram_wb_enabled(zram) && allow_wb) {
+ zcomp_stream_put(zram->comp);
+ ret = write_to_bdev(zram, bvec, index, bio, &element);
+ if (!ret) {
+ flags = ZRAM_WB;
+ ret = 1;
+ goto out;
+ }
+ allow_wb = false;
+ goto compress_again;
+ }
comp_len = PAGE_SIZE;
+ }
/*
* handle allocation has 2 paths:
handle = zs_malloc(zram->mem_pool, comp_len,
GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
- *zstrm = zcomp_stream_get(zram->comp);
if (handle)
goto compress_again;
return -ENOMEM;
update_used_max(zram, alloced_pages);
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+ zcomp_stream_put(zram->comp);
zs_free(zram->mem_pool, handle);
return -ENOMEM;
}
- *out_handle = handle;
- *out_comp_len = comp_len;
- return 0;
-}
-
-static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
-{
- int ret;
- unsigned long handle;
- unsigned int comp_len;
- void *src, *dst;
- struct zcomp_strm *zstrm;
- struct page *page = bvec->bv_page;
-
- if (zram_same_page_write(zram, index, page))
- return 0;
-
- zstrm = zcomp_stream_get(zram->comp);
- ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
- if (ret) {
- zcomp_stream_put(zram->comp);
- return ret;
- }
-
dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
src = zstrm->buffer;
zcomp_stream_put(zram->comp);
zs_unmap_object(zram->mem_pool, handle);
-
+ atomic64_add(comp_len, &zram->stats.compr_data_size);
+out:
/*
* Free memory associated with this sector
* before overwriting unused sectors.
*/
zram_slot_lock(zram, index);
zram_free_page(zram, index);
- zram_set_handle(zram, index, handle);
- zram_set_obj_size(zram, index, comp_len);
+
+ if (flags) {
+ zram_set_flag(zram, index, flags);
+ zram_set_element(zram, index, element);
+ } else {
+ zram_set_handle(zram, index, handle);
+ zram_set_obj_size(zram, index, comp_len);
+ }
zram_slot_unlock(zram, index);
/* Update stats */
- atomic64_add(comp_len, &zram->stats.compr_data_size);
atomic64_inc(&zram->stats.pages_stored);
- return 0;
+ return ret;
}
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
- u32 index, int offset)
+ u32 index, int offset, struct bio *bio)
{
int ret;
struct page *page = NULL;
if (!page)
return -ENOMEM;
- ret = zram_decompress_page(zram, page, index);
+ ret = __zram_bvec_read(zram, page, index, bio, true);
if (ret)
goto out;
vec.bv_offset = 0;
}
- ret = __zram_bvec_write(zram, &vec, index);
+ ret = __zram_bvec_write(zram, &vec, index, bio);
out:
if (is_partial_io(bvec))
__free_page(page);
}
}
+/*
+ * Returns errno if it has some problem. Otherwise return 0 or 1.
+ * Returns 0 if IO request was done synchronously
+ * Returns 1 if IO request was successfully submitted.
+ */
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset, bool is_write)
+ int offset, bool is_write, struct bio *bio)
{
unsigned long start_time = jiffies;
int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
+ struct request_queue *q = zram->disk->queue;
int ret;
- generic_start_io_acct(rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+ generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
&zram->disk->part0);
if (!is_write) {
atomic64_inc(&zram->stats.num_reads);
- ret = zram_bvec_read(zram, bvec, index, offset);
+ ret = zram_bvec_read(zram, bvec, index, offset, bio);
flush_dcache_page(bvec->bv_page);
} else {
atomic64_inc(&zram->stats.num_writes);
- ret = zram_bvec_write(zram, bvec, index, offset);
+ ret = zram_bvec_write(zram, bvec, index, offset, bio);
}
- generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
+ generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
- if (unlikely(ret)) {
+ if (unlikely(ret < 0)) {
if (!is_write)
atomic64_inc(&zram->stats.failed_reads);
else
bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
unwritten);
if (zram_bvec_rw(zram, &bv, index, offset,
- op_is_write(bio_op(bio))) < 0)
+ op_is_write(bio_op(bio)), bio) < 0)
goto out;
bv.bv_offset += bv.bv_len;
static int zram_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, bool is_write)
{
- int offset, err = -EIO;
+ int offset, ret;
u32 index;
struct zram *zram;
struct bio_vec bv;
+ if (PageTransHuge(page))
+ return -ENOTSUPP;
zram = bdev->bd_disk->private_data;
if (!valid_io_request(zram, sector, PAGE_SIZE)) {
atomic64_inc(&zram->stats.invalid_io);
- err = -EINVAL;
+ ret = -EINVAL;
goto out;
}
bv.bv_len = PAGE_SIZE;
bv.bv_offset = 0;
- err = zram_bvec_rw(zram, &bv, index, offset, is_write);
+ ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL);
out:
/*
* If I/O fails, just return error(ie, non-zero) without
* bio->bi_end_io does things to handle the error
* (e.g., SetPageError, set_page_dirty and extra works).
*/
- if (err == 0)
+ if (unlikely(ret < 0))
+ return ret;
+
+ switch (ret) {
+ case 0:
page_endio(page, is_write, 0);
- return err;
+ break;
+ case 1:
+ ret = 0;
+ break;
+ default:
+ WARN_ON(1);
+ }
+ return ret;
}
static void zram_reset_device(struct zram *zram)
zram_meta_free(zram, disksize);
memset(&zram->stats, 0, sizeof(zram->stats));
zcomp_destroy(comp);
+ reset_bdev(zram);
}
static ssize_t disksize_store(struct device *dev,
static DEVICE_ATTR_WO(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_WRITEBACK
+static DEVICE_ATTR_RW(backing_dev);
+#endif
static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+#ifdef CONFIG_ZRAM_WRITEBACK
+ &dev_attr_backing_dev.attr,
+#endif
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,
int i, r;
/* xor whitening with sector number */
- memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
- crypto_xor(buf, (u8 *)§or, 8);
- crypto_xor(&buf[8], (u8 *)§or, 8);
+ crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8);
+ crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8);
/* calculate crc32 for every 32bit part and xor it */
desc->tfm = tcw->crc32_tfm;
}
/* Calculate IV */
- memcpy(iv, tcw->iv_seed, cc->iv_size);
- crypto_xor(iv, (u8 *)§or, 8);
+ crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)§or, 8);
if (cc->iv_size > 8)
- crypto_xor(&iv[8], (u8 *)§or, cc->iv_size - 8);
+ crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or,
+ cc->iv_size - 8);
return r;
}
bip->bip_iter.bi_size = tag_len;
bip->bip_iter.bi_sector = io->cc->start + io->sector;
- /* We own the metadata, do not let bio_free to release it */
- bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
-
ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
tag_len, offset_in_page(io->integrity_metadata));
if (unlikely(ret != tag_len))
clone->bi_private = io;
clone->bi_end_io = crypt_endio;
- clone->bi_bdev = cc->dev->bdev;
+ bio_set_dev(clone, cc->dev->bdev);
clone->bi_opf = io->base_bio->bi_opf;
}
*/
if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
bio_op(bio) == REQ_OP_DISCARD)) {
- bio->bi_bdev = cc->dev->bdev;
+ bio_set_dev(bio, cc->dev->bdev);
if (bio_sectors(bio))
bio->bi_iter.bi_sector = cc->start +
dm_target_offset(ti, bio->bi_iter.bi_sector);
if (queue_dying) {
atomic_inc(&m->pg_init_in_progress);
activate_or_offline_path(pgpath);
- return DM_MAPIO_REQUEUE;
}
return DM_MAPIO_DELAY_REQUEUE;
}
mpio->nr_bytes = nr_bytes;
bio->bi_status = 0;
- bio->bi_bdev = pgpath->path.dev->bdev;
+ bio_set_dev(bio, pgpath->path.dev->bdev);
bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
if (pgpath->pg->ps.type->start_io)
case BLK_STS_TARGET:
case BLK_STS_NEXUS:
case BLK_STS_MEDIUM:
- case BLK_STS_RESOURCE:
return 1;
}
#define DM_MSG_PREFIX "core"
-#ifdef CONFIG_PRINTK
-/*
- * ratelimit state to be used in DMXXX_LIMIT().
- */
-DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-EXPORT_SYMBOL(dm_ratelimit_state);
-#endif
-
/*
* Cookies are numeric values sent with CHANGE and REMOVE
* uevents while resuming, removing or renaming the device.
io->start_time = jiffies;
cpu = part_stat_lock();
- part_round_stats(cpu, &dm_disk(md)->part0);
+ part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
part_stat_unlock();
atomic_set(&dm_disk(md)->part0.in_flight[rw],
atomic_inc_return(&md->pending[rw]));
int pending;
int rw = bio_data_dir(bio);
- generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
+ generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
if (unlikely(error == BLK_STS_TARGET)) {
if (bio_op(bio) == REQ_OP_WRITE_SAME &&
- !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+ !bio->bi_disk->queue->limits.max_write_same_sectors)
disable_write_same(md);
if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
- !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
disable_write_zeroes(md);
}
break;
case DM_MAPIO_REMAPPED:
/* the bio has been remapped so dispatch it */
- trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
- tio->io->bio->bi_bdev->bd_dev, sector);
+ trace_block_bio_remap(clone->bi_disk->queue, clone,
+ bio_dev(tio->io->bio), sector);
generic_make_request(clone);
break;
case DM_MAPIO_KILL:
}
/* drop the extra reference count */
- dec_pending(ci.io, error);
+ dec_pending(ci.io, errno_to_blk_status(error));
}
/*-----------------------------------------------------------------
* CRUD END
map = dm_get_live_table(md, &srcu_idx);
- generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
+ generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
goto bad;
bio_init(&md->flush_bio, NULL, 0);
- md->flush_bio.bi_bdev = md->bdev;
+ bio_set_dev(&md->flush_bio, md->bdev);
md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
dm_stats_init(&md->stats);
bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
- bi->bi_bdev = rdev->bdev;
+ bio_set_dev(bi, rdev->bdev);
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
atomic_inc(&mddev->flush_pending);
submit_bio(bi);
atomic_inc(&rdev->nr_pending);
- bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
+ bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
bio->bi_iter.bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
struct bio *bio = md_bio_alloc_sync(rdev->mddev);
int ret;
- bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
- rdev->meta_bdev : rdev->bdev;
+ if (metadata_op && rdev->meta_bdev)
+ bio_set_dev(bio, rdev->meta_bdev);
+ else
+ bio_set_dev(bio, rdev->bdev);
bio_set_op_attrs(bio, op, op_flags);
if (metadata_op)
bio->bi_iter.bi_sector = sector + rdev->sb_start;
if (mddev->safemode == 1)
mddev->safemode = 0;
/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
- if (mddev->in_sync || !mddev->sync_checkers) {
+ if (mddev->in_sync || mddev->sync_checkers) {
spin_lock(&mddev->lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
if (mddev_trylock(mddev)) {
int spares = 0;
+ if (!mddev->external && mddev->safemode == 1)
+ mddev->safemode = 0;
+
if (mddev->ro) {
struct md_rdev *rdev;
if (!mddev->external && mddev->in_sync)
bool need_split_bio;
struct bio *split_bio;
- unsigned int has_flush:1; /* include flush request */
- unsigned int has_fua:1; /* include fua request */
- unsigned int has_null_flush:1; /* include empty flush request */
+ unsigned int has_flush:1; /* include flush request */
+ unsigned int has_fua:1; /* include fua request */
+ unsigned int has_null_flush:1; /* include null flush request */
+ unsigned int has_flush_payload:1; /* include flush payload */
/*
* io isn't sent yet, flush/fua request can only be submitted till it's
* the first IO in running_ios list
struct r5l_io_unit *io_deferred;
struct r5l_log *log = io->log;
unsigned long flags;
+ bool has_null_flush;
+ bool has_flush_payload;
if (bio->bi_status)
md_error(log->rdev->mddev, log->rdev);
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+
+ /*
+ * if the io doesn't not have null_flush or flush payload,
+ * it is not safe to access it after releasing io_list_lock.
+ * Therefore, it is necessary to check the condition with
+ * the lock held.
+ */
+ has_null_flush = io->has_null_flush;
+ has_flush_payload = io->has_flush_payload;
+
if (log->need_cache_flush && !list_empty(&io->stripe_list))
r5l_move_to_end_ios(log);
else
if (log->need_cache_flush)
md_wakeup_thread(log->rdev->mddev->thread);
- if (io->has_null_flush) {
+ /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
+ if (has_null_flush) {
struct bio *bi;
WARN_ON(bio_list_empty(&io->flush_barriers));
while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
bio_endio(bi);
- atomic_dec(&io->pending_stripe);
+ if (atomic_dec_and_test(&io->pending_stripe)) {
+ __r5l_stripe_write_finished(io);
+ return;
+ }
}
}
-
- /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
- if (atomic_read(&io->pending_stripe) == 0)
- __r5l_stripe_write_finished(io);
+ /* decrease pending_stripe for flush payload */
+ if (has_flush_payload)
+ if (atomic_dec_and_test(&io->pending_stripe))
+ __r5l_stripe_write_finished(io);
}
static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- bio->bi_bdev = log->rdev->bdev;
+ bio_set_dev(bio, log->rdev->bdev);
bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
return bio;
payload->size = cpu_to_le32(sizeof(__le64));
payload->flush_stripes[0] = cpu_to_le64(sect);
io->meta_offset += meta_size;
+ /* multiple flush payloads count as one pending_stripe */
+ if (!io->has_flush_payload) {
+ io->has_flush_payload = 1;
+ atomic_inc(&io->pending_stripe);
+ }
mutex_unlock(&log->io_mutex);
}
if (!do_flush)
return;
bio_reset(&log->flush_bio);
- log->flush_bio.bi_bdev = log->rdev->bdev;
+ bio_set_dev(&log->flush_bio, log->rdev->bdev);
log->flush_bio.bi_end_io = r5l_log_flush_endio;
log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
submit_bio(&log->flush_bio);
sector_t offset)
{
bio_reset(ctx->ra_bio);
- ctx->ra_bio->bi_bdev = log->rdev->bdev;
+ bio_set_dev(ctx->ra_bio, log->rdev->bdev);
bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
*/
int r5c_journal_mode_set(struct mddev *mddev, int mode)
{
- struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
-
- if (!log)
- return -ENODEV;
+ struct r5conf *conf;
+ int err;
if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
mode > R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf || !conf->log) {
+ mddev_unlock(mddev);
+ return -ENODEV;
+ }
+
if (raid5_calc_degraded(conf) > 0 &&
- mode == R5C_JOURNAL_MODE_WRITE_BACK)
+ mode == R5C_JOURNAL_MODE_WRITE_BACK) {
+ mddev_unlock(mddev);
return -EINVAL;
+ }
mddev_suspend(mddev);
conf->log->r5c_journal_mode = mode;
mddev_resume(mddev);
+ mddev_unlock(mddev);
pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
mdname(mddev), mode, r5c_journal_mode_str[mode]);
c.directive.opcode = nvme_admin_directive_recv;
c.directive.nsid = cpu_to_le32(nsid);
- c.directive.numd = cpu_to_le32(sizeof(*s));
+ c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
c.directive.dtype = NVME_DIR_STREAMS;
if (!disk)
goto submit;
- bio->bi_bdev = bdget_disk(disk, 0);
- if (!bio->bi_bdev) {
- ret = -ENODEV;
- goto out_unmap;
- }
+ bio->bi_disk = disk;
if (meta_buffer && meta_len) {
struct bio_integrity_payload *bip;
out_free_meta:
kfree(meta);
out_unmap:
- if (bio) {
- if (disk && bio->bi_bdev)
- bdput(bio->bi_bdev);
+ if (bio)
blk_rq_unmap_user(bio);
- }
out:
blk_mq_free_request(req);
return ret;
blk_queue_write_cache(q, vwc, vwc);
}
-static void nvme_configure_apst(struct nvme_ctrl *ctrl)
+static int nvme_configure_apst(struct nvme_ctrl *ctrl)
{
/*
* APST (Autonomous Power State Transition) lets us program a
* then don't do anything.
*/
if (!ctrl->apsta)
- return;
+ return 0;
if (ctrl->npss > 31) {
dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
- return;
+ return 0;
}
table = kzalloc(sizeof(*table), GFP_KERNEL);
if (!table)
- return;
+ return 0;
if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
/* Turn off APST. */
dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
kfree(table);
+ return ret;
}
static void nvme_set_latency_tolerance(struct device *dev, s32 val)
* In fabrics we need to verify the cntlid matches the
* admin connect
*/
- if (ctrl->cntlid != le16_to_cpu(id->cntlid))
+ if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
ret = -EINVAL;
+ goto out_free;
+ }
if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
dev_err(ctrl->device,
"keep-alive support is mandatory for fabrics\n");
ret = -EINVAL;
+ goto out_free;
}
} else {
ctrl->cntlid = le16_to_cpu(id->cntlid);
else if (!ctrl->apst_enabled && prev_apst_enabled)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
- nvme_configure_apst(ctrl);
- nvme_configure_directives(ctrl);
+ ret = nvme_configure_apst(ctrl);
+ if (ret < 0)
+ return ret;
+
+ ret = nvme_configure_directives(ctrl);
+ if (ret < 0)
+ return ret;
ctrl->identified = true;
+ return 0;
+
+out_free:
+ kfree(id);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_identify);
if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
return sprintf(buf, "eui.%8phN\n", ns->eui);
- while (ctrl->serial[serial_len - 1] == ' ')
+ while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
+ ctrl->serial[serial_len - 1] == '\0'))
serial_len--;
- while (ctrl->model[model_len - 1] == ' ')
+ while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
+ ctrl->model[model_len - 1] == '\0'))
model_len--;
return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
#include <linux/string.h>
#include <linux/atomic.h>
#include <linux/blk-mq.h>
+#include <linux/blk-mq-rdma.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/mutex.h>
ibdev = queue->device->dev;
/*
- * The admin queue is barely used once the controller is live, so don't
- * bother to spread it out.
+ * Spread I/O queues completion vectors according their queue index.
+ * Admin queues can always go on completion vector 0.
*/
- if (idx == 0)
- comp_vector = 0;
- else
- comp_vector = idx % ibdev->num_comp_vectors;
-
+ comp_vector = idx == 0 ? idx : idx - 1;
/* +1 for ib_stop_cq */
queue->ib_cq = ib_alloc_cq(ibdev, queue,
static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ struct ib_device *ibdev = ctrl->device->dev;
unsigned int nr_io_queues;
int i, ret;
nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
+
+ /*
+ * we map queues according to the device irq vectors for
+ * optimal locality so we don't need more queues than
+ * completion vectors.
+ */
+ nr_io_queues = min_t(unsigned int, nr_io_queues,
+ ibdev->num_comp_vectors);
+
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret)
return ret;
if (ctrl->ctrl.queue_count > 1) {
nvme_rdma_free_io_queues(ctrl);
- ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+ ret = blk_mq_reinit_tagset(&ctrl->tag_set,
+ nvme_rdma_reinit_request);
if (ret)
goto requeue;
}
nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
- ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
+ ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set,
+ nvme_rdma_reinit_request);
if (ret)
goto requeue;
struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
int nr;
- nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
+ /*
+ * Align the MR to a 4K page size to match the ctrl page size and
+ * the block virtual boundary.
+ */
+ nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
if (nr < count) {
if (nr < 0)
return nr;
nvme_complete_rq(rq);
}
+static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
+{
+ struct nvme_rdma_ctrl *ctrl = set->driver_data;
+
+ return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+}
+
static const struct blk_mq_ops nvme_rdma_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
.init_request = nvme_rdma_init_request,
.exit_request = nvme_rdma_exit_request,
- .reinit_request = nvme_rdma_reinit_request,
.init_hctx = nvme_rdma_init_hctx,
.poll = nvme_rdma_poll,
.timeout = nvme_rdma_timeout,
+ .map_queues = nvme_rdma_map_queues,
};
static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
.complete = nvme_rdma_complete_rq,
.init_request = nvme_rdma_init_request,
.exit_request = nvme_rdma_exit_request,
- .reinit_request = nvme_rdma_reinit_request,
.init_hctx = nvme_rdma_init_admin_hctx,
.timeout = nvme_rdma_timeout,
};
goto out_cleanup_queue;
ctrl->ctrl.max_hw_sectors =
- (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
+ (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
error = nvme_init_identify(&ctrl->ctrl);
if (error)
}
if (ctrl->ctrl.queue_count > 1) {
- ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+ ret = blk_mq_reinit_tagset(&ctrl->tag_set,
+ nvme_rdma_reinit_request);
if (ret)
goto del_dead_ctrl;
.create_ctrl = nvme_rdma_create_ctrl,
};
-static void nvme_rdma_add_one(struct ib_device *ib_device)
-{
-}
-
static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
{
struct nvme_rdma_ctrl *ctrl;
static struct ib_client nvme_rdma_ib_client = {
.name = "nvme_rdma",
- .add = nvme_rdma_add_one,
.remove = nvme_rdma_remove_one
};
bio_reset(bio);
bio->bi_end_io = btrfs_end_empty_barrier;
- bio->bi_bdev = device->bdev;
+ bio_set_dev(bio, device->bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
struct bio *bio = device->flush_bio;
if (!device->flush_bio_sent)
- return 0;
+ return BLK_STS_OK;
device->flush_bio_sent = 0;
wait_for_completion_io(&device->flush_wait);
continue;
write_dev_flush(dev);
- dev->last_flush_error = 0;
+ dev->last_flush_error = BLK_STS_OK;
}
/* wait for all the barriers */
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- err = 0;
+ err = BLK_STS_OK;
/* OK, we have read all the stripes we need to. */
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
*/
if (last_end == disk_start && stripe->dev->bdev &&
!last->bi_status &&
- last->bi_bdev == stripe->dev->bdev) {
+ last->bi_disk == stripe->dev->bdev->bd_disk &&
+ last->bi_partno == stripe->dev->bdev->bd_partno) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
if (ret == PAGE_SIZE)
return 0;
/* put a new bio on the list */
bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
bio->bi_iter.bi_size = 0;
- bio->bi_bdev = stripe->dev->bdev;
+ bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
bio_add_page(bio, page, PAGE_SIZE, 0);
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO);
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
stripe_start = stripe->physical;
if (physical >= stripe_start &&
physical < stripe_start + rbio->stripe_len &&
- bio->bi_bdev == stripe->dev->bdev) {
+ bio->bi_disk == stripe->dev->bdev->bd_disk &&
+ bio->bi_partno == stripe->dev->bdev->bd_partno) {
return i;
}
}
cleanup:
- rbio_orig_end_io(rbio, -EIO);
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
return 0;
cleanup:
- rbio_orig_end_io(rbio, -EIO);
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
return -EIO;
finish:
void **pointers;
int faila = -1, failb = -1;
struct page *page;
- int err;
+ blk_status_t err;
int i;
pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers) {
- err = -ENOMEM;
+ err = BLK_STS_RESOURCE;
goto cleanup_io;
}
* a bad data or Q stripe.
* TODO, we should redo the xor here.
*/
- err = -EIO;
+ err = BLK_STS_IOERR;
goto cleanup;
}
/*
if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
if (rbio->bbio->raid_map[faila] ==
RAID5_P_STRIPE) {
- err = -EIO;
+ err = BLK_STS_IOERR;
goto cleanup;
}
/*
}
}
- err = 0;
+ err = BLK_STS_OK;
cleanup:
kfree(pointers);
cleanup_io:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- if (err == 0)
+ if (err == BLK_STS_OK)
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
rbio_orig_end_io(rbio, err);
} else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
rbio_orig_end_io(rbio, err);
- } else if (err == 0) {
+ } else if (err == BLK_STS_OK) {
rbio->faila = -1;
rbio->failb = -1;
return;
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
- rbio_orig_end_io(rbio, -EIO);
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
else
__raid_recover_end_io(rbio);
}
cleanup:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
- rbio_orig_end_io(rbio, -EIO);
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
return -EIO;
}
nr_data = bio_list_size(&bio_list);
if (!nr_data) {
/* Every parity is right */
- rbio_orig_end_io(rbio, 0);
+ rbio_orig_end_io(rbio, BLK_STS_OK);
return;
}
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO);
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO);
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO);
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
return;
finish:
rcu_read_unlock();
}
#endif
- bio->bi_bdev = dev->bdev;
+ bio_set_dev(bio, dev->bdev);
btrfs_bio_counter_inc_noblocked(fs_info);
}
}
-int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, int async_submit)
+blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+ int mirror_num, int async_submit)
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
&map_length, &bbio, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
- return ret;
+ return errno_to_blk_status(ret);
}
total_devs = bbio->num_stripes;
}
btrfs_bio_counter_dec(fs_info);
- return ret;
+ return errno_to_blk_status(ret);
}
if (map_length < length) {
dev_nr, async_submit);
}
btrfs_bio_counter_dec(fs_info);
- return 0;
+ return BLK_STS_OK;
}
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
struct pagevec pvec;
pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
pgoff_t end;
- int i;
+ int i, count;
struct buffer_head *bh;
struct buffer_head *head;
end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
pagevec_init(&pvec, 0);
- while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
+ while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
+ count = pagevec_count(&pvec);
+ for (i = 0; i < count; i++) {
struct page *page = pvec.pages[i];
- index = page->index;
- if (index > end)
- break;
if (!page_has_buffers(page))
continue;
/*
}
pagevec_release(&pvec);
cond_resched();
- index++;
+ /* End of range already reached? */
+ if (index > end || !index)
+ break;
}
}
EXPORT_SYMBOL(clean_bdev_aliases);
struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned truncated_bytes;
- maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+ maxsector = get_capacity(bio->bi_disk);
if (!maxsector)
return;
}
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_bdev = bh->b_bdev;
+ bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
pagevec_init(&pvec, 0);
do {
- unsigned want, nr_pages, i;
+ unsigned nr_pages, i;
- want = min_t(unsigned, end - index, PAGEVEC_SIZE);
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+ end - 1);
if (nr_pages == 0)
break;
lastoff < page_offset(page))
goto check_range;
- /* Searching done if the page index is out of range. */
- if (page->index >= end)
- goto not_found;
-
lock_page(page);
if (likely(page->mapping == inode->i_mapping) &&
page_has_buffers(page)) {
unlock_page(page);
lastoff = page_offset(page) + PAGE_SIZE;
}
-
- /* Searching done if fewer pages returned than wanted. */
- if (nr_pages < want)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index < end);
struct page *page;
int i;
- if (bio->bi_status)
- fs_err(sdp, "Error %d writing to log\n", bio->bi_status);
+ if (bio->bi_status) {
+ fs_err(sdp, "Error %d writing to journal, jid=%u\n",
+ bio->bi_status, sdp->sd_jdesc->jd_jid);
+ wake_up(&sdp->sd_logd_waitq);
+ }
bio_for_each_segment_all(bvec, bio, i) {
page = bvec->bv_page;
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
- bio->bi_bdev = sb->s_bdev;
+ bio_set_dev(bio, sb->s_bdev);
bio->bi_end_io = gfs2_end_log_write;
bio->bi_private = sdp;
bio = bio_alloc(GFP_NOIO, num);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_bdev = bh->b_bdev;
+ bio_set_dev(bio, bh->b_bdev);
while (num > 0) {
bh = *bhs;
if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
brelse(bh);
ret = -EIO;
+ } else {
+ *bhp = bh;
}
- *bhp = bh;
return ret;
}
if (buffer_uptodate(first_bh))
goto out;
if (!buffer_locked(first_bh))
- ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
+ ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh);
dblock++;
extlen--;
bh = gfs2_getbuf(gl, dblock, CREATE);
if (!buffer_uptodate(bh) && !buffer_locked(bh))
- ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh);
+ ll_rw_block(REQ_OP_READ,
+ REQ_RAHEAD | REQ_META | REQ_PRIO,
+ 1, &bh);
brelse(bh);
dblock++;
extlen--;
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
- bio->bi_bdev = sb->s_bdev;
+ bio_set_dev(bio, sb->s_bdev);
bio_add_page(bio, page, PAGE_SIZE, 0);
bio->bi_end_io = end_bio_io_page;
return error;
}
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
+ snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name);
error = gfs2_sys_fs_add(sdp);
/*
}
if (sdp->sd_args.ar_spectator)
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+ snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s",
sdp->sd_table_name);
else
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+ snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u",
sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
error = init_inodes(sdp, DO);
sdp->sd_root_dir = NULL;
sdp->sd_master_dir = NULL;
shrink_dcache_sb(sb);
- gfs2_delete_debugfs_file(sdp);
free_percpu(sdp->sd_lkstats);
kill_block_super(sb);
}
unsigned long bytes; /* Bytes to write to page */
offset = (pos & (PAGE_SIZE - 1));
- bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
+ bytes = min_t(loff_t, PAGE_SIZE - offset, length);
rpage = __iomap_read_page(inode, pos);
if (IS_ERR(rpage))
unsigned offset, bytes;
offset = pos & (PAGE_SIZE - 1); /* Within page */
- bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+ bytes = min_t(loff_t, PAGE_SIZE - offset, count);
if (IS_DAX(inode))
status = iomap_dax_zero(pos, offset, bytes, iomap);
set_page_dirty(page);
wait_for_stable_page(page);
- return 0;
+ return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
- return ret;
+ return block_page_mkwrite_return(ret);
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
struct bio *bio;
bio = bio_alloc(GFP_KERNEL, 1);
- bio->bi_bdev = iomap->bdev;
+ bio_set_dev(bio, iomap->bdev);
bio->bi_iter.bi_sector =
iomap->blkno + ((pos - iomap->offset) >> 9);
bio->bi_private = dio;
return 0;
bio = bio_alloc(GFP_KERNEL, nr_pages);
- bio->bi_bdev = iomap->bdev;
+ bio_set_dev(bio, iomap->bdev);
bio->bi_iter.bi_sector =
iomap->blkno + ((pos - iomap->offset) >> 9);
bio->bi_write_hint = dio->iocb->ki_hint;
static int kernfs_fop_open(struct inode *inode, struct file *file)
{
- struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+ struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root = kernfs_root(kn);
const struct kernfs_ops *ops;
struct kernfs_open_file *of;
static int kernfs_fop_release(struct inode *inode, struct file *filp)
{
- struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+ struct kernfs_node *kn = inode->i_private;
struct kernfs_open_file *of = kernfs_of(filp);
if (kn->flags & KERNFS_HAS_RELEASE) {
static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
{
struct kernfs_open_file *of = kernfs_of(filp);
- struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+ struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
struct kernfs_open_node *on = kn->attr.open;
if (!kernfs_get_active(kn))
* have the matching @file available. Look up the inodes
* and generate the events manually.
*/
- inode = ilookup(info->sb, kn->ino);
+ inode = ilookup(info->sb, kn->id.ino);
if (!inode)
continue;
if (parent) {
struct inode *p_inode;
- p_inode = ilookup(info->sb, parent->ino);
+ p_inode = ilookup(info->sb, parent->id.ino);
if (p_inode) {
fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
inode, FSNOTIFY_EVENT_INODE, kn->name, 0);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (key) {
- lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+ lockdep_init_map(&kn->dep_map, "kn->count", key, 0);
kn->flags |= KERNFS_LOCKDEP;
}
#endif
}
}
-static void o2hb_wait_on_io(struct o2hb_region *reg,
- struct o2hb_bio_wait_ctxt *wc)
+static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
{
o2hb_bio_wait_dec(wc, 1);
wait_for_completion(&wc->wc_io_complete);
/* Must put everything in 512 byte sectors for the bio... */
bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
- bio->bi_bdev = reg->hr_bdev;
+ bio_set_dev(bio, reg->hr_bdev);
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
bio_set_op_attrs(bio, op, op_flags);
status = 0;
bail_and_wait:
- o2hb_wait_on_io(reg, &wc);
+ o2hb_wait_on_io(&wc);
if (wc.wc_error && !status)
status = wc.wc_error;
* before we can go to steady state. This ensures that
* people we find in our steady state have seen us.
*/
- o2hb_wait_on_io(reg, &write_wc);
+ o2hb_wait_on_io(&write_wc);
if (write_wc.wc_error) {
/* Do not re-arm the write timeout on I/O error - we
* can't be sure that the new block ever made it to
o2hb_prepare_block(reg, 0);
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret == 0)
- o2hb_wait_on_io(reg, &write_wc);
+ o2hb_wait_on_io(&write_wc);
else
mlog_errno(ret);
}
}
EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
-int o2hb_check_node_heartbeating(u8 node_num)
-{
- unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-
- o2hb_fill_node_map(testing_map, sizeof(testing_map));
- if (!test_bit(node_num, testing_map)) {
- mlog(ML_HEARTBEAT,
- "node (%u) does not have heartbeating enabled.\n",
- node_num);
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
-
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
}
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
-/* Makes sure our local node is configured with a node number, and is
- * heartbeating. */
-int o2hb_check_local_node_heartbeating(void)
-{
- u8 node_num;
-
- /* if this node was set then we have networking */
- node_num = o2nm_this_node();
- if (node_num == O2NM_MAX_NODES) {
- mlog(ML_HEARTBEAT, "this node has not been configured.\n");
- return 0;
- }
-
- return o2hb_check_node_heartbeating(node_num);
-}
-EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
-
/*
* this is just a hack until we get the plumbing which flips file systems
* read only and drops the hb ref instead of killing the node dead.
* associated buffer_heads, paying attention to the start and end offsets that
* we need to process on the page.
*
- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
- * the page at all, as we may be racing with memory reclaim and it can free both
- * the bufferhead chain and the page as it will see the page as clean and
- * unused.
+ * Note that we open code the action in end_buffer_async_write here so that we
+ * only have to iterate over the buffers attached to the page once. This is not
+ * only more efficient, but also ensures that we only calls end_page_writeback
+ * at the end of the iteration, and thus avoids the pitfall of having the page
+ * and buffers potentially freed after every call to end_buffer_async_write.
*/
static void
xfs_finish_page_writeback(
struct bio_vec *bvec,
int error)
{
- unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
- struct buffer_head *head, *bh, *next;
+ struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head;
+ bool busy = false;
unsigned int off = 0;
- unsigned int bsize;
+ unsigned long flags;
ASSERT(bvec->bv_offset < PAGE_SIZE);
ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
- ASSERT(end < PAGE_SIZE);
+ ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
- bh = head = page_buffers(bvec->bv_page);
-
- bsize = bh->b_size;
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
do {
- if (off > end)
- break;
- next = bh->b_this_page;
- if (off < bvec->bv_offset)
- goto next_bh;
- bh->b_end_io(bh, !error);
-next_bh:
- off += bsize;
- } while ((bh = next) != head);
+ if (off >= bvec->bv_offset &&
+ off < bvec->bv_offset + bvec->bv_len) {
+ ASSERT(buffer_async_write(bh));
+ ASSERT(bh->b_end_io == NULL);
+
+ if (error) {
+ mark_buffer_write_io_error(bh);
+ clear_buffer_uptodate(bh);
+ SetPageError(bvec->bv_page);
+ } else {
+ set_buffer_uptodate(bh);
+ }
+ clear_buffer_async_write(bh);
+ unlock_buffer(bh);
+ } else if (buffer_async_write(bh)) {
+ ASSERT(buffer_locked(bh));
+ busy = true;
+ }
+ off += bh->b_size;
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+
+ if (!busy)
+ end_page_writeback(bvec->bv_page);
}
/*
int error)
{
struct inode *inode = ioend->io_inode;
- struct bio *last = ioend->io_bio;
- struct bio *bio, *next;
+ struct bio *bio = &ioend->io_inline_bio;
+ struct bio *last = ioend->io_bio, *next;
+ u64 start = bio->bi_iter.bi_sector;
+ bool quiet = bio_flagged(bio, BIO_QUIET);
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
bio_put(bio);
}
+
+ if (unlikely(error && !quiet)) {
+ xfs_err_ratelimited(XFS_I(inode)->i_mount,
+ "writeback error on sector %llu", start);
+ }
}
/*
ASSERT(!buffer_delay(bh));
ASSERT(!buffer_unwritten(bh));
- mark_buffer_async_write(bh);
+ bh->b_end_io = NULL;
+ set_buffer_async_write(bh);
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
}
struct buffer_head *bh)
{
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_bdev = bh->b_bdev;
+ bio_set_dev(bio, bh->b_bdev);
}
static struct xfs_ioend *
#define BIO_BUG_ON
#endif
+#ifdef CONFIG_THP_SWAP
+#if HPAGE_PMD_NR > 256
+#define BIO_MAX_PAGES HPAGE_PMD_NR
+#else
#define BIO_MAX_PAGES 256
+#endif
+#else
+#define BIO_MAX_PAGES 256
+#endif
#define bio_prio(bio) (bio)->bi_ioprio
#define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio)
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
- void generic_start_io_acct(int rw, unsigned long sectors,
- struct hd_struct *part);
- void generic_end_io_acct(int rw, struct hd_struct *part,
- unsigned long start_time);
+ void generic_start_io_acct(struct request_queue *q, int rw,
+ unsigned long sectors, struct hd_struct *part);
+ void generic_end_io_acct(struct request_queue *q, int rw,
+ struct hd_struct *part,
+ unsigned long start_time);
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
extern unsigned int bvec_nr_vecs(unsigned short idx);
+ #define bio_set_dev(bio, bdev) \
+ do { \
+ (bio)->bi_disk = (bdev)->bd_disk; \
+ (bio)->bi_partno = (bdev)->bd_partno; \
+ } while (0)
+
+ #define bio_copy_dev(dst, src) \
+ do { \
+ (dst)->bi_disk = (src)->bi_disk; \
+ (dst)->bi_partno = (src)->bi_partno; \
+ } while (0)
+
+ #define bio_dev(bio) \
+ disk_devt((bio)->bi_disk)
+
+ #define bio_devname(bio, buf) \
+ __bdevname(bio_dev(bio), (buf))
+
#ifdef CONFIG_BLK_CGROUP
int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
int bio_associate_current(struct bio *bio);
struct request {
struct list_head queuelist;
union {
- struct call_single_data csd;
+ call_single_data_t csd;
u64 fifo_time;
};
#if defined(CONFIG_BLK_DEV_BSG)
bsg_job_fn *bsg_job_fn;
- int bsg_job_size;
struct bsg_class_device bsg_dev;
#endif
u64 write_hints[BLK_MAX_WRITE_HINTS];
};
- #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
- #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */
- #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */
- #define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */
- #define QUEUE_FLAG_DYING 5 /* queue being torn down */
- #define QUEUE_FLAG_BYPASS 6 /* act as dumb FIFO queue */
- #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */
- #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */
- #define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */
- #define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */
- #define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */
- #define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */
+ #define QUEUE_FLAG_QUEUED 0 /* uses generic tag queueing */
+ #define QUEUE_FLAG_STOPPED 1 /* queue is stopped */
+ #define QUEUE_FLAG_DYING 2 /* queue being torn down */
+ #define QUEUE_FLAG_BYPASS 3 /* act as dumb FIFO queue */
+ #define QUEUE_FLAG_BIDI 4 /* queue supports bidi requests */
+ #define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */
+ #define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */
+ #define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */
+ #define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */
+ #define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */
#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */
- #define QUEUE_FLAG_IO_STAT 13 /* do IO stats */
- #define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */
- #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */
- #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */
- #define QUEUE_FLAG_SECERASE 17 /* supports secure erase */
- #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */
- #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */
- #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */
- #define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/
- #define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */
- #define QUEUE_FLAG_WC 23 /* Write back caching */
- #define QUEUE_FLAG_FUA 24 /* device supports FUA writes */
- #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */
- #define QUEUE_FLAG_DAX 26 /* device supports DAX */
- #define QUEUE_FLAG_STATS 27 /* track rq completion times */
- #define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */
- #define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */
- #define QUEUE_FLAG_SCSI_PASSTHROUGH 30 /* queue supports SCSI commands */
- #define QUEUE_FLAG_QUIESCED 31 /* queue has been quiesced */
+ #define QUEUE_FLAG_IO_STAT 10 /* do IO stats */
+ #define QUEUE_FLAG_DISCARD 11 /* supports DISCARD */
+ #define QUEUE_FLAG_NOXMERGES 12 /* No extended merges */
+ #define QUEUE_FLAG_ADD_RANDOM 13 /* Contributes to random pool */
+ #define QUEUE_FLAG_SECERASE 14 /* supports secure erase */
+ #define QUEUE_FLAG_SAME_FORCE 15 /* force complete on same CPU */
+ #define QUEUE_FLAG_DEAD 16 /* queue tear-down finished */
+ #define QUEUE_FLAG_INIT_DONE 17 /* queue is initialized */
+ #define QUEUE_FLAG_NO_SG_MERGE 18 /* don't attempt to merge SG segments*/
+ #define QUEUE_FLAG_POLL 19 /* IO polling enabled if set */
+ #define QUEUE_FLAG_WC 20 /* Write back caching */
+ #define QUEUE_FLAG_FUA 21 /* device supports FUA writes */
+ #define QUEUE_FLAG_FLUSH_NQ 22 /* flush not queueuable */
+ #define QUEUE_FLAG_DAX 23 /* device supports DAX */
+ #define QUEUE_FLAG_STATS 24 /* track rq completion times */
+ #define QUEUE_FLAG_POLL_STATS 25 /* collecting stats for hybrid polling */
+ #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */
+ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */
+ #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_STACKABLE) | \
#define CGROUP_WEIGHT_DFL 100
#define CGROUP_WEIGHT_MAX 10000
+/* walk only threadgroup leaders */
+#define CSS_TASK_ITER_PROCS (1U << 0)
+/* walk all threaded css_sets in the domain */
+#define CSS_TASK_ITER_THREADED (1U << 1)
+
/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
struct cgroup_subsys *ss;
+ unsigned int flags;
struct list_head *cset_pos;
struct list_head *cset_head;
+ struct list_head *tcset_pos;
+ struct list_head *tcset_head;
+
struct list_head *task_pos;
struct list_head *tasks_head;
struct list_head *mg_tasks_head;
struct css_set *cur_cset;
+ struct css_set *cur_dcset;
struct task_struct *cur_task;
struct list_head iters_node; /* css_set->task_iters */
};
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp);
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);
percpu_ref_put_many(&css->refcnt, n);
}
+static inline void cgroup_get(struct cgroup *cgrp)
+{
+ css_get(&cgrp->self);
+}
+
+static inline bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
static inline void cgroup_put(struct cgroup *cgrp)
{
css_put(&cgrp->self);
return task_css(task, subsys_id)->cgroup;
}
+static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
+{
+ return task_css_set(task)->dfl_cgrp;
+}
+
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+ if (parent_css)
+ return container_of(parent_css, struct cgroup, self);
+ return NULL;
+}
+
/**
* cgroup_is_descendant - test ancestry
* @cgrp: the cgroup to be tested
/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
- return cgrp->populated_cnt;
+ return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
+ cgrp->nr_populated_threaded_children;
}
/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
- return cgrp->kn->ino;
+ return cgrp->kn->id.ino;
}
/* cft/css accessors for cftype->write() operation */
current->no_cgroup_migration = 0;
}
+ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
+ {
+ return &cgrp->kn->id;
+ }
+
+ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen);
#else /* !CONFIG_CGROUPS */
struct cgroup_subsys_state;
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}
+ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
+ {
+ return NULL;
+ }
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
struct cgroup *ancestor)
{
return true;
}
+
+ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen) {}
#endif /* !CONFIG_CGROUPS */
/*
extern int sysctl_protected_symlinks;
extern int sysctl_protected_hardlinks;
+typedef __kernel_rwf_t rwf_t;
+
struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
#endif
struct block_device * bd_contains;
unsigned bd_block_size;
+ u8 bd_partno;
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
/* Page cache limit. The filesystems should put that into their s_maxbytes
limits, otherwise bad things can happen in VM. */
#if BITS_PER_LONG==32
-#define MAX_LFS_FILESIZE (((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1)
+#define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
-#define MAX_LFS_FILESIZE ((loff_t)0x7fffffffffffffffLL)
+#define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX)
#endif
#define FL_POSIX 1
unsigned char fl_type;
unsigned int fl_pid;
int fl_link_cpu; /* what cpu's list is this on? */
- struct pid *fl_nspid;
wait_queue_head_t fl_wait;
struct file *fl_file;
loff_t fl_start;
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);
-struct mm_struct;
-
/*
* Umount options
*/
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
- unsigned long, loff_t *, int);
+ unsigned long, loff_t *, rwf_t);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
- unsigned long, loff_t *, int);
+ unsigned long, loff_t *, rwf_t);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
#endif
/* fs/char_dev.c */
-#define CHRDEV_MAJOR_HASH_SIZE 255
+#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
+/* Marks the top and bottom of the second segment of free char majors */
+#define CHRDEV_MAJOR_DYN_EXT_START 511
+#define CHRDEV_MAJOR_DYN_EXT_END 384
+
extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
#define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */
#ifdef CONFIG_BLOCK
-#define BLKDEV_MAJOR_HASH_SIZE 255
+#define BLKDEV_MAJOR_MAX 512
extern const char *__bdevname(dev_t, char *buffer);
extern const char *bdevname(struct block_device *bdev, char *buffer);
extern struct block_device *lookup_bdev(const char *);
extern void blkdev_show(struct seq_file *,off_t);
#else
-#define BLKDEV_MAJOR_HASH_SIZE 0
+#define BLKDEV_MAJOR_MAX 0
#endif
extern void init_special_inode(struct inode *, umode_t, dev_t);
extern int write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
-extern int filemap_fdatawait(struct address_space *);
extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
loff_t lend);
+
+static inline int filemap_fdatawait(struct address_space *mapping)
+{
+ return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
+}
+
extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
loff_t lend);
+extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+ loff_t lend);
extern int filemap_write_and_wait(struct address_space *mapping);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
extern int filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end);
extern int filemap_check_errors(struct address_space *mapping);
-
extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+
+extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+ loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
loff_t start, loff_t end);
+static inline int file_write_and_wait(struct file *file)
+{
+ return file_write_and_wait_range(file, 0, LLONG_MAX);
+}
+
/**
* filemap_set_wb_err - set a writeback error on an address_space
* @mapping: mapping in which to set writeback error
* When a writeback error occurs, most filesystems will want to call
* filemap_set_wb_err to record the error in the mapping so that it will be
* automatically reported whenever fsync is called on the file.
- *
- * FIXME: mention FS_* flag here?
*/
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
#endif
extern void unlock_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
+extern void evict_inodes(struct super_block *sb);
extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
- int flags);
+ rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
- int flags);
+ rwf_t flags);
/* fs/block_dev.c */
extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
return res;
}
-static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
{
if (unlikely(flags & ~RWF_SUPPORTED))
return -EOPNOTSUPP;
/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
spin_unlock_bh(&cgroup_idr_lock);
}
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+static bool cgroup_has_tasks(struct cgroup *cgrp)
{
- struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+ return cgrp->nr_populated_csets;
+}
- if (parent_css)
- return container_of(parent_css, struct cgroup, self);
- return NULL;
+bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+ return cgrp->dom_cgrp != cgrp;
+}
+
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+ /*
+ * Root isn't under domain level resource control exempting it from
+ * the no-internal-process constraint, so it can serve as a thread
+ * root and a parent of resource domains at the same time.
+ */
+ return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return true;
+
+ /* domain roots can't be nested under threaded */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* can only have either domain or threaded children */
+ if (cgrp->nr_populated_domain_children)
+ return false;
+
+ /* and no domain controllers can be enabled */
+ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return false;
+
+ return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+ /* thread root should be a domain */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* a domain w/ threaded children is a thread root */
+ if (cgrp->nr_threaded_children)
+ return true;
+
+ /*
+ * A domain which has tasks and explicit threaded controllers
+ * enabled is a thread root.
+ */
+ if (cgroup_has_tasks(cgrp) &&
+ (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+ return true;
+
+ return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+ /* the cgroup itself can be a thread root */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* but the ancestors can't be unless mixable */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+ return false;
+ if (cgroup_is_threaded(cgrp))
+ return false;
+ }
+
+ return true;
}
/* subsystems visibly enabled on a cgroup */
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
- if (parent)
- return parent->subtree_control;
+ if (parent) {
+ u16 ss_mask = parent->subtree_control;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
{
struct cgroup *parent = cgroup_parent(cgrp);
- if (parent)
- return parent->subtree_ss_mask;
+ if (parent) {
+ u16 ss_mask = parent->subtree_ss_mask;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
return cgrp->root->subsys_mask;
}
return css;
}
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
-{
- css_get(&cgrp->self);
-}
-
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
*/
struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
+ .dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
static int css_set_count = 1; /* 1 for init_css_set */
+static bool css_set_threaded(struct css_set *cset)
+{
+ return cset->dom_cset != cset;
+}
+
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
}
/**
- * cgroup_update_populated - updated populated count of a cgroup
+ * cgroup_update_populated - update the populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last. Update @cgrp->populated_cnt accordingly. The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
+ * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
*
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed. This can be used to detect when @cgrp and
- * its descendants become populated or empty.
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise. When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed. This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
+ struct cgroup *child = NULL;
+ int adj = populated ? 1 : -1;
+
lockdep_assert_held(&css_set_lock);
do {
- bool trigger;
+ bool was_populated = cgroup_is_populated(cgrp);
- if (populated)
- trigger = !cgrp->populated_cnt++;
- else
- trigger = !--cgrp->populated_cnt;
+ if (!child) {
+ cgrp->nr_populated_csets += adj;
+ } else {
+ if (cgroup_is_threaded(child))
+ cgrp->nr_populated_threaded_children += adj;
+ else
+ cgrp->nr_populated_domain_children += adj;
+ }
- if (!trigger)
+ if (was_populated == cgroup_is_populated(cgrp))
break;
cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
+ child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
- * ->populated_cnt of all associated cgroups accordingly.
+ * populated counters of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
- * This function automatically handles populated_cnt updates and
+ * This function automatically handles populated counter updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
if (!refcount_dec_and_test(&cset->refcount))
return;
+ WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
/* This css_set is dead. unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
kfree(link);
}
+ if (css_set_threaded(cset)) {
+ list_del(&cset->threaded_csets_node);
+ put_css_set_locked(cset->dom_cset);
+ }
+
kfree_rcu(cset, rcu_head);
}
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
+ struct cgroup *new_dfl_cgrp;
struct list_head *l1, *l2;
/*
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
+
+ /* @cset's domain should match the default cgroup's */
+ if (cgroup_on_dfl(new_cgrp))
+ new_dfl_cgrp = new_cgrp;
+ else
+ new_dfl_cgrp = old_cset->dfl_cgrp;
+
+ if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+ return false;
+
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
}
refcount_set(&cset->refcount, 1);
+ cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->task_iters);
+ INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
spin_unlock_irq(&css_set_lock);
+ /*
+ * If @cset should be threaded, look up the matching dom_cset and
+ * link them up. We first fully initialize @cset then look for the
+ * dom_cset. It's simpler this way and safe as @cset is guaranteed
+ * to stay empty until we return.
+ */
+ if (cgroup_is_threaded(cset->dfl_cgrp)) {
+ struct css_set *dcset;
+
+ dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+ if (!dcset) {
+ put_css_set(cset);
+ return NULL;
+ }
+
+ spin_lock_irq(&css_set_lock);
+ cset->dom_cset = dcset;
+ list_add_tail(&cset->threaded_csets_node,
+ &dcset->threaded_csets);
+ spin_unlock_irq(&css_set_lock);
+ }
+
return cset;
}
if (cset == &init_css_set) {
res = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
+ cgrp->dom_cgrp = cgrp;
+ cgrp->max_descendants = INT_MAX;
+ cgrp->max_depth = INT_MAX;
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_SUPPORT_EXPORTOP,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
if (!cset->mg_src_cgrp)
return;
+ mgctx->tset.nr_tasks++;
+
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
struct css_set *cset, *tmp_cset;
int ssid, failed_ssid, ret;
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset->src_csets))
- return 0;
-
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->can_attach) {
- tset->ssid = ssid;
- ret = ss->can_attach(tset);
- if (ret) {
- failed_ssid = ssid;
- goto out_cancel_attach;
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
+ if (ret) {
+ failed_ssid = ssid;
+ goto out_cancel_attach;
+ }
}
- }
- } while_each_subsys_mask();
+ } while_each_subsys_mask();
+ }
/*
* Now that we're guaranteed success, proceed to move all tasks to
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->attach) {
- tset->ssid = ssid;
- ss->attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
ret = 0;
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ssid == failed_ssid)
- break;
- if (ss->cancel_attach) {
- tset->ssid = ssid;
- ss->cancel_attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ssid == failed_ssid)
+ break;
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
out_release_tset:
spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
}
/**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
*/
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
- return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
- !dst_cgrp->subtree_control;
+ /* v1 doesn't have any restriction */
+ if (!cgroup_on_dfl(dst_cgrp))
+ return 0;
+
+ /* verify @dst_cgrp can host resources */
+ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(dst_cgrp))
+ return 0;
+
+ /*
+ * If @dst_cgrp is already or can become a thread root or is
+ * threaded, it doesn't matter.
+ */
+ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+ return 0;
+
+ /* apply no-internal-process constraint */
+ if (dst_cgrp->subtree_control)
+ return -EBUSY;
+
+ return 0;
}
/**
struct task_struct *task;
int ret;
- if (!cgroup_may_migrate_to(dst_cgrp))
- return -EBUSY;
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
return ret;
}
-static int cgroup_procs_write_permission(struct task_struct *task,
- struct cgroup *dst_cgrp,
- struct kernfs_open_file *of)
-{
- struct super_block *sb = of->file->f_path.dentry->d_sb;
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
- struct cgroup *src_cgrp, *com_cgrp;
- struct inode *inode;
- int ret;
-
- if (!cgroup_on_dfl(dst_cgrp)) {
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
-
- /*
- * even if we're attaching all tasks in the thread group,
- * we only need to check permissions on one of them.
- */
- if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->euid, tcred->suid))
- ret = 0;
- else
- ret = -EACCES;
-
- put_cred(tcred);
- return ret;
- }
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* and the common ancestor */
- com_cgrp = src_cgrp;
- while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
- com_cgrp = cgroup_parent(com_cgrp);
-
- /* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
- if (ret)
- return ret;
-
- /*
- * If namespaces are delegation boundaries, %current must be able
- * to see both source and destination cgroups from its namespace.
- */
- if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
- (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
- !cgroup_is_descendant(dst_cgrp, root_cgrp)))
- return -ENOENT;
-
- return 0;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+ __acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
- struct cgroup_subsys *ss;
- struct cgroup *cgrp;
pid_t pid;
- int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
+ return ERR_PTR(-EINVAL);
percpu_down_write(&cgroup_threadgroup_rwsem);
+
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- ret = -ESRCH;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-ESRCH);
+ goto out_unlock_threadgroup;
}
} else {
tsk = current;
* cgroup with no rt_runtime allocated. Just say no.
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
- ret = -EINVAL;
- goto out_unlock_rcu;
+ tsk = ERR_PTR(-EINVAL);
+ goto out_unlock_threadgroup;
}
get_task_struct(tsk);
+ goto out_unlock_rcu;
+
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+out_unlock_rcu:
rcu_read_unlock();
+ return tsk;
+}
- ret = cgroup_procs_write_permission(tsk, cgrp, of);
- if (!ret)
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+void cgroup_procs_write_finish(struct task_struct *task)
+ __releases(&cgroup_threadgroup_rwsem)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
- put_task_struct(tsk);
- goto out_unlock_threadgroup;
+ /* release reference from cgroup_procs_write_start() */
+ put_task_struct(task);
-out_unlock_rcu:
- rcu_read_unlock();
-out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
- cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
- loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, true);
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
cgroup_apply_control_disable(cgrp);
}
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+ u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+ /* if nothing is getting enabled, nothing to worry about */
+ if (!enable)
+ return 0;
+
+ /* can @cgrp host any resources? */
+ if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return 0;
+
+ if (domain_enable) {
+ /* can't enable domain controllers inside a thread subtree */
+ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return -EOPNOTSUPP;
+ } else {
+ /*
+ * Threaded controllers can handle internal competitions
+ * and are always allowed inside a (prospective) thread
+ * subtree.
+ */
+ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return 0;
+ }
+
+ /*
+ * Controllers can't be enabled for a cgroup with tasks to avoid
+ * child cgroups competing against tasks.
+ */
+ if (cgroup_has_tasks(cgrp))
+ return -EBUSY;
+
+ return 0;
+}
+
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
goto out_unlock;
}
- /*
- * Except for the root, subtree_control must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (enable && cgroup_parent(cgrp)) {
- struct cgrp_cset_link *link;
-
- /*
- * Because namespaces pin csets too, @cgrp->cset_links
- * might not be empty even when @cgrp is empty. Walk and
- * verify each cset.
- */
- spin_lock_irq(&css_set_lock);
-
- ret = 0;
- list_for_each_entry(link, &cgrp->cset_links, cset_link) {
- if (css_set_populated(link->cset)) {
- ret = -EBUSY;
- break;
- }
- }
-
- spin_unlock_irq(&css_set_lock);
-
- if (ret)
- goto out_unlock;
- }
+ ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+ if (ret)
+ goto out_unlock;
/* save and update control masks and prepare csses */
cgroup_save_control(cgrp);
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
-
cgroup_finalize_control(cgrp, ret);
+ if (ret)
+ goto out_unlock;
kernfs_activate(cgrp->kn);
- ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
+/**
+ * cgroup_enable_threaded - make @cgrp threaded
+ * @cgrp: the target cgroup
+ *
+ * Called when "threaded" is written to the cgroup.type interface file and
+ * tries to make @cgrp threaded and join the parent's resource domain.
+ * This function is never called on the root cgroup as cgroup.type doesn't
+ * exist on it.
+ */
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup *dom_cgrp = parent->dom_cgrp;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* noop if already threaded */
+ if (cgroup_is_threaded(cgrp))
+ return 0;
+
+ /* we're joining the parent's domain, ensure its validity */
+ if (!cgroup_is_valid_domain(dom_cgrp) ||
+ !cgroup_can_be_thread_root(dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /*
+ * The following shouldn't cause actual migrations and should
+ * always succeed.
+ */
+ cgroup_save_control(cgrp);
+
+ cgrp->dom_cgrp = dom_cgrp;
+ ret = cgroup_apply_control(cgrp);
+ if (!ret)
+ parent->nr_threaded_children++;
+ else
+ cgrp->dom_cgrp = cgrp;
+
+ cgroup_finalize_control(cgrp, ret);
+ return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ if (cgroup_is_threaded(cgrp))
+ seq_puts(seq, "threaded\n");
+ else if (!cgroup_is_valid_domain(cgrp))
+ seq_puts(seq, "domain invalid\n");
+ else if (cgroup_is_thread_root(cgrp))
+ seq_puts(seq, "domain threaded\n");
+ else
+ seq_puts(seq, "domain\n");
+
+ return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ /* only switching to threaded mode is supported */
+ if (strcmp(strstrip(buf), "threaded"))
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /* threaded can only be enabled */
+ ret = cgroup_enable_threaded(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int descendants = READ_ONCE(cgrp->max_descendants);
+
+ if (descendants == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", descendants);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int descendants;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ descendants = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &descendants);
+ if (ret)
+ return ret;
+ }
+
+ if (descendants < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_descendants = descendants;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int depth = READ_ONCE(cgrp->max_depth);
+
+ if (depth == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", depth);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int depth;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ depth = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &depth);
+ if (ret)
+ return ret;
+ }
+
+ if (depth < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_depth = depth;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
static int cgroup_events_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "populated %d\n",
return 0;
}
+static int cgroup_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "nr_descendants %d\n",
+ cgroup->nr_descendants);
+ seq_printf(seq, "nr_dying_descendants %d\n",
+ cgroup->nr_dying_descendants);
+
+ return 0;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
- LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
return ret;
}
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+ struct list_head *l;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* find the next threaded cset */
+ if (it->tcset_pos) {
+ l = it->tcset_pos->next;
+
+ if (l != it->tcset_head) {
+ it->tcset_pos = l;
+ return container_of(l, struct css_set,
+ threaded_csets_node);
+ }
+
+ it->tcset_pos = NULL;
+ }
+
+ /* find the next cset */
+ l = it->cset_pos;
+ l = l->next;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
+ return NULL;
+ }
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+
+ it->cset_pos = l;
+
+ /* initialize threaded css_set walking */
+ if (it->flags & CSS_TASK_ITER_THREADED) {
+ if (it->cur_dcset)
+ put_css_set_locked(it->cur_dcset);
+ it->cur_dcset = cset;
+ get_css_set(cset);
+
+ it->tcset_head = &cset->threaded_csets;
+ it->tcset_pos = &cset->threaded_csets;
+ }
+
+ return cset;
+}
+
/**
* css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
*/
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
- struct list_head *l = it->cset_pos;
- struct cgrp_cset_link *link;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set */
do {
- l = l->next;
- if (l == it->cset_head) {
- it->cset_pos = NULL;
+ cset = css_task_iter_next_css_set(it);
+ if (!cset) {
it->task_pos = NULL;
return;
}
-
- if (it->ss) {
- cset = container_of(l, struct css_set,
- e_cset_node[it->ss->id]);
- } else {
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
- }
} while (!css_set_populated(cset));
- it->cset_pos = l;
-
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
else
lockdep_assert_held(&css_set_lock);
WARN_ON_ONCE(!l);
+repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
css_task_iter_advance_css_set(it);
else
it->task_pos = l;
+
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+ !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+ cg_list)))
+ goto repeat;
}
/**
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*/
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
/* no one should try to iterate before mounting cgroups */
spin_lock_irq(&css_set_lock);
it->ss = css->ss;
+ it->flags = flags;
if (it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
spin_unlock_irq(&css_set_lock);
}
+ if (it->cur_dcset)
+ put_css_set(it->cur_dcset);
+
if (it->cur_task)
put_task_struct(it->cur_task);
}
{
struct kernfs_open_file *of = s->private;
struct css_task_iter *it = of->priv;
- struct task_struct *task;
- do {
- task = css_task_iter_next(it);
- } while (task && !thread_group_leader(task));
-
- return task;
+ return css_task_iter_next(it);
}
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+ unsigned int iter_flags)
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
if (!it)
return ERR_PTR(-ENOMEM);
of->priv = it;
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
} else if (!(*pos)++) {
css_task_iter_end(it);
- css_task_iter_start(&cgrp->self, it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
}
return cgroup_procs_next(s, NULL, NULL);
}
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+
+ /*
+ * All processes of a threaded subtree belong to the domain cgroup
+ * of the subtree. Only threads can be distributed across the
+ * subtree. Reject reads on cgroup.procs in the subtree proper.
+ * They're always empty anyway.
+ */
+ if (cgroup_is_threaded(cgrp))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+ CSS_TASK_ITER_THREADED);
+}
+
static int cgroup_procs_show(struct seq_file *s, void *v)
{
- seq_printf(s, "%d\n", task_tgid_vnr(v));
+ seq_printf(s, "%d\n", task_pid_vnr(v));
return 0;
}
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup *com_cgrp = src_cgrp;
+ struct inode *inode;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* find the common ancestor */
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+ return -ENOENT;
+
+ return 0;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, true);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, true);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+ return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, false);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /* thread migrations follow the cgroup.procs delegation rule */
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb);
+ if (ret)
+ goto out_finish;
+
+ /* and must be contained in the same domain */
+ ret = -EOPNOTSUPP;
+ if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, false);
+
+out_finish:
+ cgroup_procs_write_finish(task);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
{
+ .name = "cgroup.type",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_type_show,
+ .write = cgroup_type_write,
+ },
+ {
.name = "cgroup.procs",
.flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
.write = cgroup_procs_write,
},
{
+ .name = "cgroup.threads",
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_threads_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
+ .write = cgroup_threads_write,
+ },
+ {
.name = "cgroup.controllers",
.seq_show = cgroup_controllers_show,
},
.file_offset = offsetof(struct cgroup, events_file),
.seq_show = cgroup_events_show,
},
+ {
+ .name = "cgroup.max.descendants",
+ .seq_show = cgroup_max_descendants_show,
+ .write = cgroup_max_descendants_write,
+ },
+ {
+ .name = "cgroup.max.depth",
+ .seq_show = cgroup_max_depth_show,
+ .write = cgroup_max_depth_write,
+ },
+ {
+ .name = "cgroup.stat",
+ .seq_show = cgroup_stat_show,
+ },
{ } /* terminate */
};
if (ss->css_released)
ss->css_released(css);
} else {
+ struct cgroup *tcgrp;
+
/* cgroup release path */
trace_cgroup_release(cgrp);
+ for (tcgrp = cgroup_parent(cgrp); tcgrp;
+ tcgrp = cgroup_parent(tcgrp))
+ tcgrp->nr_dying_descendants--;
+
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
if (!(css->flags & CSS_ONLINE))
return;
- if (ss->css_reset)
- ss->css_reset(css);
-
if (ss->css_offline)
ss->css_offline(css);
cgrp->root = root;
cgrp->level = level;
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+ if (tcgrp != cgrp)
+ tcgrp->nr_descendants++;
+ }
+
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
return ERR_PTR(ret);
}
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+ struct cgroup *cgroup;
+ int ret = false;
+ int level = 1;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+ if (cgroup->nr_descendants >= cgroup->max_descendants)
+ goto fail;
+
+ if (level > cgroup->max_depth)
+ goto fail;
+
+ level++;
+ }
+
+ ret = true;
+fail:
+ return ret;
+}
+
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
if (!parent)
return -ENODEV;
+ if (!cgroup_check_hierarchy_limits(parent)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
cgrp = cgroup_create(parent);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
int ssid;
*/
kernfs_remove(cgrp->kn);
- cgroup1_check_for_release(cgroup_parent(cgrp));
+ if (parent && cgroup_is_threaded(cgrp))
+ parent->nr_threaded_children--;
+
+ for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ tcgrp->nr_descendants--;
+ tcgrp->nr_dying_descendants++;
+ }
+
+ cgroup1_check_for_release(parent);
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+ /* implicit controllers must be threaded too */
+ WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
if (ss->implicit_on_dfl)
cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
else if (!ss->dfl_cftypes)
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
+ if (ss->threaded)
+ cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
} else {
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
+
+ mutex_lock(&cgroup_mutex);
+ css_populate_dir(init_css_set.subsys[ssid]);
+ mutex_unlock(&cgroup_mutex);
}
/* init_css_set.subsys[] has been updated, re-hash */
}
core_initcall(cgroup_wq_init);
+ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+ char *buf, size_t buflen)
+ {
+ struct kernfs_node *kn;
+
+ kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return;
+ kernfs_path(kn, buf, buflen);
+ kernfs_put(kn);
+ }
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
+#include <linux/sched/task.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
{
+ int i, nr = hpage_nr_pages(page);
struct bio *bio;
- bio = bio_alloc(gfp_flags, 1);
+ bio = bio_alloc(gfp_flags, nr);
if (bio) {
- bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+ struct block_device *bdev;
+
+ bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
+ bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_end_io = end_io;
- bio_add_page(bio, page, PAGE_SIZE, 0);
- BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
+ for (i = 0; i < nr; i++)
+ bio_add_page(bio, page + i, PAGE_SIZE, 0);
+ VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
}
return bio;
}
*/
set_page_dirty(page);
pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
SetPageError(page);
ClearPageUptodate(page);
pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
(unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
wake_up_process(waiter);
+ put_task_struct(waiter);
}
int generic_swapfile_activate(struct swap_info_struct *sis,
return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
}
+static inline void count_swpout_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (unlikely(PageTransHuge(page)))
+ count_vm_event(THP_SWPOUT);
+#endif
+ count_vm_events(PSWPOUT, hpage_nr_pages(page));
+}
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
if (!ret) {
- count_vm_event(PSWPOUT);
+ count_swpout_vm_event(page);
return 0;
}
goto out;
}
bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- count_vm_event(PSWPOUT);
+ count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
blk_qc_t qc;
- struct block_device *bdev;
+ struct gendisk *disk;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
ret = -ENOMEM;
goto out;
}
- bdev = bio->bi_bdev;
+ disk = bio->bi_disk;
+ /*
+ * Keep this task valid during swap readpage because the oom killer may
+ * attempt to access it in the page fault retry time check.
+ */
+ get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
count_vm_event(PSWPIN);
if (!READ_ONCE(bio->bi_private))
break;
- if (!blk_mq_poll(bdev_get_queue(bdev), qc))
+ if (!blk_mq_poll(disk->queue, qc))
break;
}
__set_current_state(TASK_RUNNING);