#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
- #include <linux/lockdep.h>
#include "blk.h"
#include "blk-mq.h"
static void blk_account_io_flush(struct request *rq)
{
- struct hd_struct *part = &rq->rq_disk->part0;
+ struct block_device *part = rq->rq_disk->part0;
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
- WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
if (!refcount_dec_and_test(&flush_rq->ref)) {
fq->rq_status = error;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
return;
}
+ /*
+ * Flush request has to be marked as IDLE when it is really ended
+ * because its .end_io() is called from timeout code path too for
+ * avoiding use-after-free.
+ */
+ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
if (fq->rq_status != BLK_STS_OK)
error = fq->rq_status;
INIT_LIST_HEAD(&fq->flush_queue[1]);
INIT_LIST_HEAD(&fq->flush_data_in_flight);
- lockdep_register_key(&fq->key);
- lockdep_set_class(&fq->mq_flush_lock, &fq->key);
-
return fq;
fail_rq:
if (!fq)
return;
- lockdep_unregister_key(&fq->key);
kfree(fq->flush_rq);
kfree(fq);
}
+
+ /*
+ * Allow driver to set its own lock class to fq->mq_flush_lock for
+ * avoiding lockdep complaint.
+ *
+ * flush_end_io() may be called recursively from some driver, such as
+ * nvme-loop, so lockdep may complain 'possible recursive locking' because
+ * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
+ * key. We need to assign different lock class for these driver's
+ * fq->mq_flush_lock for avoiding the lockdep warning.
+ *
+ * Use dynamically allocated lock class key for each 'blk_flush_queue'
+ * instance is over-kill, and more worse it introduces horrible boot delay
+ * issue because synchronize_rcu() is implied in lockdep_unregister_key which
+ * is called for each hctx release. SCSI probing may synchronously create and
+ * destroy lots of MQ request_queues for non-existent devices, and some robot
+ * test kernel always enable lockdep option. It is observed that more than half
+ * an hour is taken during SCSI MQ probe with per-fq lock class.
+ */
+ void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
+ struct lock_class_key *key)
+ {
+ lockdep_set_class(&hctx->fq->mq_flush_lock, key);
+ }
+ EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
static inline unsigned get_max_io_size(struct request_queue *q,
struct bio *bio)
{
- unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
+ unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
unsigned max_sectors = sectors;
unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
return NULL;
split:
*segs = nsegs;
+
+ /*
+ * Bio splitting may cause subtle trouble such as hang when doing sync
+ * iopoll in direct IO routine. Given performance gain of iopoll for
+ * big IO can be trival, disable iopoll when split needed.
+ */
+ bio->bi_opf &= ~REQ_HIPRI;
+
return bio_split(bio, sectors, GFP_NOIO, bs);
}
split->bi_opf |= REQ_NOMERGE;
bio_chain(split, *bio);
- trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
+ trace_block_split(split, (*bio)->bi_iter.bi_sector);
submit_bio_noacct(*bio);
*bio = split;
}
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
part_stat_unlock();
-
- hd_struct_put(req->part);
}
}
*/
blk_account_io_merge_request(next);
- trace_block_rq_merge(q, next);
+ trace_block_rq_merge(next);
/*
* ownership of bio passed from next to req, return 'next' for
if (!ll_back_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
- trace_block_bio_backmerge(req->q, req, bio);
+ trace_block_bio_backmerge(bio);
rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
- trace_block_bio_frontmerge(req->q, req, bio);
+ trace_block_bio_frontmerge(bio);
rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
}
struct mq_inflight {
- struct hd_struct *part;
+ struct block_device *part;
unsigned int inflight[2];
};
{
struct mq_inflight *mi = priv;
- if (rq->part == mi->part && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
+ if ((!mi->part->bd_partno || rq->part == mi->part) &&
+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
return true;
}
- unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
+ unsigned int blk_mq_in_flight(struct request_queue *q,
+ struct block_device *part)
{
struct mq_inflight mi = { .part = part };
return mi.inflight[0] + mi.inflight[1];
}
- void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2])
+ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
+ unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
return false;
if (blk_mq_complete_need_ipi(rq)) {
- rq->csd.func = __blk_mq_complete_request_remote;
- rq->csd.info = rq;
- rq->csd.flags = 0;
+ INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
} else {
if (rq->q->nr_hw_queues > 1)
{
struct request_queue *q = rq->q;
- trace_block_rq_issue(q, rq);
+ trace_block_rq_issue(rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
rq->io_start_time_ns = ktime_get_ns();
blk_mq_put_driver_tag(rq);
- trace_block_rq_requeue(q, rq);
+ trace_block_rq_requeue(rq);
rq_qos_requeue(q, rq);
if (blk_mq_request_started(rq)) {
* __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
* @hctx: Pointer to the hardware queue to run.
* @async: If we want to run the queue asynchronously.
- * @msecs: Microseconds of delay to wait before running the queue.
+ * @msecs: Milliseconds of delay to wait before running the queue.
*
* If !@async, try to run the queue now. Else, run the queue asynchronously and
* with a delay of @msecs.
/**
* blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
* @hctx: Pointer to the hardware queue to run.
- * @msecs: Microseconds of delay to wait before running the queue.
+ * @msecs: Milliseconds of delay to wait before running the queue.
*
* Run a hardware queue asynchronously with a delay of @msecs.
*/
/**
* blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
* @q: Pointer to the request queue to run.
- * @msecs: Microseconds of delay to wait before running the queues.
+ * @msecs: Milliseconds of delay to wait before running the queues.
*/
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
lockdep_assert_held(&ctx->lock);
- trace_block_rq_insert(hctx->queue, rq);
+ trace_block_rq_insert(rq);
if (at_head)
list_add(&rq->queuelist, &ctx->rq_lists[type]);
*/
list_for_each_entry(rq, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx);
- trace_block_rq_insert(hctx->queue, rq);
+ trace_block_rq_insert(rq);
}
spin_lock(&ctx->lock);
unsigned int nr_segs;
blk_qc_t cookie;
blk_status_t ret;
+ bool hipri;
blk_queue_bounce(q, &bio);
__blk_queue_split(&bio, &nr_segs);
rq_qos_throttle(q, bio);
+ hipri = bio->bi_opf & REQ_HIPRI;
+
data.cmd_flags = bio->bi_opf;
rq = __blk_mq_alloc_request(&data);
if (unlikely(!rq)) {
goto queue_exit;
}
- trace_block_getrq(q, bio, bio->bi_opf);
+ trace_block_getrq(bio);
rq_qos_track(q, rq, bio);
blk_mq_sched_insert_request(rq, false, true, true);
}
+ if (!hipri)
+ return BLK_QC_T_NONE;
return cookie;
queue_exit:
blk_queue_exit(q);
return 0;
}
+ static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
+ int new_nr_hw_queues)
+ {
+ return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
+ }
+
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
- if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
+ if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
return -ENOMEM;
ret = -ENOMEM;
* the state. Like for the other success return cases, the
* caller is responsible for checking if the IO completed. If
* the IO isn't complete, we'll get called again and will go
- * straight to the busy poll loop.
+ * straight to the busy poll loop. If specified not to spin,
+ * we also should not sleep.
*/
- if (blk_mq_poll_hybrid(q, hctx, cookie))
+ if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
return 1;
hctx->poll_considered++;
struct work_struct persistent_purge_work;
/* Buffer of free pages to map grant refs. */
- spinlock_t free_pages_lock;
- int free_pages_num;
- struct list_head free_pages;
+ struct gnttab_page_cache free_pages;
struct work_struct free_work;
/* Thread shutdown wait queue. */
};
- #define vbd_sz(_v) ((_v)->bdev->bd_part ? \
- (_v)->bdev->bd_part->nr_sects : \
- get_capacity((_v)->bdev->bd_disk))
+ #define vbd_sz(_v) bdev_nr_sectors((_v)->bdev)
#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
#define xen_blkif_put(_b) \
static DEFINE_MUTEX(zram_index_mutex);
static int zram_major;
-static const char *default_compressor = "lzo-rle";
+static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
/* Module params (documentation at end) */
static unsigned int num_devices = 1;
return;
bdev = zram->bdev;
- if (zram->old_block_size)
- set_blocksize(bdev, zram->old_block_size);
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
/* hope filp_close flush all of IO */
filp_close(zram->backing_dev, NULL);
zram->backing_dev = NULL;
- zram->old_block_size = 0;
zram->bdev = NULL;
zram->disk->fops = &zram_devops;
kvfree(zram->bitmap);
struct file *backing_dev = NULL;
struct inode *inode;
struct address_space *mapping;
- unsigned int bitmap_sz, old_block_size = 0;
+ unsigned int bitmap_sz;
unsigned long nr_pages, *bitmap = NULL;
struct block_device *bdev = NULL;
int err;
goto out;
}
- old_block_size = block_size(bdev);
- err = set_blocksize(bdev, PAGE_SIZE);
- if (err)
- goto out;
-
reset_bdev(zram);
- zram->old_block_size = old_block_size;
zram->bdev = bdev;
zram->backing_dev = backing_dev;
zram->bitmap = bitmap;
return 1;
}
+#define PAGE_WB_SIG "page_index="
+
+#define PAGE_WRITEBACK 0
#define HUGE_WRITEBACK 1
#define IDLE_WRITEBACK 2
+
static ssize_t writeback_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
- unsigned long index;
+ unsigned long index = 0;
struct bio bio;
struct bio_vec bio_vec;
struct page *page;
mode = IDLE_WRITEBACK;
else if (sysfs_streq(buf, "huge"))
mode = HUGE_WRITEBACK;
- else
- return -EINVAL;
+ else {
+ if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
+ return -EINVAL;
+
+ ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index);
+ if (ret || index >= nr_pages)
+ return -EINVAL;
+
+ nr_pages = 1;
+ mode = PAGE_WRITEBACK;
+ }
down_read(&zram->init_lock);
if (!init_done(zram)) {
goto release_init_lock;
}
- for (index = 0; index < nr_pages; index++) {
+ while (nr_pages--) {
struct bio_vec bvec;
bvec.bv_page = page;
max_used = atomic_long_read(&zram->stats.max_used_pages);
ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.same_pages),
pool_stats.pages_compacted,
- (u64)atomic64_read(&zram->stats.huge_pages));
+ (u64)atomic64_read(&zram->stats.huge_pages),
+ (u64)atomic64_read(&zram->stats.huge_pages_since));
up_read(&zram->init_lock);
return ret;
if (comp_len == PAGE_SIZE) {
zram_set_flag(zram, index, ZRAM_HUGE);
atomic64_inc(&zram->stats.huge_pages);
+ atomic64_inc(&zram->stats.huge_pages_since);
}
if (flags) {
disksize = zram->disksize;
zram->disksize = 0;
- set_capacity(zram->disk, 0);
- part_stat_set_all(&zram->disk->part0, 0);
+ set_capacity_and_notify(zram->disk, 0);
+ part_stat_set_all(zram->disk->part0, 0);
up_write(&zram->init_lock);
/* I/O operation under all of CPU are done so let's free */
zram->comp = comp;
zram->disksize = disksize;
- set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-
- revalidate_disk_size(zram->disk, true);
+ set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
up_write(&zram->init_lock);
return len;
return -EINVAL;
zram = dev_to_zram(dev);
- bdev = bdget_disk(zram->disk, 0);
- if (!bdev)
- return -ENOMEM;
+ bdev = zram->disk->part0;
mutex_lock(&bdev->bd_mutex);
/* Do not reset an active device or claimed device */
if (bdev->bd_openers || zram->claim) {
mutex_unlock(&bdev->bd_mutex);
- bdput(bdev);
return -EBUSY;
}
/* Make sure all the pending I/O are finished */
fsync_bdev(bdev);
zram_reset_device(zram);
- revalidate_disk_size(zram->disk, true);
- bdput(bdev);
mutex_lock(&bdev->bd_mutex);
zram->claim = false;
static int zram_remove(struct zram *zram)
{
- struct block_device *bdev;
-
- bdev = bdget_disk(zram->disk, 0);
- if (!bdev)
- return -ENOMEM;
+ struct block_device *bdev = zram->disk->part0;
mutex_lock(&bdev->bd_mutex);
if (bdev->bd_openers || zram->claim) {
mutex_unlock(&bdev->bd_mutex);
- bdput(bdev);
return -EBUSY;
}
/* Make sure all the pending I/O are finished */
fsync_bdev(bdev);
zram_reset_device(zram);
- bdput(bdev);
pr_info("Removed device: %s\n", zram->disk->disk_name);
atomic64_t notify_free; /* no. of swap slot free notifications */
atomic64_t same_pages; /* no. of same element filled pages */
atomic64_t huge_pages; /* no. of huge pages */
+ atomic64_t huge_pages_since; /* no. of huge pages since zram set up */
atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
bool wb_limit_enable;
u64 bd_wb_limit;
struct block_device *bdev;
- unsigned int old_block_size;
unsigned long *bitmap;
unsigned long nr_pages;
#endif
return 1;
}
- static int ata_lock(dev_t dev, void *data)
+ static void ata_probe(dev_t dev)
{
- /* FIXME: we want to pin hwif down */
- return 0;
+ request_module("ide-disk");
+ request_module("ide-cd");
+ request_module("ide-tape");
+ request_module("ide-floppy");
}
- static struct kobject *ata_probe(dev_t dev, int *part, void *data)
- {
- ide_hwif_t *hwif = data;
- int unit = *part >> PARTN_BITS;
- ide_drive_t *drive = hwif->devices[unit];
-
- if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0)
- return NULL;
-
- if (drive->media == ide_disk)
- request_module("ide-disk");
- if (drive->media == ide_cdrom || drive->media == ide_optical)
- request_module("ide-cd");
- if (drive->media == ide_tape)
- request_module("ide-tape");
- if (drive->media == ide_floppy)
- request_module("ide-floppy");
-
- return NULL;
- }
-
- static struct kobject *exact_match(dev_t dev, int *part, void *data)
- {
- struct gendisk *p = data;
- *part &= (1 << PARTN_BITS) - 1;
- return &disk_to_dev(p)->kobj;
- }
-
- static int exact_lock(dev_t dev, void *data)
- {
- struct gendisk *p = data;
-
- if (!get_disk_and_module(p))
- return -1;
- return 0;
- }
-
- void ide_register_region(struct gendisk *disk)
- {
- blk_register_region(MKDEV(disk->major, disk->first_minor),
- disk->minors, NULL, exact_match, exact_lock, disk);
- }
-
- EXPORT_SYMBOL_GPL(ide_register_region);
-
- void ide_unregister_region(struct gendisk *disk)
- {
- blk_unregister_region(MKDEV(disk->major, disk->first_minor),
- disk->minors);
- }
-
- EXPORT_SYMBOL_GPL(ide_unregister_region);
-
void ide_init_disk(struct gendisk *disk, ide_drive_t *drive)
{
ide_hwif_t *hwif = drive->hwif;
return 0;
}
- if (register_blkdev(hwif->major, hwif->name))
+ if (__register_blkdev(hwif->major, hwif->name, ata_probe))
return 0;
if (!hwif->sg_max_nents)
goto out;
}
- blk_register_region(MKDEV(hwif->major, 0), MAX_DRIVES << PARTN_BITS,
- THIS_MODULE, ata_probe, ata_lock, hwif);
return 1;
out:
static void ide_unregister(ide_hwif_t *hwif)
{
- BUG_ON(in_interrupt());
- BUG_ON(irqs_disabled());
-
mutex_lock(&ide_cfg_mtx);
if (hwif->present) {
/*
* Remove us from the kernel's knowledge
*/
- blk_unregister_region(MKDEV(hwif->major, 0), MAX_DRIVES<<PARTN_BITS);
kfree(hwif->sg_table);
unregister_blkdev(hwif->major, hwif->name);
{
struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
- set_capacity(gendisk, rs->md.array_sectors);
- revalidate_disk_size(gendisk, true);
+ set_capacity_and_notify(gendisk, rs->md.array_sectors);
}
/*
blk_limits_io_min(limits, chunk_size_bytes);
blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
+
+ /*
+ * RAID1 and RAID10 personalities require bio splitting,
+ * RAID0/4/5/6 don't and process large discard bios properly.
+ */
+ if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
+ limits->discard_granularity = chunk_size_bytes;
+ limits->max_discard_sectors = rs->md.chunk_sectors;
+ }
}
static void raid_postsuspend(struct dm_target *ti)
#include <linux/mutex.h>
#include <linux/delay.h>
#include <linux/atomic.h>
-#include <linux/lcm.h>
#include <linux/blk-mq.h>
#include <linux/mount.h>
#include <linux/dax.h>
dev_t dm_get_dev_t(const char *path)
{
dev_t dev;
- struct block_device *bdev;
- bdev = lookup_bdev(path);
- if (IS_ERR(bdev))
+ if (lookup_bdev(path, &dev))
dev = name_to_dev_t(path);
- else {
- dev = bdev->bd_dev;
- bdput(bdev);
- }
-
return dev;
}
EXPORT_SYMBOL_GPL(dm_get_dev_t);
void dm_table_event(struct dm_table *t)
{
- /*
- * You can no longer call dm_table_event() from interrupt
- * context, use a bottom half instead.
- */
- BUG_ON(in_interrupt());
-
mutex_lock(&_event_lock);
if (t->event_fn)
t->event_fn(t->event_context);
zone_sectors = ti_limits.chunk_sectors;
}
- /* Stack chunk_sectors if target-specific splitting is required */
- if (ti->max_io_len)
- ti_limits.chunk_sectors = lcm_not_zero(ti->max_io_len,
- ti_limits.chunk_sectors);
/* Set I/O hints portion of queue limits */
if (ti->type->io_hints)
ti->type->io_hints(ti, &ti_limits);
return -EAGAIN;
map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- return -EIO;
+ if (!map) {
+ ret = -EIO;
+ goto out;
+ }
do {
struct dm_target *tgt;
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
struct block_device **bdev)
- __acquires(md->io_barrier)
{
struct dm_target *tgt;
struct dm_table *map;
}
static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
- __releases(md->io_barrier)
{
dm_put_live_table(md, srcu_idx);
}
}
}
- r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+ if (!bdev->bd_disk->fops->ioctl)
+ r = -ENOTTY;
+ else
+ r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
out:
dm_unprepare_ioctl(md, srcu_idx);
return r;
sector_t max_len;
/*
- * Does the target need to split even further?
- * - q->limits.chunk_sectors reflects ti->max_io_len so
- * blk_max_size_offset() provides required splitting.
- * - blk_max_size_offset() also respects q->limits.max_sectors
+ * Does the target need to split IO even further?
+ * - varied (per target) IO splitting is a tenet of DM; this
+ * explains why stacked chunk_sectors based splitting via
+ * blk_max_size_offset() isn't possible here. So pass in
+ * ti->max_io_len to override stacked chunk_sectors.
*/
- max_len = blk_max_size_offset(ti->table->md->queue,
- target_offset);
- if (len > max_len)
- len = max_len;
+ if (ti->max_io_len) {
+ max_len = blk_max_size_offset(ti->table->md->queue,
+ target_offset, ti->max_io_len);
+ if (len > max_len)
+ len = max_len;
+ }
return len;
}
* ->zero_page_range() is mandatory dax operation. If we are
* here, something is wrong.
*/
- dm_put_live_table(md, srcu_idx);
goto out;
}
ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
-
out:
dm_put_live_table(md, srcu_idx);
break;
case DM_MAPIO_REMAPPED:
/* the bio has been remapped so dispatch it */
- trace_block_bio_remap(clone->bi_disk->queue, clone,
- bio_dev(io->orig_bio), sector);
+ trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
ret = submit_bio_noacct(clone);
break;
case DM_MAPIO_KILL:
*/
bio_init(&flush_bio, NULL, 0);
flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ flush_bio.bi_disk = ci->io->md->disk;
+ bio_associate_blkg(&flush_bio);
+
ci->bio = &flush_bio;
ci->sector_count = 0;
- /*
- * Empty flush uses a statically initialized bio, as the base for
- * cloning. However, blkg association requires that a bdev is
- * associated with a gendisk, which doesn't happen until the bdev is
- * opened. So, blkg association is done at issue time of the flush
- * rather than when the device is created in alloc_dev().
- */
- bio_set_dev(ci->bio, ci->io->md->bdev);
-
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
* (by eliminating DM's splitting and just using bio_split)
*/
part_stat_lock();
- __dm_part_stat_sub(&dm_disk(md)->part0,
+ __dm_part_stat_sub(dm_disk(md)->part0,
sectors[op_stat_group(bio_op(bio))], ci.sector_count);
part_stat_unlock();
bio_chain(b, bio);
- trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
+ trace_block_split(b, bio->bi_iter.bi_sector);
ret = submit_bio_noacct(bio);
break;
}
cleanup_srcu_struct(&md->io_barrier);
- if (md->bdev) {
- bdput(md->bdev);
- md->bdev = NULL;
- }
-
mutex_destroy(&md->suspend_lock);
mutex_destroy(&md->type_lock);
mutex_destroy(&md->table_devices_lock);
if (!md->wq)
goto bad;
- md->bdev = bdget_disk(md->disk, 0);
- if (!md->bdev)
- goto bad;
-
dm_stats_init(&md->stats);
/* Populate the mapping, nobody knows we exist yet */
if (size != dm_get_size(md))
memset(&md->geometry, 0, sizeof(md->geometry));
- set_capacity(md->disk, size);
- bd_set_nr_sectors(md->bdev, size);
+ set_capacity_and_notify(md->disk, size);
dm_table_event_callback(t, event_callback, md);
static bool md_in_flight_bios(struct mapped_device *md)
{
int cpu;
- struct hd_struct *part = &dm_disk(md)->part0;
+ struct block_device *part = dm_disk(md)->part0;
long sum = 0;
for_each_possible_cpu(cpu) {
{
int r;
- WARN_ON(md->frozen_sb);
-
- md->frozen_sb = freeze_bdev(md->bdev);
- if (IS_ERR(md->frozen_sb)) {
- r = PTR_ERR(md->frozen_sb);
- md->frozen_sb = NULL;
- return r;
- }
-
- set_bit(DMF_FROZEN, &md->flags);
+ WARN_ON(test_bit(DMF_FROZEN, &md->flags));
- return 0;
+ r = freeze_bdev(md->disk->part0);
+ if (!r)
+ set_bit(DMF_FROZEN, &md->flags);
+ return r;
}
static void unlock_fs(struct mapped_device *md)
{
if (!test_bit(DMF_FROZEN, &md->flags))
return;
-
- thaw_bdev(md->bdev, md->frozen_sb);
- md->frozen_sb = NULL;
+ thaw_bdev(md->disk->part0);
clear_bit(DMF_FROZEN, &md->flags);
}
bio_end_io_t *orig_bi_end_io;
void *orig_bi_private;
unsigned long start_time;
- struct hd_struct *part;
+ struct block_device *part;
};
static void md_end_io(struct bio *bio)
static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
{
char b[BDEVNAME_SIZE];
- struct kobject *ko;
int err;
/* prevent duplicates */
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
- ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
/* failure here is OK */
- err = sysfs_create_link(&rdev->kobj, ko, "block");
+ err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
rdev->sysfs_unack_badblocks =
sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
if (!err) {
mddev->array_sectors = sectors;
- if (mddev->pers) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk_size(mddev->gendisk, true);
- }
+ if (mddev->pers)
+ set_capacity_and_notify(mddev->gendisk,
+ mddev->array_sectors);
}
mddev_unlock(mddev);
return err ?: len;
return error;
}
- static struct kobject *md_probe(dev_t dev, int *part, void *data)
+ static void md_probe(dev_t dev)
{
+ if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
+ return;
if (create_on_open)
md_alloc(dev, NULL);
- return NULL;
}
static int add_named_array(const char *val, const struct kernel_param *kp)
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk_size(mddev->gendisk, true);
+ set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
if (rdev->raid_disk >= 0)
sysfs_unlink_rdev(mddev, rdev);
- set_capacity(disk, 0);
+ set_capacity_and_notify(disk, 0);
mutex_unlock(&mddev->open_mutex);
mddev->changed = 1;
- revalidate_disk_size(disk, true);
if (mddev->ro)
mddev->ro = 0;
break;
}
- md_probe(dev, NULL, NULL);
+ md_probe(dev);
mddev = mddev_find(dev);
if (!mddev || !mddev->gendisk) {
if (mddev)
if (mddev_is_clustered(mddev))
md_cluster_ops->update_size(mddev, old_dev_sectors);
else if (mddev->queue) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk_size(mddev->gendisk, true);
+ set_capacity_and_notify(mddev->gendisk,
+ mddev->array_sectors);
}
}
return rv;
{
switch (cmd) {
case ADD_NEW_DISK:
- case BLKROSET:
case GET_ARRAY_INFO:
case GET_BITMAP_FILE:
case GET_DISK_INFO:
int err = 0;
void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL;
- int ro;
bool did_set_md_closing = false;
if (!md_ioctl_valid(cmd))
goto unlock;
}
break;
-
- case BLKROSET:
- if (get_user(ro, (int __user *)(arg))) {
- err = -EFAULT;
- goto unlock;
- }
- err = -EINVAL;
-
- /* if the bdev is going readonly the value of mddev->ro
- * does not matter, no writes are coming
- */
- if (ro)
- goto unlock;
-
- /* are we are already prepared for writes? */
- if (mddev->ro != 1)
- goto unlock;
-
- /* transitioning to readauto need only happen for
- * arrays that call md_write_start
- */
- if (mddev->pers) {
- err = restart_array(mddev);
- if (err == 0) {
- mddev->ro = 2;
- set_disk_ro(mddev->gendisk, 0);
- }
- }
- goto unlock;
}
/*
}
#endif /* CONFIG_COMPAT */
+ static int md_set_read_only(struct block_device *bdev, bool ro)
+ {
+ struct mddev *mddev = bdev->bd_disk->private_data;
+ int err;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ if (!mddev->raid_disks && !mddev->external) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+
+ /*
+ * Transitioning to read-auto need only happen for arrays that call
+ * md_write_start and which are not ready for writes yet.
+ */
+ if (!ro && mddev->ro == 1 && mddev->pers) {
+ err = restart_array(mddev);
+ if (err)
+ goto out_unlock;
+ mddev->ro = 2;
+ }
+
+ out_unlock:
+ mddev_unlock(mddev);
+ return err;
+ }
+
static int md_open(struct block_device *bdev, fmode_t mode)
{
/*
#endif
.getgeo = md_getgeo,
.check_events = md_check_events,
+ .set_read_only = md_set_read_only,
};
static int md_thread(void *arg)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_disk;
- curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
+ curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
* as sync_io is counted when a request starts, and
EXPORT_SYMBOL(md_write_end);
-/* This is used by raid0 and raid10 */
-void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
- struct bio *bio, sector_t start, sector_t size)
-{
- struct bio *discard_bio = NULL;
-
- if (__blkdev_issue_discard(rdev->bdev, start, size,
- GFP_NOIO, 0, &discard_bio) || !discard_bio)
- return;
-
- bio_chain(discard_bio, bio);
- bio_clone_blkg_association(discard_bio, bio);
- if (mddev->gendisk)
- trace_block_bio_remap(discard_bio, disk_devt(mddev->gendisk),
- bio->bi_iter.bi_sector);
- submit_bio_noacct(discard_bio);
-}
-EXPORT_SYMBOL(md_submit_discard_bio);
-
/* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
- if (!mddev_is_clustered(mddev)) {
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk_size(mddev->gendisk, true);
- }
+ if (!mddev_is_clustered(mddev))
+ set_capacity_and_notify(mddev->gendisk,
+ mddev->array_sectors);
}
spin_lock(&mddev->lock);
if (!md_rdev_misc_wq)
goto err_rdev_misc_wq;
- if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+ ret = __register_blkdev(MD_MAJOR, "md", md_probe);
+ if (ret < 0)
goto err_md;
- if ((ret = register_blkdev(0, "mdp")) < 0)
+ ret = __register_blkdev(0, "mdp", md_probe);
+ if (ret < 0)
goto err_mdp;
mdp_major = ret;
- blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
- md_probe, NULL, NULL);
- blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
- md_probe, NULL, NULL);
-
register_reboot_notifier(&md_notifier);
raid_table_header = register_sysctl_table(raid_root_table);
struct list_head *tmp;
int delay = 1;
- blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
- blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
-
unregister_blkdev(MD_MAJOR,"md");
unregister_blkdev(mdp_major, "mdp");
unregister_reboot_notifier(&md_notifier);
for (disk = 0; disk < zone->nb_dev; disk++) {
sector_t dev_start, dev_end;
+ struct bio *discard_bio = NULL;
struct md_rdev *rdev;
if (disk < start_disk_index)
rdev = conf->devlist[(zone - conf->strip_zone) *
conf->strip_zone[0].nb_dev + disk];
- md_submit_discard_bio(mddev, rdev, bio,
+ if (__blkdev_issue_discard(rdev->bdev,
dev_start + zone->dev_start + rdev->data_offset,
- dev_end - dev_start);
+ dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
+ !discard_bio)
+ continue;
+ bio_chain(discard_bio, bio);
+ bio_clone_blkg_association(discard_bio, bio);
+ if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(rdev->bdev),
- discard_bio, disk_devt(mddev->gendisk),
++ trace_block_bio_remap(discard_bio,
++ disk_devt(mddev->gendisk),
+ bio->bi_iter.bi_sector);
+ submit_bio_noacct(discard_bio);
}
bio_endio(bio);
}
tmp_dev->data_offset;
if (mddev->gendisk)
- trace_block_bio_remap(bio->bi_disk->queue, bio,
- disk_devt(mddev->gendisk), bio_sector);
+ trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
+ bio_sector);
mddev_check_writesame(mddev, bio);
mddev_check_write_zeroes(mddev, bio);
submit_bio_noacct(bio);
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
- int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
+ int size = offsetof(struct r10bio, devs[conf->copies]);
/* allocate a r10bio with room for raid_disks entries in the
* bios array */
{
int i;
- for (i = 0; i < conf->geo.raid_disks; i++) {
+ for (i = 0; i < conf->copies; i++) {
struct bio **bio = & r10_bio->devs[i].bio;
if (!BIO_SPECIAL(*bio))
bio_put(*bio);
int slot;
int repl = 0;
- for (slot = 0; slot < conf->geo.raid_disks; slot++) {
+ for (slot = 0; slot < conf->copies; slot++) {
if (r10_bio->devs[slot].bio == bio)
break;
if (r10_bio->devs[slot].repl_bio == bio) {
}
}
+ BUG_ON(slot == conf->copies);
update_head_pos(slot, r10_bio);
if (slotp)
read_bio->bi_private = r10_bio;
if (mddev->gendisk)
- trace_block_bio_remap(read_bio->bi_disk->queue,
- read_bio, disk_devt(mddev->gendisk),
+ trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
submit_bio_noacct(read_bio);
return;
mbio->bi_private = r10_bio;
if (conf->mddev->gendisk)
- trace_block_bio_remap(mbio->bi_disk->queue,
- mbio, disk_devt(conf->mddev->gendisk),
+ trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
r10_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_disk = (void *)rdev;
}
}
-static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
-{
- int i;
- struct r10conf *conf = mddev->private;
- struct md_rdev *blocked_rdev;
-
-retry_wait:
- blocked_rdev = NULL;
- rcu_read_lock();
- for (i = 0; i < conf->copies; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
- struct md_rdev *rrdev = rcu_dereference(
- conf->mirrors[i].replacement);
- if (rdev == rrdev)
- rrdev = NULL;
- if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- atomic_inc(&rdev->nr_pending);
- blocked_rdev = rdev;
- break;
- }
- if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
- atomic_inc(&rrdev->nr_pending);
- blocked_rdev = rrdev;
- break;
- }
-
- if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
- sector_t first_bad;
- sector_t dev_sector = r10_bio->devs[i].addr;
- int bad_sectors;
- int is_bad;
-
- /* Discard request doesn't care the write result
- * so it doesn't need to wait blocked disk here.
- */
- if (!r10_bio->sectors)
- continue;
-
- is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
- &first_bad, &bad_sectors);
- if (is_bad < 0) {
- /* Mustn't write here until the bad block
- * is acknowledged
- */
- atomic_inc(&rdev->nr_pending);
- set_bit(BlockedBadBlocks, &rdev->flags);
- blocked_rdev = rdev;
- break;
- }
- }
- }
- rcu_read_unlock();
-
- if (unlikely(blocked_rdev)) {
- /* Have to wait for this device to get unblocked, then retry */
- allow_barrier(conf);
- raid10_log(conf->mddev, "%s wait rdev %d blocked",
- __func__, blocked_rdev->raid_disk);
- md_wait_for_blocked_rdev(blocked_rdev, mddev);
- wait_barrier(conf);
- goto retry_wait;
- }
-}
-
static void raid10_write_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
int i;
+ struct md_rdev *blocked_rdev;
sector_t sectors;
int max_sectors;
r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
raid10_find_phys(conf, r10_bio);
-
- wait_blocked_dev(mddev, r10_bio);
-
+retry_write:
+ blocked_rdev = NULL;
rcu_read_lock();
max_sectors = r10_bio->sectors;
conf->mirrors[d].replacement);
if (rdev == rrdev)
rrdev = NULL;
+ if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+ atomic_inc(&rdev->nr_pending);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+ atomic_inc(&rrdev->nr_pending);
+ blocked_rdev = rrdev;
+ break;
+ }
if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
is_bad = is_badblock(rdev, dev_sector, max_sectors,
&first_bad, &bad_sectors);
+ if (is_bad < 0) {
+ /* Mustn't write here until the bad block
+ * is acknowledged
+ */
+ atomic_inc(&rdev->nr_pending);
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ blocked_rdev = rdev;
+ break;
+ }
if (is_bad && first_bad <= dev_sector) {
/* Cannot write here at all */
bad_sectors -= (dev_sector - first_bad);
}
rcu_read_unlock();
+ if (unlikely(blocked_rdev)) {
+ /* Have to wait for this device to get unblocked, then retry */
+ int j;
+ int d;
+
+ for (j = 0; j < i; j++) {
+ if (r10_bio->devs[j].bio) {
+ d = r10_bio->devs[j].devnum;
+ rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+ }
+ if (r10_bio->devs[j].repl_bio) {
+ struct md_rdev *rdev;
+ d = r10_bio->devs[j].devnum;
+ rdev = conf->mirrors[d].replacement;
+ if (!rdev) {
+ /* Race with remove_disk */
+ smp_mb();
+ rdev = conf->mirrors[d].rdev;
+ }
+ rdev_dec_pending(rdev, mddev);
+ }
+ }
+ allow_barrier(conf);
+ raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
+ md_wait_for_blocked_rdev(blocked_rdev, mddev);
+ wait_barrier(conf);
+ goto retry_write;
+ }
+
if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
- memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks);
+ memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
raid10_write_request(mddev, bio, r10_bio);
}
-static struct bio *raid10_split_bio(struct r10conf *conf,
- struct bio *bio, sector_t sectors, bool want_first)
-{
- struct bio *split;
-
- split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split);
- bio_chain(split, bio);
- allow_barrier(conf);
- if (want_first) {
- submit_bio_noacct(bio);
- bio = split;
- } else
- submit_bio_noacct(split);
- wait_barrier(conf);
-
- return bio;
-}
-
-static void raid_end_discard_bio(struct r10bio *r10bio)
-{
- struct r10conf *conf = r10bio->mddev->private;
- struct r10bio *first_r10bio;
-
- while (atomic_dec_and_test(&r10bio->remaining)) {
-
- allow_barrier(conf);
-
- if (!test_bit(R10BIO_Discard, &r10bio->state)) {
- first_r10bio = (struct r10bio *)r10bio->master_bio;
- free_r10bio(r10bio);
- r10bio = first_r10bio;
- } else {
- md_write_end(r10bio->mddev);
- bio_endio(r10bio->master_bio);
- free_r10bio(r10bio);
- break;
- }
- }
-}
-
-static void raid10_end_discard_request(struct bio *bio)
-{
- struct r10bio *r10_bio = bio->bi_private;
- struct r10conf *conf = r10_bio->mddev->private;
- struct md_rdev *rdev = NULL;
- int dev;
- int slot, repl;
-
- /*
- * We don't care the return value of discard bio
- */
- if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
- set_bit(R10BIO_Uptodate, &r10_bio->state);
-
- dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
- if (repl)
- rdev = conf->mirrors[dev].replacement;
- if (!rdev) {
- /* raid10_remove_disk uses smp_mb to make sure rdev is set to
- * replacement before setting replacement to NULL. It can read
- * rdev first without barrier protect even replacment is NULL
- */
- smp_rmb();
- rdev = conf->mirrors[dev].rdev;
- }
-
- raid_end_discard_bio(r10_bio);
- rdev_dec_pending(rdev, conf->mddev);
-}
-
-/* There are some limitations to handle discard bio
- * 1st, the discard size is bigger than stripe_size*2.
- * 2st, if the discard bio spans reshape progress, we use the old way to
- * handle discard bio
- */
-static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
-{
- struct r10conf *conf = mddev->private;
- struct geom *geo = &conf->geo;
- struct r10bio *r10_bio, *first_r10bio;
- int far_copies = geo->far_copies;
- bool first_copy = true;
-
- int disk;
- sector_t chunk;
- unsigned int stripe_size;
- sector_t split_size;
-
- sector_t bio_start, bio_end;
- sector_t first_stripe_index, last_stripe_index;
- sector_t start_disk_offset;
- unsigned int start_disk_index;
- sector_t end_disk_offset;
- unsigned int end_disk_index;
- unsigned int remainder;
-
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- return -EAGAIN;
-
- wait_barrier(conf);
-
- /* Check reshape again to avoid reshape happens after checking
- * MD_RECOVERY_RESHAPE and before wait_barrier
- */
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- goto out;
-
- stripe_size = geo->raid_disks << geo->chunk_shift;
- bio_start = bio->bi_iter.bi_sector;
- bio_end = bio_end_sector(bio);
-
- /* Maybe one discard bio is smaller than strip size or across one stripe
- * and discard region is larger than one stripe size. For far offset layout,
- * if the discard region is not aligned with stripe size, there is hole
- * when we submit discard bio to member disk. For simplicity, we only
- * handle discard bio which discard region is bigger than stripe_size*2
- */
- if (bio_sectors(bio) < stripe_size*2)
- goto out;
-
- /* For far and far offset layout, if bio is not aligned with stripe size,
- * it splits the part that is not aligned with strip size.
- */
- div_u64_rem(bio_start, stripe_size, &remainder);
- if ((far_copies > 1) && remainder) {
- split_size = stripe_size - remainder;
- bio = raid10_split_bio(conf, bio, split_size, false);
- }
- div_u64_rem(bio_end, stripe_size, &remainder);
- if ((far_copies > 1) && remainder) {
- split_size = bio_sectors(bio) - remainder;
- bio = raid10_split_bio(conf, bio, split_size, true);
- }
-
- bio_start = bio->bi_iter.bi_sector;
- bio_end = bio_end_sector(bio);
-
- /* raid10 uses chunk as the unit to store data. It's similar like raid0.
- * One stripe contains the chunks from all member disk (one chunk from
- * one disk at the same HBA address). For layout detail, see 'man md 4'
- */
- chunk = bio_start >> geo->chunk_shift;
- chunk *= geo->near_copies;
- first_stripe_index = chunk;
- start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
- if (geo->far_offset)
- first_stripe_index *= geo->far_copies;
- start_disk_offset = (bio_start & geo->chunk_mask) +
- (first_stripe_index << geo->chunk_shift);
-
- chunk = bio_end >> geo->chunk_shift;
- chunk *= geo->near_copies;
- last_stripe_index = chunk;
- end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
- if (geo->far_offset)
- last_stripe_index *= geo->far_copies;
- end_disk_offset = (bio_end & geo->chunk_mask) +
- (last_stripe_index << geo->chunk_shift);
-
-retry_discard:
- r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
- r10_bio->mddev = mddev;
- r10_bio->state = 0;
- r10_bio->sectors = 0;
- memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
- wait_blocked_dev(mddev, r10_bio);
-
- /* For far layout it needs more than one r10bio to cover all regions.
- * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
- * to record the discard bio. Other r10bio->master_bio record the first
- * r10bio. The first r10bio only release after all other r10bios finish.
- * The discard bio returns only first r10bio finishes
- */
- if (first_copy) {
- r10_bio->master_bio = bio;
- set_bit(R10BIO_Discard, &r10_bio->state);
- first_copy = false;
- first_r10bio = r10_bio;
- } else
- r10_bio->master_bio = (struct bio *)first_r10bio;
-
- rcu_read_lock();
- for (disk = 0; disk < geo->raid_disks; disk++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
- struct md_rdev *rrdev = rcu_dereference(
- conf->mirrors[disk].replacement);
-
- r10_bio->devs[disk].bio = NULL;
- r10_bio->devs[disk].repl_bio = NULL;
-
- if (rdev && (test_bit(Faulty, &rdev->flags)))
- rdev = NULL;
- if (rrdev && (test_bit(Faulty, &rrdev->flags)))
- rrdev = NULL;
- if (!rdev && !rrdev)
- continue;
-
- if (rdev) {
- r10_bio->devs[disk].bio = bio;
- atomic_inc(&rdev->nr_pending);
- }
- if (rrdev) {
- r10_bio->devs[disk].repl_bio = bio;
- atomic_inc(&rrdev->nr_pending);
- }
- }
- rcu_read_unlock();
-
- atomic_set(&r10_bio->remaining, 1);
- for (disk = 0; disk < geo->raid_disks; disk++) {
- sector_t dev_start, dev_end;
- struct bio *mbio, *rbio = NULL;
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
- struct md_rdev *rrdev = rcu_dereference(
- conf->mirrors[disk].replacement);
-
- /*
- * Now start to calculate the start and end address for each disk.
- * The space between dev_start and dev_end is the discard region.
- *
- * For dev_start, it needs to consider three conditions:
- * 1st, the disk is before start_disk, you can imagine the disk in
- * the next stripe. So the dev_start is the start address of next
- * stripe.
- * 2st, the disk is after start_disk, it means the disk is at the
- * same stripe of first disk
- * 3st, the first disk itself, we can use start_disk_offset directly
- */
- if (disk < start_disk_index)
- dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
- else if (disk > start_disk_index)
- dev_start = first_stripe_index * mddev->chunk_sectors;
- else
- dev_start = start_disk_offset;
-
- if (disk < end_disk_index)
- dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
- else if (disk > end_disk_index)
- dev_end = last_stripe_index * mddev->chunk_sectors;
- else
- dev_end = end_disk_offset;
-
- /* It only handles discard bio which size is >= stripe size, so
- * dev_end > dev_start all the time
- */
- if (r10_bio->devs[disk].bio) {
- mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
- mbio->bi_end_io = raid10_end_discard_request;
- mbio->bi_private = r10_bio;
- r10_bio->devs[disk].bio = mbio;
- r10_bio->devs[disk].devnum = disk;
- atomic_inc(&r10_bio->remaining);
- md_submit_discard_bio(mddev, rdev, mbio,
- dev_start + choose_data_offset(r10_bio, rdev),
- dev_end - dev_start);
- bio_endio(mbio);
- }
- if (r10_bio->devs[disk].repl_bio) {
- rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
- rbio->bi_end_io = raid10_end_discard_request;
- rbio->bi_private = r10_bio;
- r10_bio->devs[disk].repl_bio = rbio;
- r10_bio->devs[disk].devnum = disk;
- atomic_inc(&r10_bio->remaining);
- md_submit_discard_bio(mddev, rrdev, rbio,
- dev_start + choose_data_offset(r10_bio, rrdev),
- dev_end - dev_start);
- bio_endio(rbio);
- }
- }
-
- if (!geo->far_offset && --far_copies) {
- first_stripe_index += geo->stride >> geo->chunk_shift;
- start_disk_offset += geo->stride;
- last_stripe_index += geo->stride >> geo->chunk_shift;
- end_disk_offset += geo->stride;
- atomic_inc(&first_r10bio->remaining);
- raid_end_discard_bio(r10_bio);
- wait_barrier(conf);
- goto retry_discard;
- }
-
- raid_end_discard_bio(r10_bio);
-
- return 0;
-out:
- allow_barrier(conf);
- return -EAGAIN;
-}
-
static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
if (!md_write_start(mddev, bio))
return false;
- if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
- if (!raid10_handle_discard(mddev, bio))
- return true;
-
/*
* If this request crosses a chunk boundary, we need to split
* it.
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
- UINT_MAX);
+ mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
- static void nvme_update_bdev_size(struct gendisk *disk)
- {
- struct block_device *bdev = bdget_disk(disk, 0);
-
- if (bdev) {
- bd_set_nr_sectors(bdev, get_capacity(disk));
- bdput(bdev);
- }
- }
-
/*
* Prepare a queue for teardown.
*
blk_set_queue_dying(ns->queue);
blk_mq_unquiesce_queue(ns->queue);
- set_capacity(ns->disk, 0);
- nvme_update_bdev_size(ns->disk);
+ set_capacity_and_notify(ns->disk, 0);
}
static void nvme_queue_scan(struct nvme_ctrl *ctrl)
capacity = 0;
}
- set_capacity_revalidate_and_notify(disk, capacity, false);
+ set_capacity_and_notify(disk, capacity);
nvme_config_discard(disk, ns);
nvme_config_write_zeroes(disk, ns);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
blk_queue_update_readahead(ns->head->disk->queue);
- nvme_update_bdev_size(ns->head->disk);
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
#endif
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
- struct nvme_cel *cel = xa_load(&ctrl->cels, csi);
+ struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
int ret;
if (cel)
return -ENOMEM;
ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
- &cel->log, sizeof(cel->log), 0);
+ cel, sizeof(*cel), 0);
if (ret) {
kfree(cel);
return ret;
}
- cel->csi = csi;
- xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL);
+ xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
out:
- *log = &cel->log;
+ *log = cel;
return 0;
}
*/
if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR)))
nvme_ns_remove(ns);
- else
- revalidate_disk_size(ns->disk, true);
}
static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
+static void nvme_free_cels(struct nvme_ctrl *ctrl)
+{
+ struct nvme_effects_log *cel;
+ unsigned long i;
+
+ xa_for_each (&ctrl->cels, i, cel) {
+ xa_erase(&ctrl->cels, i);
+ kfree(cel);
+ }
+
+ xa_destroy(&ctrl->cels);
+}
+
static void nvme_free_ctrl(struct device *dev)
{
struct nvme_ctrl *ctrl =
if (!subsys || ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
- xa_destroy(&ctrl->cels);
-
+ nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
__free_page(ctrl->discard_page);
static void dasd_device_tasklet(unsigned long);
static void dasd_block_tasklet(unsigned long);
static void do_kick_device(struct work_struct *);
-static void do_restore_device(struct work_struct *);
static void do_reload_device(struct work_struct *);
static void do_requeue_requests(struct work_struct *);
static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *);
INIT_LIST_HEAD(&device->ccw_queue);
timer_setup(&device->timer, dasd_device_timeout, 0);
INIT_WORK(&device->kick_work, do_kick_device);
- INIT_WORK(&device->restore_device, do_restore_device);
INIT_WORK(&device->reload_device, do_reload_device);
INIT_WORK(&device->requeue_requests, do_requeue_requests);
device->state = DASD_STATE_NEW;
{
struct gendisk *disk;
struct disk_part_iter piter;
- struct hd_struct *part;
+ struct block_device *part;
device->state = DASD_STATE_ONLINE;
if (device->block) {
disk = device->block->bdev->bd_disk;
disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
while ((part = disk_part_iter_next(&piter)))
- kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
+ kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
disk_part_iter_exit(&piter);
}
return 0;
int rc;
struct gendisk *disk;
struct disk_part_iter piter;
- struct hd_struct *part;
+ struct block_device *part;
if (device->discipline->online_to_ready) {
rc = device->discipline->online_to_ready(device);
disk = device->block->bdev->bd_disk;
disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
while ((part = disk_part_iter_next(&piter)))
- kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
+ kobject_uevent(bdev_kobj(part), KOBJ_CHANGE);
disk_part_iter_exit(&piter);
}
return 0;
EXPORT_SYMBOL(dasd_reload_device);
/*
- * dasd_restore_device will schedule a call do do_restore_device to the kernel
- * event daemon.
- */
-static void do_restore_device(struct work_struct *work)
-{
- struct dasd_device *device = container_of(work, struct dasd_device,
- restore_device);
- device->cdev->drv->restore(device->cdev);
- dasd_put_device(device);
-}
-
-void dasd_restore_device(struct dasd_device *device)
-{
- dasd_get_device(device);
- /* queue call to dasd_restore_device to the kernel event daemon. */
- if (!schedule_work(&device->restore_device))
- dasd_put_device(device);
-}
-
-/*
* Set the target state for a device and starts the state change.
*/
void dasd_set_target_state(struct dasd_device *device, int target)
"start_IO: -EIO device gone, retry");
break;
case -EINVAL:
- /* most likely caused in power management context */
DBF_DEV_EVENT(DBF_WARNING, device, "%s",
"start_IO: -EINVAL device currently "
"not accessible");
static int __dasd_device_is_unusable(struct dasd_device *device,
struct dasd_ccw_req *cqr)
{
- int mask = ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM | DASD_STOPPED_NOSPC);
+ int mask = ~(DASD_STOPPED_DC_WAIT | DASD_STOPPED_NOSPC);
if (test_bit(DASD_FLAG_OFFLINE, &device->flags) &&
!test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) {
if (!dasd_path_get_tbvpm(device))
return;
- if (device->stopped &
- ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM))
+ if (device->stopped & ~(DASD_STOPPED_DC_WAIT))
return;
rc = device->discipline->verify_path(device,
dasd_path_get_tbvpm(device));
if (!block)
return -EINVAL;
+ /*
+ * If the request is an ERP request there is nothing to requeue.
+ * This will be done with the remaining original request.
+ */
+ if (cqr->refers)
+ return 0;
spin_lock_irq(&cqr->dq->lock);
req = (struct request *) cqr->callback_data;
blk_mq_requeue_request(req, false);
.ioctl = dasd_ioctl,
.compat_ioctl = dasd_ioctl,
.getgeo = dasd_getgeo,
+ .set_read_only = dasd_set_read_only,
};
/*******************************************************************************
"operational\n");
DBF_DEV_EVENT(DBF_WARNING, device, "%s", "path operational");
dasd_device_remove_stop_bits(device, DASD_STOPPED_DC_WAIT);
- if (device->stopped & DASD_UNRESUMED_PM) {
- dasd_device_remove_stop_bits(device, DASD_UNRESUMED_PM);
- dasd_restore_device(device);
- return 1;
- }
dasd_schedule_device_bh(device);
if (device->block) {
dasd_schedule_block_bh(device->block);
}
EXPORT_SYMBOL(dasd_schedule_requeue);
-int dasd_generic_pm_freeze(struct ccw_device *cdev)
-{
- struct dasd_device *device = dasd_device_from_cdev(cdev);
-
- if (IS_ERR(device))
- return PTR_ERR(device);
-
- /* mark device as suspended */
- set_bit(DASD_FLAG_SUSPENDED, &device->flags);
-
- if (device->discipline->freeze)
- device->discipline->freeze(device);
-
- /* disallow new I/O */
- dasd_device_set_stop_bits(device, DASD_STOPPED_PM);
-
- return dasd_generic_requeue_all_requests(device);
-}
-EXPORT_SYMBOL_GPL(dasd_generic_pm_freeze);
-
-int dasd_generic_restore_device(struct ccw_device *cdev)
-{
- struct dasd_device *device = dasd_device_from_cdev(cdev);
- int rc = 0;
-
- if (IS_ERR(device))
- return PTR_ERR(device);
-
- /* allow new IO again */
- dasd_device_remove_stop_bits(device,
- (DASD_STOPPED_PM | DASD_UNRESUMED_PM));
-
- dasd_schedule_device_bh(device);
-
- /*
- * call discipline restore function
- * if device is stopped do nothing e.g. for disconnected devices
- */
- if (device->discipline->restore && !(device->stopped))
- rc = device->discipline->restore(device);
- if (rc || device->stopped)
- /*
- * if the resume failed for the DASD we put it in
- * an UNRESUMED stop state
- */
- device->stopped |= DASD_UNRESUMED_PM;
-
- if (device->block) {
- dasd_schedule_block_bh(device->block);
- if (device->block->request_queue)
- blk_mq_run_hw_queues(device->block->request_queue,
- true);
- }
-
- clear_bit(DASD_FLAG_SUSPENDED, &device->flags);
- dasd_put_device(device);
- return 0;
-}
-EXPORT_SYMBOL_GPL(dasd_generic_restore_device);
-
static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device,
int rdc_buffer_size,
int magic)
int (*fill_info) (struct dasd_device *, struct dasd_information2_t *);
int (*ioctl) (struct dasd_block *, unsigned int, void __user *);
- /* suspend/resume functions */
- int (*freeze) (struct dasd_device *);
- int (*restore) (struct dasd_device *);
-
/* reload device after state change */
int (*reload) (struct dasd_device *);
atomic_t tasklet_scheduled;
struct tasklet_struct tasklet;
struct work_struct kick_work;
- struct work_struct restore_device;
struct work_struct reload_device;
struct work_struct kick_validate;
struct work_struct suc_work;
#define DASD_STOPPED_PENDING 4 /* long busy */
#define DASD_STOPPED_DC_WAIT 8 /* disconnected, wait */
#define DASD_STOPPED_SU 16 /* summary unit check handling */
-#define DASD_STOPPED_PM 32 /* pm state transition */
-#define DASD_UNRESUMED_PM 64 /* pm resume failed state */
#define DASD_STOPPED_NOSPC 128 /* no space left */
/* per device flags */
void dasd_enable_device(struct dasd_device *);
void dasd_set_target_state(struct dasd_device *, int);
void dasd_kick_device(struct dasd_device *);
-void dasd_restore_device(struct dasd_device *);
void dasd_reload_device(struct dasd_device *);
void dasd_schedule_requeue(struct dasd_device *);
void dasd_generic_shutdown(struct ccw_device *);
void dasd_generic_handle_state_change(struct dasd_device *);
-int dasd_generic_pm_freeze(struct ccw_device *);
-int dasd_generic_restore_device(struct ccw_device *);
enum uc_todo dasd_generic_uc_handler(struct ccw_device *, struct irb *);
void dasd_generic_path_event(struct ccw_device *, int *);
int dasd_generic_verify_path(struct dasd_device *, __u8);
void dasd_destroy_partitions(struct dasd_block *);
/* externals in dasd_ioctl.c */
- int dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long);
+ int dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long);
+ int dasd_set_read_only(struct block_device *bdev, bool ro);
/* externals in dasd_proc.c */
int dasd_proc_init(void);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+/* Remove once support for zoned allocation is feature complete */
+#ifdef CONFIG_BTRFS_DEBUG
+BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+#endif
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
NULL
};
}
BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+static const char *rescue_opts[] = {
+ "usebackuproot",
+ "nologreplay",
+ "ignorebadroots",
+ "ignoredatacsums",
+ "all",
+};
+
+static ssize_t supported_rescue_options_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (i ? " " : ""), rescue_opts[i]);
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_rescue_options,
+ supported_rescue_options_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
BTRFS_ATTR_PTR(static_feature, send_stream_version),
+ BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
NULL
};
return -EINVAL;
WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
-
+ btrfs_discard_calc_delay(discard_ctl);
+ btrfs_discard_schedule_work(discard_ctl, true);
return len;
}
BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
return -EINVAL;
WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
-
+ btrfs_discard_schedule_work(discard_ctl, true);
return len;
}
BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
}
BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+static ssize_t btrfs_generation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+}
+BTRFS_ATTR(, generation, btrfs_generation_show);
+
+/*
+ * Look for an exact string @string in @buffer with possible leading or
+ * trailing whitespace
+ */
+static bool strmatch(const char *buffer, const char *string)
+{
+ const size_t len = strlen(string);
+
+ /* Skip leading whitespace */
+ buffer = skip_spaces(buffer);
+
+ /* Match entire string, check if the rest is whitespace or empty */
+ if (strncmp(string, buffer, len) == 0 &&
+ strlen(skip_spaces(buffer + len)) == 0)
+ return true;
+
+ return false;
+}
+
+static const char * const btrfs_read_policy_name[] = { "pid" };
+
+static ssize_t btrfs_read_policy_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (fs_devices->read_policy == i)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ else
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ }
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+
+ return ret;
+}
+
+static ssize_t btrfs_read_policy_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (strmatch(buf, btrfs_read_policy_name[i])) {
+ if (i != fs_devices->read_policy) {
+ fs_devices->read_policy = i;
+ btrfs_info(fs_devices->fs_info,
+ "read policy set to '%s'",
+ btrfs_read_policy_name[i]);
+ }
+ return len;
+ }
+ }
+
+ return -EINVAL;
+}
+BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
BTRFS_ATTR_PTR(, exclusive_operation),
+ BTRFS_ATTR_PTR(, generation),
+ BTRFS_ATTR_PTR(, read_policy),
NULL,
};
default:
WARN_ON(1);
return "invalid-combination";
- };
+ }
}
/*
void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
- struct hd_struct *disk;
- struct kobject *disk_kobj;
struct kobject *devices_kobj;
/*
devices_kobj = device->fs_info->fs_devices->devices_kobj;
ASSERT(devices_kobj);
- if (device->bdev) {
- disk = device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(devices_kobj, disk_kobj->name);
- }
+ if (device->bdev)
+ sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name);
if (device->devid_kobj.state_initialized) {
kobject_del(&device->devid_kobj);
nofs_flag = memalloc_nofs_save();
if (device->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
-
- disk = device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ struct kobject *disk_kobj = bdev_kobj(device->bdev);
ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
if (ret) {
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
+#include "zoned.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
bio_put(device->flush_bio);
+ btrfs_destroy_dev_zone_info(device);
kfree(device);
}
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
device->mode = flags;
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret != 0)
+ goto error_free_page;
+
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
} else {
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL, false);
+ disk_super->dev_item.uuid, NULL);
/*
* If this disk has been pulled into an fs devices created by
* make sure it's the same device if the device is mounted
*/
if (device->bdev) {
- struct block_device *path_bdev;
+ int error;
+ dev_t path_dev;
- path_bdev = lookup_bdev(path);
- if (IS_ERR(path_bdev)) {
+ error = lookup_bdev(path, &path_dev);
+ if (error) {
mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_CAST(path_bdev);
+ return ERR_PTR(error);
}
- if (device->bdev != path_bdev) {
- bdput(path_bdev);
+ if (device->bdev->bd_dev != path_dev) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(device->fs_info,
+ /*
+ * device->fs_info may not be reliable here, so
+ * pass in a NULL instead. This avoids a
+ * possible use-after-free when the fs_info and
+ * fs_info->sb are already torn down.
+ */
+ btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
}
static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
- int step, struct btrfs_device **latest_dev)
+ struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
* After we have read the system tree and know devids belonging to this
* filesystem, remove the device which does not belong there.
*/
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *latest_dev = NULL;
struct btrfs_fs_devices *seed_dev;
mutex_lock(&uuid_mutex);
- __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+ __btrfs_free_extra_devids(fs_devices, &latest_dev);
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
- __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
+ __btrfs_free_extra_devids(seed_dev, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
device->bdev = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ btrfs_destroy_dev_zone_info(device);
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+ fs_devices->read_policy = BTRFS_READ_POLICY_PID;
return 0;
}
}
static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- u64 bytenr)
+ u64 bytenr, u64 bytenr_orig)
{
struct btrfs_super_block *disk_super;
struct page *page;
/* align our pointer to the offset of the super block */
disk_super = p + offset_in_page(bytenr);
- if (btrfs_super_bytenr(disk_super) != bytenr ||
+ if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
btrfs_release_disk_super(p);
return ERR_PTR(-EINVAL);
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct block_device *bdev;
- u64 bytenr;
+ u64 bytenr, bytenr_orig;
+ int ret;
lockdep_assert_held(&uuid_mutex);
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
- bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- disk_super = btrfs_read_disk_super(bdev, bytenr);
+ bytenr_orig = btrfs_sb_offset(0);
+ ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
if (IS_ERR(disk_super))
continue;
+ if (bdev_is_zoned(bdev)) {
+ btrfs_reset_sb_log_zones(bdev, copy_num);
+ continue;
+ }
+
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
page = virt_to_page(disk_super);
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid, true);
+ disk_super->metadata_uuid);
else
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid, true);
+ disk_super->fsid);
btrfs_release_disk_super(disk_super);
if (!device)
if (devid) {
device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL, true);
+ NULL);
if (!device)
return ERR_PTR(-ENOENT);
return device;
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
if (IS_ERR(bdev))
return PTR_ERR(bdev);
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
}
rcu_assign_pointer(device->name, name);
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret)
+ goto error_free_device;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto error_free_device;
+ goto error_free_zone;
}
q = bdev_get_queue(bdev);
fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
- device->fs_info = fs_info;
- device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
sb->s_flags |= SB_RDONLY;
if (trans)
btrfs_end_transaction(trans);
+error_free_zone:
+ btrfs_destroy_dev_zone_info(device);
error_free_device:
btrfs_free_device(device);
error:
else
num_stripes = map->num_stripes;
- preferred_mirror = first + current->pid % num_stripes;
+ switch (fs_info->fs_devices->read_policy) {
+ default:
+ /* Shouldn't happen, just warn and use pid instead of failing */
+ btrfs_warn_rl(fs_info,
+ "unknown read_policy type %u, reset to pid",
+ fs_info->fs_devices->read_policy);
+ fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ fallthrough;
+ case BTRFS_READ_POLICY_PID:
+ preferred_mirror = first + (current->pid % num_stripes);
+ break;
+ }
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
bio->bi_iter.bi_sector = physical >> 9;
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
+ bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
* If @seed is true, traverse through the seed devices.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid,
- bool seed)
+ u64 devid, u8 *uuid, u8 *fsid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL, true);
+ devid, uuid, NULL);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
}
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ fs_uuid);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
}
fill_device_from_item(leaf, dev_item, device);
+ if (device->bdev) {
+ u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+
+ if (device->total_bytes > max_total_bytes) {
+ btrfs_err(fs_info,
+ "device total_bytes should be at most %llu but found %llu",
+ max_total_bytes, device->total_bytes);
+ return -EINVAL;
+ }
+ }
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
+ root->root_key.objectid, 0);
if (IS_ERR(sb))
return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
int i;
const int nr_items = btrfs_header_nritems(node);
- for (i = 0; i < nr_items; i++) {
- u64 start;
-
- start = btrfs_node_blockptr(node, i);
- readahead_tree_block(node->fs_info, start);
- }
+ for (i = 0; i < nr_items; i++)
+ btrfs_readahead_node_child(node, i);
}
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
- true);
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
}
/* Make sure no dev extent is beyond device bondary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- /* It's possible this device is a dummy for seed device */
- if (dev->disk_total_bytes == 0) {
- struct btrfs_fs_devices *devs;
-
- devs = list_first_entry(&fs_info->fs_devices->seed_list,
- struct btrfs_fs_devices, seed_list);
- dev = btrfs_find_device(devs, devid, NULL, NULL, false);
- if (!dev) {
- btrfs_err(fs_info, "failed to find seed devid %llu",
- devid);
- ret = -EUCLEAN;
- goto out;
- }
- }
-
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
u64 prev_dev_ext_end = 0;
int ret = 0;
+ /*
+ * We don't have a dev_root because we mounted with ignorebadroots and
+ * failed to load the root, so we want to skip the verification in this
+ * case for sure.
+ *
+ * However if the dev root is fine, but the tree itself is corrupted
+ * we'd still fail to mount. This verification is only to make sure
+ * writes can happen safely, so instead just bypass this check
+ * completely in the case of IGNOREBADROOTS.
+ */
+ if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ return 0;
+
key.objectid = 1;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
--- /dev/null
- nr_sectors = bdev->bd_part->nr_sects;
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "zoned.h"
+#include "rcu-string.h"
+
+/* Maximum number of zones to report per blkdev_report_zones() call */
+#define BTRFS_REPORT_NR_ZONES 4096
+
+/* Number of superblock log zones */
+#define BTRFS_NR_SB_LOG_ZONES 2
+
+static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
+{
+ struct blk_zone *zones = data;
+
+ memcpy(&zones[idx], zone, sizeof(*zone));
+
+ return 0;
+}
+
+static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
+ u64 *wp_ret)
+{
+ bool empty[BTRFS_NR_SB_LOG_ZONES];
+ bool full[BTRFS_NR_SB_LOG_ZONES];
+ sector_t sector;
+
+ ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
+ zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
+
+ empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
+ empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
+ full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
+ full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+
+ /*
+ * Possible states of log buffer zones
+ *
+ * Empty[0] In use[0] Full[0]
+ * Empty[1] * x 0
+ * In use[1] 0 x 0
+ * Full[1] 1 1 C
+ *
+ * Log position:
+ * *: Special case, no superblock is written
+ * 0: Use write pointer of zones[0]
+ * 1: Use write pointer of zones[1]
+ * C: Compare super blcoks from zones[0] and zones[1], use the latest
+ * one determined by generation
+ * x: Invalid state
+ */
+
+ if (empty[0] && empty[1]) {
+ /* Special case to distinguish no superblock to read */
+ *wp_ret = zones[0].start << SECTOR_SHIFT;
+ return -ENOENT;
+ } else if (full[0] && full[1]) {
+ /* Compare two super blocks */
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct page *page[BTRFS_NR_SB_LOG_ZONES];
+ struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
+ int i;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ u64 bytenr;
+
+ bytenr = ((zones[i].start + zones[i].len)
+ << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
+
+ page[i] = read_cache_page_gfp(mapping,
+ bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page[i])) {
+ if (i == 1)
+ btrfs_release_disk_super(super[0]);
+ return PTR_ERR(page[i]);
+ }
+ super[i] = page_address(page[i]);
+ }
+
+ if (super[0]->generation > super[1]->generation)
+ sector = zones[1].start;
+ else
+ sector = zones[0].start;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ btrfs_release_disk_super(super[i]);
+ } else if (!full[0] && (empty[1] || full[1])) {
+ sector = zones[0].wp;
+ } else if (full[0]) {
+ sector = zones[1].wp;
+ } else {
+ return -EUCLEAN;
+ }
+ *wp_ret = sector << SECTOR_SHIFT;
+ return 0;
+}
+
+/*
+ * The following zones are reserved as the circular buffer on ZONED btrfs.
+ * - The primary superblock: zones 0 and 1
+ * - The first copy: zones 16 and 17
+ * - The second copy: zones 1024 or zone at 256GB which is minimum, and
+ * the following one
+ */
+static inline u32 sb_zone_number(int shift, int mirror)
+{
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+
+ switch (mirror) {
+ case 0: return 0;
+ case 1: return 16;
+ case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
+ }
+
+ return 0;
+}
+
+static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int *nr_zones)
+{
+ int ret;
+
+ if (!*nr_zones)
+ return 0;
+
+ ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
+ copy_zone_info_cb, zones);
+ if (ret < 0) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read zone %llu on %s (devid %llu)",
+ pos, rcu_str_deref(device->name),
+ device->devid);
+ return ret;
+ }
+ *nr_zones = ret;
+ if (!ret)
+ return -EIO;
+
+ return 0;
+}
+
+int btrfs_get_dev_zone_info(struct btrfs_device *device)
+{
+ struct btrfs_zoned_device_info *zone_info = NULL;
+ struct block_device *bdev = device->bdev;
+ struct request_queue *queue = bdev_get_queue(bdev);
+ sector_t nr_sectors;
+ sector_t sector = 0;
+ struct blk_zone *zones = NULL;
+ unsigned int i, nreported = 0, nr_zones;
+ unsigned int zone_sectors;
+ int ret;
+
+ if (!bdev_is_zoned(bdev))
+ return 0;
+
+ if (device->zone_info)
+ return 0;
+
+ zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return -ENOMEM;
+
- nr_sectors = bdev->bd_part->nr_sects;
++ nr_sectors = bdev_nr_sectors(bdev);
+ zone_sectors = bdev_zone_sectors(bdev);
+ /* Check if it's power of 2 (see is_power_of_2) */
+ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+ zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+ zone_info->zone_size_shift = ilog2(zone_info->zone_size);
+ zone_info->max_zone_append_size =
+ (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
+ zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+ if (!IS_ALIGNED(nr_sectors, zone_sectors))
+ zone_info->nr_zones++;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Get zones type */
+ while (sector < nr_sectors) {
+ nr_zones = BTRFS_REPORT_NR_ZONES;
+ ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < nr_zones; i++) {
+ if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
+ __set_bit(nreported, zone_info->seq_zones);
+ if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+ __set_bit(nreported, zone_info->empty_zones);
+ nreported++;
+ }
+ sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
+ }
+
+ if (nreported != zone_info->nr_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "inconsistent number of zones on %s (%u/%u)",
+ rcu_str_deref(device->name), nreported,
+ zone_info->nr_zones);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* Validate superblock log */
+ nr_zones = BTRFS_NR_SB_LOG_ZONES;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_wp;
+ int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
+
+ sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
+ if (sb_zone + 1 >= zone_info->nr_zones)
+ continue;
+
+ sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
+ ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
+ &zone_info->sb_zones[sb_pos],
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read super block log zone info at devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /*
+ * If zones[0] is conventional, always use the beggining of the
+ * zone to record superblock. No need to validate in that case.
+ */
+ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
+ BLK_ZONE_TYPE_CONVENTIONAL)
+ continue;
+
+ ret = sb_write_pointer(device->bdev,
+ &zone_info->sb_zones[sb_pos], &sb_wp);
+ if (ret != -ENOENT && ret) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: super block log zone corrupted devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+
+ kfree(zones);
+
+ device->zone_info = zone_info;
+
+ /* device->fs_info is not safe to use for printing messages */
+ btrfs_info_in_rcu(NULL,
+ "host-%s zoned block device %s, %u zones of %llu bytes",
+ bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware",
+ rcu_str_deref(device->name), zone_info->nr_zones,
+ zone_info->zone_size);
+
+ return 0;
+
+out:
+ kfree(zones);
+ bitmap_free(zone_info->empty_zones);
+ bitmap_free(zone_info->seq_zones);
+ kfree(zone_info);
+
+ return ret;
+}
+
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return;
+
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ kfree(zone_info);
+ device->zone_info = NULL;
+}
+
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ unsigned int nr_zones = 1;
+ int ret;
+
+ ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
+ if (ret != 0 || !nr_zones)
+ return ret ? ret : -EIO;
+
+ return 0;
+}
+
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 zoned_devices = 0;
+ u64 nr_devices = 0;
+ u64 zone_size = 0;
+ u64 max_zone_append_size = 0;
+ const bool incompat_zoned = btrfs_is_zoned(fs_info);
+ int ret = 0;
+
+ /* Count zoned devices */
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ enum blk_zoned_model model;
+
+ if (!device->bdev)
+ continue;
+
+ model = bdev_zoned_model(device->bdev);
+ if (model == BLK_ZONED_HM ||
+ (model == BLK_ZONED_HA && incompat_zoned)) {
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = device->zone_info;
+ zoned_devices++;
+ if (!zone_size) {
+ zone_size = zone_info->zone_size;
+ } else if (zone_info->zone_size != zone_size) {
+ btrfs_err(fs_info,
+ "zoned: unequal block device zone sizes: have %llu found %llu",
+ device->zone_info->zone_size,
+ zone_size);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size =
+ zone_info->max_zone_append_size;
+ }
+ nr_devices++;
+ }
+
+ if (!zoned_devices && !incompat_zoned)
+ goto out;
+
+ if (!zoned_devices && incompat_zoned) {
+ /* No zoned block device found on ZONED filesystem */
+ btrfs_err(fs_info,
+ "zoned: no zoned devices found on a zoned filesystem");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (zoned_devices && !incompat_zoned) {
+ btrfs_err(fs_info,
+ "zoned: mode not enabled but zoned device found");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (zoned_devices != nr_devices) {
+ btrfs_err(fs_info,
+ "zoned: cannot mix zoned and regular devices");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * stripe_size is always aligned to BTRFS_STRIPE_LEN in
+ * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+ * check the alignment here.
+ */
+ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
+ btrfs_err(fs_info,
+ "zoned: zone size %llu not aligned to stripe %u",
+ zone_size, BTRFS_STRIPE_LEN);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ btrfs_err(fs_info, "zoned: mixed block groups not supported");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ fs_info->zone_size = zone_size;
+ fs_info->max_zone_append_size = max_zone_append_size;
+
+ btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
+out:
+ return ret;
+}
+
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ if (!btrfs_is_zoned(info))
+ return 0;
+
+ /*
+ * Space cache writing is not COWed. Disable that to avoid write errors
+ * in sequential zones.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE)) {
+ btrfs_err(info, "zoned: space cache v1 is not supported");
+ return -EINVAL;
+ }
+
+ if (btrfs_test_opt(info, NODATACOW)) {
+ btrfs_err(info, "zoned: NODATACOW not supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
+ int rw, u64 *bytenr_ret)
+{
+ u64 wp;
+ int ret;
+
+ if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ *bytenr_ret = zones[0].start << SECTOR_SHIFT;
+ return 0;
+ }
+
+ ret = sb_write_pointer(bdev, zones, &wp);
+ if (ret != -ENOENT && ret < 0)
+ return ret;
+
+ if (rw == WRITE) {
+ struct blk_zone *reset = NULL;
+
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ reset = &zones[0];
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ reset = &zones[1];
+
+ if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ reset->start, reset->len,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ reset->cond = BLK_ZONE_COND_EMPTY;
+ reset->wp = reset->start;
+ }
+ } else if (ret != -ENOENT) {
+ /* For READ, we want the precious one */
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+ wp -= BTRFS_SUPER_INFO_SIZE;
+ }
+
+ *bytenr_ret = wp;
+ return 0;
+
+}
+
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
+ unsigned int zone_sectors;
+ u32 sb_zone;
+ int ret;
+ u64 zone_size;
+ u8 zone_sectors_shift;
+ sector_t nr_sectors;
+ u32 nr_zones;
+
+ if (!bdev_is_zoned(bdev)) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ ASSERT(rw == READ || rw == WRITE);
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ if (!is_power_of_2(zone_sectors))
+ return -EINVAL;
+ zone_size = zone_sectors << SECTOR_SHIFT;
+ zone_sectors_shift = ilog2(zone_sectors);
- nr_sectors = bdev->bd_part->nr_sects;
++ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
+ BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
+ zones);
+ if (ret < 0)
+ return ret;
+ if (ret != BTRFS_NR_SB_LOG_ZONES)
+ return -EIO;
+
+ return sb_log_location(bdev, zones, rw, bytenr_ret);
+}
+
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zone_num;
+
+ if (!zinfo) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return -ENOENT;
+
+ return sb_log_location(device->bdev,
+ &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
+ rw, bytenr_ret);
+}
+
+static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
+ int mirror)
+{
+ u32 zone_num;
+
+ if (!zinfo)
+ return false;
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return false;
+
+ if (!test_bit(zone_num, zinfo->seq_zones))
+ return false;
+
+ return true;
+}
+
+void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ struct blk_zone *zone;
+
+ if (!is_sb_log_zone(zinfo, mirror))
+ return;
+
+ zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
+ if (zone->cond != BLK_ZONE_COND_FULL) {
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+
+ if (zone->wp == zone->start + zone->len)
+ zone->cond = BLK_ZONE_COND_FULL;
+
+ return;
+ }
+
+ zone++;
+ ASSERT(zone->cond != BLK_ZONE_COND_FULL);
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+
+ if (zone->wp == zone->start + zone->len)
+ zone->cond = BLK_ZONE_COND_FULL;
+}
+
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ sector_t zone_sectors;
+ sector_t nr_sectors;
+ u8 zone_sectors_shift;
+ u32 sb_zone;
+ u32 nr_zones;
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ zone_sectors_shift = ilog2(zone_sectors);
++ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ sb_zone << zone_sectors_shift,
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+}
void emergency_thaw_bdev(struct super_block *sb)
{
- while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+ while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}
} while (bh != head);
}
/*
- * Lock out page->mem_cgroup migration to keep PageDirty
+ * Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
lock_page_memcg(page);
} else if (test_opt2(sb, DAX_INODE)) {
SEQ_OPTS_PUTS("dax=inode");
}
-
- if (test_opt2(sb, JOURNAL_FAST_COMMIT))
- SEQ_OPTS_PUTS("fast_commit");
-
ext4_show_quota_options(seq, sb);
return 0;
}
sbi->s_sb = sb;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
- if (sb->s_bdev->bd_part)
- sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
/* Cleanup superblock name */
strreplace(sb->s_id, '/', '!');
*/
if (!(sb->s_flags & SB_RDONLY))
ext4_update_tstamp(es, s_wtime);
- if (sb->s_bdev->bd_part)
- es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1));
- else
- es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1));
if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
ext4_free_blocks_count_set(es,
EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
* and the return value is in kbytes. s is of struct f2fs_sb_info.
*/
#define BD_PART_WRITTEN(s) \
- (((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \
+ (((u64)part_stat_read((s)->sb->s_bdev, sectors[STAT_WRITE]) - \
(s)->sectors_written_start) >> 1)
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
{
+ if (fscrypt_is_nokey_name(dentry))
+ return -ENOKEY;
return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name,
inode, inode->i_ino, inode->i_mode);
}
extern int __sync_blockdev(struct block_device *bdev, int wait);
void iterate_bdevs(void (*)(struct block_device *, void *), void *);
void emergency_thaw_bdev(struct super_block *sb);
- void bd_forget(struct inode *inode);
#else
static inline void bdev_cache_init(void)
{
{
return 0;
}
- static inline void bd_forget(struct inode *inode)
- {
- }
#endif /* CONFIG_BLOCK */
/*
long do_rmdir(int dfd, struct filename *name);
long do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct path *link);
+int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
+ struct filename *newname, unsigned int flags);
/*
* namespace.c
*/
extern int reconfigure_super(struct fs_context *);
extern bool trylock_super(struct super_block *sb);
- extern struct super_block *user_get_super(dev_t);
+ struct super_block *user_get_super(dev_t, bool excl);
+ void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
/*
struct list_head file_list;
struct fixed_file_data *file_data;
struct llist_node llist;
+ bool done;
};
struct fixed_file_data {
struct task_struct *thread;
struct wait_queue_head wait;
+
+ unsigned sq_thread_idle;
};
struct io_ring_ctx {
struct list_head timeout_list;
struct list_head cq_overflow_list;
- wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
- struct wait_queue_entry sqo_wait_entry;
struct list_head sqd_list;
/*
*/
struct io_poll_iocb {
struct file *file;
- union {
- struct wait_queue_head *head;
- u64 addr;
- };
+ struct wait_queue_head *head;
__poll_t events;
bool done;
bool canceled;
struct wait_queue_entry wait;
};
+struct io_poll_remove {
+ struct file *file;
+ u64 addr;
+};
+
struct io_close {
struct file *file;
struct file *put_file;
u32 off;
u32 target_seq;
struct list_head list;
+ /* head of the link, used by linked timeouts only */
+ struct io_kiocb *head;
};
struct io_timeout_rem {
struct file *file;
u64 addr;
+
+ /* timeout update */
+ struct timespec64 ts;
+ u32 flags;
};
struct io_rw {
struct io_open {
struct file *file;
int dfd;
+ bool ignore_nonblock;
struct filename *filename;
struct open_how how;
unsigned long nofile;
struct statx __user *buffer;
};
+struct io_shutdown {
+ struct file *file;
+ int how;
+};
+
+struct io_rename {
+ struct file *file;
+ int old_dfd;
+ int new_dfd;
+ struct filename *oldpath;
+ struct filename *newpath;
+ int flags;
+};
+
+struct io_unlink {
+ struct file *file;
+ int dfd;
+ int flags;
+ struct filename *filename;
+};
+
struct io_completion {
struct file *file;
struct list_head list;
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
- REQ_F_LINK_HEAD_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
- /* head of a link */
- REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
+ struct io_poll_remove poll_remove;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ struct io_shutdown shutdown;
+ struct io_rename rename;
+ struct io_unlink unlink;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
struct task_struct *task;
u64 user_data;
- struct list_head link_list;
+ struct io_kiocb *link;
+ struct percpu_ref *fixed_file_refs;
/*
* 1. used with ctx->iopoll_list with reads/writes
* 2. to track reqs with ->files (see io_op_def::file_table)
*/
struct list_head inflight_entry;
-
- struct percpu_ref *fixed_file_refs;
struct callback_head task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
void *reqs[IO_IOPOLL_BATCH];
unsigned int free_reqs;
+ bool plug_started;
+
/*
* Batch completion logic
*/
*/
struct file *file;
unsigned int fd;
- unsigned int has_refs;
+ unsigned int file_refs;
unsigned int ios_left;
};
unsigned buffer_select : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
unsigned work_flags;
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
IO_WQ_WORK_MM,
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
.work_flags = IO_WQ_WORK_MM,
},
- [IORING_OP_TIMEOUT_REMOVE] = {},
+ [IORING_OP_TIMEOUT_REMOVE] = {
+ /* used by timeout updates' prep() */
+ .work_flags = IO_WQ_WORK_MM,
+ },
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ IO_WQ_WORK_FS | IO_WQ_WORK_MM,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
},
[IORING_OP_OPENAT2] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG,
+ IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
+ [IORING_OP_SHUTDOWN] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_RENAMEAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
+ [IORING_OP_UNLINKAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
};
enum io_mem_account {
}
EXPORT_SYMBOL(io_uring_get_socket);
+#define io_for_each_link(pos, head) \
+ for (pos = (head); pos; pos = pos->link)
+
static inline void io_clean_op(struct io_kiocb *req)
{
if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
__io_clean_op(req);
}
-static void io_sq_thread_drop_mm(void)
+static inline void io_set_resource_node(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->fixed_file_refs) {
+ req->fixed_file_refs = &ctx->file_data->node->refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
+}
+
+static bool io_match_task(struct io_kiocb *head,
+ struct task_struct *task,
+ struct files_struct *files)
+{
+ struct io_kiocb *req;
+
+ if (task && head->task != task)
+ return false;
+ if (!files)
+ return true;
+
+ io_for_each_link(req, head) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES) &&
+ req->work.identity->files == files)
+ return true;
+ }
+ return false;
+}
+
+static void io_sq_thread_drop_mm_files(void)
{
+ struct files_struct *files = current->files;
struct mm_struct *mm = current->mm;
if (mm) {
mmput(mm);
current->mm = NULL;
}
+ if (files) {
+ struct nsproxy *nsproxy = current->nsproxy;
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
+ put_files_struct(files);
+ put_nsproxy(nsproxy);
+ }
+}
+
+static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
+{
+ if (!current->files) {
+ struct files_struct *files;
+ struct nsproxy *nsproxy;
+
+ task_lock(ctx->sqo_task);
+ files = ctx->sqo_task->files;
+ if (!files) {
+ task_unlock(ctx->sqo_task);
+ return -EOWNERDEAD;
+ }
+ atomic_inc(&files->count);
+ get_nsproxy(ctx->sqo_task->nsproxy);
+ nsproxy = ctx->sqo_task->nsproxy;
+ task_unlock(ctx->sqo_task);
+
+ task_lock(current);
+ current->files = files;
+ current->nsproxy = nsproxy;
+ task_unlock(current);
+ }
+ return 0;
}
static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
return -EFAULT;
}
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
- if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
- return 0;
- return __io_sq_thread_acquire_mm(ctx);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ int ret;
+
+ if (def->work_flags & IO_WQ_WORK_MM) {
+ ret = __io_sq_thread_acquire_mm(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
+ ret = __io_sq_thread_acquire_files(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return 0;
}
static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
- init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
*/
io_init_identity(id);
if (creds)
- req->work.identity->creds = creds;
+ id->creds = creds;
/* add one for this request */
refcount_inc(&id->count);
return false;
req->work.flags |= IO_WQ_WORK_FSIZE;
}
-
- if (!(req->work.flags & IO_WQ_WORK_FILES) &&
- (def->work_flags & IO_WQ_WORK_FILES) &&
- !(req->flags & REQ_F_NO_FILE_TABLE)) {
- if (id->files != current->files ||
- id->nsproxy != current->nsproxy)
- return false;
- atomic_inc(&id->files->count);
- get_nsproxy(id->nsproxy);
- req->flags |= REQ_F_INFLIGHT;
-
- spin_lock_irq(&ctx->inflight_lock);
- list_add(&req->inflight_entry, &ctx->inflight_list);
- spin_unlock_irq(&ctx->inflight_lock);
- req->work.flags |= IO_WQ_WORK_FILES;
- }
#ifdef CONFIG_BLK_CGROUP
if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
(def->work_flags & IO_WQ_WORK_BLKCG)) {
}
spin_unlock(¤t->fs->lock);
}
+ if (!(req->work.flags & IO_WQ_WORK_FILES) &&
+ (def->work_flags & IO_WQ_WORK_FILES) &&
+ !(req->flags & REQ_F_NO_FILE_TABLE)) {
+ if (id->files != current->files ||
+ id->nsproxy != current->nsproxy)
+ return false;
+ atomic_inc(&id->files->count);
+ get_nsproxy(id->nsproxy);
+ req->flags |= REQ_F_INFLIGHT;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_add(&req->inflight_entry, &ctx->inflight_list);
+ spin_unlock_irq(&ctx->inflight_lock);
+ req->work.flags |= IO_WQ_WORK_FILES;
+ }
return true;
}
{
struct io_kiocb *cur;
- io_prep_async_work(req);
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(cur, &req->link_list, link_list)
- io_prep_async_work(cur);
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
}
static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
}
}
-static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!tsk || req->task == tsk)
- return true;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (ctx->sq_data && req->task == ctx->sq_data->thread)
- return true;
- }
- return false;
-}
-
/*
* Returns true if we found and killed one or more timeouts
*/
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_task_match(req, tsk)) {
+ if (io_match_task(req, tsk, files)) {
io_kill_timeout(req);
canceled++;
}
}
}
-static inline bool __io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- return ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES)) &&
- req->work.identity->files == files;
-}
-
-static bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- struct io_kiocb *link;
-
- if (!files)
- return true;
- if (__io_match_files(req, files))
- return true;
- if (req->flags & REQ_F_LINK_HEAD) {
- list_for_each_entry(link, &req->link_list, link_list) {
- if (__io_match_files(link, files))
- return true;
- }
- }
- return false;
-}
-
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
cqe = NULL;
list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- if (tsk && req->task != tsk)
- continue;
- if (!io_match_files(req, files))
+ if (!io_match_task(req, tsk, files))
continue;
cqe = io_get_cqring(ctx);
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
- if (fixed)
- percpu_ref_put(req->fixed_file_refs);
- else
+ if (!fixed)
fput(file);
}
kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-
+ if (req->fixed_file_refs)
+ percpu_ref_put(req->fixed_file_refs);
io_req_clean_work(req);
}
percpu_ref_put(&ctx->refs);
}
+static inline void io_remove_next_linked(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt = req->link;
+
+ req->link = nxt->link;
+ nxt->link = NULL;
+}
+
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
+ link = req->link;
+
/*
* Can happen if a linked timeout fired and link had been like
* req -> link t-out -> link t-out [-> ...]
struct io_timeout_data *io = link->async_data;
int ret;
- list_del_init(&link->link_list);
+ io_remove_next_linked(req);
+ link->timeout.head = NULL;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
io_cqring_fill_event(link, -ECANCELED);
}
}
-static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
-
- /*
- * The list should never be empty when we are called here. But could
- * potentially happen if the chain is messed up, check to be on the
- * safe side.
- */
- if (unlikely(list_empty(&req->link_list)))
- return NULL;
-
- nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- return nxt;
-}
-/*
- * Called if REQ_F_LINK_HEAD is set, and we fail the head request
- */
static void io_fail_links(struct io_kiocb *req)
{
+ struct io_kiocb *link, *nxt;
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *link = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
+ link = req->link;
+ req->link = NULL;
- list_del_init(&link->link_list);
- trace_io_uring_fail_link(req, link);
+ while (link) {
+ nxt = link->link;
+ link->link = NULL;
+ trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
/*
io_put_req_deferred(link, 2);
else
io_double_put_req(link);
+ link = nxt;
}
-
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
- req->flags &= ~REQ_F_LINK_HEAD;
if (req->flags & REQ_F_LINK_TIMEOUT)
io_kill_linked_timeout(req);
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (likely(!(req->flags & REQ_F_FAIL_LINK)))
- return io_req_link_next(req);
+ if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
+ struct io_kiocb *nxt = req->link;
+
+ req->link = NULL;
+ return nxt;
+ }
io_fail_links(req);
return NULL;
}
-static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
return NULL;
return __io_req_find_next(req);
}
-static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
+static int io_req_task_work_add(struct io_kiocb *req)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
* will do the job.
*/
notify = TWA_NONE;
- if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
+ if (!(ctx->flags & IORING_SETUP_SQPOLL))
notify = TWA_SIGNAL;
ret = task_work_add(tsk, &req->task_work, notify);
{
struct io_ring_ctx *ctx = req->ctx;
- if (!__io_sq_thread_acquire_mm(ctx)) {
+ if (!__io_sq_thread_acquire_mm(ctx) &&
+ !__io_sq_thread_acquire_files(ctx)) {
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
init_task_work(&req->task_work, io_req_task_submit);
percpu_ref_get(&req->ctx->refs);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
}
}
-static void io_queue_next(struct io_kiocb *req)
+static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
io_free_req(req);
return;
}
- if (req->flags & REQ_F_LINK_HEAD)
- io_queue_next(req);
+ io_queue_next(req);
if (req->task != rb->task) {
if (rb->task) {
int ret;
init_task_work(&req->task_work, io_put_req_deferred_cb);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
* we wake up the task, and the next invocation will flush the
* entries. We cannot safely to it from here.
*/
- if (noflush && !list_empty(&ctx->cq_overflow_list))
+ if (noflush)
return -1U;
io_cqring_overflow_flush(ctx, false, NULL, NULL);
}
end_req:
req_set_fail_links(req);
- io_req_complete(req, ret);
return false;
}
#endif
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
- ret = io_sq_thread_acquire_mm(req->ctx, req);
+ ret = io_sq_thread_acquire_mm_files(req->ctx, req);
if (io_resubmit_prep(req, ret)) {
refcount_inc(&req->refs);
* find it from a io_iopoll_getevents() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
{
struct io_ring_ctx *ctx = req->ctx;
else
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
- if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ /*
+ * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
+ * task context or in io worker task context. If current task context is
+ * sq thread, we don't need to check whether should wake up sq thread.
+ */
+ if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
}
-static void __io_state_file_put(struct io_submit_state *state)
+static inline void __io_state_file_put(struct io_submit_state *state)
{
- if (state->has_refs)
- fput_many(state->file, state->has_refs);
- state->file = NULL;
+ fput_many(state->file, state->file_refs);
+ state->file_refs = 0;
}
static inline void io_state_file_put(struct io_submit_state *state)
{
- if (state->file)
+ if (state->file_refs)
__io_state_file_put(state);
}
if (!state)
return fget(fd);
- if (state->file) {
+ if (state->file_refs) {
if (state->fd == fd) {
- state->has_refs--;
+ state->file_refs--;
return state->file;
}
__io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
- if (!state->file)
+ if (unlikely(!state->file))
return NULL;
state->fd = fd;
- state->has_refs = state->ios_left - 1;
+ state->file_refs = state->ios_left - 1;
return state->file;
}
static bool io_bdev_nowait(struct block_device *bdev)
{
- #ifdef CONFIG_BLOCK
return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
- #else
- return true;
- #endif
}
/*
umode_t mode = file_inode(file)->i_mode;
if (S_ISBLK(mode)) {
- if (io_bdev_nowait(file->f_inode->i_bdev))
+ if (IS_ENABLED(CONFIG_BLOCK) &&
+ io_bdev_nowait(I_BDEV(file->f_mapping->host)))
return true;
return false;
}
if (S_ISCHR(mode) || S_ISSOCK(mode))
return true;
if (S_ISREG(mode)) {
- if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+ if (IS_ENABLED(CONFIG_BLOCK) &&
+ io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
file->f_op != &io_uring_fops)
return true;
return false;
return __io_iov_buffer_select(req, iov, needs_lock);
}
-static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
{
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret < 0 ? ret : sqe_len;
+ return ret;
}
if (req->flags & REQ_F_BUFFER_SELECT) {
req->ctx->compat);
}
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock)
-{
- struct io_async_rw *iorw = req->async_data;
-
- if (!iorw)
- return __io_import_iovec(rw, req, iovec, iter, needs_lock);
- *iovec = NULL;
- return iov_iter_count(&iorw->iter);
-}
-
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
- if (iter->type == ITER_BVEC)
+ if (iov_iter_is_bvec(iter))
return;
if (!iovec) {
unsigned iov_off = 0;
struct iovec *iov = iorw->fast_iov;
ssize_t ret;
- ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
return ret;
/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
- size_t iov_count;
bool no_async;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
ret = 0;
if (no_async)
goto copy_iov;
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
if (req->file->f_flags & O_NONBLOCK)
goto done;
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
goto copy_iov;
} else if (ret < 0) {
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
- size_t iov_count;
ssize_t ret, ret2, io_size;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
/* Ensure we clear previously set non-block flag */
(req->flags & REQ_F_ISREG))
goto copy_iov;
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (!ret)
return -EAGAIN;
return ret;
}
-static int __io_splice_prep(struct io_kiocb *req,
+static int io_renameat_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- struct io_splice* sp = &req->splice;
- unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
+ struct io_rename *ren = &req->rename;
+ const char __user *oldf, *newf;
- sp->file_in = NULL;
- sp->len = READ_ONCE(sqe->len);
- sp->flags = READ_ONCE(sqe->splice_flags);
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
- if (unlikely(sp->flags & ~valid_flags))
- return -EINVAL;
+ ren->old_dfd = READ_ONCE(sqe->fd);
+ oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ren->new_dfd = READ_ONCE(sqe->len);
+ ren->flags = READ_ONCE(sqe->rename_flags);
- sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
- (sp->flags & SPLICE_F_FD_IN_FIXED));
- if (!sp->file_in)
- return -EBADF;
- req->flags |= REQ_F_NEED_CLEANUP;
+ ren->oldpath = getname(oldf);
+ if (IS_ERR(ren->oldpath))
+ return PTR_ERR(ren->oldpath);
- if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
- /*
- * Splice operation will be punted aync, and here need to
- * modify io_wq_work.flags, so initialize io_wq_work firstly.
- */
- io_req_init_async(req);
- req->work.flags |= IO_WQ_WORK_UNBOUND;
+ ren->newpath = getname(newf);
+ if (IS_ERR(ren->newpath)) {
+ putname(ren->oldpath);
+ return PTR_ERR(ren->newpath);
}
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
-static int io_tee_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
- return -EINVAL;
- return __io_splice_prep(req, sqe);
-}
-
-static int io_tee(struct io_kiocb *req, bool force_nonblock)
+static int io_renameat(struct io_kiocb *req, bool force_nonblock)
{
- struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
- struct file *out = sp->file_out;
- unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
- long ret = 0;
+ struct io_rename *ren = &req->rename;
+ int ret;
if (force_nonblock)
return -EAGAIN;
- if (sp->len)
- ret = do_tee(in, out, sp->len, flags);
- io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+ ren->newpath, ren->flags);
- if (ret != sp->len)
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
req_set_fail_links(req);
io_req_complete(req, ret);
return 0;
}
-static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_unlinkat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
- struct io_splice* sp = &req->splice;
+ struct io_unlink *un = &req->unlink;
+ const char __user *fname;
- sp->off_in = READ_ONCE(sqe->splice_off_in);
- sp->off_out = READ_ONCE(sqe->off);
- return __io_splice_prep(req, sqe);
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ un->dfd = READ_ONCE(sqe->fd);
+
+ un->flags = READ_ONCE(sqe->unlink_flags);
+ if (un->flags & ~AT_REMOVEDIR)
+ return -EINVAL;
+
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ un->filename = getname(fname);
+ if (IS_ERR(un->filename))
+ return PTR_ERR(un->filename);
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
}
-static int io_splice(struct io_kiocb *req, bool force_nonblock)
+static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_unlink *un = &req->unlink;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ if (un->flags & AT_REMOVEDIR)
+ ret = do_rmdir(un->dfd, un->filename);
+ else
+ ret = do_unlinkat(un->dfd, un->filename);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_shutdown_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_NET)
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index)
+ return -EINVAL;
+
+ req->shutdown.how = READ_ONCE(sqe->len);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct socket *sock;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_shutdown_sock(sock, req->shutdown.how);
+ io_req_complete(req, ret);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int __io_splice_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_splice* sp = &req->splice;
+ unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
+ sp->file_in = NULL;
+ sp->len = READ_ONCE(sqe->len);
+ sp->flags = READ_ONCE(sqe->splice_flags);
+
+ if (unlikely(sp->flags & ~valid_flags))
+ return -EINVAL;
+
+ sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
+ (sp->flags & SPLICE_F_FD_IN_FIXED));
+ if (!sp->file_in)
+ return -EBADF;
+ req->flags |= REQ_F_NEED_CLEANUP;
+
+ if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
+ /*
+ * Splice operation will be punted aync, and here need to
+ * modify io_wq_work.flags, so initialize io_wq_work firstly.
+ */
+ io_req_init_async(req);
+ req->work.flags |= IO_WQ_WORK_UNBOUND;
+ }
+
+ return 0;
+}
+
+static int io_tee_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
+ return -EINVAL;
+ return __io_splice_prep(req, sqe);
+}
+
+static int io_tee(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_splice *sp = &req->splice;
+ struct file *in = sp->file_in;
+ struct file *out = sp->file_out;
+ unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ long ret = 0;
+
+ if (force_nonblock)
+ return -EAGAIN;
+ if (sp->len)
+ ret = do_tee(in, out, sp->len, flags);
+
+ io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+
+ if (ret != sp->len)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_splice* sp = &req->splice;
+
+ sp->off_in = READ_ONCE(sqe->splice_off_in);
+ sp->off_out = READ_ONCE(sqe->off);
+ return __io_splice_prep(req, sqe);
+}
+
+static int io_splice(struct io_kiocb *req, bool force_nonblock)
{
struct io_splice *sp = &req->splice;
struct file *in = sp->file_in;
return ret;
}
req->open.nofile = rlimit(RLIMIT_NOFILE);
+ req->open.ignore_nonblock = false;
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
{
u64 flags, mode;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
mode = READ_ONCE(sqe->len);
flags = READ_ONCE(sqe->open_flags);
size_t len;
int ret;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
struct file *file;
int ret;
- if (force_nonblock)
+ if (force_nonblock && !req->open.ignore_nonblock)
return -EAGAIN;
ret = build_open_flags(&req->open.how, &op);
if (IS_ERR(file)) {
put_unused_fd(ret);
ret = PTR_ERR(file);
+ /*
+ * A work-around to ensure that /proc/self works that way
+ * that it should - if we get -EOPNOTSUPP back, then assume
+ * that proc_self_get_link() failed us because we're in async
+ * context. We should be safe to retry this from the task
+ * itself with force_nonblock == false set, as it should not
+ * block on lookup. Would be nice to know this upfront and
+ * avoid the async dance, but doesn't seem feasible.
+ */
+ if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
+ req->open.ignore_nonblock = true;
+ refcount_inc(&req->refs);
+ io_req_task_queue(req);
+ return 0;
+ }
} else {
fsnotify_open(file);
fd_install(ret, file);
head = idr_find(&ctx->io_buffer_idr, p->bgid);
if (head)
ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
-
- io_ring_submit_lock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ __io_req_complete(req, ret, 0, cs);
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ } else {
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ __io_req_complete(req, ret, 0, cs);
+ }
return 0;
}
}
}
out:
- io_ring_submit_unlock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ __io_req_complete(req, ret, 0, cs);
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ } else {
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ __io_req_complete(req, ret, 0, cs);
+ }
return 0;
}
io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
/* might be already done during nonblock submission */
if (!close->put_file) {
- ret = __close_fd_get_file(close->fd, &close->put_file);
+ ret = close_fd_get_file(close->fd, &close->put_file);
if (ret < 0)
return (ret == -ENOENT) ? -EBADF : ret;
}
unsigned flags;
int ret;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->async_data) {
kmsg = req->async_data;
unsigned flags;
int ret;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
return -EFAULT;
if (clen < 0)
return -EINVAL;
- sr->len = iomsg->iov[0].iov_len;
+ sr->len = clen;
+ iomsg->iov[0].iov_len = clen;
iomsg->iov = NULL;
} else {
ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
unsigned flags;
int ret, cflags = 0;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->async_data) {
kmsg = req->async_data;
unsigned flags;
int ret, cflags = 0;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
kbuf = io_recv_buffer_select(req, !force_nonblock);
{
struct io_accept *accept = &req->accept;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index)
return -EINVAL;
struct io_connect *conn = &req->connect;
struct io_async_connect *io = req->async_data;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
- bool twa_signal_ok;
int ret;
/* for instances that support it check for an event match first: */
percpu_ref_get(&req->ctx->refs);
/*
- * If we using the signalfd wait_queue_head for this wakeup, then
- * it's not safe to use TWA_SIGNAL as we could be recursing on the
- * tsk->sighand->siglock on doing the wakeup. Should not be needed
- * either, as the normal wakeup will suffice.
- */
- twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
-
- /*
* If this fails, then the task is exiting. When a task exits, the
* work gets canceled, so just cancel this request as well instead
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = io_req_task_work_add(req, twa_signal_ok);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct hlist_node *tmp;
struct io_kiocb *req;
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_task_match(req, tsk))
+ if (io_match_task(req, tsk, files))
posted += io_poll_remove_one(req);
}
}
sqe->poll_events)
return -EINVAL;
- req->poll.addr = READ_ONCE(sqe->addr);
+ req->poll_remove.addr = READ_ONCE(sqe->addr);
return 0;
}
static int io_poll_remove(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- u64 addr;
int ret;
- addr = req->poll.addr;
spin_lock_irq(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, addr);
+ ret = io_poll_cancel(ctx, req->poll_remove.addr);
spin_unlock_irq(&ctx->completion_lock);
if (ret < 0)
return HRTIMER_NORESTART;
}
-static int __io_timeout_cancel(struct io_kiocb *req)
+static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
+ __u64 user_data)
{
- struct io_timeout_data *io = req->async_data;
- int ret;
+ struct io_timeout_data *io;
+ struct io_kiocb *req;
+ int ret = -ENOENT;
+
+ list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
+ if (user_data == req->user_data) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret == -ENOENT)
+ return ERR_PTR(ret);
+
+ io = req->async_data;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret == -1)
- return -EALREADY;
+ return ERR_PTR(-EALREADY);
list_del_init(&req->timeout.list);
+ return req;
+}
+
+static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+{
+ struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req_set_fail_links(req);
io_cqring_fill_event(req, -ECANCELED);
return 0;
}
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+ struct timespec64 *ts, enum hrtimer_mode mode)
{
- struct io_kiocb *req;
- int ret = -ENOENT;
-
- list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- if (user_data == req->user_data) {
- ret = 0;
- break;
- }
- }
+ struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ struct io_timeout_data *data;
- if (ret == -ENOENT)
- return ret;
+ if (IS_ERR(req))
+ return PTR_ERR(req);
- return __io_timeout_cancel(req);
+ req->timeout.off = 0; /* noseq */
+ data = req->async_data;
+ list_add_tail(&req->timeout.list, &ctx->timeout_list);
+ hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+ data->timer.function = io_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
+ return 0;
}
static int io_timeout_remove_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ struct io_timeout_rem *tr = &req->timeout_rem;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
+ if (sqe->ioprio || sqe->buf_index || sqe->len)
return -EINVAL;
- req->timeout_rem.addr = READ_ONCE(sqe->addr);
+ tr->addr = READ_ONCE(sqe->addr);
+ tr->flags = READ_ONCE(sqe->timeout_flags);
+ if (tr->flags & IORING_TIMEOUT_UPDATE) {
+ if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+ return -EINVAL;
+ if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
+ return -EFAULT;
+ } else if (tr->flags) {
+ /* timeout removal doesn't support flags */
+ return -EINVAL;
+ }
+
return 0;
}
*/
static int io_timeout_remove(struct io_kiocb *req)
{
+ struct io_timeout_rem *tr = &req->timeout_rem;
struct io_ring_ctx *ctx = req->ctx;
int ret;
spin_lock_irq(&ctx->completion_lock);
- ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
+ if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
+ enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
+ ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+
+ ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ } else {
+ ret = io_timeout_cancel(ctx, tr->addr);
+ }
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
return io_remove_buffers_prep(req, sqe);
case IORING_OP_TEE:
return io_tee_prep(req, sqe);
+ case IORING_OP_SHUTDOWN:
+ return io_shutdown_prep(req, sqe);
+ case IORING_OP_RENAMEAT:
+ return io_renameat_prep(req, sqe);
+ case IORING_OP_UNLINKAT:
+ return io_unlinkat_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
{
struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
- u32 total_submitted, nr_reqs = 1;
+ u32 total_submitted, nr_reqs = 0;
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(pos, &req->link_list, link_list)
- nr_reqs++;
+ io_for_each_link(pos, req)
+ nr_reqs++;
total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
return total_submitted - nr_reqs;
static void io_req_drop_files(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_task *tctx = req->task->io_uring;
unsigned long flags;
spin_lock_irqsave(&ctx->inflight_lock, flags);
list_del(&req->inflight_entry);
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
+ if (atomic_read(&tctx->in_idle))
+ wake_up(&tctx->wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
req->flags &= ~REQ_F_INFLIGHT;
put_files_struct(req->work.identity->files);
if (req->open.filename)
putname(req->open.filename);
break;
+ case IORING_OP_RENAMEAT:
+ putname(req->rename.oldpath);
+ putname(req->rename.newpath);
+ break;
+ case IORING_OP_UNLINKAT:
+ putname(req->unlink.filename);
+ break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
case IORING_OP_TEE:
ret = io_tee(req, force_nonblock);
break;
+ case IORING_OP_SHUTDOWN:
+ ret = io_shutdown(req, force_nonblock);
+ break;
+ case IORING_OP_RENAMEAT:
+ ret = io_renameat(req, force_nonblock);
+ break;
+ case IORING_OP_UNLINKAT:
+ ret = io_unlinkat(req, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
if (in_async)
mutex_lock(&ctx->uring_lock);
- io_iopoll_req_issued(req);
+ io_iopoll_req_issued(req, in_async);
if (in_async)
mutex_unlock(&ctx->uring_lock);
}
if (ret) {
- req_set_fail_links(req);
- io_req_complete(req, ret);
+ /*
+ * io_iopoll_complete() does not hold completion_lock to complete
+ * polled io, so here for polled io, just mark it done and still let
+ * io_iopoll_complete() complete it.
+ */
+ if (req->ctx->flags & IORING_SETUP_IOPOLL) {
+ struct kiocb *kiocb = &req->rw.kiocb;
+
+ kiocb_done(kiocb, ret, NULL);
+ } else {
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ }
}
return io_steal_work(req);
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
- if (file) {
- req->fixed_file_refs = &ctx->file_data->node->refs;
- percpu_ref_get(req->fixed_file_refs);
- }
+ io_set_resource_node(req);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
return file;
}
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- int fd)
-{
- bool fixed;
-
- fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
- if (unlikely(!fixed && io_async_submit(req->ctx)))
- return -EBADF;
-
- req->file = io_file_get(state, req, fd, fixed);
- if (req->file || io_op_defs[req->opcode].needs_file_no_error)
- return 0;
- return -EBADF;
-}
-
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
struct io_timeout_data, timer);
- struct io_kiocb *req = data->req;
+ struct io_kiocb *prev, *req = data->req;
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *prev = NULL;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
+ prev = req->timeout.head;
+ req->timeout.head = NULL;
/*
* We don't expect the list to be empty, that will only happen if we
* race with the completion of the linked work.
*/
- if (!list_empty(&req->link_list)) {
- prev = list_entry(req->link_list.prev, struct io_kiocb,
- link_list);
- if (refcount_inc_not_zero(&prev->refs))
- list_del_init(&req->link_list);
- else
- prev = NULL;
- }
-
+ if (prev && refcount_inc_not_zero(&prev->refs))
+ io_remove_next_linked(prev);
+ else
+ prev = NULL;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (prev) {
static void __io_queue_linked_timeout(struct io_kiocb *req)
{
/*
- * If the list is now empty, then our linked request finished before
- * we got a chance to setup the timer
+ * If the back reference is NULL, then our linked request finished
+ * before we got a chance to setup the timer
*/
- if (!list_empty(&req->link_list)) {
+ if (req->timeout.head) {
struct io_timeout_data *data = req->async_data;
data->timer.function = io_link_timeout_fn;
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
- struct io_kiocb *nxt;
+ struct io_kiocb *nxt = req->link;
- if (!(req->flags & REQ_F_LINK_HEAD))
- return NULL;
- if (req->flags & REQ_F_LINK_TIMEOUT)
- return NULL;
-
- nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
- if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
+ if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
+ nxt->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
+ nxt->timeout.head = req;
nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
req->flags |= REQ_F_LINK_TIMEOUT;
return nxt;
io_queue_sqe(req, NULL, cs);
}
+struct io_submit_link {
+ struct io_kiocb *head;
+ struct io_kiocb *last;
+};
+
static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **link, struct io_comp_state *cs)
+ struct io_submit_link *link, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
- if (*link) {
- struct io_kiocb *head = *link;
+ if (link->head) {
+ struct io_kiocb *head = link->head;
/*
* Taking sequential execution of a link, draining both sides
return ret;
}
trace_io_uring_link(ctx, req, head);
- list_add_tail(&req->link_list, &head->link_list);
+ link->last->link = req;
+ link->last = req;
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_link_head(head, cs);
- *link = NULL;
+ link->head = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
ctx->drain_next = 0;
}
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- req->flags |= REQ_F_LINK_HEAD;
- INIT_LIST_HEAD(&req->link_list);
-
ret = io_req_defer_prep(req, sqe);
if (unlikely(ret))
req->flags |= REQ_F_FAIL_LINK;
- *link = req;
+ link->head = req;
+ link->last = req;
} else {
io_queue_sqe(req, sqe, cs);
}
{
if (!list_empty(&state->comp.list))
io_submit_flush_completions(&state->comp);
- blk_finish_plug(&state->plug);
+ if (state->plug_started)
+ blk_finish_plug(&state->plug);
io_state_file_put(state);
if (state->free_reqs)
kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
static void io_submit_state_start(struct io_submit_state *state,
struct io_ring_ctx *ctx, unsigned int max_ios)
{
- blk_start_plug(&state->plug);
+ state->plug_started = false;
state->comp.nr = 0;
INIT_LIST_HEAD(&state->comp.list);
state->comp.ctx = ctx;
state->free_reqs = 0;
- state->file = NULL;
+ state->file_refs = 0;
state->ios_left = max_ios;
}
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
+ req->link = NULL;
+ req->fixed_file_refs = NULL;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
req->task = current;
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
return -EFAULT;
sqe_flags = READ_ONCE(sqe->flags);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags;
- if (!io_op_defs[req->opcode].needs_file)
- return 0;
+ /*
+ * Plug now if we have more than 1 IO left after this, and the target
+ * is potentially a read/write to block based storage.
+ */
+ if (!state->plug_started && state->ios_left > 1 &&
+ io_op_defs[req->opcode].plug) {
+ blk_start_plug(&state->plug);
+ state->plug_started = true;
+ }
+
+ ret = 0;
+ if (io_op_defs[req->opcode].needs_file) {
+ bool fixed = req->flags & REQ_F_FIXED_FILE;
+
+ req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+ if (unlikely(!req->file &&
+ !io_op_defs[req->opcode].needs_file_no_error))
+ ret = -EBADF;
+ }
- ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
state->ios_left--;
return ret;
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
struct io_submit_state state;
- struct io_kiocb *link = NULL;
+ struct io_submit_link link;
int i, submitted = 0;
/* if we have a backlog and couldn't flush it all, return BUSY */
refcount_add(nr, ¤t->usage);
io_submit_state_start(&state, ctx, nr);
+ link.head = NULL;
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
percpu_counter_sub(&tctx->inflight, unused);
put_task_struct_many(current, unused);
}
- if (link)
- io_queue_link_head(link, &state.comp);
+ if (link.head)
+ io_queue_link_head(link.head, &state.comp);
io_submit_state_end(&state);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
spin_unlock_irq(&ctx->completion_lock);
}
-static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
- int sync, void *key)
+static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
- struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
- int ret;
-
- ret = autoremove_wake_function(wqe, mode, sync, key);
- if (ret) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- }
- return ret;
-}
-
-enum sq_ret {
- SQT_IDLE = 1,
- SQT_SPIN = 2,
- SQT_DID_WORK = 4,
-};
-
-static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
- unsigned long start_jiffies, bool cap_entries)
-{
- unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
- struct io_sq_data *sqd = ctx->sq_data;
unsigned int to_submit;
int ret = 0;
-again:
- if (!list_empty(&ctx->iopoll_list)) {
+ to_submit = io_sqring_entries(ctx);
+ /* if we're handling multiple rings, cap submit size for fairness */
+ if (cap_entries && to_submit > 8)
+ to_submit = 8;
+
+ if (!list_empty(&ctx->iopoll_list) || to_submit) {
unsigned nr_events = 0;
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ if (!list_empty(&ctx->iopoll_list))
io_do_iopoll(ctx, &nr_events, 0);
+
+ if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
+ ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
}
- to_submit = io_sqring_entries(ctx);
-
- /*
- * If submit got -EBUSY, flag us as needing the application
- * to enter the kernel to reap and flush events.
- */
- if (!to_submit || ret == -EBUSY || need_resched()) {
- /*
- * Drop cur_mm before scheduling, we can't hold it for
- * long periods (or over schedule()). Do this before
- * adding ourselves to the waitqueue, as the unuse/drop
- * may sleep.
- */
- io_sq_thread_drop_mm();
-
- /*
- * We're polling. If we're within the defined idle
- * period, then let us spin without work before going
- * to sleep. The exception is if we got EBUSY doing
- * more IO, we should wait for the application to
- * reap events and wake us up.
- */
- if (!list_empty(&ctx->iopoll_list) || need_resched() ||
- (!time_after(jiffies, timeout) && ret != -EBUSY &&
- !percpu_ref_is_dying(&ctx->refs)))
- return SQT_SPIN;
+ if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
+ wake_up(&ctx->sqo_sq_wait);
- prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
- TASK_INTERRUPTIBLE);
+ return ret;
+}
- /*
- * While doing polled IO, before going to sleep, we need
- * to check if there are new reqs added to iopoll_list,
- * it is because reqs may have been punted to io worker
- * and will be added to iopoll_list later, hence check
- * the iopoll_list again.
- */
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- goto again;
- }
+static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+{
+ struct io_ring_ctx *ctx;
+ unsigned sq_thread_idle = 0;
- to_submit = io_sqring_entries(ctx);
- if (!to_submit || ret == -EBUSY)
- return SQT_IDLE;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if (sq_thread_idle < ctx->sq_thread_idle)
+ sq_thread_idle = ctx->sq_thread_idle;
}
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- io_ring_clear_wakeup_flag(ctx);
-
- /* if we're handling multiple rings, cap submit size for fairness */
- if (cap_entries && to_submit > 8)
- to_submit = 8;
-
- mutex_lock(&ctx->uring_lock);
- if (likely(!percpu_ref_is_dying(&ctx->refs)))
- ret = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
-
- if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
- wake_up(&ctx->sqo_sq_wait);
-
- return SQT_DID_WORK;
+ sqd->sq_thread_idle = sq_thread_idle;
}
static void io_sqd_init_new(struct io_sq_data *sqd)
while (!list_empty(&sqd->ctx_new_list)) {
ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
- init_wait(&ctx->sqo_wait_entry);
- ctx->sqo_wait_entry.func = io_sq_wake_function;
list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
complete(&ctx->sq_thread_comp);
}
+
+ io_sqd_update_thread_idle(sqd);
}
static int io_sq_thread(void *data)
{
struct cgroup_subsys_state *cur_css = NULL;
+ struct files_struct *old_files = current->files;
+ struct nsproxy *old_nsproxy = current->nsproxy;
const struct cred *old_cred = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
- unsigned long start_jiffies;
+ unsigned long timeout = 0;
+ DEFINE_WAIT(wait);
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
- start_jiffies = jiffies;
while (!kthread_should_stop()) {
- enum sq_ret ret = 0;
- bool cap_entries;
+ int ret;
+ bool cap_entries, sqt_spin, needs_sched;
/*
* Any changes to the sqd lists are synchronized through the
* kthread parking. This synchronizes the thread vs users,
* the users are synchronized on the sqd->ctx_lock.
*/
- if (kthread_should_park())
+ if (kthread_should_park()) {
kthread_parkme();
+ /*
+ * When sq thread is unparked, in case the previous park operation
+ * comes from io_put_sq_data(), which means that sq thread is going
+ * to be stopped, so here needs to have a check.
+ */
+ if (kthread_should_stop())
+ break;
+ }
- if (unlikely(!list_empty(&sqd->ctx_new_list)))
+ if (unlikely(!list_empty(&sqd->ctx_new_list))) {
io_sqd_init_new(sqd);
+ timeout = jiffies + sqd->sq_thread_idle;
+ }
+ sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
-
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
if (current->cred != ctx->creds) {
if (old_cred)
current->sessionid = ctx->sessionid;
#endif
- ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
+ ret = __io_sq_thread(ctx, cap_entries);
+ if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ sqt_spin = true;
- io_sq_thread_drop_mm();
+ io_sq_thread_drop_mm_files();
}
- if (ret & SQT_SPIN) {
+ if (sqt_spin || !time_after(jiffies, timeout)) {
io_run_task_work();
cond_resched();
- } else if (ret == SQT_IDLE) {
- if (kthread_should_park())
- continue;
+ if (sqt_spin)
+ timeout = jiffies + sqd->sq_thread_idle;
+ continue;
+ }
+
+ if (kthread_should_park())
+ continue;
+
+ needs_sched = true;
+ prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->iopoll_list)) {
+ needs_sched = false;
+ break;
+ }
+ if (io_sqring_entries(ctx)) {
+ needs_sched = false;
+ break;
+ }
+ }
+
+ if (needs_sched) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
+
schedule();
- start_jiffies = jiffies;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
}
+
+ finish_wait(&sqd->wait, &wait);
+ timeout = jiffies + sqd->sq_thread_idle;
}
io_run_task_work();
if (old_cred)
revert_creds(old_cred);
+ task_lock(current);
+ current->files = old_files;
+ current->nsproxy = old_nsproxy;
+ task_unlock(current);
+
kthread_parkme();
return 0;
return 1;
if (!signal_pending(current))
return 0;
- if (current->jobctl & JOBCTL_TASK_WORK) {
- spin_lock_irq(¤t->sighand->siglock);
- current->jobctl &= ~JOBCTL_TASK_WORK;
- recalc_sigpending();
- spin_unlock_irq(¤t->sighand->siglock);
- return 1;
- }
+ if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
+ return -ERESTARTSYS;
return -EINTR;
}
* application must reap them itself, as they reside on the shared cq ring.
*/
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
- const sigset_t __user *sig, size_t sigsz)
+ const sigset_t __user *sig, size_t sigsz,
+ struct __kernel_timespec __user *uts)
{
struct io_wait_queue iowq = {
.wq = {
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings;
+ struct timespec64 ts;
+ signed long timeout = 0;
int ret = 0;
do {
return ret;
}
+ if (uts) {
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = timespec64_to_jiffies(&ts);
+ }
+
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
trace_io_uring_cqring_wait(ctx, min_events);
do {
break;
if (io_should_wake(&iowq, false))
break;
- schedule();
+ if (uts) {
+ timeout = schedule_timeout(timeout);
+ if (timeout == 0) {
+ ret = -ETIME;
+ break;
+ }
+ } else {
+ schedule();
+ }
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
if (!data)
return -ENXIO;
- spin_lock(&data->lock);
- if (!list_empty(&data->ref_list))
- ref_node = list_first_entry(&data->ref_list,
- struct fixed_file_ref_node, node);
- spin_unlock(&data->lock);
+ spin_lock_bh(&data->lock);
+ ref_node = data->node;
+ spin_unlock_bh(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
mutex_lock(&sqd->ctx_lock);
list_del(&ctx->sqd_list);
+ io_sqd_update_thread_idle(sqd);
mutex_unlock(&sqd->ctx_lock);
- if (sqd->thread) {
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ if (sqd->thread)
io_sq_thread_unpark(sqd);
- }
io_put_sq_data(sqd);
ctx->sq_data = NULL;
kfree(pfile);
}
- spin_lock(&file_data->lock);
- list_del(&ref_node->node);
- spin_unlock(&file_data->lock);
-
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
percpu_ref_put(&file_data->refs);
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *data;
struct io_ring_ctx *ctx;
- bool first_add;
+ bool first_add = false;
int delay = HZ;
ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- ctx = ref_node->file_data->ctx;
+ data = ref_node->file_data;
+ ctx = data->ctx;
+
+ spin_lock_bh(&data->lock);
+ ref_node->done = true;
+
+ while (!list_empty(&data->ref_list)) {
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_file_ref_node, node);
+ /* recycle ref nodes in order */
+ if (!ref_node->done)
+ break;
+ list_del(&ref_node->node);
+ first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
+ }
+ spin_unlock_bh(&data->lock);
- if (percpu_ref_is_dying(&ctx->file_data->refs))
+ if (percpu_ref_is_dying(&data->refs))
delay = 0;
- first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
if (!delay)
mod_delayed_work(system_wq, &ctx->file_put_work, 0);
else if (first_add)
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->file_list);
ref_node->file_data = ctx->file_data;
+ ref_node->done = false;
return ref_node;
}
}
file_data->node = ref_node;
- spin_lock(&file_data->lock);
- list_add(&ref_node->node, &file_data->ref_list);
- spin_unlock(&file_data->lock);
+ spin_lock_bh(&file_data->lock);
+ list_add_tail(&ref_node->node, &file_data->ref_list);
+ spin_unlock_bh(&file_data->lock);
percpu_ref_get(&file_data->refs);
return ret;
out_fput:
if (needs_switch) {
percpu_ref_kill(&data->node->refs);
- spin_lock(&data->lock);
- list_add(&ref_node->node, &data->ref_list);
+ spin_lock_bh(&data->lock);
+ list_add_tail(&ref_node->node, &data->ref_list);
data->node = ref_node;
- spin_unlock(&data->lock);
+ spin_unlock_bh(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
} else
destroy_fixed_file_ref_node(ref_node);
struct io_sq_data *sqd;
ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
goto err;
sqd = io_get_sq_data(p);
* as nobody else will be looking for them.
*/
do {
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
{
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx, NULL);
- io_poll_remove_all(ctx, NULL);
+ io_kill_timeouts(ctx, NULL, NULL);
+ io_poll_remove_all(ctx, NULL, NULL);
if (ctx->io_wq)
io_wq_cancel_all(ctx->io_wq);
/* if we failed setting up the ctx, we might not have any rings */
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
return 0;
}
-static bool io_wq_files_match(struct io_wq_work *work, void *data)
-{
- struct files_struct *files = data;
-
- return !files || ((work->flags & IO_WQ_WORK_FILES) &&
- work->identity->files == files);
-}
-
-/*
- * Returns true if 'preq' is the link parent of 'req'
- */
-static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
-{
- struct io_kiocb *link;
-
- if (!(preq->flags & REQ_F_LINK_HEAD))
- return false;
-
- list_for_each_entry(link, &preq->link_list, link_list) {
- if (link == req)
- return true;
- }
-
- return false;
-}
-
-/*
- * We're looking to cancel 'req' because it's holding on to our files, but
- * 'req' could be a link to another request. See if it is, and cancel that
- * parent request if so.
- */
-static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
- struct hlist_node *tmp;
- struct io_kiocb *preq;
- bool found = false;
- int i;
-
- spin_lock_irq(&ctx->completion_lock);
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
- found = io_match_link(preq, req);
- if (found) {
- io_poll_remove_one(preq);
- break;
- }
- }
- }
- spin_unlock_irq(&ctx->completion_lock);
- return found;
-}
-
-static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_kiocb *preq;
- bool found = false;
-
- spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
- found = io_match_link(preq, req);
- if (found) {
- __io_timeout_cancel(preq);
- break;
- }
- }
- spin_unlock_irq(&ctx->completion_lock);
- return found;
-}
+struct io_task_cancel {
+ struct task_struct *task;
+ struct files_struct *files;
+};
-static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_task_cancel *cancel = data;
bool ret;
- if (req->flags & REQ_F_LINK_TIMEOUT) {
+ if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
unsigned long flags;
struct io_ring_ctx *ctx = req->ctx;
/* protect against races with linked timeouts */
spin_lock_irqsave(&ctx->completion_lock, flags);
- ret = io_match_link(req, data);
+ ret = io_match_task(req, cancel->task, cancel->files);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
} else {
- ret = io_match_link(req, data);
+ ret = io_match_task(req, cancel->task, cancel->files);
}
return ret;
}
-static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
- enum io_wq_cancel cret;
-
- /* cancel this particular work, if it's running */
- cret = io_wq_cancel_work(ctx->io_wq, &req->work);
- if (cret != IO_WQ_CANCEL_NOTFOUND)
- return;
-
- /* find links that hold this pending, cancel those */
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
- if (cret != IO_WQ_CANCEL_NOTFOUND)
- return;
-
- /* if we have a poll link holding this pending, cancel that */
- if (io_poll_remove_link(ctx, req))
- return;
-
- /* final option, timeout link is holding this req pending */
- io_timeout_remove_link(ctx, req);
-}
-
static void io_cancel_defer_files(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files)
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_task_match(de->req, task) &&
- io_match_files(de->req, files)) {
+ if (io_match_task(de->req, task, files)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
}
}
-/*
- * Returns true if we found and killed one or more files pinning requests
- */
-static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
struct files_struct *files)
{
- if (list_empty_careful(&ctx->inflight_list))
- return false;
-
- /* cancel all at once, should be faster than doing it one by one*/
- io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
-
while (!list_empty_careful(&ctx->inflight_list)) {
- struct io_kiocb *cancel_req = NULL, *req;
+ struct io_task_cancel cancel = { .task = task, .files = files };
+ struct io_kiocb *req;
DEFINE_WAIT(wait);
+ bool found = false;
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
+ if (req->task != task ||
req->work.identity->files != files)
continue;
- /* req is being completed, ignore */
- if (!refcount_inc_not_zero(&req->refs))
- continue;
- cancel_req = req;
+ found = true;
break;
}
- if (cancel_req)
- prepare_to_wait(&ctx->inflight_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ if (found)
+ prepare_to_wait(&task->io_uring->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&ctx->inflight_lock);
/* We need to keep going until we don't find a matching req */
- if (!cancel_req)
+ if (!found)
break;
- /* cancel this request, or head link requests */
- io_attempt_cancel(ctx, cancel_req);
- io_put_req(cancel_req);
+
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
+ io_poll_remove_all(ctx, task, files);
+ io_kill_timeouts(ctx, task, files);
/* cancellations _may_ trigger task work */
io_run_task_work();
schedule();
- finish_wait(&ctx->inflight_wait, &wait);
+ finish_wait(&task->io_uring->wait, &wait);
}
-
- return true;
}
-static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task)
{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct task_struct *task = data;
-
- return io_task_match(req, task);
-}
-
-static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
-{
- bool ret;
-
- ret = io_uring_cancel_files(ctx, files);
- if (!files) {
+ while (1) {
+ struct io_task_cancel cancel = { .task = task, .files = NULL, };
enum io_wq_cancel cret;
+ bool ret = false;
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
if (cret != IO_WQ_CANCEL_NOTFOUND)
ret = true;
}
}
- ret |= io_poll_remove_all(ctx, task);
- ret |= io_kill_timeouts(ctx, task);
+ ret |= io_poll_remove_all(ctx, task, NULL);
+ ret |= io_kill_timeouts(ctx, task, NULL);
+ if (!ret)
+ break;
+ io_run_task_work();
+ cond_resched();
}
-
- return ret;
}
/*
io_sq_thread_park(ctx->sq_data);
}
- if (files)
- io_cancel_defer_files(ctx, NULL, files);
- else
- io_cancel_defer_files(ctx, task, NULL);
-
+ io_cancel_defer_files(ctx, task, files);
+ io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
io_cqring_overflow_flush(ctx, true, task, files);
+ io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
- while (__io_uring_cancel_task_requests(ctx, task, files)) {
- io_run_task_work();
- cond_resched();
- }
+ if (!files)
+ __io_uring_cancel_task_requests(ctx, task);
+ else
+ io_uring_cancel_files(ctx, task, files);
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
atomic_dec(&task->io_uring->in_idle);
finish_wait(&ctx->sqo_sq_wait, &wait);
}
+static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
+ struct __kernel_timespec __user **ts,
+ const sigset_t __user **sig)
+{
+ struct io_uring_getevents_arg arg;
+
+ /*
+ * If EXT_ARG isn't set, then we have no timespec and the argp pointer
+ * is just a pointer to the sigset_t.
+ */
+ if (!(flags & IORING_ENTER_EXT_ARG)) {
+ *sig = (const sigset_t __user *) argp;
+ *ts = NULL;
+ return 0;
+ }
+
+ /*
+ * EXT_ARG is set - ensure we agree on the size of it and copy in our
+ * timespec and sigset_t pointers if good.
+ */
+ if (*argsz != sizeof(arg))
+ return -EINVAL;
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+ *sig = u64_to_user_ptr(arg.sigmask);
+ *argsz = arg.sigmask_sz;
+ *ts = u64_to_user_ptr(arg.ts);
+ return 0;
+}
+
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
- u32, min_complete, u32, flags, const sigset_t __user *, sig,
- size_t, sigsz)
+ u32, min_complete, u32, flags, const void __user *, argp,
+ size_t, argsz)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
io_run_task_work();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
return -EINVAL;
f = fdget(fd);
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
if (!list_empty_careful(&ctx->cq_overflow_list))
io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT)
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
+ const sigset_t __user *sig;
+ struct __kernel_timespec __user *ts;
+
+ ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+ if (unlikely(ret))
+ goto out;
+
min_complete = min(min_complete, ctx->cq_entries);
/*
!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_iopoll_check(ctx, min_complete);
} else {
- ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+ ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
}
}
{
struct file *file;
int ret;
+ int fd;
#if defined(CONFIG_UNIX)
ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (ret < 0)
goto err;
+ fd = ret;
file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC);
if (IS_ERR(file)) {
-err_fd:
- put_unused_fd(ret);
+ put_unused_fd(fd);
ret = PTR_ERR(file);
goto err;
}
#if defined(CONFIG_UNIX)
ctx->ring_sock->file = file;
#endif
- if (unlikely(io_uring_add_task_file(ctx, file))) {
- file = ERR_PTR(-ENOMEM);
- goto err_fd;
+ ret = io_uring_add_task_file(ctx, file);
+ if (ret) {
+ fput(file);
+ put_unused_fd(fd);
+ goto err;
}
- fd_install(ret, file);
- return ret;
+ fd_install(fd, file);
+ return fd;
err:
#if defined(CONFIG_UNIX)
sock_release(ctx->ring_sock);
* to a power-of-two, if it isn't already. We do NOT impose
* any cq vs sq ring sizing.
*/
- p->cq_entries = roundup_pow_of_two(p->cq_entries);
- if (p->cq_entries < p->sq_entries)
+ if (!p->cq_entries)
return -EINVAL;
if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
return -EINVAL;
p->cq_entries = IORING_MAX_CQ_ENTRIES;
}
+ p->cq_entries = roundup_pow_of_two(p->cq_entries);
+ if (p->cq_entries < p->sq_entries)
+ return -EINVAL;
} else {
p->cq_entries = 2 * p->sq_entries;
}
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- IORING_FEAT_POLL_32BITS;
+ IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+ IORING_FEAT_EXT_ARG;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
static DEFINE_MUTEX(pstore_blk_lock);
static struct block_device *psblk_bdev;
static struct pstore_zone_info *pstore_zone_info;
-static pstore_blk_panic_write_op blkdev_panic_write;
struct bdev_info {
dev_t devt;
return bdev;
}
- nr_sects = part_nr_sects_read(bdev->bd_part);
+ nr_sects = bdev_nr_sectors(bdev);
if (!nr_sects) {
pr_err("not enough space for '%s'\n", blkdev);
blkdev_put(bdev, mode);
return ret;
}
-static ssize_t psblk_blk_panic_write(const char *buf, size_t size,
- loff_t off)
-{
- int ret;
-
- if (!blkdev_panic_write)
- return -EOPNOTSUPP;
-
- /* size and off must align to SECTOR_SIZE for block device */
- ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT,
- size >> SECTOR_SHIFT);
- /* try next zone */
- if (ret == -ENOMSG)
- return ret;
- return ret ? -EIO : size;
-}
-
-static int __register_pstore_blk(struct pstore_blk_info *info)
+/*
+ * This takes its configuration only from the module parameters now.
+ * See psblk_get_bdev() and blkdev.
+ */
+static int __register_pstore_blk(void)
{
char bdev_name[BDEVNAME_SIZE];
struct block_device *bdev;
}
/* only allow driver matching the @blkdev */
- if (!binfo.devt || (!best_effort &&
- MAJOR(binfo.devt) != info->major)) {
- pr_debug("invalid major %u (expect %u)\n",
- info->major, MAJOR(binfo.devt));
+ if (!binfo.devt) {
+ pr_debug("no major\n");
ret = -ENODEV;
goto err_put_bdev;
}
/* psblk_bdev must be assigned before register to pstore/blk */
psblk_bdev = bdev;
- blkdev_panic_write = info->panic_write;
-
- /* Copy back block device details. */
- info->devt = binfo.devt;
- info->nr_sects = binfo.nr_sects;
- info->start_sect = binfo.start_sect;
memset(&dev, 0, sizeof(dev));
- dev.total_size = info->nr_sects << SECTOR_SHIFT;
- dev.flags = info->flags;
+ dev.total_size = binfo.nr_sects << SECTOR_SHIFT;
dev.read = psblk_generic_blk_read;
dev.write = psblk_generic_blk_write;
- dev.erase = NULL;
- dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL;
ret = __register_pstore_device(&dev);
if (ret)
goto err_put_bdev;
bdevname(bdev, bdev_name);
- pr_info("attached %s%s\n", bdev_name,
- info->panic_write ? "" : " (no dedicated panic_write!)");
+ pr_info("attached %s (no dedicated panic_write!)\n", bdev_name);
return 0;
err_put_bdev:
psblk_bdev = NULL;
- blkdev_panic_write = NULL;
psblk_put_bdev(bdev, holder);
return ret;
}
-/**
- * register_pstore_blk() - register block device to pstore/blk
- *
- * @info: details on the desired block device interface
- *
- * Return:
- * * 0 - OK
- * * Others - something error.
- */
-int register_pstore_blk(struct pstore_blk_info *info)
-{
- int ret;
-
- mutex_lock(&pstore_blk_lock);
- ret = __register_pstore_blk(info);
- mutex_unlock(&pstore_blk_lock);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(register_pstore_blk);
-
static void __unregister_pstore_blk(unsigned int major)
{
struct pstore_device_info dev = { .read = psblk_generic_blk_read };
if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) {
__unregister_pstore_device(&dev);
psblk_put_bdev(psblk_bdev, holder);
- blkdev_panic_write = NULL;
psblk_bdev = NULL;
}
}
-/**
- * unregister_pstore_blk() - unregister block device from pstore/blk
- *
- * @major: the major device number of device
- */
-void unregister_pstore_blk(unsigned int major)
-{
- mutex_lock(&pstore_blk_lock);
- __unregister_pstore_blk(major);
- mutex_unlock(&pstore_blk_lock);
-}
-EXPORT_SYMBOL_GPL(unregister_pstore_blk);
-
/* get information of pstore/blk */
int pstore_blk_get_config(struct pstore_blk_config *info)
{
static int __init pstore_blk_init(void)
{
- struct pstore_blk_info info = { };
int ret = 0;
mutex_lock(&pstore_blk_lock);
if (!pstore_zone_info && best_effort && blkdev[0])
- ret = __register_pstore_blk(&info);
+ ret = __register_pstore_blk();
mutex_unlock(&pstore_blk_lock);
return ret;
};
struct gendisk *rq_disk;
- struct hd_struct *part;
+ struct block_device *part;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
/* Time that the first bio started allocating this request. */
u64 alloc_time_ns;
* file system requests.
*/
static inline unsigned int blk_max_size_offset(struct request_queue *q,
- sector_t offset)
+ sector_t offset,
+ unsigned int chunk_sectors)
{
- unsigned int chunk_sectors = q->limits.chunk_sectors;
-
- if (!chunk_sectors)
- return q->limits.max_sectors;
+ if (!chunk_sectors) {
+ if (q->limits.chunk_sectors)
+ chunk_sectors = q->limits.chunk_sectors;
+ else
+ return q->limits.max_sectors;
+ }
if (likely(is_power_of_2(chunk_sectors)))
chunk_sectors -= offset & (chunk_sectors - 1);
req_op(rq) == REQ_OP_SECURE_ERASE)
return blk_queue_get_max_sectors(q, req_op(rq));
- return min(blk_max_size_offset(q, offset),
+ return min(blk_max_size_offset(q, offset, 0),
blk_queue_get_max_sectors(q, req_op(rq)));
}
return -1;
if (bdev_is_partition(bdev))
return queue_limit_alignment_offset(&q->limits,
- bdev->bd_part->start_sect);
+ bdev->bd_start_sect);
return q->limits.alignment_offset;
}
if (bdev_is_partition(bdev))
return queue_limit_discard_alignment(&q->limits,
- bdev->bd_part->start_sect);
+ bdev->bd_start_sect);
return q->limits.discard_alignment;
}
void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
int (*getgeo)(struct block_device *, struct hd_geometry *);
+ int (*set_read_only)(struct block_device *bdev, bool ro);
/* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
int (*report_zones)(struct gendisk *, sector_t sector,
#define blkdev_compat_ptr_ioctl NULL
#endif
- extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
- unsigned long);
extern int bdev_read_page(struct block_device *, sector_t, struct page *);
extern int bdev_write_page(struct block_device *, sector_t, struct page *,
struct writeback_control *);
void disk_end_io_acct(struct gendisk *disk, unsigned int op,
unsigned long start_time);
- unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
- struct bio *bio);
- void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+ unsigned long part_start_io_acct(struct gendisk *disk,
+ struct block_device **part, struct bio *bio);
+ void part_end_io_acct(struct block_device *part, struct bio *bio,
unsigned long start_time);
/**
int set_blocksize(struct block_device *bdev, int size);
const char *bdevname(struct block_device *bdev, char *buffer);
- struct block_device *lookup_bdev(const char *);
+ int lookup_bdev(const char *pathname, dev_t *dev);
void blkdev_show(struct seq_file *seqf, off_t offset);
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
void *holder);
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder);
- int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
- void *holder);
- void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
- void *holder);
+ int bd_prepare_to_claim(struct block_device *bdev, void *holder);
+ void bd_abort_claiming(struct block_device *bdev, void *holder);
void blkdev_put(struct block_device *bdev, fmode_t mode);
+ /* just for blk-cgroup, don't use elsewhere */
+ struct block_device *blkdev_get_no_open(dev_t dev);
+ void blkdev_put_no_open(struct block_device *bdev);
+
+ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
+ void bdev_add(struct block_device *bdev, dev_t dev);
struct block_device *I_BDEV(struct inode *inode);
- struct block_device *bdget_part(struct hd_struct *part);
struct block_device *bdgrab(struct block_device *bdev);
void bdput(struct block_device *);
#endif
int fsync_bdev(struct block_device *bdev);
- struct super_block *freeze_bdev(struct block_device *bdev);
- int thaw_bdev(struct block_device *bdev, struct super_block *sb);
+ int freeze_bdev(struct block_device *bdev);
+ int thaw_bdev(struct block_device *bdev);
#endif /* _LINUX_BLKDEV_H */
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
- struct block_device *i_bdev;
struct cdev *i_cdev;
char *i_link;
unsigned i_dir_seq;
const struct file_operations *f_op;
/*
- * Protects f_ep_links, f_flags.
+ * Protects f_ep, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
- struct list_head f_ep_links;
- struct list_head f_tfile_llink;
+ struct hlist_head *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
errseq_t f_wb_err;
struct sb_writers {
int frozen; /* Is sb frozen? */
- wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */
+ wait_queue_head_t wait_unfrozen; /* wait for thaw */
struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern struct super_block *get_super(struct block_device *);
- extern struct super_block *get_super_thawed(struct block_device *);
- extern struct super_block *get_super_exclusive_thawed(struct block_device *bdev);
extern struct super_block *get_active_super(struct block_device *bdev);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
{
struct inode *inode;
- if (!vma->vm_file)
+ if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
return false;
if (!vma_is_dax(vma))
return false;
&relay_file_operations);
}
-static struct rchan_callbacks blk_relay_callbacks = {
+static const struct rchan_callbacks blk_relay_callbacks = {
.subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
static void blk_trace_setup_lba(struct blk_trace *bt,
struct block_device *bdev)
{
- struct hd_struct *part = NULL;
-
- if (bdev)
- part = bdev->bd_part;
-
- if (part) {
- bt->start_lba = part->start_sect;
- bt->end_lba = part->start_sect + part->nr_sects;
+ if (bdev) {
+ bt->start_lba = bdev->bd_start_sect;
+ bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
} else {
bt->start_lba = 0;
bt->end_lba = -1ULL;
#endif
static u64
- blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+ blk_trace_request_get_cgid(struct request *rq)
{
if (!rq->bio)
return 0;
/* Use the first bio */
- return blk_trace_bio_get_cgid(q, rq->bio);
+ return blk_trace_bio_get_cgid(rq->q, rq->bio);
}
/*
rcu_read_unlock();
}
- static void blk_add_trace_rq_insert(void *ignore,
- struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
- static void blk_add_trace_rq_issue(void *ignore,
- struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
- static void blk_add_trace_rq_merge(void *ignore,
- struct request_queue *q, struct request *rq)
+ static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
- static void blk_add_trace_rq_requeue(void *ignore,
- struct request_queue *q,
- struct request *rq)
+ static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
int error, unsigned int nr_bytes)
{
blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
- blk_trace_request_get_cgid(rq->q, rq));
+ blk_trace_request_get_cgid(rq));
}
/**
rcu_read_unlock();
}
- static void blk_add_trace_bio_bounce(void *ignore,
- struct request_queue *q, struct bio *bio)
+ static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
blk_status_to_errno(bio->bi_status));
}
- static void blk_add_trace_bio_backmerge(void *ignore,
- struct request_queue *q,
- struct request *rq,
- struct bio *bio)
+ static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0);
}
- static void blk_add_trace_bio_frontmerge(void *ignore,
- struct request_queue *q,
- struct request *rq,
- struct bio *bio)
+ static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0);
}
- static void blk_add_trace_bio_queue(void *ignore,
- struct request_queue *q, struct bio *bio)
+ static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0);
}
- static void blk_add_trace_getrq(void *ignore,
- struct request_queue *q,
- struct bio *bio, int rw)
+ static void blk_add_trace_getrq(void *ignore, struct bio *bio)
{
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
- else {
- struct blk_trace *bt;
-
- rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
- NULL, 0);
- rcu_read_unlock();
- }
- }
-
-
- static void blk_add_trace_sleeprq(void *ignore,
- struct request_queue *q,
- struct bio *bio, int rw)
- {
- if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
- else {
- struct blk_trace *bt;
-
- rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
- if (bt)
- __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
- 0, 0, NULL, 0);
- rcu_read_unlock();
- }
+ blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0);
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
rcu_read_unlock();
}
- static void blk_add_trace_split(void *ignore,
- struct request_queue *q, struct bio *bio,
- unsigned int pdu)
+ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
+ struct request_queue *q = bio->bi_disk->queue;
struct blk_trace *bt;
rcu_read_lock();
/**
* blk_add_trace_bio_remap - Add a trace for a bio-remap operation
* @ignore: trace callback data parameter (not used)
- * @q: queue the io is for
* @bio: the source bio
- * @dev: target device
+ * @dev: source device
* @from: source sector
*
- * Description:
- * Device mapper or raid target sometimes need to split a bio because
- * it spans a stripe (or similar). Add a trace for that action.
- *
+ * Called after a bio is remapped to a different device and/or sector.
**/
- static void blk_add_trace_bio_remap(void *ignore,
- struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
+ sector_t from)
{
+ struct request_queue *q = bio->bi_disk->queue;
struct blk_trace *bt;
struct blk_io_trace_remap r;
/**
* blk_add_trace_rq_remap - Add a trace for a request-remap operation
* @ignore: trace callback data parameter (not used)
- * @q: queue the io is for
* @rq: the source request
* @dev: target device
* @from: source sector
* Add a trace for that action.
*
**/
- static void blk_add_trace_rq_remap(void *ignore,
- struct request_queue *q,
- struct request *rq, dev_t dev,
+ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
sector_t from)
{
struct blk_trace *bt;
struct blk_io_trace_remap r;
rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
+ bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
- sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
+ sizeof(r), &r, blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
/**
* blk_add_driver_data - Add binary message with driver-specific data
- * @q: queue the io is for
* @rq: io request
* @data: driver-specific data
* @len: length of driver-specific data
* Some drivers might want to write driver-specific data per request.
*
**/
- void blk_add_driver_data(struct request_queue *q,
- struct request *rq,
- void *data, size_t len)
+ void blk_add_driver_data(struct request *rq, void *data, size_t len)
{
struct blk_trace *bt;
rcu_read_lock();
- bt = rcu_dereference(q->blk_trace);
+ bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, 0, len, data,
- blk_trace_request_get_cgid(q, rq));
+ blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
WARN_ON(ret);
ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
- WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
return p - buf;
}
- static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
- {
- if (bdev->bd_disk == NULL)
- return NULL;
-
- return bdev_get_queue(bdev);
- }
-
static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- struct block_device *bdev = bdget_part(dev_to_part(dev));
- struct request_queue *q;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
ssize_t ret = -ENXIO;
- if (bdev == NULL)
- goto out;
-
- q = blk_trace_get_queue(bdev);
- if (q == NULL)
- goto out_bdput;
-
mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
out_unlock_bdev:
mutex_unlock(&q->debugfs_mutex);
- out_bdput:
- bdput(bdev);
- out:
return ret;
}
struct device_attribute *attr,
const char *buf, size_t count)
{
- struct block_device *bdev;
- struct request_queue *q;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
goto out;
value = ret;
}
- } else if (kstrtoull(buf, 0, &value))
- goto out;
-
- ret = -ENXIO;
- bdev = bdget_part(dev_to_part(dev));
- if (bdev == NULL)
- goto out;
-
- q = blk_trace_get_queue(bdev);
- if (q == NULL)
- goto out_bdput;
+ } else {
+ if (kstrtoull(buf, 0, &value))
+ goto out;
+ }
mutex_lock(&q->debugfs_mutex);
out_unlock_bdev:
mutex_unlock(&q->debugfs_mutex);
- out_bdput:
- bdput(bdev);
out:
return ret ? ret : count;
}
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->i_pages lock (try_to_unmap_one)
- * ->pgdat->lru_lock (follow_page->mark_page_accessed)
- * ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
+ * ->lruvec->lru_lock (follow_page->mark_page_accessed)
+ * ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
if (PageSwapBacked(page)) {
__mod_lruvec_page_state(page, NR_SHMEM, -nr);
if (PageTransHuge(page))
- __dec_node_page_state(page, NR_SHMEM_THPS);
+ __dec_lruvec_page_state(page, NR_SHMEM_THPS);
} else if (PageTransHuge(page)) {
- __dec_node_page_state(page, NR_FILE_THPS);
+ __dec_lruvec_page_state(page, NR_FILE_THPS);
filemap_nr_thps_dec(mapping);
}
else
ret = PageLocked(page);
/*
- * If we were succesful now, we know we're still on the
+ * If we were successful now, we know we're still on the
* waitqueue as we're still under the lock. This means it's
* safe to remove and return success, we know the callback
* isn't going to trigger.
rotate_reclaimable_page(page);
}
+ /*
+ * Writeback does not hold a page reference of its own, relying
+ * on truncation to wait for the clearing of PG_writeback.
+ * But here we must make sure that the page is not freed and
+ * reused before the wake_up_page().
+ */
+ get_page(page);
if (!test_clear_page_writeback(page))
BUG();
smp_mb__after_atomic();
wake_up_page(page, PG_writeback);
+ put_page(page);
}
EXPORT_SYMBOL(end_page_writeback);
else
wait_on_page_locked(page);
return 0;
- } else {
- if (flags & FAULT_FLAG_KILLABLE) {
- int ret;
+ }
+ if (flags & FAULT_FLAG_KILLABLE) {
+ int ret;
- ret = __lock_page_killable(page);
- if (ret) {
- mmap_read_unlock(mm);
- return 0;
- }
- } else
- __lock_page(page);
- return 1;
+ ret = __lock_page_killable(page);
+ if (ret) {
+ mmap_read_unlock(mm);
+ return 0;
+ }
+ } else {
+ __lock_page(page);
}
+ return 1;
+
}
/**
ra->ra_pages /= 4;
}
+static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+{
+ if (iocb->ki_flags & IOCB_WAITQ)
+ return lock_page_async(page, iocb->ki_waitq);
+ else if (iocb->ki_flags & IOCB_NOWAIT)
+ return trylock_page(page) ? 0 : -EAGAIN;
+ else
+ return lock_page_killable(page);
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct kiocb *iocb,
+ struct file *filp,
+ struct address_space *mapping,
+ struct page *page)
+{
+ struct file_ra_state *ra = &filp->f_ra;
+ int error;
+
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures, eg. multipath errors.
+ * PG_error will be set again if readpage fails.
+ */
+ ClearPageError(page);
+ /* Start the actual read. The read will unlock the page. */
+ error = mapping->a_ops->readpage(filp, page);
+
+ if (unlikely(error)) {
+ put_page(page);
+ return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+ }
+
+ if (!PageUptodate(page)) {
+ error = lock_page_for_iocb(iocb, page);
+ if (unlikely(error)) {
+ put_page(page);
+ return ERR_PTR(error);
+ }
+ if (!PageUptodate(page)) {
+ if (page->mapping == NULL) {
+ /*
+ * invalidate_mapping_pages got it
+ */
+ unlock_page(page);
+ put_page(page);
+ return NULL;
+ }
+ unlock_page(page);
+ shrink_readahead_size_eio(ra);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ unlock_page(page);
+ }
+
+ return page;
+}
+
+static struct page *
+generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
+ struct file *filp,
+ struct iov_iter *iter,
+ struct page *page,
+ loff_t pos, loff_t count)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ int error;
+
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ if (iocb->ki_flags & IOCB_WAITQ) {
+ error = wait_on_page_locked_async(page,
+ iocb->ki_waitq);
+ } else {
+ error = wait_on_page_locked_killable(page);
+ }
+ if (unlikely(error)) {
+ put_page(page);
+ return ERR_PTR(error);
+ }
+ if (PageUptodate(page))
+ return page;
+
+ if (inode->i_blkbits == PAGE_SHIFT ||
+ !mapping->a_ops->is_partially_uptodate)
+ goto page_not_up_to_date;
+ /* pipes can't handle partially uptodate pages */
+ if (unlikely(iov_iter_is_pipe(iter)))
+ goto page_not_up_to_date;
+ if (!trylock_page(page))
+ goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
+ if (!mapping->a_ops->is_partially_uptodate(page,
+ pos & ~PAGE_MASK, count))
+ goto page_not_up_to_date_locked;
+ unlock_page(page);
+ return page;
+
+page_not_up_to_date:
+ /* Get exclusive access to the page ... */
+ error = lock_page_for_iocb(iocb, page);
+ if (unlikely(error)) {
+ put_page(page);
+ return ERR_PTR(error);
+ }
+
+page_not_up_to_date_locked:
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping) {
+ unlock_page(page);
+ put_page(page);
+ return NULL;
+ }
+
+ /* Did somebody else fill it already? */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ return page;
+ }
+
+ return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+}
+
+static struct page *
+generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ struct file *filp = iocb->ki_filp;
+ struct address_space *mapping = filp->f_mapping;
+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+ struct page *page;
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOIO)
+ return ERR_PTR(-EAGAIN);
+
+ /*
+ * Ok, it wasn't cached, so we need to create a new
+ * page..
+ */
+ page = page_cache_alloc(mapping);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ error = add_to_page_cache_lru(page, mapping, index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
+ if (error) {
+ put_page(page);
+ return error != -EEXIST ? ERR_PTR(error) : NULL;
+ }
+
+ return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+}
+
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+ struct iov_iter *iter,
+ struct page **pages,
+ unsigned int nr)
+{
+ struct file *filp = iocb->ki_filp;
+ struct address_space *mapping = filp->f_mapping;
+ struct file_ra_state *ra = &filp->f_ra;
+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+ pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+ int i, j, nr_got, err = 0;
+
+ nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ nr_got = find_get_pages_contig(mapping, index, nr, pages);
+ if (nr_got)
+ goto got_pages;
+
+ if (iocb->ki_flags & IOCB_NOIO)
+ return -EAGAIN;
+
+ page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+ nr_got = find_get_pages_contig(mapping, index, nr, pages);
+ if (nr_got)
+ goto got_pages;
+
+ pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+ err = PTR_ERR_OR_ZERO(pages[0]);
+ if (!IS_ERR_OR_NULL(pages[0]))
+ nr_got = 1;
+got_pages:
+ for (i = 0; i < nr_got; i++) {
+ struct page *page = pages[i];
+ pgoff_t pg_index = index + i;
+ loff_t pg_pos = max(iocb->ki_pos,
+ (loff_t) pg_index << PAGE_SHIFT);
+ loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
+
+ if (PageReadahead(page)) {
+ if (iocb->ki_flags & IOCB_NOIO) {
+ for (j = i; j < nr_got; j++)
+ put_page(pages[j]);
+ nr_got = i;
+ err = -EAGAIN;
+ break;
+ }
+ page_cache_async_readahead(mapping, ra, filp, page,
+ pg_index, last_index - pg_index);
+ }
+
+ if (!PageUptodate(page)) {
+ if ((iocb->ki_flags & IOCB_NOWAIT) ||
+ ((iocb->ki_flags & IOCB_WAITQ) && i)) {
+ for (j = i; j < nr_got; j++)
+ put_page(pages[j]);
+ nr_got = i;
+ err = -EAGAIN;
+ break;
+ }
+
+ page = generic_file_buffered_read_pagenotuptodate(iocb,
+ filp, iter, page, pg_pos, pg_count);
+ if (IS_ERR_OR_NULL(page)) {
+ for (j = i + 1; j < nr_got; j++)
+ put_page(pages[j]);
+ nr_got = i;
+ err = PTR_ERR_OR_ZERO(page);
+ break;
+ }
+ }
+ }
+
+ if (likely(nr_got))
+ return nr_got;
+ if (err)
+ return err;
+ /*
+ * No pages and no error means we raced and should retry:
+ */
+ goto find_page;
+}
+
/**
* generic_file_buffered_read - generic file read routine
* @iocb: the iocb to read
struct iov_iter *iter, ssize_t written)
{
struct file *filp = iocb->ki_filp;
+ struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- struct file_ra_state *ra = &filp->f_ra;
- loff_t *ppos = &iocb->ki_pos;
- pgoff_t index;
- pgoff_t last_index;
- pgoff_t prev_index;
- unsigned long offset; /* offset into pagecache page */
- unsigned int prev_offset;
- int error = 0;
-
- if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+ struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
+ unsigned int nr_pages = min_t(unsigned int, 512,
+ ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ (iocb->ki_pos >> PAGE_SHIFT));
+ int i, pg_nr, error = 0;
+ bool writably_mapped;
+ loff_t isize, end_offset;
+
+ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
- index = *ppos >> PAGE_SHIFT;
- prev_index = ra->prev_pos >> PAGE_SHIFT;
- prev_offset = ra->prev_pos & (PAGE_SIZE-1);
- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
- offset = *ppos & ~PAGE_MASK;
+ if (nr_pages > ARRAY_SIZE(pages_onstack))
+ pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
- /*
- * If we've already successfully copied some data, then we
- * can no longer safely return -EIOCBQUEUED. Hence mark
- * an async read NOWAIT at that point.
- */
- if (written && (iocb->ki_flags & IOCB_WAITQ))
- iocb->ki_flags |= IOCB_NOWAIT;
-
- for (;;) {
- struct page *page;
- pgoff_t end_index;
- loff_t isize;
- unsigned long nr, ret;
+ if (!pages) {
+ pages = pages_onstack;
+ nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
+ }
+ do {
cond_resched();
-find_page:
- if (fatal_signal_pending(current)) {
- error = -EINTR;
- goto out;
- }
- page = find_get_page(mapping, index);
- if (!page) {
- if (iocb->ki_flags & IOCB_NOIO)
- goto would_block;
- page_cache_sync_readahead(mapping,
- ra, filp,
- index, last_index - index);
- page = find_get_page(mapping, index);
- if (unlikely(page == NULL))
- goto no_cached_page;
- }
- if (PageReadahead(page)) {
- if (iocb->ki_flags & IOCB_NOIO) {
- put_page(page);
- goto out;
- }
- page_cache_async_readahead(mapping,
- ra, filp, page,
- index, last_index - index);
- }
- if (!PageUptodate(page)) {
- /*
- * See comment in do_read_cache_page on why
- * wait_on_page_locked is used to avoid unnecessarily
- * serialisations and why it's safe.
- */
- if (iocb->ki_flags & IOCB_WAITQ) {
- if (written) {
- put_page(page);
- goto out;
- }
- error = wait_on_page_locked_async(page,
- iocb->ki_waitq);
- } else {
- if (iocb->ki_flags & IOCB_NOWAIT) {
- put_page(page);
- goto would_block;
- }
- error = wait_on_page_locked_killable(page);
- }
- if (unlikely(error))
- goto readpage_error;
- if (PageUptodate(page))
- goto page_ok;
-
- if (inode->i_blkbits == PAGE_SHIFT ||
- !mapping->a_ops->is_partially_uptodate)
- goto page_not_up_to_date;
- /* pipes can't handle partially uptodate pages */
- if (unlikely(iov_iter_is_pipe(iter)))
- goto page_not_up_to_date;
- if (!trylock_page(page))
- goto page_not_up_to_date;
- /* Did it get truncated before we got the lock? */
- if (!page->mapping)
- goto page_not_up_to_date_locked;
- if (!mapping->a_ops->is_partially_uptodate(page,
- offset, iter->count))
- goto page_not_up_to_date_locked;
- unlock_page(page);
+ /*
+ * If we've already successfully copied some data, then we
+ * can no longer safely return -EIOCBQUEUED. Hence mark
+ * an async read NOWAIT at that point.
+ */
+ if ((iocb->ki_flags & IOCB_WAITQ) && written)
+ iocb->ki_flags |= IOCB_NOWAIT;
+
+ i = 0;
+ pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
+ pages, nr_pages);
+ if (pg_nr < 0) {
+ error = pg_nr;
+ break;
}
-page_ok:
+
/*
- * i_size must be checked after we know the page is Uptodate.
+ * i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
-
isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_SHIFT;
- if (unlikely(!isize || index > end_index)) {
- put_page(page);
- goto out;
- }
+ if (unlikely(iocb->ki_pos >= isize))
+ goto put_pages;
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_SIZE;
- if (index == end_index) {
- nr = ((isize - 1) & ~PAGE_MASK) + 1;
- if (nr <= offset) {
- put_page(page);
- goto out;
- }
- }
- nr = nr - offset;
+ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
- /* If users can be writing to this page using arbitrary
- * virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
- */
- if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
+ (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
+ put_page(pages[--pg_nr]);
/*
- * When a sequential read accesses a page several times,
- * only mark it as accessed the first time.
+ * Once we start copying data, we don't want to be touching any
+ * cachelines that might be contended:
*/
- if (prev_index != index || offset != prev_offset)
- mark_page_accessed(page);
- prev_index = index;
+ writably_mapped = mapping_writably_mapped(mapping);
/*
- * Ok, we have the page, and it's up-to-date, so
- * now we can copy it to user space...
+ * When a sequential read accesses a page several times, only
+ * mark it as accessed the first time.
*/
+ if (iocb->ki_pos >> PAGE_SHIFT !=
+ ra->prev_pos >> PAGE_SHIFT)
+ mark_page_accessed(pages[0]);
+ for (i = 1; i < pg_nr; i++)
+ mark_page_accessed(pages[i]);
+
+ for (i = 0; i < pg_nr; i++) {
+ unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+ unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+ PAGE_SIZE - offset);
+ unsigned int copied;
- ret = copy_page_to_iter(page, offset, nr, iter);
- offset += ret;
- index += offset >> PAGE_SHIFT;
- offset &= ~PAGE_MASK;
- prev_offset = offset;
-
- put_page(page);
- written += ret;
- if (!iov_iter_count(iter))
- goto out;
- if (ret < nr) {
- error = -EFAULT;
- goto out;
- }
- continue;
-
-page_not_up_to_date:
- /* Get exclusive access to the page ... */
- if (iocb->ki_flags & IOCB_WAITQ)
- error = lock_page_async(page, iocb->ki_waitq);
- else
- error = lock_page_killable(page);
- if (unlikely(error))
- goto readpage_error;
-
-page_not_up_to_date_locked:
- /* Did it get truncated before we got the lock? */
- if (!page->mapping) {
- unlock_page(page);
- put_page(page);
- continue;
- }
-
- /* Did somebody else fill it already? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto page_ok;
- }
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (writably_mapped)
+ flush_dcache_page(pages[i]);
-readpage:
- if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
- unlock_page(page);
- put_page(page);
- goto would_block;
- }
- /*
- * A previous I/O error may have been due to temporary
- * failures, eg. multipath errors.
- * PG_error will be set again if readpage fails.
- */
- ClearPageError(page);
- /* Start the actual read. The read will unlock the page. */
- error = mapping->a_ops->readpage(filp, page);
+ copied = copy_page_to_iter(pages[i], offset, bytes, iter);
- if (unlikely(error)) {
- if (error == AOP_TRUNCATED_PAGE) {
- put_page(page);
- error = 0;
- goto find_page;
- }
- goto readpage_error;
- }
+ written += copied;
+ iocb->ki_pos += copied;
+ ra->prev_pos = iocb->ki_pos;
- if (!PageUptodate(page)) {
- if (iocb->ki_flags & IOCB_WAITQ)
- error = lock_page_async(page, iocb->ki_waitq);
- else
- error = lock_page_killable(page);
-
- if (unlikely(error))
- goto readpage_error;
- if (!PageUptodate(page)) {
- if (page->mapping == NULL) {
- /*
- * invalidate_mapping_pages got it
- */
- unlock_page(page);
- put_page(page);
- goto find_page;
- }
- unlock_page(page);
- shrink_readahead_size_eio(ra);
- error = -EIO;
- goto readpage_error;
+ if (copied < bytes) {
+ error = -EFAULT;
+ break;
}
- unlock_page(page);
}
+put_pages:
+ for (i = 0; i < pg_nr; i++)
+ put_page(pages[i]);
+ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
- goto page_ok;
-
-readpage_error:
- /* UHHUH! A synchronous read error occurred. Report it */
- put_page(page);
- goto out;
-
-no_cached_page:
- /*
- * Ok, it wasn't cached, so we need to create a new
- * page..
- */
- page = page_cache_alloc(mapping);
- if (!page) {
- error = -ENOMEM;
- goto out;
- }
- error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
- if (error) {
- put_page(page);
- if (error == -EEXIST) {
- error = 0;
- goto find_page;
- }
- goto out;
- }
- goto readpage;
- }
+ file_accessed(filp);
-would_block:
- error = -EAGAIN;
-out:
- ra->prev_pos = prev_index;
- ra->prev_pos <<= PAGE_SHIFT;
- ra->prev_pos |= prev_offset;
+ if (pages != pages_onstack)
+ kfree(pages);
- *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
- file_accessed(filp);
return written ? written : error;
}
EXPORT_SYMBOL_GPL(generic_file_buffered_read);
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
struct page *page = vmf->page;
- struct inode *inode = file_inode(vmf->vma->vm_file);
vm_fault_t ret = VM_FAULT_LOCKED;
- sb_start_pagefault(inode->i_sb);
+ sb_start_pagefault(mapping->host->i_sb);
file_update_time(vmf->vma->vm_file);
lock_page(page);
- if (page->mapping != inode->i_mapping) {
+ if (page->mapping != mapping) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
goto out;
set_page_dirty(page);
wait_for_stable_page(page);
out:
- sb_end_pagefault(inode->i_sb);
+ sb_end_pagefault(mapping->host->i_sb);
return ret;
}
{
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
char pathname[128];
- struct inode *inode = file_inode(filp);
char *path;
- errseq_set(&inode->i_mapping->wb_err, -EIO);
+ errseq_set(&filp->f_mapping->wb_err, -EIO);
if (__ratelimit(&_rs)) {
path = file_path(filp, pathname, sizeof(pathname));
if (IS_ERR(path))
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
- if (filemap_range_has_page(inode->i_mapping, pos,
+ if (filemap_range_has_page(file->f_mapping, pos,
pos + write_len - 1))
return -EAGAIN;
} else {