2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
28 #include "block_int.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
36 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79 BlockDriverCompletionFunc *cb,
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
106 static int is_windows_drive_prefix(const char *filename)
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113 int is_windows_drive(const char *filename)
115 if (is_windows_drive_prefix(filename) &&
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
128 bs->io_limits_enabled = false;
130 while (qemu_co_queue_next(&bs->throttled_reqs));
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
144 static void bdrv_block_timer(void *opaque)
146 BlockDriverState *bs = opaque;
148 qemu_co_queue_next(&bs->throttled_reqs);
151 void bdrv_io_limits_enable(BlockDriverState *bs)
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
176 int64_t wait_time = -1;
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
195 qemu_co_queue_next(&bs->throttled_reqs);
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
208 return strchr(path, ':') != NULL;
211 int path_is_absolute(const char *path)
214 /* specific case for names like: "\\.\d:" */
215 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
218 return (*path == '/' || *path == '\\');
220 return (*path == '/');
224 /* if filename is absolute, just copy it to dest. Otherwise, build a
225 path to it by considering it is relative to base_path. URL are
227 void path_combine(char *dest, int dest_size,
228 const char *base_path,
229 const char *filename)
236 if (path_is_absolute(filename)) {
237 pstrcpy(dest, dest_size, filename);
239 p = strchr(base_path, ':');
244 p1 = strrchr(base_path, '/');
248 p2 = strrchr(base_path, '\\');
260 if (len > dest_size - 1)
262 memcpy(dest, base_path, len);
264 pstrcat(dest, dest_size, filename);
268 void bdrv_register(BlockDriver *bdrv)
270 /* Block drivers without coroutine functions need emulation */
271 if (!bdrv->bdrv_co_readv) {
272 bdrv->bdrv_co_readv = bdrv_co_readv_em;
273 bdrv->bdrv_co_writev = bdrv_co_writev_em;
275 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
276 * the block driver lacks aio we need to emulate that too.
278 if (!bdrv->bdrv_aio_readv) {
279 /* add AIO emulation layer */
280 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
281 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
285 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
288 /* create a new block device (by default it is empty) */
289 BlockDriverState *bdrv_new(const char *device_name)
291 BlockDriverState *bs;
293 bs = g_malloc0(sizeof(BlockDriverState));
294 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
295 if (device_name[0] != '\0') {
296 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
298 bdrv_iostatus_disable(bs);
302 BlockDriver *bdrv_find_format(const char *format_name)
305 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
306 if (!strcmp(drv1->format_name, format_name)) {
313 static int bdrv_is_whitelisted(BlockDriver *drv)
315 static const char *whitelist[] = {
316 CONFIG_BDRV_WHITELIST
321 return 1; /* no whitelist, anything goes */
323 for (p = whitelist; *p; p++) {
324 if (!strcmp(drv->format_name, *p)) {
331 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
333 BlockDriver *drv = bdrv_find_format(format_name);
334 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
337 typedef struct CreateCo {
340 QEMUOptionParameter *options;
344 static void coroutine_fn bdrv_create_co_entry(void *opaque)
346 CreateCo *cco = opaque;
349 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
352 int bdrv_create(BlockDriver *drv, const char* filename,
353 QEMUOptionParameter *options)
360 .filename = g_strdup(filename),
365 if (!drv->bdrv_create) {
369 if (qemu_in_coroutine()) {
370 /* Fast-path if already in coroutine context */
371 bdrv_create_co_entry(&cco);
373 co = qemu_coroutine_create(bdrv_create_co_entry);
374 qemu_coroutine_enter(co, &cco);
375 while (cco.ret == NOT_DONE) {
381 g_free(cco.filename);
386 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
390 drv = bdrv_find_protocol(filename);
395 return bdrv_create(drv, filename, options);
399 void get_tmp_filename(char *filename, int size)
401 char temp_dir[MAX_PATH];
403 GetTempPath(MAX_PATH, temp_dir);
404 GetTempFileName(temp_dir, "qem", 0, filename);
407 void get_tmp_filename(char *filename, int size)
411 /* XXX: race condition possible */
412 tmpdir = getenv("TMPDIR");
415 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
416 fd = mkstemp(filename);
422 * Detect host devices. By convention, /dev/cdrom[N] is always
423 * recognized as a host CDROM.
425 static BlockDriver *find_hdev_driver(const char *filename)
427 int score_max = 0, score;
428 BlockDriver *drv = NULL, *d;
430 QLIST_FOREACH(d, &bdrv_drivers, list) {
431 if (d->bdrv_probe_device) {
432 score = d->bdrv_probe_device(filename);
433 if (score > score_max) {
443 BlockDriver *bdrv_find_protocol(const char *filename)
450 /* TODO Drivers without bdrv_file_open must be specified explicitly */
453 * XXX(hch): we really should not let host device detection
454 * override an explicit protocol specification, but moving this
455 * later breaks access to device names with colons in them.
456 * Thanks to the brain-dead persistent naming schemes on udev-
457 * based Linux systems those actually are quite common.
459 drv1 = find_hdev_driver(filename);
464 if (!path_has_protocol(filename)) {
465 return bdrv_find_format("file");
467 p = strchr(filename, ':');
470 if (len > sizeof(protocol) - 1)
471 len = sizeof(protocol) - 1;
472 memcpy(protocol, filename, len);
473 protocol[len] = '\0';
474 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
475 if (drv1->protocol_name &&
476 !strcmp(drv1->protocol_name, protocol)) {
483 static int find_image_format(const char *filename, BlockDriver **pdrv)
485 int ret, score, score_max;
486 BlockDriver *drv1, *drv;
488 BlockDriverState *bs;
490 ret = bdrv_file_open(&bs, filename, 0);
496 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
497 if (bs->sg || !bdrv_is_inserted(bs)) {
499 drv = bdrv_find_format("raw");
507 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
516 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
517 if (drv1->bdrv_probe) {
518 score = drv1->bdrv_probe(buf, ret, filename);
519 if (score > score_max) {
533 * Set the current 'total_sectors' value
535 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
537 BlockDriver *drv = bs->drv;
539 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
543 /* query actual device if possible, otherwise just trust the hint */
544 if (drv->bdrv_getlength) {
545 int64_t length = drv->bdrv_getlength(bs);
549 hint = length >> BDRV_SECTOR_BITS;
552 bs->total_sectors = hint;
557 * Set open flags for a given cache mode
559 * Return 0 on success, -1 if the cache mode was invalid.
561 int bdrv_parse_cache_flags(const char *mode, int *flags)
563 *flags &= ~BDRV_O_CACHE_MASK;
565 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
566 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
567 } else if (!strcmp(mode, "directsync")) {
568 *flags |= BDRV_O_NOCACHE;
569 } else if (!strcmp(mode, "writeback")) {
570 *flags |= BDRV_O_CACHE_WB;
571 } else if (!strcmp(mode, "unsafe")) {
572 *flags |= BDRV_O_CACHE_WB;
573 *flags |= BDRV_O_NO_FLUSH;
574 } else if (!strcmp(mode, "writethrough")) {
575 /* this is the default */
584 * The copy-on-read flag is actually a reference count so multiple users may
585 * use the feature without worrying about clobbering its previous state.
586 * Copy-on-read stays enabled until all users have called to disable it.
588 void bdrv_enable_copy_on_read(BlockDriverState *bs)
593 void bdrv_disable_copy_on_read(BlockDriverState *bs)
595 assert(bs->copy_on_read > 0);
600 * Common part for opening disk images and files
602 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
603 int flags, BlockDriver *drv)
609 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
612 bs->total_sectors = 0;
616 bs->open_flags = flags;
618 bs->buffer_alignment = 512;
620 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
621 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
622 bdrv_enable_copy_on_read(bs);
625 pstrcpy(bs->filename, sizeof(bs->filename), filename);
626 bs->backing_file[0] = '\0';
628 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
633 bs->opaque = g_malloc0(drv->instance_size);
635 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
638 * Clear flags that are internal to the block layer before opening the
641 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
644 * Snapshots should be writable.
646 if (bs->is_temporary) {
647 open_flags |= BDRV_O_RDWR;
650 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
652 /* Open the image, either directly or using a protocol */
653 if (drv->bdrv_file_open) {
654 ret = drv->bdrv_file_open(bs, filename, open_flags);
656 ret = bdrv_file_open(&bs->file, filename, open_flags);
658 ret = drv->bdrv_open(bs, open_flags);
666 ret = refresh_total_sectors(bs, bs->total_sectors);
672 if (bs->is_temporary) {
680 bdrv_delete(bs->file);
690 * Opens a file using a protocol (file, host_device, nbd, ...)
692 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
694 BlockDriverState *bs;
698 drv = bdrv_find_protocol(filename);
704 ret = bdrv_open_common(bs, filename, flags, drv);
715 * Opens a disk image (raw, qcow2, vmdk, ...)
717 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
721 char tmp_filename[PATH_MAX];
723 if (flags & BDRV_O_SNAPSHOT) {
724 BlockDriverState *bs1;
727 BlockDriver *bdrv_qcow2;
728 QEMUOptionParameter *options;
729 char backing_filename[PATH_MAX];
731 /* if snapshot, we create a temporary backing file and open it
732 instead of opening 'filename' directly */
734 /* if there is a backing file, use it */
736 ret = bdrv_open(bs1, filename, 0, drv);
741 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
743 if (bs1->drv && bs1->drv->protocol_name)
748 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
750 /* Real path is meaningless for protocols */
752 snprintf(backing_filename, sizeof(backing_filename),
754 else if (!realpath(filename, backing_filename))
757 bdrv_qcow2 = bdrv_find_format("qcow2");
758 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
760 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
761 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
763 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
767 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
768 free_option_parameters(options);
773 filename = tmp_filename;
775 bs->is_temporary = 1;
778 /* Find the right image format driver */
780 ret = find_image_format(filename, &drv);
784 goto unlink_and_fail;
788 ret = bdrv_open_common(bs, filename, flags, drv);
790 goto unlink_and_fail;
793 /* If there is a backing file, use it */
794 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
795 char backing_filename[PATH_MAX];
797 BlockDriver *back_drv = NULL;
799 bs->backing_hd = bdrv_new("");
801 if (path_has_protocol(bs->backing_file)) {
802 pstrcpy(backing_filename, sizeof(backing_filename),
805 path_combine(backing_filename, sizeof(backing_filename),
806 filename, bs->backing_file);
809 if (bs->backing_format[0] != '\0') {
810 back_drv = bdrv_find_format(bs->backing_format);
813 /* backing files always opened read-only */
815 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
817 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
822 if (bs->is_temporary) {
823 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
825 /* base image inherits from "parent" */
826 bs->backing_hd->keep_read_only = bs->keep_read_only;
830 if (!bdrv_key_required(bs)) {
831 bdrv_dev_change_media_cb(bs, true);
834 /* throttling disk I/O limits */
835 if (bs->io_limits_enabled) {
836 bdrv_io_limits_enable(bs);
842 if (bs->is_temporary) {
848 void bdrv_close(BlockDriverState *bs)
853 block_job_cancel_sync(bs->job);
857 if (bs == bs_snapshots) {
860 if (bs->backing_hd) {
861 bdrv_delete(bs->backing_hd);
862 bs->backing_hd = NULL;
864 bs->drv->bdrv_close(bs);
867 if (bs->is_temporary) {
868 unlink(bs->filename);
873 bs->copy_on_read = 0;
874 bs->backing_file[0] = '\0';
875 bs->backing_format[0] = '\0';
877 if (bs->file != NULL) {
878 bdrv_delete(bs->file);
882 bdrv_dev_change_media_cb(bs, false);
885 /*throttling disk I/O limits*/
886 if (bs->io_limits_enabled) {
887 bdrv_io_limits_disable(bs);
891 void bdrv_close_all(void)
893 BlockDriverState *bs;
895 QTAILQ_FOREACH(bs, &bdrv_states, list) {
901 * Wait for pending requests to complete across all BlockDriverStates
903 * This function does not flush data to disk, use bdrv_flush_all() for that
904 * after calling this function.
906 * Note that completion of an asynchronous I/O operation can trigger any
907 * number of other I/O operations on other devices---for example a coroutine
908 * can be arbitrarily complex and a constant flow of I/O can come until the
909 * coroutine is complete. Because of this, it is not possible to have a
910 * function to drain a single device's I/O queue.
912 void bdrv_drain_all(void)
914 BlockDriverState *bs;
918 busy = qemu_aio_wait();
920 /* FIXME: We do not have timer support here, so this is effectively
923 QTAILQ_FOREACH(bs, &bdrv_states, list) {
924 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
925 qemu_co_queue_restart_all(&bs->throttled_reqs);
931 /* If requests are still pending there is a bug somewhere */
932 QTAILQ_FOREACH(bs, &bdrv_states, list) {
933 assert(QLIST_EMPTY(&bs->tracked_requests));
934 assert(qemu_co_queue_empty(&bs->throttled_reqs));
938 /* make a BlockDriverState anonymous by removing from bdrv_state list.
939 Also, NULL terminate the device_name to prevent double remove */
940 void bdrv_make_anon(BlockDriverState *bs)
942 if (bs->device_name[0] != '\0') {
943 QTAILQ_REMOVE(&bdrv_states, bs, list);
945 bs->device_name[0] = '\0';
948 static void bdrv_rebind(BlockDriverState *bs)
950 if (bs->drv && bs->drv->bdrv_rebind) {
951 bs->drv->bdrv_rebind(bs);
956 * Add new bs contents at the top of an image chain while the chain is
957 * live, while keeping required fields on the top layer.
959 * This will modify the BlockDriverState fields, and swap contents
960 * between bs_new and bs_top. Both bs_new and bs_top are modified.
962 * bs_new is required to be anonymous.
964 * This function does not create any image files.
966 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
968 BlockDriverState tmp;
970 /* bs_new must be anonymous */
971 assert(bs_new->device_name[0] == '\0');
975 /* there are some fields that need to stay on the top layer: */
976 tmp.open_flags = bs_top->open_flags;
979 tmp.dev_ops = bs_top->dev_ops;
980 tmp.dev_opaque = bs_top->dev_opaque;
981 tmp.dev = bs_top->dev;
982 tmp.buffer_alignment = bs_top->buffer_alignment;
983 tmp.copy_on_read = bs_top->copy_on_read;
985 /* i/o timing parameters */
986 tmp.slice_time = bs_top->slice_time;
987 tmp.slice_start = bs_top->slice_start;
988 tmp.slice_end = bs_top->slice_end;
989 tmp.io_limits = bs_top->io_limits;
990 tmp.io_base = bs_top->io_base;
991 tmp.throttled_reqs = bs_top->throttled_reqs;
992 tmp.block_timer = bs_top->block_timer;
993 tmp.io_limits_enabled = bs_top->io_limits_enabled;
996 tmp.cyls = bs_top->cyls;
997 tmp.heads = bs_top->heads;
998 tmp.secs = bs_top->secs;
999 tmp.translation = bs_top->translation;
1002 tmp.on_read_error = bs_top->on_read_error;
1003 tmp.on_write_error = bs_top->on_write_error;
1006 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1007 tmp.iostatus = bs_top->iostatus;
1009 /* keep the same entry in bdrv_states */
1010 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1011 tmp.list = bs_top->list;
1013 /* The contents of 'tmp' will become bs_top, as we are
1014 * swapping bs_new and bs_top contents. */
1015 tmp.backing_hd = bs_new;
1016 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1017 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1019 /* swap contents of the fixed new bs and the current top */
1023 /* device_name[] was carried over from the old bs_top. bs_new
1024 * shouldn't be in bdrv_states, so we need to make device_name[]
1025 * reflect the anonymity of bs_new
1027 bs_new->device_name[0] = '\0';
1029 /* clear the copied fields in the new backing file */
1030 bdrv_detach_dev(bs_new, bs_new->dev);
1032 qemu_co_queue_init(&bs_new->throttled_reqs);
1033 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1034 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1035 bdrv_iostatus_disable(bs_new);
1037 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1038 * to affect or delete the block_timer, as it has been moved to bs_top */
1039 bs_new->io_limits_enabled = false;
1040 bs_new->block_timer = NULL;
1041 bs_new->slice_time = 0;
1042 bs_new->slice_start = 0;
1043 bs_new->slice_end = 0;
1045 bdrv_rebind(bs_new);
1046 bdrv_rebind(bs_top);
1049 void bdrv_delete(BlockDriverState *bs)
1053 assert(!bs->in_use);
1055 /* remove from list, if necessary */
1060 assert(bs != bs_snapshots);
1064 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1065 /* TODO change to DeviceState *dev when all users are qdevified */
1071 bdrv_iostatus_reset(bs);
1075 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1076 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1078 if (bdrv_attach_dev(bs, dev) < 0) {
1083 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1084 /* TODO change to DeviceState *dev when all users are qdevified */
1086 assert(bs->dev == dev);
1089 bs->dev_opaque = NULL;
1090 bs->buffer_alignment = 512;
1093 /* TODO change to return DeviceState * when all users are qdevified */
1094 void *bdrv_get_attached_dev(BlockDriverState *bs)
1099 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1103 bs->dev_opaque = opaque;
1104 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1105 bs_snapshots = NULL;
1109 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1110 BlockQMPEventAction action, int is_read)
1113 const char *action_str;
1116 case BDRV_ACTION_REPORT:
1117 action_str = "report";
1119 case BDRV_ACTION_IGNORE:
1120 action_str = "ignore";
1122 case BDRV_ACTION_STOP:
1123 action_str = "stop";
1129 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1132 is_read ? "read" : "write");
1133 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1135 qobject_decref(data);
1138 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1142 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1143 bdrv_get_device_name(bs), ejected);
1144 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1146 qobject_decref(data);
1149 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1151 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1152 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1153 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1154 if (tray_was_closed) {
1156 bdrv_emit_qmp_eject_event(bs, true);
1160 bdrv_emit_qmp_eject_event(bs, false);
1165 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1167 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1170 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1172 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1173 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1177 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1179 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1180 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1185 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1187 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1188 bs->dev_ops->resize_cb(bs->dev_opaque);
1192 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1194 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1195 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1201 * Run consistency checks on an image
1203 * Returns 0 if the check could be completed (it doesn't mean that the image is
1204 * free of errors) or -errno when an internal error occurred. The results of the
1205 * check are stored in res.
1207 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1209 if (bs->drv->bdrv_check == NULL) {
1213 memset(res, 0, sizeof(*res));
1214 return bs->drv->bdrv_check(bs, res);
1217 #define COMMIT_BUF_SECTORS 2048
1219 /* commit COW file into the raw image */
1220 int bdrv_commit(BlockDriverState *bs)
1222 BlockDriver *drv = bs->drv;
1223 BlockDriver *backing_drv;
1224 int64_t sector, total_sectors;
1225 int n, ro, open_flags;
1226 int ret = 0, rw_ret = 0;
1228 char filename[1024];
1229 BlockDriverState *bs_rw, *bs_ro;
1234 if (!bs->backing_hd) {
1238 if (bs->backing_hd->keep_read_only) {
1242 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1246 backing_drv = bs->backing_hd->drv;
1247 ro = bs->backing_hd->read_only;
1248 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1249 open_flags = bs->backing_hd->open_flags;
1253 bdrv_delete(bs->backing_hd);
1254 bs->backing_hd = NULL;
1255 bs_rw = bdrv_new("");
1256 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1260 /* try to re-open read-only */
1261 bs_ro = bdrv_new("");
1262 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1266 /* drive not functional anymore */
1270 bs->backing_hd = bs_ro;
1273 bs->backing_hd = bs_rw;
1276 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1277 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1279 for (sector = 0; sector < total_sectors; sector += n) {
1280 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1282 if (bdrv_read(bs, sector, buf, n) != 0) {
1287 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1294 if (drv->bdrv_make_empty) {
1295 ret = drv->bdrv_make_empty(bs);
1300 * Make sure all data we wrote to the backing device is actually
1304 bdrv_flush(bs->backing_hd);
1311 bdrv_delete(bs->backing_hd);
1312 bs->backing_hd = NULL;
1313 bs_ro = bdrv_new("");
1314 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1318 /* drive not functional anymore */
1322 bs->backing_hd = bs_ro;
1323 bs->backing_hd->keep_read_only = 0;
1329 int bdrv_commit_all(void)
1331 BlockDriverState *bs;
1333 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1334 int ret = bdrv_commit(bs);
1342 struct BdrvTrackedRequest {
1343 BlockDriverState *bs;
1347 QLIST_ENTRY(BdrvTrackedRequest) list;
1348 Coroutine *co; /* owner, used for deadlock detection */
1349 CoQueue wait_queue; /* coroutines blocked on this request */
1353 * Remove an active request from the tracked requests list
1355 * This function should be called when a tracked request is completing.
1357 static void tracked_request_end(BdrvTrackedRequest *req)
1359 QLIST_REMOVE(req, list);
1360 qemu_co_queue_restart_all(&req->wait_queue);
1364 * Add an active request to the tracked requests list
1366 static void tracked_request_begin(BdrvTrackedRequest *req,
1367 BlockDriverState *bs,
1369 int nb_sectors, bool is_write)
1371 *req = (BdrvTrackedRequest){
1373 .sector_num = sector_num,
1374 .nb_sectors = nb_sectors,
1375 .is_write = is_write,
1376 .co = qemu_coroutine_self(),
1379 qemu_co_queue_init(&req->wait_queue);
1381 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1385 * Round a region to cluster boundaries
1387 static void round_to_clusters(BlockDriverState *bs,
1388 int64_t sector_num, int nb_sectors,
1389 int64_t *cluster_sector_num,
1390 int *cluster_nb_sectors)
1392 BlockDriverInfo bdi;
1394 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1395 *cluster_sector_num = sector_num;
1396 *cluster_nb_sectors = nb_sectors;
1398 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1399 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1400 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1405 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1406 int64_t sector_num, int nb_sectors) {
1408 if (sector_num >= req->sector_num + req->nb_sectors) {
1412 if (req->sector_num >= sector_num + nb_sectors) {
1418 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1419 int64_t sector_num, int nb_sectors)
1421 BdrvTrackedRequest *req;
1422 int64_t cluster_sector_num;
1423 int cluster_nb_sectors;
1426 /* If we touch the same cluster it counts as an overlap. This guarantees
1427 * that allocating writes will be serialized and not race with each other
1428 * for the same cluster. For example, in copy-on-read it ensures that the
1429 * CoR read and write operations are atomic and guest writes cannot
1430 * interleave between them.
1432 round_to_clusters(bs, sector_num, nb_sectors,
1433 &cluster_sector_num, &cluster_nb_sectors);
1437 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1438 if (tracked_request_overlaps(req, cluster_sector_num,
1439 cluster_nb_sectors)) {
1440 /* Hitting this means there was a reentrant request, for
1441 * example, a block driver issuing nested requests. This must
1442 * never happen since it means deadlock.
1444 assert(qemu_coroutine_self() != req->co);
1446 qemu_co_queue_wait(&req->wait_queue);
1457 * -EINVAL - backing format specified, but no file
1458 * -ENOSPC - can't update the backing file because no space is left in the
1460 * -ENOTSUP - format driver doesn't support changing the backing file
1462 int bdrv_change_backing_file(BlockDriverState *bs,
1463 const char *backing_file, const char *backing_fmt)
1465 BlockDriver *drv = bs->drv;
1468 /* Backing file format doesn't make sense without a backing file */
1469 if (backing_fmt && !backing_file) {
1473 if (drv->bdrv_change_backing_file != NULL) {
1474 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1480 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1481 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1486 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1491 if (!bdrv_is_inserted(bs))
1497 len = bdrv_getlength(bs);
1502 if ((offset > len) || (len - offset < size))
1508 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1511 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1512 nb_sectors * BDRV_SECTOR_SIZE);
1515 typedef struct RwCo {
1516 BlockDriverState *bs;
1524 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1526 RwCo *rwco = opaque;
1528 if (!rwco->is_write) {
1529 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1530 rwco->nb_sectors, rwco->qiov, 0);
1532 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1533 rwco->nb_sectors, rwco->qiov, 0);
1538 * Process a synchronous request using coroutines
1540 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1541 int nb_sectors, bool is_write)
1544 struct iovec iov = {
1545 .iov_base = (void *)buf,
1546 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1551 .sector_num = sector_num,
1552 .nb_sectors = nb_sectors,
1554 .is_write = is_write,
1558 qemu_iovec_init_external(&qiov, &iov, 1);
1561 * In sync call context, when the vcpu is blocked, this throttling timer
1562 * will not fire; so the I/O throttling function has to be disabled here
1563 * if it has been enabled.
1565 if (bs->io_limits_enabled) {
1566 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1567 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1568 bdrv_io_limits_disable(bs);
1571 if (qemu_in_coroutine()) {
1572 /* Fast-path if already in coroutine context */
1573 bdrv_rw_co_entry(&rwco);
1575 co = qemu_coroutine_create(bdrv_rw_co_entry);
1576 qemu_coroutine_enter(co, &rwco);
1577 while (rwco.ret == NOT_DONE) {
1584 /* return < 0 if error. See bdrv_write() for the return codes */
1585 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1586 uint8_t *buf, int nb_sectors)
1588 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1591 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
1593 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1594 int nb_sectors, int dirty)
1597 unsigned long val, idx, bit;
1599 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1600 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1602 for (; start <= end; start++) {
1603 idx = start / BITS_PER_LONG;
1604 bit = start % BITS_PER_LONG;
1605 val = bs->dirty_bitmap[idx];
1607 if (!(val & (1UL << bit))) {
1612 if (val & (1UL << bit)) {
1614 val &= ~(1UL << bit);
1617 bs->dirty_bitmap[idx] = val;
1621 /* Return < 0 if error. Important errors are:
1622 -EIO generic I/O error (may happen for all errors)
1623 -ENOMEDIUM No media inserted.
1624 -EINVAL Invalid sector number or nb_sectors
1625 -EACCES Trying to write a read-only device
1627 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1628 const uint8_t *buf, int nb_sectors)
1630 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1633 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1634 void *buf, int count1)
1636 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1637 int len, nb_sectors, count;
1642 /* first read to align to sector start */
1643 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1646 sector_num = offset >> BDRV_SECTOR_BITS;
1648 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1650 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1658 /* read the sectors "in place" */
1659 nb_sectors = count >> BDRV_SECTOR_BITS;
1660 if (nb_sectors > 0) {
1661 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1663 sector_num += nb_sectors;
1664 len = nb_sectors << BDRV_SECTOR_BITS;
1669 /* add data from the last sector */
1671 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1673 memcpy(buf, tmp_buf, count);
1678 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1679 const void *buf, int count1)
1681 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1682 int len, nb_sectors, count;
1687 /* first write to align to sector start */
1688 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1691 sector_num = offset >> BDRV_SECTOR_BITS;
1693 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1695 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1696 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1705 /* write the sectors "in place" */
1706 nb_sectors = count >> BDRV_SECTOR_BITS;
1707 if (nb_sectors > 0) {
1708 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1710 sector_num += nb_sectors;
1711 len = nb_sectors << BDRV_SECTOR_BITS;
1716 /* add data from the last sector */
1718 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1720 memcpy(tmp_buf, buf, count);
1721 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1728 * Writes to the file and ensures that no writes are reordered across this
1729 * request (acts as a barrier)
1731 * Returns 0 on success, -errno in error cases.
1733 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1734 const void *buf, int count)
1738 ret = bdrv_pwrite(bs, offset, buf, count);
1743 /* No flush needed for cache modes that use O_DSYNC */
1744 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1751 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1752 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1754 /* Perform I/O through a temporary buffer so that users who scribble over
1755 * their read buffer while the operation is in progress do not end up
1756 * modifying the image file. This is critical for zero-copy guest I/O
1757 * where anything might happen inside guest memory.
1759 void *bounce_buffer;
1761 BlockDriver *drv = bs->drv;
1763 QEMUIOVector bounce_qiov;
1764 int64_t cluster_sector_num;
1765 int cluster_nb_sectors;
1769 /* Cover entire cluster so no additional backing file I/O is required when
1770 * allocating cluster in the image file.
1772 round_to_clusters(bs, sector_num, nb_sectors,
1773 &cluster_sector_num, &cluster_nb_sectors);
1775 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1776 cluster_sector_num, cluster_nb_sectors);
1778 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1779 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1780 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1782 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1788 if (drv->bdrv_co_write_zeroes &&
1789 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1790 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1791 cluster_nb_sectors);
1793 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1798 /* It might be okay to ignore write errors for guest requests. If this
1799 * is a deliberate copy-on-read then we don't want to ignore the error.
1800 * Simply report it in all cases.
1805 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1806 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1807 nb_sectors * BDRV_SECTOR_SIZE);
1810 qemu_vfree(bounce_buffer);
1815 * Handle a read request in coroutine context
1817 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1818 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1819 BdrvRequestFlags flags)
1821 BlockDriver *drv = bs->drv;
1822 BdrvTrackedRequest req;
1828 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1832 /* throttling disk read I/O */
1833 if (bs->io_limits_enabled) {
1834 bdrv_io_limits_intercept(bs, false, nb_sectors);
1837 if (bs->copy_on_read) {
1838 flags |= BDRV_REQ_COPY_ON_READ;
1840 if (flags & BDRV_REQ_COPY_ON_READ) {
1841 bs->copy_on_read_in_flight++;
1844 if (bs->copy_on_read_in_flight) {
1845 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1848 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1850 if (flags & BDRV_REQ_COPY_ON_READ) {
1853 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1858 if (!ret || pnum != nb_sectors) {
1859 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1864 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1867 tracked_request_end(&req);
1869 if (flags & BDRV_REQ_COPY_ON_READ) {
1870 bs->copy_on_read_in_flight--;
1876 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1877 int nb_sectors, QEMUIOVector *qiov)
1879 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1881 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1884 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1885 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1887 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1889 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1890 BDRV_REQ_COPY_ON_READ);
1893 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1894 int64_t sector_num, int nb_sectors)
1896 BlockDriver *drv = bs->drv;
1901 /* TODO Emulate only part of misaligned requests instead of letting block
1902 * drivers return -ENOTSUP and emulate everything */
1904 /* First try the efficient write zeroes operation */
1905 if (drv->bdrv_co_write_zeroes) {
1906 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1907 if (ret != -ENOTSUP) {
1912 /* Fall back to bounce buffer if write zeroes is unsupported */
1913 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1914 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1915 memset(iov.iov_base, 0, iov.iov_len);
1916 qemu_iovec_init_external(&qiov, &iov, 1);
1918 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1920 qemu_vfree(iov.iov_base);
1925 * Handle a write request in coroutine context
1927 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1928 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1929 BdrvRequestFlags flags)
1931 BlockDriver *drv = bs->drv;
1932 BdrvTrackedRequest req;
1938 if (bs->read_only) {
1941 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1945 /* throttling disk write I/O */
1946 if (bs->io_limits_enabled) {
1947 bdrv_io_limits_intercept(bs, true, nb_sectors);
1950 if (bs->copy_on_read_in_flight) {
1951 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1954 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1956 if (flags & BDRV_REQ_ZERO_WRITE) {
1957 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1959 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1962 if (bs->dirty_bitmap) {
1963 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1966 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1967 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1970 tracked_request_end(&req);
1975 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1976 int nb_sectors, QEMUIOVector *qiov)
1978 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1980 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1983 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1984 int64_t sector_num, int nb_sectors)
1986 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1988 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1989 BDRV_REQ_ZERO_WRITE);
1993 * Truncate file to 'offset' bytes (needed only for file protocols)
1995 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1997 BlockDriver *drv = bs->drv;
2001 if (!drv->bdrv_truncate)
2005 if (bdrv_in_use(bs))
2007 ret = drv->bdrv_truncate(bs, offset);
2009 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2010 bdrv_dev_resize_cb(bs);
2016 * Length of a allocated file in bytes. Sparse files are counted by actual
2017 * allocated space. Return < 0 if error or unknown.
2019 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2021 BlockDriver *drv = bs->drv;
2025 if (drv->bdrv_get_allocated_file_size) {
2026 return drv->bdrv_get_allocated_file_size(bs);
2029 return bdrv_get_allocated_file_size(bs->file);
2035 * Length of a file in bytes. Return < 0 if error or unknown.
2037 int64_t bdrv_getlength(BlockDriverState *bs)
2039 BlockDriver *drv = bs->drv;
2043 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2044 if (drv->bdrv_getlength) {
2045 return drv->bdrv_getlength(bs);
2048 return bs->total_sectors * BDRV_SECTOR_SIZE;
2051 /* return 0 as number of sectors if no device present or error */
2052 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2055 length = bdrv_getlength(bs);
2059 length = length >> BDRV_SECTOR_BITS;
2060 *nb_sectors_ptr = length;
2064 uint8_t boot_ind; /* 0x80 - active */
2065 uint8_t head; /* starting head */
2066 uint8_t sector; /* starting sector */
2067 uint8_t cyl; /* starting cylinder */
2068 uint8_t sys_ind; /* What partition type */
2069 uint8_t end_head; /* end head */
2070 uint8_t end_sector; /* end sector */
2071 uint8_t end_cyl; /* end cylinder */
2072 uint32_t start_sect; /* starting sector counting from 0 */
2073 uint32_t nr_sects; /* nr of sectors in partition */
2076 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2077 static int guess_disk_lchs(BlockDriverState *bs,
2078 int *pcylinders, int *pheads, int *psectors)
2080 uint8_t buf[BDRV_SECTOR_SIZE];
2081 int ret, i, heads, sectors, cylinders;
2082 struct partition *p;
2084 uint64_t nb_sectors;
2087 bdrv_get_geometry(bs, &nb_sectors);
2090 * The function will be invoked during startup not only in sync I/O mode,
2091 * but also in async I/O mode. So the I/O throttling function has to
2092 * be disabled temporarily here, not permanently.
2094 enabled = bs->io_limits_enabled;
2095 bs->io_limits_enabled = false;
2096 ret = bdrv_read(bs, 0, buf, 1);
2097 bs->io_limits_enabled = enabled;
2100 /* test msdos magic */
2101 if (buf[510] != 0x55 || buf[511] != 0xaa)
2103 for(i = 0; i < 4; i++) {
2104 p = ((struct partition *)(buf + 0x1be)) + i;
2105 nr_sects = le32_to_cpu(p->nr_sects);
2106 if (nr_sects && p->end_head) {
2107 /* We make the assumption that the partition terminates on
2108 a cylinder boundary */
2109 heads = p->end_head + 1;
2110 sectors = p->end_sector & 63;
2113 cylinders = nb_sectors / (heads * sectors);
2114 if (cylinders < 1 || cylinders > 16383)
2117 *psectors = sectors;
2118 *pcylinders = cylinders;
2120 printf("guessed geometry: LCHS=%d %d %d\n",
2121 cylinders, heads, sectors);
2129 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2131 int translation, lba_detected = 0;
2132 int cylinders, heads, secs;
2133 uint64_t nb_sectors;
2135 /* if a geometry hint is available, use it */
2136 bdrv_get_geometry(bs, &nb_sectors);
2137 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2138 translation = bdrv_get_translation_hint(bs);
2139 if (cylinders != 0) {
2144 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2146 /* if heads > 16, it means that a BIOS LBA
2147 translation was active, so the default
2148 hardware geometry is OK */
2150 goto default_geometry;
2155 /* disable any translation to be in sync with
2156 the logical geometry */
2157 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2158 bdrv_set_translation_hint(bs,
2159 BIOS_ATA_TRANSLATION_NONE);
2164 /* if no geometry, use a standard physical disk geometry */
2165 cylinders = nb_sectors / (16 * 63);
2167 if (cylinders > 16383)
2169 else if (cylinders < 2)
2174 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2175 if ((*pcyls * *pheads) <= 131072) {
2176 bdrv_set_translation_hint(bs,
2177 BIOS_ATA_TRANSLATION_LARGE);
2179 bdrv_set_translation_hint(bs,
2180 BIOS_ATA_TRANSLATION_LBA);
2184 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2188 void bdrv_set_geometry_hint(BlockDriverState *bs,
2189 int cyls, int heads, int secs)
2196 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2198 bs->translation = translation;
2201 void bdrv_get_geometry_hint(BlockDriverState *bs,
2202 int *pcyls, int *pheads, int *psecs)
2205 *pheads = bs->heads;
2209 /* throttling disk io limits */
2210 void bdrv_set_io_limits(BlockDriverState *bs,
2211 BlockIOLimit *io_limits)
2213 bs->io_limits = *io_limits;
2214 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2217 /* Recognize floppy formats */
2218 typedef struct FDFormat {
2226 static const FDFormat fd_formats[] = {
2227 /* First entry is default format */
2228 /* 1.44 MB 3"1/2 floppy disks */
2229 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2230 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2231 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2232 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2233 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2234 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2235 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2236 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2237 /* 2.88 MB 3"1/2 floppy disks */
2238 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2239 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2240 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2241 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2242 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2243 /* 720 kB 3"1/2 floppy disks */
2244 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2245 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2246 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2247 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2248 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2249 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2250 /* 1.2 MB 5"1/4 floppy disks */
2251 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2252 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2253 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2254 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2255 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2256 /* 720 kB 5"1/4 floppy disks */
2257 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2258 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2259 /* 360 kB 5"1/4 floppy disks */
2260 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2261 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2262 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2263 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2264 /* 320 kB 5"1/4 floppy disks */
2265 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2266 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2267 /* 360 kB must match 5"1/4 better than 3"1/2... */
2268 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2270 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2273 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2274 int *max_track, int *last_sect,
2275 FDriveType drive_in, FDriveType *drive,
2278 const FDFormat *parse;
2279 uint64_t nb_sectors, size;
2280 int i, first_match, match;
2282 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2283 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2284 /* User defined disk */
2285 *rate = FDRIVE_RATE_500K;
2287 bdrv_get_geometry(bs, &nb_sectors);
2290 for (i = 0; ; i++) {
2291 parse = &fd_formats[i];
2292 if (parse->drive == FDRIVE_DRV_NONE) {
2295 if (drive_in == parse->drive ||
2296 drive_in == FDRIVE_DRV_NONE) {
2297 size = (parse->max_head + 1) * parse->max_track *
2299 if (nb_sectors == size) {
2303 if (first_match == -1) {
2309 if (first_match == -1) {
2312 match = first_match;
2314 parse = &fd_formats[match];
2316 *nb_heads = parse->max_head + 1;
2317 *max_track = parse->max_track;
2318 *last_sect = parse->last_sect;
2319 *drive = parse->drive;
2320 *rate = parse->rate;
2324 int bdrv_get_translation_hint(BlockDriverState *bs)
2326 return bs->translation;
2329 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2330 BlockErrorAction on_write_error)
2332 bs->on_read_error = on_read_error;
2333 bs->on_write_error = on_write_error;
2336 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2338 return is_read ? bs->on_read_error : bs->on_write_error;
2341 int bdrv_is_read_only(BlockDriverState *bs)
2343 return bs->read_only;
2346 int bdrv_is_sg(BlockDriverState *bs)
2351 int bdrv_enable_write_cache(BlockDriverState *bs)
2353 return bs->enable_write_cache;
2356 int bdrv_is_encrypted(BlockDriverState *bs)
2358 if (bs->backing_hd && bs->backing_hd->encrypted)
2360 return bs->encrypted;
2363 int bdrv_key_required(BlockDriverState *bs)
2365 BlockDriverState *backing_hd = bs->backing_hd;
2367 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2369 return (bs->encrypted && !bs->valid_key);
2372 int bdrv_set_key(BlockDriverState *bs, const char *key)
2375 if (bs->backing_hd && bs->backing_hd->encrypted) {
2376 ret = bdrv_set_key(bs->backing_hd, key);
2382 if (!bs->encrypted) {
2384 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2387 ret = bs->drv->bdrv_set_key(bs, key);
2390 } else if (!bs->valid_key) {
2392 /* call the change callback now, we skipped it on open */
2393 bdrv_dev_change_media_cb(bs, true);
2398 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2403 pstrcpy(buf, buf_size, bs->drv->format_name);
2407 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2412 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2413 it(opaque, drv->format_name);
2417 BlockDriverState *bdrv_find(const char *name)
2419 BlockDriverState *bs;
2421 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2422 if (!strcmp(name, bs->device_name)) {
2429 BlockDriverState *bdrv_next(BlockDriverState *bs)
2432 return QTAILQ_FIRST(&bdrv_states);
2434 return QTAILQ_NEXT(bs, list);
2437 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2439 BlockDriverState *bs;
2441 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2446 const char *bdrv_get_device_name(BlockDriverState *bs)
2448 return bs->device_name;
2451 void bdrv_flush_all(void)
2453 BlockDriverState *bs;
2455 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2460 int bdrv_has_zero_init(BlockDriverState *bs)
2464 if (bs->drv->bdrv_has_zero_init) {
2465 return bs->drv->bdrv_has_zero_init(bs);
2471 typedef struct BdrvCoIsAllocatedData {
2472 BlockDriverState *bs;
2478 } BdrvCoIsAllocatedData;
2481 * Returns true iff the specified sector is present in the disk image. Drivers
2482 * not implementing the functionality are assumed to not support backing files,
2483 * hence all their sectors are reported as allocated.
2485 * If 'sector_num' is beyond the end of the disk image the return value is 0
2486 * and 'pnum' is set to 0.
2488 * 'pnum' is set to the number of sectors (including and immediately following
2489 * the specified sector) that are known to be in the same
2490 * allocated/unallocated state.
2492 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2493 * beyond the end of the disk image it will be clamped.
2495 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2496 int nb_sectors, int *pnum)
2500 if (sector_num >= bs->total_sectors) {
2505 n = bs->total_sectors - sector_num;
2506 if (n < nb_sectors) {
2510 if (!bs->drv->bdrv_co_is_allocated) {
2515 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2518 /* Coroutine wrapper for bdrv_is_allocated() */
2519 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2521 BdrvCoIsAllocatedData *data = opaque;
2522 BlockDriverState *bs = data->bs;
2524 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2530 * Synchronous wrapper around bdrv_co_is_allocated().
2532 * See bdrv_co_is_allocated() for details.
2534 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2538 BdrvCoIsAllocatedData data = {
2540 .sector_num = sector_num,
2541 .nb_sectors = nb_sectors,
2546 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2547 qemu_coroutine_enter(co, &data);
2548 while (!data.done) {
2554 BlockInfoList *qmp_query_block(Error **errp)
2556 BlockInfoList *head = NULL, *cur_item = NULL;
2557 BlockDriverState *bs;
2559 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2560 BlockInfoList *info = g_malloc0(sizeof(*info));
2562 info->value = g_malloc0(sizeof(*info->value));
2563 info->value->device = g_strdup(bs->device_name);
2564 info->value->type = g_strdup("unknown");
2565 info->value->locked = bdrv_dev_is_medium_locked(bs);
2566 info->value->removable = bdrv_dev_has_removable_media(bs);
2568 if (bdrv_dev_has_removable_media(bs)) {
2569 info->value->has_tray_open = true;
2570 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2573 if (bdrv_iostatus_is_enabled(bs)) {
2574 info->value->has_io_status = true;
2575 info->value->io_status = bs->iostatus;
2579 info->value->has_inserted = true;
2580 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2581 info->value->inserted->file = g_strdup(bs->filename);
2582 info->value->inserted->ro = bs->read_only;
2583 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2584 info->value->inserted->encrypted = bs->encrypted;
2585 if (bs->backing_file[0]) {
2586 info->value->inserted->has_backing_file = true;
2587 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2590 if (bs->io_limits_enabled) {
2591 info->value->inserted->bps =
2592 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2593 info->value->inserted->bps_rd =
2594 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2595 info->value->inserted->bps_wr =
2596 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2597 info->value->inserted->iops =
2598 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2599 info->value->inserted->iops_rd =
2600 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2601 info->value->inserted->iops_wr =
2602 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2606 /* XXX: waiting for the qapi to support GSList */
2608 head = cur_item = info;
2610 cur_item->next = info;
2618 /* Consider exposing this as a full fledged QMP command */
2619 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2623 s = g_malloc0(sizeof(*s));
2625 if (bs->device_name[0]) {
2626 s->has_device = true;
2627 s->device = g_strdup(bs->device_name);
2630 s->stats = g_malloc0(sizeof(*s->stats));
2631 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2632 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2633 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2634 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2635 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2636 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2637 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2638 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2639 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2642 s->has_parent = true;
2643 s->parent = qmp_query_blockstat(bs->file, NULL);
2649 BlockStatsList *qmp_query_blockstats(Error **errp)
2651 BlockStatsList *head = NULL, *cur_item = NULL;
2652 BlockDriverState *bs;
2654 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2655 BlockStatsList *info = g_malloc0(sizeof(*info));
2656 info->value = qmp_query_blockstat(bs, NULL);
2658 /* XXX: waiting for the qapi to support GSList */
2660 head = cur_item = info;
2662 cur_item->next = info;
2670 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2672 if (bs->backing_hd && bs->backing_hd->encrypted)
2673 return bs->backing_file;
2674 else if (bs->encrypted)
2675 return bs->filename;
2680 void bdrv_get_backing_filename(BlockDriverState *bs,
2681 char *filename, int filename_size)
2683 pstrcpy(filename, filename_size, bs->backing_file);
2686 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2687 const uint8_t *buf, int nb_sectors)
2689 BlockDriver *drv = bs->drv;
2692 if (!drv->bdrv_write_compressed)
2694 if (bdrv_check_request(bs, sector_num, nb_sectors))
2697 if (bs->dirty_bitmap) {
2698 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2701 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2704 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2706 BlockDriver *drv = bs->drv;
2709 if (!drv->bdrv_get_info)
2711 memset(bdi, 0, sizeof(*bdi));
2712 return drv->bdrv_get_info(bs, bdi);
2715 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2716 int64_t pos, int size)
2718 BlockDriver *drv = bs->drv;
2721 if (drv->bdrv_save_vmstate)
2722 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2724 return bdrv_save_vmstate(bs->file, buf, pos, size);
2728 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2729 int64_t pos, int size)
2731 BlockDriver *drv = bs->drv;
2734 if (drv->bdrv_load_vmstate)
2735 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2737 return bdrv_load_vmstate(bs->file, buf, pos, size);
2741 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2743 BlockDriver *drv = bs->drv;
2745 if (!drv || !drv->bdrv_debug_event) {
2749 return drv->bdrv_debug_event(bs, event);
2753 /**************************************************************/
2754 /* handling of snapshots */
2756 int bdrv_can_snapshot(BlockDriverState *bs)
2758 BlockDriver *drv = bs->drv;
2759 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2763 if (!drv->bdrv_snapshot_create) {
2764 if (bs->file != NULL) {
2765 return bdrv_can_snapshot(bs->file);
2773 int bdrv_is_snapshot(BlockDriverState *bs)
2775 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2778 BlockDriverState *bdrv_snapshots(void)
2780 BlockDriverState *bs;
2783 return bs_snapshots;
2787 while ((bs = bdrv_next(bs))) {
2788 if (bdrv_can_snapshot(bs)) {
2796 int bdrv_snapshot_create(BlockDriverState *bs,
2797 QEMUSnapshotInfo *sn_info)
2799 BlockDriver *drv = bs->drv;
2802 if (drv->bdrv_snapshot_create)
2803 return drv->bdrv_snapshot_create(bs, sn_info);
2805 return bdrv_snapshot_create(bs->file, sn_info);
2809 int bdrv_snapshot_goto(BlockDriverState *bs,
2810 const char *snapshot_id)
2812 BlockDriver *drv = bs->drv;
2817 if (drv->bdrv_snapshot_goto)
2818 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2821 drv->bdrv_close(bs);
2822 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2823 open_ret = drv->bdrv_open(bs, bs->open_flags);
2825 bdrv_delete(bs->file);
2835 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2837 BlockDriver *drv = bs->drv;
2840 if (drv->bdrv_snapshot_delete)
2841 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2843 return bdrv_snapshot_delete(bs->file, snapshot_id);
2847 int bdrv_snapshot_list(BlockDriverState *bs,
2848 QEMUSnapshotInfo **psn_info)
2850 BlockDriver *drv = bs->drv;
2853 if (drv->bdrv_snapshot_list)
2854 return drv->bdrv_snapshot_list(bs, psn_info);
2856 return bdrv_snapshot_list(bs->file, psn_info);
2860 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2861 const char *snapshot_name)
2863 BlockDriver *drv = bs->drv;
2867 if (!bs->read_only) {
2870 if (drv->bdrv_snapshot_load_tmp) {
2871 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2876 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2877 const char *backing_file)
2883 if (bs->backing_hd) {
2884 if (strcmp(bs->backing_file, backing_file) == 0) {
2885 return bs->backing_hd;
2887 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2894 #define NB_SUFFIXES 4
2896 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2898 static const char suffixes[NB_SUFFIXES] = "KMGT";
2903 snprintf(buf, buf_size, "%" PRId64, size);
2906 for(i = 0; i < NB_SUFFIXES; i++) {
2907 if (size < (10 * base)) {
2908 snprintf(buf, buf_size, "%0.1f%c",
2909 (double)size / base,
2912 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2913 snprintf(buf, buf_size, "%" PRId64 "%c",
2914 ((size + (base >> 1)) / base),
2924 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2926 char buf1[128], date_buf[128], clock_buf[128];
2936 snprintf(buf, buf_size,
2937 "%-10s%-20s%7s%20s%15s",
2938 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2942 ptm = localtime(&ti);
2943 strftime(date_buf, sizeof(date_buf),
2944 "%Y-%m-%d %H:%M:%S", ptm);
2946 localtime_r(&ti, &tm);
2947 strftime(date_buf, sizeof(date_buf),
2948 "%Y-%m-%d %H:%M:%S", &tm);
2950 secs = sn->vm_clock_nsec / 1000000000;
2951 snprintf(clock_buf, sizeof(clock_buf),
2952 "%02d:%02d:%02d.%03d",
2954 (int)((secs / 60) % 60),
2956 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2957 snprintf(buf, buf_size,
2958 "%-10s%-20s%7s%20s%15s",
2959 sn->id_str, sn->name,
2960 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2967 /**************************************************************/
2970 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2971 QEMUIOVector *qiov, int nb_sectors,
2972 BlockDriverCompletionFunc *cb, void *opaque)
2974 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2976 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2980 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2981 QEMUIOVector *qiov, int nb_sectors,
2982 BlockDriverCompletionFunc *cb, void *opaque)
2984 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2986 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2991 typedef struct MultiwriteCB {
2996 BlockDriverCompletionFunc *cb;
2998 QEMUIOVector *free_qiov;
3002 static void multiwrite_user_cb(MultiwriteCB *mcb)
3006 for (i = 0; i < mcb->num_callbacks; i++) {
3007 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3008 if (mcb->callbacks[i].free_qiov) {
3009 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3011 g_free(mcb->callbacks[i].free_qiov);
3015 static void multiwrite_cb(void *opaque, int ret)
3017 MultiwriteCB *mcb = opaque;
3019 trace_multiwrite_cb(mcb, ret);
3021 if (ret < 0 && !mcb->error) {
3025 mcb->num_requests--;
3026 if (mcb->num_requests == 0) {
3027 multiwrite_user_cb(mcb);
3032 static int multiwrite_req_compare(const void *a, const void *b)
3034 const BlockRequest *req1 = a, *req2 = b;
3037 * Note that we can't simply subtract req2->sector from req1->sector
3038 * here as that could overflow the return value.
3040 if (req1->sector > req2->sector) {
3042 } else if (req1->sector < req2->sector) {
3050 * Takes a bunch of requests and tries to merge them. Returns the number of
3051 * requests that remain after merging.
3053 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3054 int num_reqs, MultiwriteCB *mcb)
3058 // Sort requests by start sector
3059 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3061 // Check if adjacent requests touch the same clusters. If so, combine them,
3062 // filling up gaps with zero sectors.
3064 for (i = 1; i < num_reqs; i++) {
3066 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3068 // Handle exactly sequential writes and overlapping writes.
3069 if (reqs[i].sector <= oldreq_last) {
3073 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3079 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3080 qemu_iovec_init(qiov,
3081 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3083 // Add the first request to the merged one. If the requests are
3084 // overlapping, drop the last sectors of the first request.
3085 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3086 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3088 // We should need to add any zeros between the two requests
3089 assert (reqs[i].sector <= oldreq_last);
3091 // Add the second request
3092 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3094 reqs[outidx].nb_sectors = qiov->size >> 9;
3095 reqs[outidx].qiov = qiov;
3097 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3100 reqs[outidx].sector = reqs[i].sector;
3101 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3102 reqs[outidx].qiov = reqs[i].qiov;
3110 * Submit multiple AIO write requests at once.
3112 * On success, the function returns 0 and all requests in the reqs array have
3113 * been submitted. In error case this function returns -1, and any of the
3114 * requests may or may not be submitted yet. In particular, this means that the
3115 * callback will be called for some of the requests, for others it won't. The
3116 * caller must check the error field of the BlockRequest to wait for the right
3117 * callbacks (if error != 0, no callback will be called).
3119 * The implementation may modify the contents of the reqs array, e.g. to merge
3120 * requests. However, the fields opaque and error are left unmodified as they
3121 * are used to signal failure for a single request to the caller.
3123 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3128 /* don't submit writes if we don't have a medium */
3129 if (bs->drv == NULL) {
3130 for (i = 0; i < num_reqs; i++) {
3131 reqs[i].error = -ENOMEDIUM;
3136 if (num_reqs == 0) {
3140 // Create MultiwriteCB structure
3141 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3142 mcb->num_requests = 0;
3143 mcb->num_callbacks = num_reqs;
3145 for (i = 0; i < num_reqs; i++) {
3146 mcb->callbacks[i].cb = reqs[i].cb;
3147 mcb->callbacks[i].opaque = reqs[i].opaque;
3150 // Check for mergable requests
3151 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3153 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3155 /* Run the aio requests. */
3156 mcb->num_requests = num_reqs;
3157 for (i = 0; i < num_reqs; i++) {
3158 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3159 reqs[i].nb_sectors, multiwrite_cb, mcb);
3165 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3167 acb->pool->cancel(acb);
3170 /* block I/O throttling */
3171 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3172 bool is_write, double elapsed_time, uint64_t *wait)
3174 uint64_t bps_limit = 0;
3175 double bytes_limit, bytes_base, bytes_res;
3176 double slice_time, wait_time;
3178 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3179 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3180 } else if (bs->io_limits.bps[is_write]) {
3181 bps_limit = bs->io_limits.bps[is_write];
3190 slice_time = bs->slice_end - bs->slice_start;
3191 slice_time /= (NANOSECONDS_PER_SECOND);
3192 bytes_limit = bps_limit * slice_time;
3193 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3194 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3195 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3198 /* bytes_base: the bytes of data which have been read/written; and
3199 * it is obtained from the history statistic info.
3200 * bytes_res: the remaining bytes of data which need to be read/written.
3201 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3202 * the total time for completing reading/writting all data.
3204 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3206 if (bytes_base + bytes_res <= bytes_limit) {
3214 /* Calc approx time to dispatch */
3215 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3217 /* When the I/O rate at runtime exceeds the limits,
3218 * bs->slice_end need to be extended in order that the current statistic
3219 * info can be kept until the timer fire, so it is increased and tuned
3220 * based on the result of experiment.
3222 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3223 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3225 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3231 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3232 double elapsed_time, uint64_t *wait)
3234 uint64_t iops_limit = 0;
3235 double ios_limit, ios_base;
3236 double slice_time, wait_time;
3238 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3239 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3240 } else if (bs->io_limits.iops[is_write]) {
3241 iops_limit = bs->io_limits.iops[is_write];
3250 slice_time = bs->slice_end - bs->slice_start;
3251 slice_time /= (NANOSECONDS_PER_SECOND);
3252 ios_limit = iops_limit * slice_time;
3253 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3254 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3255 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3258 if (ios_base + 1 <= ios_limit) {
3266 /* Calc approx time to dispatch */
3267 wait_time = (ios_base + 1) / iops_limit;
3268 if (wait_time > elapsed_time) {
3269 wait_time = wait_time - elapsed_time;
3274 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3275 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3277 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3283 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3284 bool is_write, int64_t *wait)
3286 int64_t now, max_wait;
3287 uint64_t bps_wait = 0, iops_wait = 0;
3288 double elapsed_time;
3289 int bps_ret, iops_ret;
3291 now = qemu_get_clock_ns(vm_clock);
3292 if ((bs->slice_start < now)
3293 && (bs->slice_end > now)) {
3294 bs->slice_end = now + bs->slice_time;
3296 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3297 bs->slice_start = now;
3298 bs->slice_end = now + bs->slice_time;
3300 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3301 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3303 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3304 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3307 elapsed_time = now - bs->slice_start;
3308 elapsed_time /= (NANOSECONDS_PER_SECOND);
3310 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3311 is_write, elapsed_time, &bps_wait);
3312 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3313 elapsed_time, &iops_wait);
3314 if (bps_ret || iops_ret) {
3315 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3320 now = qemu_get_clock_ns(vm_clock);
3321 if (bs->slice_end < now + max_wait) {
3322 bs->slice_end = now + max_wait;
3335 /**************************************************************/
3336 /* async block device emulation */
3338 typedef struct BlockDriverAIOCBSync {
3339 BlockDriverAIOCB common;
3342 /* vector translation state */
3346 } BlockDriverAIOCBSync;
3348 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3350 BlockDriverAIOCBSync *acb =
3351 container_of(blockacb, BlockDriverAIOCBSync, common);
3352 qemu_bh_delete(acb->bh);
3354 qemu_aio_release(acb);
3357 static AIOPool bdrv_em_aio_pool = {
3358 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3359 .cancel = bdrv_aio_cancel_em,
3362 static void bdrv_aio_bh_cb(void *opaque)
3364 BlockDriverAIOCBSync *acb = opaque;
3367 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3368 qemu_vfree(acb->bounce);
3369 acb->common.cb(acb->common.opaque, acb->ret);
3370 qemu_bh_delete(acb->bh);
3372 qemu_aio_release(acb);
3375 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3379 BlockDriverCompletionFunc *cb,
3384 BlockDriverAIOCBSync *acb;
3386 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3387 acb->is_write = is_write;
3389 acb->bounce = qemu_blockalign(bs, qiov->size);
3390 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3393 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3394 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3396 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3399 qemu_bh_schedule(acb->bh);
3401 return &acb->common;
3404 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3405 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3406 BlockDriverCompletionFunc *cb, void *opaque)
3408 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3411 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3412 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3413 BlockDriverCompletionFunc *cb, void *opaque)
3415 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3419 typedef struct BlockDriverAIOCBCoroutine {
3420 BlockDriverAIOCB common;
3424 } BlockDriverAIOCBCoroutine;
3426 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3431 static AIOPool bdrv_em_co_aio_pool = {
3432 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3433 .cancel = bdrv_aio_co_cancel_em,
3436 static void bdrv_co_em_bh(void *opaque)
3438 BlockDriverAIOCBCoroutine *acb = opaque;
3440 acb->common.cb(acb->common.opaque, acb->req.error);
3441 qemu_bh_delete(acb->bh);
3442 qemu_aio_release(acb);
3445 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3446 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3448 BlockDriverAIOCBCoroutine *acb = opaque;
3449 BlockDriverState *bs = acb->common.bs;
3451 if (!acb->is_write) {
3452 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3453 acb->req.nb_sectors, acb->req.qiov, 0);
3455 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3456 acb->req.nb_sectors, acb->req.qiov, 0);
3459 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3460 qemu_bh_schedule(acb->bh);
3463 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3467 BlockDriverCompletionFunc *cb,
3472 BlockDriverAIOCBCoroutine *acb;
3474 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3475 acb->req.sector = sector_num;
3476 acb->req.nb_sectors = nb_sectors;
3477 acb->req.qiov = qiov;
3478 acb->is_write = is_write;
3480 co = qemu_coroutine_create(bdrv_co_do_rw);
3481 qemu_coroutine_enter(co, acb);
3483 return &acb->common;
3486 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3488 BlockDriverAIOCBCoroutine *acb = opaque;
3489 BlockDriverState *bs = acb->common.bs;
3491 acb->req.error = bdrv_co_flush(bs);
3492 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3493 qemu_bh_schedule(acb->bh);
3496 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3497 BlockDriverCompletionFunc *cb, void *opaque)
3499 trace_bdrv_aio_flush(bs, opaque);
3502 BlockDriverAIOCBCoroutine *acb;
3504 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3505 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3506 qemu_coroutine_enter(co, acb);
3508 return &acb->common;
3511 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3513 BlockDriverAIOCBCoroutine *acb = opaque;
3514 BlockDriverState *bs = acb->common.bs;
3516 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3517 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3518 qemu_bh_schedule(acb->bh);
3521 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3522 int64_t sector_num, int nb_sectors,
3523 BlockDriverCompletionFunc *cb, void *opaque)
3526 BlockDriverAIOCBCoroutine *acb;
3528 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3530 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3531 acb->req.sector = sector_num;
3532 acb->req.nb_sectors = nb_sectors;
3533 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3534 qemu_coroutine_enter(co, acb);
3536 return &acb->common;
3539 void bdrv_init(void)
3541 module_call_init(MODULE_INIT_BLOCK);
3544 void bdrv_init_with_whitelist(void)
3546 use_bdrv_whitelist = 1;
3550 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3551 BlockDriverCompletionFunc *cb, void *opaque)
3553 BlockDriverAIOCB *acb;
3555 if (pool->free_aiocb) {
3556 acb = pool->free_aiocb;
3557 pool->free_aiocb = acb->next;
3559 acb = g_malloc0(pool->aiocb_size);
3564 acb->opaque = opaque;
3568 void qemu_aio_release(void *p)
3570 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3571 AIOPool *pool = acb->pool;
3572 acb->next = pool->free_aiocb;
3573 pool->free_aiocb = acb;
3576 /**************************************************************/
3577 /* Coroutine block device emulation */
3579 typedef struct CoroutineIOCompletion {
3580 Coroutine *coroutine;
3582 } CoroutineIOCompletion;
3584 static void bdrv_co_io_em_complete(void *opaque, int ret)
3586 CoroutineIOCompletion *co = opaque;
3589 qemu_coroutine_enter(co->coroutine, NULL);
3592 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3593 int nb_sectors, QEMUIOVector *iov,
3596 CoroutineIOCompletion co = {
3597 .coroutine = qemu_coroutine_self(),
3599 BlockDriverAIOCB *acb;
3602 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3603 bdrv_co_io_em_complete, &co);
3605 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3606 bdrv_co_io_em_complete, &co);
3609 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3613 qemu_coroutine_yield();
3618 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3619 int64_t sector_num, int nb_sectors,
3622 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3625 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3626 int64_t sector_num, int nb_sectors,
3629 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3632 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3634 RwCo *rwco = opaque;
3636 rwco->ret = bdrv_co_flush(rwco->bs);
3639 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3643 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3647 /* Write back cached data to the OS even with cache=unsafe */
3648 if (bs->drv->bdrv_co_flush_to_os) {
3649 ret = bs->drv->bdrv_co_flush_to_os(bs);
3655 /* But don't actually force it to the disk with cache=unsafe */
3656 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3660 if (bs->drv->bdrv_co_flush_to_disk) {
3661 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3662 } else if (bs->drv->bdrv_aio_flush) {
3663 BlockDriverAIOCB *acb;
3664 CoroutineIOCompletion co = {
3665 .coroutine = qemu_coroutine_self(),
3668 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3672 qemu_coroutine_yield();
3677 * Some block drivers always operate in either writethrough or unsafe
3678 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3679 * know how the server works (because the behaviour is hardcoded or
3680 * depends on server-side configuration), so we can't ensure that
3681 * everything is safe on disk. Returning an error doesn't work because
3682 * that would break guests even if the server operates in writethrough
3685 * Let's hope the user knows what he's doing.
3693 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3694 * in the case of cache=unsafe, so there are no useless flushes.
3696 return bdrv_co_flush(bs->file);
3699 void bdrv_invalidate_cache(BlockDriverState *bs)
3701 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3702 bs->drv->bdrv_invalidate_cache(bs);
3706 void bdrv_invalidate_cache_all(void)
3708 BlockDriverState *bs;
3710 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3711 bdrv_invalidate_cache(bs);
3715 void bdrv_clear_incoming_migration_all(void)
3717 BlockDriverState *bs;
3719 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3720 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3724 int bdrv_flush(BlockDriverState *bs)
3732 if (qemu_in_coroutine()) {
3733 /* Fast-path if already in coroutine context */
3734 bdrv_flush_co_entry(&rwco);
3736 co = qemu_coroutine_create(bdrv_flush_co_entry);
3737 qemu_coroutine_enter(co, &rwco);
3738 while (rwco.ret == NOT_DONE) {
3746 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3748 RwCo *rwco = opaque;
3750 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3753 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3758 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3760 } else if (bs->read_only) {
3762 } else if (bs->drv->bdrv_co_discard) {
3763 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3764 } else if (bs->drv->bdrv_aio_discard) {
3765 BlockDriverAIOCB *acb;
3766 CoroutineIOCompletion co = {
3767 .coroutine = qemu_coroutine_self(),
3770 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3771 bdrv_co_io_em_complete, &co);
3775 qemu_coroutine_yield();
3783 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3788 .sector_num = sector_num,
3789 .nb_sectors = nb_sectors,
3793 if (qemu_in_coroutine()) {
3794 /* Fast-path if already in coroutine context */
3795 bdrv_discard_co_entry(&rwco);
3797 co = qemu_coroutine_create(bdrv_discard_co_entry);
3798 qemu_coroutine_enter(co, &rwco);
3799 while (rwco.ret == NOT_DONE) {
3807 /**************************************************************/
3808 /* removable device support */
3811 * Return TRUE if the media is present
3813 int bdrv_is_inserted(BlockDriverState *bs)
3815 BlockDriver *drv = bs->drv;
3819 if (!drv->bdrv_is_inserted)
3821 return drv->bdrv_is_inserted(bs);
3825 * Return whether the media changed since the last call to this
3826 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3828 int bdrv_media_changed(BlockDriverState *bs)
3830 BlockDriver *drv = bs->drv;
3832 if (drv && drv->bdrv_media_changed) {
3833 return drv->bdrv_media_changed(bs);
3839 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3841 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3843 BlockDriver *drv = bs->drv;
3845 if (drv && drv->bdrv_eject) {
3846 drv->bdrv_eject(bs, eject_flag);
3849 if (bs->device_name[0] != '\0') {
3850 bdrv_emit_qmp_eject_event(bs, eject_flag);
3855 * Lock or unlock the media (if it is locked, the user won't be able
3856 * to eject it manually).
3858 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3860 BlockDriver *drv = bs->drv;
3862 trace_bdrv_lock_medium(bs, locked);
3864 if (drv && drv->bdrv_lock_medium) {
3865 drv->bdrv_lock_medium(bs, locked);
3869 /* needed for generic scsi interface */
3871 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3873 BlockDriver *drv = bs->drv;
3875 if (drv && drv->bdrv_ioctl)
3876 return drv->bdrv_ioctl(bs, req, buf);
3880 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3881 unsigned long int req, void *buf,
3882 BlockDriverCompletionFunc *cb, void *opaque)
3884 BlockDriver *drv = bs->drv;
3886 if (drv && drv->bdrv_aio_ioctl)
3887 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3891 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3893 bs->buffer_alignment = align;
3896 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3898 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3901 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3903 int64_t bitmap_size;
3905 bs->dirty_count = 0;
3907 if (!bs->dirty_bitmap) {
3908 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3909 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3910 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3912 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
3915 if (bs->dirty_bitmap) {
3916 g_free(bs->dirty_bitmap);
3917 bs->dirty_bitmap = NULL;
3922 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3924 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3926 if (bs->dirty_bitmap &&
3927 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3928 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3929 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3935 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3938 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3941 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3943 return bs->dirty_count;
3946 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3948 assert(bs->in_use != in_use);
3949 bs->in_use = in_use;
3952 int bdrv_in_use(BlockDriverState *bs)
3957 void bdrv_iostatus_enable(BlockDriverState *bs)
3959 bs->iostatus_enabled = true;
3960 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3963 /* The I/O status is only enabled if the drive explicitly
3964 * enables it _and_ the VM is configured to stop on errors */
3965 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3967 return (bs->iostatus_enabled &&
3968 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3969 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3970 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3973 void bdrv_iostatus_disable(BlockDriverState *bs)
3975 bs->iostatus_enabled = false;
3978 void bdrv_iostatus_reset(BlockDriverState *bs)
3980 if (bdrv_iostatus_is_enabled(bs)) {
3981 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3985 /* XXX: Today this is set by device models because it makes the implementation
3986 quite simple. However, the block layer knows about the error, so it's
3987 possible to implement this without device models being involved */
3988 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3990 if (bdrv_iostatus_is_enabled(bs) &&
3991 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3993 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3994 BLOCK_DEVICE_IO_STATUS_FAILED;
3999 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4000 enum BlockAcctType type)
4002 assert(type < BDRV_MAX_IOTYPE);
4004 cookie->bytes = bytes;
4005 cookie->start_time_ns = get_clock();
4006 cookie->type = type;
4010 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4012 assert(cookie->type < BDRV_MAX_IOTYPE);
4014 bs->nr_bytes[cookie->type] += cookie->bytes;
4015 bs->nr_ops[cookie->type]++;
4016 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4019 int bdrv_img_create(const char *filename, const char *fmt,
4020 const char *base_filename, const char *base_fmt,
4021 char *options, uint64_t img_size, int flags)
4023 QEMUOptionParameter *param = NULL, *create_options = NULL;
4024 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4025 BlockDriverState *bs = NULL;
4026 BlockDriver *drv, *proto_drv;
4027 BlockDriver *backing_drv = NULL;
4030 /* Find driver and parse its options */
4031 drv = bdrv_find_format(fmt);
4033 error_report("Unknown file format '%s'", fmt);
4038 proto_drv = bdrv_find_protocol(filename);
4040 error_report("Unknown protocol '%s'", filename);
4045 create_options = append_option_parameters(create_options,
4046 drv->create_options);
4047 create_options = append_option_parameters(create_options,
4048 proto_drv->create_options);
4050 /* Create parameter list with default values */
4051 param = parse_option_parameters("", create_options, param);
4053 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4055 /* Parse -o options */
4057 param = parse_option_parameters(options, create_options, param);
4058 if (param == NULL) {
4059 error_report("Invalid options for file format '%s'.", fmt);
4065 if (base_filename) {
4066 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4068 error_report("Backing file not supported for file format '%s'",
4076 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4077 error_report("Backing file format not supported for file "
4078 "format '%s'", fmt);
4084 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4085 if (backing_file && backing_file->value.s) {
4086 if (!strcmp(filename, backing_file->value.s)) {
4087 error_report("Error: Trying to create an image with the "
4088 "same filename as the backing file");
4094 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4095 if (backing_fmt && backing_fmt->value.s) {
4096 backing_drv = bdrv_find_format(backing_fmt->value.s);
4098 error_report("Unknown backing file format '%s'",
4099 backing_fmt->value.s);
4105 // The size for the image must always be specified, with one exception:
4106 // If we are using a backing file, we can obtain the size from there
4107 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4108 if (size && size->value.n == -1) {
4109 if (backing_file && backing_file->value.s) {
4114 /* backing files always opened read-only */
4116 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4120 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4122 error_report("Could not open '%s'", backing_file->value.s);
4125 bdrv_get_geometry(bs, &size);
4128 snprintf(buf, sizeof(buf), "%" PRId64, size);
4129 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4131 error_report("Image creation needs a size parameter");
4137 printf("Formatting '%s', fmt=%s ", filename, fmt);
4138 print_option_parameters(param);
4141 ret = bdrv_create(drv, filename, param);
4144 if (ret == -ENOTSUP) {
4145 error_report("Formatting or formatting option not supported for "
4146 "file format '%s'", fmt);
4147 } else if (ret == -EFBIG) {
4148 error_report("The image size is too large for file format '%s'",
4151 error_report("%s: error while creating %s: %s", filename, fmt,
4157 free_option_parameters(create_options);
4158 free_option_parameters(param);
4167 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4168 int64_t speed, BlockDriverCompletionFunc *cb,
4169 void *opaque, Error **errp)
4173 if (bs->job || bdrv_in_use(bs)) {
4174 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4177 bdrv_set_in_use(bs, 1);
4179 job = g_malloc0(job_type->instance_size);
4180 job->job_type = job_type;
4183 job->opaque = opaque;
4187 /* Only set speed when necessary to avoid NotSupported error */
4189 Error *local_err = NULL;
4191 block_job_set_speed(job, speed, &local_err);
4192 if (error_is_set(&local_err)) {
4195 bdrv_set_in_use(bs, 0);
4196 error_propagate(errp, local_err);
4203 void block_job_complete(BlockJob *job, int ret)
4205 BlockDriverState *bs = job->bs;
4207 assert(bs->job == job);
4208 job->cb(job->opaque, ret);
4211 bdrv_set_in_use(bs, 0);
4214 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4216 Error *local_err = NULL;
4218 if (!job->job_type->set_speed) {
4219 error_set(errp, QERR_NOT_SUPPORTED);
4222 job->job_type->set_speed(job, speed, &local_err);
4223 if (error_is_set(&local_err)) {
4224 error_propagate(errp, local_err);
4231 void block_job_cancel(BlockJob *job)
4233 job->cancelled = true;
4234 if (job->co && !job->busy) {
4235 qemu_coroutine_enter(job->co, NULL);
4239 bool block_job_is_cancelled(BlockJob *job)
4241 return job->cancelled;
4244 struct BlockCancelData {
4246 BlockDriverCompletionFunc *cb;
4252 static void block_job_cancel_cb(void *opaque, int ret)
4254 struct BlockCancelData *data = opaque;
4256 data->cancelled = block_job_is_cancelled(data->job);
4258 data->cb(data->opaque, ret);
4261 int block_job_cancel_sync(BlockJob *job)
4263 struct BlockCancelData data;
4264 BlockDriverState *bs = job->bs;
4266 assert(bs->job == job);
4268 /* Set up our own callback to store the result and chain to
4269 * the original callback.
4273 data.opaque = job->opaque;
4274 data.ret = -EINPROGRESS;
4275 job->cb = block_job_cancel_cb;
4276 job->opaque = &data;
4277 block_job_cancel(job);
4278 while (data.ret == -EINPROGRESS) {
4281 return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4284 void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4286 /* Check cancellation *before* setting busy = false, too! */
4287 if (!block_job_is_cancelled(job)) {
4289 co_sleep_ns(clock, ns);