2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
28 #include "block_int.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
36 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79 BlockDriverCompletionFunc *cb,
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
106 static int is_windows_drive_prefix(const char *filename)
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113 int is_windows_drive(const char *filename)
115 if (is_windows_drive_prefix(filename) &&
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
128 bs->io_limits_enabled = false;
130 while (qemu_co_queue_next(&bs->throttled_reqs));
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
144 static void bdrv_block_timer(void *opaque)
146 BlockDriverState *bs = opaque;
148 qemu_co_queue_next(&bs->throttled_reqs);
151 void bdrv_io_limits_enable(BlockDriverState *bs)
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
176 int64_t wait_time = -1;
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
195 qemu_co_queue_next(&bs->throttled_reqs);
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
208 return strchr(path, ':') != NULL;
211 int path_is_absolute(const char *path)
215 /* specific case for names like: "\\.\d:" */
216 if (*path == '/' || *path == '\\')
219 p = strchr(path, ':');
225 return (*p == '/' || *p == '\\');
231 /* if filename is absolute, just copy it to dest. Otherwise, build a
232 path to it by considering it is relative to base_path. URL are
234 void path_combine(char *dest, int dest_size,
235 const char *base_path,
236 const char *filename)
243 if (path_is_absolute(filename)) {
244 pstrcpy(dest, dest_size, filename);
246 p = strchr(base_path, ':');
251 p1 = strrchr(base_path, '/');
255 p2 = strrchr(base_path, '\\');
267 if (len > dest_size - 1)
269 memcpy(dest, base_path, len);
271 pstrcat(dest, dest_size, filename);
275 void bdrv_register(BlockDriver *bdrv)
277 /* Block drivers without coroutine functions need emulation */
278 if (!bdrv->bdrv_co_readv) {
279 bdrv->bdrv_co_readv = bdrv_co_readv_em;
280 bdrv->bdrv_co_writev = bdrv_co_writev_em;
282 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283 * the block driver lacks aio we need to emulate that too.
285 if (!bdrv->bdrv_aio_readv) {
286 /* add AIO emulation layer */
287 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
292 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
295 /* create a new block device (by default it is empty) */
296 BlockDriverState *bdrv_new(const char *device_name)
298 BlockDriverState *bs;
300 bs = g_malloc0(sizeof(BlockDriverState));
301 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
302 if (device_name[0] != '\0') {
303 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
305 bdrv_iostatus_disable(bs);
309 BlockDriver *bdrv_find_format(const char *format_name)
312 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313 if (!strcmp(drv1->format_name, format_name)) {
320 static int bdrv_is_whitelisted(BlockDriver *drv)
322 static const char *whitelist[] = {
323 CONFIG_BDRV_WHITELIST
328 return 1; /* no whitelist, anything goes */
330 for (p = whitelist; *p; p++) {
331 if (!strcmp(drv->format_name, *p)) {
338 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
340 BlockDriver *drv = bdrv_find_format(format_name);
341 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
344 typedef struct CreateCo {
347 QEMUOptionParameter *options;
351 static void coroutine_fn bdrv_create_co_entry(void *opaque)
353 CreateCo *cco = opaque;
356 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
359 int bdrv_create(BlockDriver *drv, const char* filename,
360 QEMUOptionParameter *options)
367 .filename = g_strdup(filename),
372 if (!drv->bdrv_create) {
376 if (qemu_in_coroutine()) {
377 /* Fast-path if already in coroutine context */
378 bdrv_create_co_entry(&cco);
380 co = qemu_coroutine_create(bdrv_create_co_entry);
381 qemu_coroutine_enter(co, &cco);
382 while (cco.ret == NOT_DONE) {
388 g_free(cco.filename);
393 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
397 drv = bdrv_find_protocol(filename);
402 return bdrv_create(drv, filename, options);
406 void get_tmp_filename(char *filename, int size)
408 char temp_dir[MAX_PATH];
410 GetTempPath(MAX_PATH, temp_dir);
411 GetTempFileName(temp_dir, "qem", 0, filename);
414 void get_tmp_filename(char *filename, int size)
418 /* XXX: race condition possible */
419 tmpdir = getenv("TMPDIR");
422 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
423 fd = mkstemp(filename);
429 * Detect host devices. By convention, /dev/cdrom[N] is always
430 * recognized as a host CDROM.
432 static BlockDriver *find_hdev_driver(const char *filename)
434 int score_max = 0, score;
435 BlockDriver *drv = NULL, *d;
437 QLIST_FOREACH(d, &bdrv_drivers, list) {
438 if (d->bdrv_probe_device) {
439 score = d->bdrv_probe_device(filename);
440 if (score > score_max) {
450 BlockDriver *bdrv_find_protocol(const char *filename)
457 /* TODO Drivers without bdrv_file_open must be specified explicitly */
460 * XXX(hch): we really should not let host device detection
461 * override an explicit protocol specification, but moving this
462 * later breaks access to device names with colons in them.
463 * Thanks to the brain-dead persistent naming schemes on udev-
464 * based Linux systems those actually are quite common.
466 drv1 = find_hdev_driver(filename);
471 if (!path_has_protocol(filename)) {
472 return bdrv_find_format("file");
474 p = strchr(filename, ':');
477 if (len > sizeof(protocol) - 1)
478 len = sizeof(protocol) - 1;
479 memcpy(protocol, filename, len);
480 protocol[len] = '\0';
481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482 if (drv1->protocol_name &&
483 !strcmp(drv1->protocol_name, protocol)) {
490 static int find_image_format(const char *filename, BlockDriver **pdrv)
492 int ret, score, score_max;
493 BlockDriver *drv1, *drv;
495 BlockDriverState *bs;
497 ret = bdrv_file_open(&bs, filename, 0);
503 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
504 if (bs->sg || !bdrv_is_inserted(bs)) {
506 drv = bdrv_find_format("raw");
514 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
523 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
524 if (drv1->bdrv_probe) {
525 score = drv1->bdrv_probe(buf, ret, filename);
526 if (score > score_max) {
540 * Set the current 'total_sectors' value
542 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
544 BlockDriver *drv = bs->drv;
546 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
550 /* query actual device if possible, otherwise just trust the hint */
551 if (drv->bdrv_getlength) {
552 int64_t length = drv->bdrv_getlength(bs);
556 hint = length >> BDRV_SECTOR_BITS;
559 bs->total_sectors = hint;
564 * Set open flags for a given cache mode
566 * Return 0 on success, -1 if the cache mode was invalid.
568 int bdrv_parse_cache_flags(const char *mode, int *flags)
570 *flags &= ~BDRV_O_CACHE_MASK;
572 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
573 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
574 } else if (!strcmp(mode, "directsync")) {
575 *flags |= BDRV_O_NOCACHE;
576 } else if (!strcmp(mode, "writeback")) {
577 *flags |= BDRV_O_CACHE_WB;
578 } else if (!strcmp(mode, "unsafe")) {
579 *flags |= BDRV_O_CACHE_WB;
580 *flags |= BDRV_O_NO_FLUSH;
581 } else if (!strcmp(mode, "writethrough")) {
582 /* this is the default */
591 * The copy-on-read flag is actually a reference count so multiple users may
592 * use the feature without worrying about clobbering its previous state.
593 * Copy-on-read stays enabled until all users have called to disable it.
595 void bdrv_enable_copy_on_read(BlockDriverState *bs)
600 void bdrv_disable_copy_on_read(BlockDriverState *bs)
602 assert(bs->copy_on_read > 0);
607 * Common part for opening disk images and files
609 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
610 int flags, BlockDriver *drv)
616 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
619 bs->total_sectors = 0;
623 bs->open_flags = flags;
625 bs->buffer_alignment = 512;
627 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
628 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
629 bdrv_enable_copy_on_read(bs);
632 pstrcpy(bs->filename, sizeof(bs->filename), filename);
633 bs->backing_file[0] = '\0';
635 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
640 bs->opaque = g_malloc0(drv->instance_size);
642 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
645 * Clear flags that are internal to the block layer before opening the
648 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
651 * Snapshots should be writable.
653 if (bs->is_temporary) {
654 open_flags |= BDRV_O_RDWR;
657 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
659 /* Open the image, either directly or using a protocol */
660 if (drv->bdrv_file_open) {
661 ret = drv->bdrv_file_open(bs, filename, open_flags);
663 ret = bdrv_file_open(&bs->file, filename, open_flags);
665 ret = drv->bdrv_open(bs, open_flags);
673 ret = refresh_total_sectors(bs, bs->total_sectors);
679 if (bs->is_temporary) {
687 bdrv_delete(bs->file);
697 * Opens a file using a protocol (file, host_device, nbd, ...)
699 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
701 BlockDriverState *bs;
705 drv = bdrv_find_protocol(filename);
711 ret = bdrv_open_common(bs, filename, flags, drv);
722 * Opens a disk image (raw, qcow2, vmdk, ...)
724 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
728 char tmp_filename[PATH_MAX];
730 if (flags & BDRV_O_SNAPSHOT) {
731 BlockDriverState *bs1;
734 BlockDriver *bdrv_qcow2;
735 QEMUOptionParameter *options;
736 char backing_filename[PATH_MAX];
738 /* if snapshot, we create a temporary backing file and open it
739 instead of opening 'filename' directly */
741 /* if there is a backing file, use it */
743 ret = bdrv_open(bs1, filename, 0, drv);
748 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
750 if (bs1->drv && bs1->drv->protocol_name)
755 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
757 /* Real path is meaningless for protocols */
759 snprintf(backing_filename, sizeof(backing_filename),
761 else if (!realpath(filename, backing_filename))
764 bdrv_qcow2 = bdrv_find_format("qcow2");
765 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
767 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
768 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
770 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
774 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
775 free_option_parameters(options);
780 filename = tmp_filename;
782 bs->is_temporary = 1;
785 /* Find the right image format driver */
787 ret = find_image_format(filename, &drv);
791 goto unlink_and_fail;
795 ret = bdrv_open_common(bs, filename, flags, drv);
797 goto unlink_and_fail;
800 /* If there is a backing file, use it */
801 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
802 char backing_filename[PATH_MAX];
804 BlockDriver *back_drv = NULL;
806 bs->backing_hd = bdrv_new("");
808 if (path_has_protocol(bs->backing_file)) {
809 pstrcpy(backing_filename, sizeof(backing_filename),
812 path_combine(backing_filename, sizeof(backing_filename),
813 filename, bs->backing_file);
816 if (bs->backing_format[0] != '\0') {
817 back_drv = bdrv_find_format(bs->backing_format);
820 /* backing files always opened read-only */
822 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
824 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
829 if (bs->is_temporary) {
830 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
832 /* base image inherits from "parent" */
833 bs->backing_hd->keep_read_only = bs->keep_read_only;
837 if (!bdrv_key_required(bs)) {
838 bdrv_dev_change_media_cb(bs, true);
841 /* throttling disk I/O limits */
842 if (bs->io_limits_enabled) {
843 bdrv_io_limits_enable(bs);
849 if (bs->is_temporary) {
855 void bdrv_close(BlockDriverState *bs)
860 block_job_cancel_sync(bs->job);
864 if (bs == bs_snapshots) {
867 if (bs->backing_hd) {
868 bdrv_delete(bs->backing_hd);
869 bs->backing_hd = NULL;
871 bs->drv->bdrv_close(bs);
874 if (bs->is_temporary) {
875 unlink(bs->filename);
880 bs->copy_on_read = 0;
882 if (bs->file != NULL) {
883 bdrv_close(bs->file);
886 bdrv_dev_change_media_cb(bs, false);
889 /*throttling disk I/O limits*/
890 if (bs->io_limits_enabled) {
891 bdrv_io_limits_disable(bs);
895 void bdrv_close_all(void)
897 BlockDriverState *bs;
899 QTAILQ_FOREACH(bs, &bdrv_states, list) {
905 * Wait for pending requests to complete across all BlockDriverStates
907 * This function does not flush data to disk, use bdrv_flush_all() for that
908 * after calling this function.
910 * Note that completion of an asynchronous I/O operation can trigger any
911 * number of other I/O operations on other devices---for example a coroutine
912 * can be arbitrarily complex and a constant flow of I/O can come until the
913 * coroutine is complete. Because of this, it is not possible to have a
914 * function to drain a single device's I/O queue.
916 void bdrv_drain_all(void)
918 BlockDriverState *bs;
922 busy = qemu_aio_wait();
924 /* FIXME: We do not have timer support here, so this is effectively
927 QTAILQ_FOREACH(bs, &bdrv_states, list) {
928 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
929 qemu_co_queue_restart_all(&bs->throttled_reqs);
935 /* If requests are still pending there is a bug somewhere */
936 QTAILQ_FOREACH(bs, &bdrv_states, list) {
937 assert(QLIST_EMPTY(&bs->tracked_requests));
938 assert(qemu_co_queue_empty(&bs->throttled_reqs));
942 /* make a BlockDriverState anonymous by removing from bdrv_state list.
943 Also, NULL terminate the device_name to prevent double remove */
944 void bdrv_make_anon(BlockDriverState *bs)
946 if (bs->device_name[0] != '\0') {
947 QTAILQ_REMOVE(&bdrv_states, bs, list);
949 bs->device_name[0] = '\0';
953 * Add new bs contents at the top of an image chain while the chain is
954 * live, while keeping required fields on the top layer.
956 * This will modify the BlockDriverState fields, and swap contents
957 * between bs_new and bs_top. Both bs_new and bs_top are modified.
959 * bs_new is required to be anonymous.
961 * This function does not create any image files.
963 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
965 BlockDriverState tmp;
967 /* bs_new must be anonymous */
968 assert(bs_new->device_name[0] == '\0');
972 /* there are some fields that need to stay on the top layer: */
975 tmp.dev_ops = bs_top->dev_ops;
976 tmp.dev_opaque = bs_top->dev_opaque;
977 tmp.dev = bs_top->dev;
978 tmp.buffer_alignment = bs_top->buffer_alignment;
979 tmp.copy_on_read = bs_top->copy_on_read;
981 /* i/o timing parameters */
982 tmp.slice_time = bs_top->slice_time;
983 tmp.slice_start = bs_top->slice_start;
984 tmp.slice_end = bs_top->slice_end;
985 tmp.io_limits = bs_top->io_limits;
986 tmp.io_base = bs_top->io_base;
987 tmp.throttled_reqs = bs_top->throttled_reqs;
988 tmp.block_timer = bs_top->block_timer;
989 tmp.io_limits_enabled = bs_top->io_limits_enabled;
992 tmp.cyls = bs_top->cyls;
993 tmp.heads = bs_top->heads;
994 tmp.secs = bs_top->secs;
995 tmp.translation = bs_top->translation;
998 tmp.on_read_error = bs_top->on_read_error;
999 tmp.on_write_error = bs_top->on_write_error;
1002 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1003 tmp.iostatus = bs_top->iostatus;
1005 /* keep the same entry in bdrv_states */
1006 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1007 tmp.list = bs_top->list;
1009 /* The contents of 'tmp' will become bs_top, as we are
1010 * swapping bs_new and bs_top contents. */
1011 tmp.backing_hd = bs_new;
1012 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1013 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1015 /* swap contents of the fixed new bs and the current top */
1019 /* device_name[] was carried over from the old bs_top. bs_new
1020 * shouldn't be in bdrv_states, so we need to make device_name[]
1021 * reflect the anonymity of bs_new
1023 bs_new->device_name[0] = '\0';
1025 /* clear the copied fields in the new backing file */
1026 bdrv_detach_dev(bs_new, bs_new->dev);
1028 qemu_co_queue_init(&bs_new->throttled_reqs);
1029 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1030 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1031 bdrv_iostatus_disable(bs_new);
1033 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1034 * to affect or delete the block_timer, as it has been moved to bs_top */
1035 bs_new->io_limits_enabled = false;
1036 bs_new->block_timer = NULL;
1037 bs_new->slice_time = 0;
1038 bs_new->slice_start = 0;
1039 bs_new->slice_end = 0;
1042 void bdrv_delete(BlockDriverState *bs)
1046 assert(!bs->in_use);
1048 /* remove from list, if necessary */
1052 if (bs->file != NULL) {
1053 bdrv_delete(bs->file);
1056 assert(bs != bs_snapshots);
1060 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1061 /* TODO change to DeviceState *dev when all users are qdevified */
1067 bdrv_iostatus_reset(bs);
1071 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1072 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1074 if (bdrv_attach_dev(bs, dev) < 0) {
1079 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1080 /* TODO change to DeviceState *dev when all users are qdevified */
1082 assert(bs->dev == dev);
1085 bs->dev_opaque = NULL;
1086 bs->buffer_alignment = 512;
1089 /* TODO change to return DeviceState * when all users are qdevified */
1090 void *bdrv_get_attached_dev(BlockDriverState *bs)
1095 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1099 bs->dev_opaque = opaque;
1100 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1101 bs_snapshots = NULL;
1105 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1106 BlockQMPEventAction action, int is_read)
1109 const char *action_str;
1112 case BDRV_ACTION_REPORT:
1113 action_str = "report";
1115 case BDRV_ACTION_IGNORE:
1116 action_str = "ignore";
1118 case BDRV_ACTION_STOP:
1119 action_str = "stop";
1125 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1128 is_read ? "read" : "write");
1129 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1131 qobject_decref(data);
1134 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1138 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1139 bdrv_get_device_name(bs), ejected);
1140 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1142 qobject_decref(data);
1145 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1147 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1148 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1149 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1150 if (tray_was_closed) {
1152 bdrv_emit_qmp_eject_event(bs, true);
1156 bdrv_emit_qmp_eject_event(bs, false);
1161 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1163 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1166 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1168 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1169 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1173 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1175 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1176 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1181 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1183 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1184 bs->dev_ops->resize_cb(bs->dev_opaque);
1188 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1190 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1191 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1197 * Run consistency checks on an image
1199 * Returns 0 if the check could be completed (it doesn't mean that the image is
1200 * free of errors) or -errno when an internal error occurred. The results of the
1201 * check are stored in res.
1203 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1205 if (bs->drv->bdrv_check == NULL) {
1209 memset(res, 0, sizeof(*res));
1210 return bs->drv->bdrv_check(bs, res);
1213 #define COMMIT_BUF_SECTORS 2048
1215 /* commit COW file into the raw image */
1216 int bdrv_commit(BlockDriverState *bs)
1218 BlockDriver *drv = bs->drv;
1219 BlockDriver *backing_drv;
1220 int64_t sector, total_sectors;
1221 int n, ro, open_flags;
1222 int ret = 0, rw_ret = 0;
1224 char filename[1024];
1225 BlockDriverState *bs_rw, *bs_ro;
1230 if (!bs->backing_hd) {
1234 if (bs->backing_hd->keep_read_only) {
1238 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1242 backing_drv = bs->backing_hd->drv;
1243 ro = bs->backing_hd->read_only;
1244 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1245 open_flags = bs->backing_hd->open_flags;
1249 bdrv_delete(bs->backing_hd);
1250 bs->backing_hd = NULL;
1251 bs_rw = bdrv_new("");
1252 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1256 /* try to re-open read-only */
1257 bs_ro = bdrv_new("");
1258 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1262 /* drive not functional anymore */
1266 bs->backing_hd = bs_ro;
1269 bs->backing_hd = bs_rw;
1272 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1273 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1275 for (sector = 0; sector < total_sectors; sector += n) {
1276 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1278 if (bdrv_read(bs, sector, buf, n) != 0) {
1283 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1290 if (drv->bdrv_make_empty) {
1291 ret = drv->bdrv_make_empty(bs);
1296 * Make sure all data we wrote to the backing device is actually
1300 bdrv_flush(bs->backing_hd);
1307 bdrv_delete(bs->backing_hd);
1308 bs->backing_hd = NULL;
1309 bs_ro = bdrv_new("");
1310 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1314 /* drive not functional anymore */
1318 bs->backing_hd = bs_ro;
1319 bs->backing_hd->keep_read_only = 0;
1325 int bdrv_commit_all(void)
1327 BlockDriverState *bs;
1329 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1330 int ret = bdrv_commit(bs);
1338 struct BdrvTrackedRequest {
1339 BlockDriverState *bs;
1343 QLIST_ENTRY(BdrvTrackedRequest) list;
1344 Coroutine *co; /* owner, used for deadlock detection */
1345 CoQueue wait_queue; /* coroutines blocked on this request */
1349 * Remove an active request from the tracked requests list
1351 * This function should be called when a tracked request is completing.
1353 static void tracked_request_end(BdrvTrackedRequest *req)
1355 QLIST_REMOVE(req, list);
1356 qemu_co_queue_restart_all(&req->wait_queue);
1360 * Add an active request to the tracked requests list
1362 static void tracked_request_begin(BdrvTrackedRequest *req,
1363 BlockDriverState *bs,
1365 int nb_sectors, bool is_write)
1367 *req = (BdrvTrackedRequest){
1369 .sector_num = sector_num,
1370 .nb_sectors = nb_sectors,
1371 .is_write = is_write,
1372 .co = qemu_coroutine_self(),
1375 qemu_co_queue_init(&req->wait_queue);
1377 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1381 * Round a region to cluster boundaries
1383 static void round_to_clusters(BlockDriverState *bs,
1384 int64_t sector_num, int nb_sectors,
1385 int64_t *cluster_sector_num,
1386 int *cluster_nb_sectors)
1388 BlockDriverInfo bdi;
1390 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1391 *cluster_sector_num = sector_num;
1392 *cluster_nb_sectors = nb_sectors;
1394 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1395 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1396 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1401 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1402 int64_t sector_num, int nb_sectors) {
1404 if (sector_num >= req->sector_num + req->nb_sectors) {
1408 if (req->sector_num >= sector_num + nb_sectors) {
1414 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1415 int64_t sector_num, int nb_sectors)
1417 BdrvTrackedRequest *req;
1418 int64_t cluster_sector_num;
1419 int cluster_nb_sectors;
1422 /* If we touch the same cluster it counts as an overlap. This guarantees
1423 * that allocating writes will be serialized and not race with each other
1424 * for the same cluster. For example, in copy-on-read it ensures that the
1425 * CoR read and write operations are atomic and guest writes cannot
1426 * interleave between them.
1428 round_to_clusters(bs, sector_num, nb_sectors,
1429 &cluster_sector_num, &cluster_nb_sectors);
1433 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1434 if (tracked_request_overlaps(req, cluster_sector_num,
1435 cluster_nb_sectors)) {
1436 /* Hitting this means there was a reentrant request, for
1437 * example, a block driver issuing nested requests. This must
1438 * never happen since it means deadlock.
1440 assert(qemu_coroutine_self() != req->co);
1442 qemu_co_queue_wait(&req->wait_queue);
1453 * -EINVAL - backing format specified, but no file
1454 * -ENOSPC - can't update the backing file because no space is left in the
1456 * -ENOTSUP - format driver doesn't support changing the backing file
1458 int bdrv_change_backing_file(BlockDriverState *bs,
1459 const char *backing_file, const char *backing_fmt)
1461 BlockDriver *drv = bs->drv;
1463 if (drv->bdrv_change_backing_file != NULL) {
1464 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1470 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1475 if (!bdrv_is_inserted(bs))
1481 len = bdrv_getlength(bs);
1486 if ((offset > len) || (len - offset < size))
1492 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1495 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1496 nb_sectors * BDRV_SECTOR_SIZE);
1499 typedef struct RwCo {
1500 BlockDriverState *bs;
1508 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1510 RwCo *rwco = opaque;
1512 if (!rwco->is_write) {
1513 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1514 rwco->nb_sectors, rwco->qiov, 0);
1516 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1517 rwco->nb_sectors, rwco->qiov, 0);
1522 * Process a synchronous request using coroutines
1524 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1525 int nb_sectors, bool is_write)
1528 struct iovec iov = {
1529 .iov_base = (void *)buf,
1530 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1535 .sector_num = sector_num,
1536 .nb_sectors = nb_sectors,
1538 .is_write = is_write,
1542 qemu_iovec_init_external(&qiov, &iov, 1);
1545 * In sync call context, when the vcpu is blocked, this throttling timer
1546 * will not fire; so the I/O throttling function has to be disabled here
1547 * if it has been enabled.
1549 if (bs->io_limits_enabled) {
1550 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1551 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1552 bdrv_io_limits_disable(bs);
1555 if (qemu_in_coroutine()) {
1556 /* Fast-path if already in coroutine context */
1557 bdrv_rw_co_entry(&rwco);
1559 co = qemu_coroutine_create(bdrv_rw_co_entry);
1560 qemu_coroutine_enter(co, &rwco);
1561 while (rwco.ret == NOT_DONE) {
1568 /* return < 0 if error. See bdrv_write() for the return codes */
1569 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1570 uint8_t *buf, int nb_sectors)
1572 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1575 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1576 int nb_sectors, int dirty)
1579 unsigned long val, idx, bit;
1581 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1582 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1584 for (; start <= end; start++) {
1585 idx = start / (sizeof(unsigned long) * 8);
1586 bit = start % (sizeof(unsigned long) * 8);
1587 val = bs->dirty_bitmap[idx];
1589 if (!(val & (1UL << bit))) {
1594 if (val & (1UL << bit)) {
1596 val &= ~(1UL << bit);
1599 bs->dirty_bitmap[idx] = val;
1603 /* Return < 0 if error. Important errors are:
1604 -EIO generic I/O error (may happen for all errors)
1605 -ENOMEDIUM No media inserted.
1606 -EINVAL Invalid sector number or nb_sectors
1607 -EACCES Trying to write a read-only device
1609 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1610 const uint8_t *buf, int nb_sectors)
1612 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1615 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1616 void *buf, int count1)
1618 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1619 int len, nb_sectors, count;
1624 /* first read to align to sector start */
1625 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1628 sector_num = offset >> BDRV_SECTOR_BITS;
1630 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1632 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1640 /* read the sectors "in place" */
1641 nb_sectors = count >> BDRV_SECTOR_BITS;
1642 if (nb_sectors > 0) {
1643 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1645 sector_num += nb_sectors;
1646 len = nb_sectors << BDRV_SECTOR_BITS;
1651 /* add data from the last sector */
1653 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1655 memcpy(buf, tmp_buf, count);
1660 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1661 const void *buf, int count1)
1663 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1664 int len, nb_sectors, count;
1669 /* first write to align to sector start */
1670 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1673 sector_num = offset >> BDRV_SECTOR_BITS;
1675 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1677 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1678 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1687 /* write the sectors "in place" */
1688 nb_sectors = count >> BDRV_SECTOR_BITS;
1689 if (nb_sectors > 0) {
1690 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1692 sector_num += nb_sectors;
1693 len = nb_sectors << BDRV_SECTOR_BITS;
1698 /* add data from the last sector */
1700 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1702 memcpy(tmp_buf, buf, count);
1703 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1710 * Writes to the file and ensures that no writes are reordered across this
1711 * request (acts as a barrier)
1713 * Returns 0 on success, -errno in error cases.
1715 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1716 const void *buf, int count)
1720 ret = bdrv_pwrite(bs, offset, buf, count);
1725 /* No flush needed for cache modes that use O_DSYNC */
1726 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1733 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1734 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1736 /* Perform I/O through a temporary buffer so that users who scribble over
1737 * their read buffer while the operation is in progress do not end up
1738 * modifying the image file. This is critical for zero-copy guest I/O
1739 * where anything might happen inside guest memory.
1741 void *bounce_buffer;
1743 BlockDriver *drv = bs->drv;
1745 QEMUIOVector bounce_qiov;
1746 int64_t cluster_sector_num;
1747 int cluster_nb_sectors;
1751 /* Cover entire cluster so no additional backing file I/O is required when
1752 * allocating cluster in the image file.
1754 round_to_clusters(bs, sector_num, nb_sectors,
1755 &cluster_sector_num, &cluster_nb_sectors);
1757 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1758 cluster_sector_num, cluster_nb_sectors);
1760 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1761 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1762 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1764 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1770 if (drv->bdrv_co_write_zeroes &&
1771 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1772 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1773 cluster_nb_sectors);
1775 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1780 /* It might be okay to ignore write errors for guest requests. If this
1781 * is a deliberate copy-on-read then we don't want to ignore the error.
1782 * Simply report it in all cases.
1787 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1788 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1789 nb_sectors * BDRV_SECTOR_SIZE);
1792 qemu_vfree(bounce_buffer);
1797 * Handle a read request in coroutine context
1799 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1800 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1801 BdrvRequestFlags flags)
1803 BlockDriver *drv = bs->drv;
1804 BdrvTrackedRequest req;
1810 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1814 /* throttling disk read I/O */
1815 if (bs->io_limits_enabled) {
1816 bdrv_io_limits_intercept(bs, false, nb_sectors);
1819 if (bs->copy_on_read) {
1820 flags |= BDRV_REQ_COPY_ON_READ;
1822 if (flags & BDRV_REQ_COPY_ON_READ) {
1823 bs->copy_on_read_in_flight++;
1826 if (bs->copy_on_read_in_flight) {
1827 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1830 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1832 if (flags & BDRV_REQ_COPY_ON_READ) {
1835 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1840 if (!ret || pnum != nb_sectors) {
1841 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1846 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1849 tracked_request_end(&req);
1851 if (flags & BDRV_REQ_COPY_ON_READ) {
1852 bs->copy_on_read_in_flight--;
1858 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1859 int nb_sectors, QEMUIOVector *qiov)
1861 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1863 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1866 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1867 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1869 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1871 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1872 BDRV_REQ_COPY_ON_READ);
1875 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1876 int64_t sector_num, int nb_sectors)
1878 BlockDriver *drv = bs->drv;
1883 /* TODO Emulate only part of misaligned requests instead of letting block
1884 * drivers return -ENOTSUP and emulate everything */
1886 /* First try the efficient write zeroes operation */
1887 if (drv->bdrv_co_write_zeroes) {
1888 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1889 if (ret != -ENOTSUP) {
1894 /* Fall back to bounce buffer if write zeroes is unsupported */
1895 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1896 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1897 memset(iov.iov_base, 0, iov.iov_len);
1898 qemu_iovec_init_external(&qiov, &iov, 1);
1900 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1902 qemu_vfree(iov.iov_base);
1907 * Handle a write request in coroutine context
1909 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1910 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1911 BdrvRequestFlags flags)
1913 BlockDriver *drv = bs->drv;
1914 BdrvTrackedRequest req;
1920 if (bs->read_only) {
1923 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1927 /* throttling disk write I/O */
1928 if (bs->io_limits_enabled) {
1929 bdrv_io_limits_intercept(bs, true, nb_sectors);
1932 if (bs->copy_on_read_in_flight) {
1933 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1936 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1938 if (flags & BDRV_REQ_ZERO_WRITE) {
1939 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1941 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1944 if (bs->dirty_bitmap) {
1945 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1948 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1949 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1952 tracked_request_end(&req);
1957 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1958 int nb_sectors, QEMUIOVector *qiov)
1960 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1962 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1965 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1966 int64_t sector_num, int nb_sectors)
1968 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1970 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1971 BDRV_REQ_ZERO_WRITE);
1975 * Truncate file to 'offset' bytes (needed only for file protocols)
1977 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1979 BlockDriver *drv = bs->drv;
1983 if (!drv->bdrv_truncate)
1987 if (bdrv_in_use(bs))
1989 ret = drv->bdrv_truncate(bs, offset);
1991 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1992 bdrv_dev_resize_cb(bs);
1998 * Length of a allocated file in bytes. Sparse files are counted by actual
1999 * allocated space. Return < 0 if error or unknown.
2001 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2003 BlockDriver *drv = bs->drv;
2007 if (drv->bdrv_get_allocated_file_size) {
2008 return drv->bdrv_get_allocated_file_size(bs);
2011 return bdrv_get_allocated_file_size(bs->file);
2017 * Length of a file in bytes. Return < 0 if error or unknown.
2019 int64_t bdrv_getlength(BlockDriverState *bs)
2021 BlockDriver *drv = bs->drv;
2025 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2026 if (drv->bdrv_getlength) {
2027 return drv->bdrv_getlength(bs);
2030 return bs->total_sectors * BDRV_SECTOR_SIZE;
2033 /* return 0 as number of sectors if no device present or error */
2034 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2037 length = bdrv_getlength(bs);
2041 length = length >> BDRV_SECTOR_BITS;
2042 *nb_sectors_ptr = length;
2046 uint8_t boot_ind; /* 0x80 - active */
2047 uint8_t head; /* starting head */
2048 uint8_t sector; /* starting sector */
2049 uint8_t cyl; /* starting cylinder */
2050 uint8_t sys_ind; /* What partition type */
2051 uint8_t end_head; /* end head */
2052 uint8_t end_sector; /* end sector */
2053 uint8_t end_cyl; /* end cylinder */
2054 uint32_t start_sect; /* starting sector counting from 0 */
2055 uint32_t nr_sects; /* nr of sectors in partition */
2058 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2059 static int guess_disk_lchs(BlockDriverState *bs,
2060 int *pcylinders, int *pheads, int *psectors)
2062 uint8_t buf[BDRV_SECTOR_SIZE];
2063 int ret, i, heads, sectors, cylinders;
2064 struct partition *p;
2066 uint64_t nb_sectors;
2069 bdrv_get_geometry(bs, &nb_sectors);
2072 * The function will be invoked during startup not only in sync I/O mode,
2073 * but also in async I/O mode. So the I/O throttling function has to
2074 * be disabled temporarily here, not permanently.
2076 enabled = bs->io_limits_enabled;
2077 bs->io_limits_enabled = false;
2078 ret = bdrv_read(bs, 0, buf, 1);
2079 bs->io_limits_enabled = enabled;
2082 /* test msdos magic */
2083 if (buf[510] != 0x55 || buf[511] != 0xaa)
2085 for(i = 0; i < 4; i++) {
2086 p = ((struct partition *)(buf + 0x1be)) + i;
2087 nr_sects = le32_to_cpu(p->nr_sects);
2088 if (nr_sects && p->end_head) {
2089 /* We make the assumption that the partition terminates on
2090 a cylinder boundary */
2091 heads = p->end_head + 1;
2092 sectors = p->end_sector & 63;
2095 cylinders = nb_sectors / (heads * sectors);
2096 if (cylinders < 1 || cylinders > 16383)
2099 *psectors = sectors;
2100 *pcylinders = cylinders;
2102 printf("guessed geometry: LCHS=%d %d %d\n",
2103 cylinders, heads, sectors);
2111 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2113 int translation, lba_detected = 0;
2114 int cylinders, heads, secs;
2115 uint64_t nb_sectors;
2117 /* if a geometry hint is available, use it */
2118 bdrv_get_geometry(bs, &nb_sectors);
2119 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2120 translation = bdrv_get_translation_hint(bs);
2121 if (cylinders != 0) {
2126 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2128 /* if heads > 16, it means that a BIOS LBA
2129 translation was active, so the default
2130 hardware geometry is OK */
2132 goto default_geometry;
2137 /* disable any translation to be in sync with
2138 the logical geometry */
2139 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2140 bdrv_set_translation_hint(bs,
2141 BIOS_ATA_TRANSLATION_NONE);
2146 /* if no geometry, use a standard physical disk geometry */
2147 cylinders = nb_sectors / (16 * 63);
2149 if (cylinders > 16383)
2151 else if (cylinders < 2)
2156 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2157 if ((*pcyls * *pheads) <= 131072) {
2158 bdrv_set_translation_hint(bs,
2159 BIOS_ATA_TRANSLATION_LARGE);
2161 bdrv_set_translation_hint(bs,
2162 BIOS_ATA_TRANSLATION_LBA);
2166 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2170 void bdrv_set_geometry_hint(BlockDriverState *bs,
2171 int cyls, int heads, int secs)
2178 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2180 bs->translation = translation;
2183 void bdrv_get_geometry_hint(BlockDriverState *bs,
2184 int *pcyls, int *pheads, int *psecs)
2187 *pheads = bs->heads;
2191 /* throttling disk io limits */
2192 void bdrv_set_io_limits(BlockDriverState *bs,
2193 BlockIOLimit *io_limits)
2195 bs->io_limits = *io_limits;
2196 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2199 /* Recognize floppy formats */
2200 typedef struct FDFormat {
2208 static const FDFormat fd_formats[] = {
2209 /* First entry is default format */
2210 /* 1.44 MB 3"1/2 floppy disks */
2211 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2212 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2213 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2214 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2215 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2216 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2217 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2218 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2219 /* 2.88 MB 3"1/2 floppy disks */
2220 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2221 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2222 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2223 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2224 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2225 /* 720 kB 3"1/2 floppy disks */
2226 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2227 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2228 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2229 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2230 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2231 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2232 /* 1.2 MB 5"1/4 floppy disks */
2233 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2234 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2235 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2236 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2237 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2238 /* 720 kB 5"1/4 floppy disks */
2239 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2240 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2241 /* 360 kB 5"1/4 floppy disks */
2242 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2243 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2244 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2245 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2246 /* 320 kB 5"1/4 floppy disks */
2247 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2248 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2249 /* 360 kB must match 5"1/4 better than 3"1/2... */
2250 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2252 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2255 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2256 int *max_track, int *last_sect,
2257 FDriveType drive_in, FDriveType *drive,
2260 const FDFormat *parse;
2261 uint64_t nb_sectors, size;
2262 int i, first_match, match;
2264 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2265 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2266 /* User defined disk */
2267 *rate = FDRIVE_RATE_500K;
2269 bdrv_get_geometry(bs, &nb_sectors);
2272 for (i = 0; ; i++) {
2273 parse = &fd_formats[i];
2274 if (parse->drive == FDRIVE_DRV_NONE) {
2277 if (drive_in == parse->drive ||
2278 drive_in == FDRIVE_DRV_NONE) {
2279 size = (parse->max_head + 1) * parse->max_track *
2281 if (nb_sectors == size) {
2285 if (first_match == -1) {
2291 if (first_match == -1) {
2294 match = first_match;
2296 parse = &fd_formats[match];
2298 *nb_heads = parse->max_head + 1;
2299 *max_track = parse->max_track;
2300 *last_sect = parse->last_sect;
2301 *drive = parse->drive;
2302 *rate = parse->rate;
2306 int bdrv_get_translation_hint(BlockDriverState *bs)
2308 return bs->translation;
2311 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2312 BlockErrorAction on_write_error)
2314 bs->on_read_error = on_read_error;
2315 bs->on_write_error = on_write_error;
2318 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2320 return is_read ? bs->on_read_error : bs->on_write_error;
2323 int bdrv_is_read_only(BlockDriverState *bs)
2325 return bs->read_only;
2328 int bdrv_is_sg(BlockDriverState *bs)
2333 int bdrv_enable_write_cache(BlockDriverState *bs)
2335 return bs->enable_write_cache;
2338 int bdrv_is_encrypted(BlockDriverState *bs)
2340 if (bs->backing_hd && bs->backing_hd->encrypted)
2342 return bs->encrypted;
2345 int bdrv_key_required(BlockDriverState *bs)
2347 BlockDriverState *backing_hd = bs->backing_hd;
2349 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2351 return (bs->encrypted && !bs->valid_key);
2354 int bdrv_set_key(BlockDriverState *bs, const char *key)
2357 if (bs->backing_hd && bs->backing_hd->encrypted) {
2358 ret = bdrv_set_key(bs->backing_hd, key);
2364 if (!bs->encrypted) {
2366 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2369 ret = bs->drv->bdrv_set_key(bs, key);
2372 } else if (!bs->valid_key) {
2374 /* call the change callback now, we skipped it on open */
2375 bdrv_dev_change_media_cb(bs, true);
2380 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2385 pstrcpy(buf, buf_size, bs->drv->format_name);
2389 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2394 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2395 it(opaque, drv->format_name);
2399 BlockDriverState *bdrv_find(const char *name)
2401 BlockDriverState *bs;
2403 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2404 if (!strcmp(name, bs->device_name)) {
2411 BlockDriverState *bdrv_next(BlockDriverState *bs)
2414 return QTAILQ_FIRST(&bdrv_states);
2416 return QTAILQ_NEXT(bs, list);
2419 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2421 BlockDriverState *bs;
2423 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2428 const char *bdrv_get_device_name(BlockDriverState *bs)
2430 return bs->device_name;
2433 void bdrv_flush_all(void)
2435 BlockDriverState *bs;
2437 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2442 int bdrv_has_zero_init(BlockDriverState *bs)
2446 if (bs->drv->bdrv_has_zero_init) {
2447 return bs->drv->bdrv_has_zero_init(bs);
2453 typedef struct BdrvCoIsAllocatedData {
2454 BlockDriverState *bs;
2460 } BdrvCoIsAllocatedData;
2463 * Returns true iff the specified sector is present in the disk image. Drivers
2464 * not implementing the functionality are assumed to not support backing files,
2465 * hence all their sectors are reported as allocated.
2467 * If 'sector_num' is beyond the end of the disk image the return value is 0
2468 * and 'pnum' is set to 0.
2470 * 'pnum' is set to the number of sectors (including and immediately following
2471 * the specified sector) that are known to be in the same
2472 * allocated/unallocated state.
2474 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2475 * beyond the end of the disk image it will be clamped.
2477 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2478 int nb_sectors, int *pnum)
2482 if (sector_num >= bs->total_sectors) {
2487 n = bs->total_sectors - sector_num;
2488 if (n < nb_sectors) {
2492 if (!bs->drv->bdrv_co_is_allocated) {
2497 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2500 /* Coroutine wrapper for bdrv_is_allocated() */
2501 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2503 BdrvCoIsAllocatedData *data = opaque;
2504 BlockDriverState *bs = data->bs;
2506 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2512 * Synchronous wrapper around bdrv_co_is_allocated().
2514 * See bdrv_co_is_allocated() for details.
2516 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2520 BdrvCoIsAllocatedData data = {
2522 .sector_num = sector_num,
2523 .nb_sectors = nb_sectors,
2528 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2529 qemu_coroutine_enter(co, &data);
2530 while (!data.done) {
2536 BlockInfoList *qmp_query_block(Error **errp)
2538 BlockInfoList *head = NULL, *cur_item = NULL;
2539 BlockDriverState *bs;
2541 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2542 BlockInfoList *info = g_malloc0(sizeof(*info));
2544 info->value = g_malloc0(sizeof(*info->value));
2545 info->value->device = g_strdup(bs->device_name);
2546 info->value->type = g_strdup("unknown");
2547 info->value->locked = bdrv_dev_is_medium_locked(bs);
2548 info->value->removable = bdrv_dev_has_removable_media(bs);
2550 if (bdrv_dev_has_removable_media(bs)) {
2551 info->value->has_tray_open = true;
2552 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2555 if (bdrv_iostatus_is_enabled(bs)) {
2556 info->value->has_io_status = true;
2557 info->value->io_status = bs->iostatus;
2561 info->value->has_inserted = true;
2562 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2563 info->value->inserted->file = g_strdup(bs->filename);
2564 info->value->inserted->ro = bs->read_only;
2565 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2566 info->value->inserted->encrypted = bs->encrypted;
2567 if (bs->backing_file[0]) {
2568 info->value->inserted->has_backing_file = true;
2569 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2572 if (bs->io_limits_enabled) {
2573 info->value->inserted->bps =
2574 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2575 info->value->inserted->bps_rd =
2576 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2577 info->value->inserted->bps_wr =
2578 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2579 info->value->inserted->iops =
2580 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2581 info->value->inserted->iops_rd =
2582 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2583 info->value->inserted->iops_wr =
2584 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2588 /* XXX: waiting for the qapi to support GSList */
2590 head = cur_item = info;
2592 cur_item->next = info;
2600 /* Consider exposing this as a full fledged QMP command */
2601 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2605 s = g_malloc0(sizeof(*s));
2607 if (bs->device_name[0]) {
2608 s->has_device = true;
2609 s->device = g_strdup(bs->device_name);
2612 s->stats = g_malloc0(sizeof(*s->stats));
2613 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2614 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2615 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2616 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2617 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2618 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2619 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2620 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2621 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2624 s->has_parent = true;
2625 s->parent = qmp_query_blockstat(bs->file, NULL);
2631 BlockStatsList *qmp_query_blockstats(Error **errp)
2633 BlockStatsList *head = NULL, *cur_item = NULL;
2634 BlockDriverState *bs;
2636 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2637 BlockStatsList *info = g_malloc0(sizeof(*info));
2638 info->value = qmp_query_blockstat(bs, NULL);
2640 /* XXX: waiting for the qapi to support GSList */
2642 head = cur_item = info;
2644 cur_item->next = info;
2652 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2654 if (bs->backing_hd && bs->backing_hd->encrypted)
2655 return bs->backing_file;
2656 else if (bs->encrypted)
2657 return bs->filename;
2662 void bdrv_get_backing_filename(BlockDriverState *bs,
2663 char *filename, int filename_size)
2665 pstrcpy(filename, filename_size, bs->backing_file);
2668 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2669 const uint8_t *buf, int nb_sectors)
2671 BlockDriver *drv = bs->drv;
2674 if (!drv->bdrv_write_compressed)
2676 if (bdrv_check_request(bs, sector_num, nb_sectors))
2679 if (bs->dirty_bitmap) {
2680 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2683 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2686 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2688 BlockDriver *drv = bs->drv;
2691 if (!drv->bdrv_get_info)
2693 memset(bdi, 0, sizeof(*bdi));
2694 return drv->bdrv_get_info(bs, bdi);
2697 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2698 int64_t pos, int size)
2700 BlockDriver *drv = bs->drv;
2703 if (drv->bdrv_save_vmstate)
2704 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2706 return bdrv_save_vmstate(bs->file, buf, pos, size);
2710 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2711 int64_t pos, int size)
2713 BlockDriver *drv = bs->drv;
2716 if (drv->bdrv_load_vmstate)
2717 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2719 return bdrv_load_vmstate(bs->file, buf, pos, size);
2723 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2725 BlockDriver *drv = bs->drv;
2727 if (!drv || !drv->bdrv_debug_event) {
2731 return drv->bdrv_debug_event(bs, event);
2735 /**************************************************************/
2736 /* handling of snapshots */
2738 int bdrv_can_snapshot(BlockDriverState *bs)
2740 BlockDriver *drv = bs->drv;
2741 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2745 if (!drv->bdrv_snapshot_create) {
2746 if (bs->file != NULL) {
2747 return bdrv_can_snapshot(bs->file);
2755 int bdrv_is_snapshot(BlockDriverState *bs)
2757 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2760 BlockDriverState *bdrv_snapshots(void)
2762 BlockDriverState *bs;
2765 return bs_snapshots;
2769 while ((bs = bdrv_next(bs))) {
2770 if (bdrv_can_snapshot(bs)) {
2778 int bdrv_snapshot_create(BlockDriverState *bs,
2779 QEMUSnapshotInfo *sn_info)
2781 BlockDriver *drv = bs->drv;
2784 if (drv->bdrv_snapshot_create)
2785 return drv->bdrv_snapshot_create(bs, sn_info);
2787 return bdrv_snapshot_create(bs->file, sn_info);
2791 int bdrv_snapshot_goto(BlockDriverState *bs,
2792 const char *snapshot_id)
2794 BlockDriver *drv = bs->drv;
2799 if (drv->bdrv_snapshot_goto)
2800 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2803 drv->bdrv_close(bs);
2804 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2805 open_ret = drv->bdrv_open(bs, bs->open_flags);
2807 bdrv_delete(bs->file);
2817 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2819 BlockDriver *drv = bs->drv;
2822 if (drv->bdrv_snapshot_delete)
2823 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2825 return bdrv_snapshot_delete(bs->file, snapshot_id);
2829 int bdrv_snapshot_list(BlockDriverState *bs,
2830 QEMUSnapshotInfo **psn_info)
2832 BlockDriver *drv = bs->drv;
2835 if (drv->bdrv_snapshot_list)
2836 return drv->bdrv_snapshot_list(bs, psn_info);
2838 return bdrv_snapshot_list(bs->file, psn_info);
2842 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2843 const char *snapshot_name)
2845 BlockDriver *drv = bs->drv;
2849 if (!bs->read_only) {
2852 if (drv->bdrv_snapshot_load_tmp) {
2853 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2858 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2859 const char *backing_file)
2865 if (bs->backing_hd) {
2866 if (strcmp(bs->backing_file, backing_file) == 0) {
2867 return bs->backing_hd;
2869 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2876 #define NB_SUFFIXES 4
2878 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2880 static const char suffixes[NB_SUFFIXES] = "KMGT";
2885 snprintf(buf, buf_size, "%" PRId64, size);
2888 for(i = 0; i < NB_SUFFIXES; i++) {
2889 if (size < (10 * base)) {
2890 snprintf(buf, buf_size, "%0.1f%c",
2891 (double)size / base,
2894 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2895 snprintf(buf, buf_size, "%" PRId64 "%c",
2896 ((size + (base >> 1)) / base),
2906 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2908 char buf1[128], date_buf[128], clock_buf[128];
2918 snprintf(buf, buf_size,
2919 "%-10s%-20s%7s%20s%15s",
2920 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2924 ptm = localtime(&ti);
2925 strftime(date_buf, sizeof(date_buf),
2926 "%Y-%m-%d %H:%M:%S", ptm);
2928 localtime_r(&ti, &tm);
2929 strftime(date_buf, sizeof(date_buf),
2930 "%Y-%m-%d %H:%M:%S", &tm);
2932 secs = sn->vm_clock_nsec / 1000000000;
2933 snprintf(clock_buf, sizeof(clock_buf),
2934 "%02d:%02d:%02d.%03d",
2936 (int)((secs / 60) % 60),
2938 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2939 snprintf(buf, buf_size,
2940 "%-10s%-20s%7s%20s%15s",
2941 sn->id_str, sn->name,
2942 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2949 /**************************************************************/
2952 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2953 QEMUIOVector *qiov, int nb_sectors,
2954 BlockDriverCompletionFunc *cb, void *opaque)
2956 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2958 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2962 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2963 QEMUIOVector *qiov, int nb_sectors,
2964 BlockDriverCompletionFunc *cb, void *opaque)
2966 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2968 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2973 typedef struct MultiwriteCB {
2978 BlockDriverCompletionFunc *cb;
2980 QEMUIOVector *free_qiov;
2984 static void multiwrite_user_cb(MultiwriteCB *mcb)
2988 for (i = 0; i < mcb->num_callbacks; i++) {
2989 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2990 if (mcb->callbacks[i].free_qiov) {
2991 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2993 g_free(mcb->callbacks[i].free_qiov);
2997 static void multiwrite_cb(void *opaque, int ret)
2999 MultiwriteCB *mcb = opaque;
3001 trace_multiwrite_cb(mcb, ret);
3003 if (ret < 0 && !mcb->error) {
3007 mcb->num_requests--;
3008 if (mcb->num_requests == 0) {
3009 multiwrite_user_cb(mcb);
3014 static int multiwrite_req_compare(const void *a, const void *b)
3016 const BlockRequest *req1 = a, *req2 = b;
3019 * Note that we can't simply subtract req2->sector from req1->sector
3020 * here as that could overflow the return value.
3022 if (req1->sector > req2->sector) {
3024 } else if (req1->sector < req2->sector) {
3032 * Takes a bunch of requests and tries to merge them. Returns the number of
3033 * requests that remain after merging.
3035 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3036 int num_reqs, MultiwriteCB *mcb)
3040 // Sort requests by start sector
3041 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3043 // Check if adjacent requests touch the same clusters. If so, combine them,
3044 // filling up gaps with zero sectors.
3046 for (i = 1; i < num_reqs; i++) {
3048 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3050 // Handle exactly sequential writes and overlapping writes.
3051 if (reqs[i].sector <= oldreq_last) {
3055 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3061 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3062 qemu_iovec_init(qiov,
3063 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3065 // Add the first request to the merged one. If the requests are
3066 // overlapping, drop the last sectors of the first request.
3067 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3068 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3070 // We should need to add any zeros between the two requests
3071 assert (reqs[i].sector <= oldreq_last);
3073 // Add the second request
3074 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3076 reqs[outidx].nb_sectors = qiov->size >> 9;
3077 reqs[outidx].qiov = qiov;
3079 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3082 reqs[outidx].sector = reqs[i].sector;
3083 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3084 reqs[outidx].qiov = reqs[i].qiov;
3092 * Submit multiple AIO write requests at once.
3094 * On success, the function returns 0 and all requests in the reqs array have
3095 * been submitted. In error case this function returns -1, and any of the
3096 * requests may or may not be submitted yet. In particular, this means that the
3097 * callback will be called for some of the requests, for others it won't. The
3098 * caller must check the error field of the BlockRequest to wait for the right
3099 * callbacks (if error != 0, no callback will be called).
3101 * The implementation may modify the contents of the reqs array, e.g. to merge
3102 * requests. However, the fields opaque and error are left unmodified as they
3103 * are used to signal failure for a single request to the caller.
3105 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3110 /* don't submit writes if we don't have a medium */
3111 if (bs->drv == NULL) {
3112 for (i = 0; i < num_reqs; i++) {
3113 reqs[i].error = -ENOMEDIUM;
3118 if (num_reqs == 0) {
3122 // Create MultiwriteCB structure
3123 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3124 mcb->num_requests = 0;
3125 mcb->num_callbacks = num_reqs;
3127 for (i = 0; i < num_reqs; i++) {
3128 mcb->callbacks[i].cb = reqs[i].cb;
3129 mcb->callbacks[i].opaque = reqs[i].opaque;
3132 // Check for mergable requests
3133 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3135 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3137 /* Run the aio requests. */
3138 mcb->num_requests = num_reqs;
3139 for (i = 0; i < num_reqs; i++) {
3140 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3141 reqs[i].nb_sectors, multiwrite_cb, mcb);
3147 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3149 acb->pool->cancel(acb);
3152 /* block I/O throttling */
3153 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3154 bool is_write, double elapsed_time, uint64_t *wait)
3156 uint64_t bps_limit = 0;
3157 double bytes_limit, bytes_base, bytes_res;
3158 double slice_time, wait_time;
3160 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3161 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3162 } else if (bs->io_limits.bps[is_write]) {
3163 bps_limit = bs->io_limits.bps[is_write];
3172 slice_time = bs->slice_end - bs->slice_start;
3173 slice_time /= (NANOSECONDS_PER_SECOND);
3174 bytes_limit = bps_limit * slice_time;
3175 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3176 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3177 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3180 /* bytes_base: the bytes of data which have been read/written; and
3181 * it is obtained from the history statistic info.
3182 * bytes_res: the remaining bytes of data which need to be read/written.
3183 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3184 * the total time for completing reading/writting all data.
3186 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3188 if (bytes_base + bytes_res <= bytes_limit) {
3196 /* Calc approx time to dispatch */
3197 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3199 /* When the I/O rate at runtime exceeds the limits,
3200 * bs->slice_end need to be extended in order that the current statistic
3201 * info can be kept until the timer fire, so it is increased and tuned
3202 * based on the result of experiment.
3204 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3205 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3207 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3213 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3214 double elapsed_time, uint64_t *wait)
3216 uint64_t iops_limit = 0;
3217 double ios_limit, ios_base;
3218 double slice_time, wait_time;
3220 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3221 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3222 } else if (bs->io_limits.iops[is_write]) {
3223 iops_limit = bs->io_limits.iops[is_write];
3232 slice_time = bs->slice_end - bs->slice_start;
3233 slice_time /= (NANOSECONDS_PER_SECOND);
3234 ios_limit = iops_limit * slice_time;
3235 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3236 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3237 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3240 if (ios_base + 1 <= ios_limit) {
3248 /* Calc approx time to dispatch */
3249 wait_time = (ios_base + 1) / iops_limit;
3250 if (wait_time > elapsed_time) {
3251 wait_time = wait_time - elapsed_time;
3256 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3257 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3259 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3265 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3266 bool is_write, int64_t *wait)
3268 int64_t now, max_wait;
3269 uint64_t bps_wait = 0, iops_wait = 0;
3270 double elapsed_time;
3271 int bps_ret, iops_ret;
3273 now = qemu_get_clock_ns(vm_clock);
3274 if ((bs->slice_start < now)
3275 && (bs->slice_end > now)) {
3276 bs->slice_end = now + bs->slice_time;
3278 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3279 bs->slice_start = now;
3280 bs->slice_end = now + bs->slice_time;
3282 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3283 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3285 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3286 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3289 elapsed_time = now - bs->slice_start;
3290 elapsed_time /= (NANOSECONDS_PER_SECOND);
3292 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3293 is_write, elapsed_time, &bps_wait);
3294 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3295 elapsed_time, &iops_wait);
3296 if (bps_ret || iops_ret) {
3297 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3302 now = qemu_get_clock_ns(vm_clock);
3303 if (bs->slice_end < now + max_wait) {
3304 bs->slice_end = now + max_wait;
3317 /**************************************************************/
3318 /* async block device emulation */
3320 typedef struct BlockDriverAIOCBSync {
3321 BlockDriverAIOCB common;
3324 /* vector translation state */
3328 } BlockDriverAIOCBSync;
3330 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3332 BlockDriverAIOCBSync *acb =
3333 container_of(blockacb, BlockDriverAIOCBSync, common);
3334 qemu_bh_delete(acb->bh);
3336 qemu_aio_release(acb);
3339 static AIOPool bdrv_em_aio_pool = {
3340 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3341 .cancel = bdrv_aio_cancel_em,
3344 static void bdrv_aio_bh_cb(void *opaque)
3346 BlockDriverAIOCBSync *acb = opaque;
3349 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3350 qemu_vfree(acb->bounce);
3351 acb->common.cb(acb->common.opaque, acb->ret);
3352 qemu_bh_delete(acb->bh);
3354 qemu_aio_release(acb);
3357 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3361 BlockDriverCompletionFunc *cb,
3366 BlockDriverAIOCBSync *acb;
3368 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3369 acb->is_write = is_write;
3371 acb->bounce = qemu_blockalign(bs, qiov->size);
3372 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3375 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3376 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3378 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3381 qemu_bh_schedule(acb->bh);
3383 return &acb->common;
3386 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3387 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3388 BlockDriverCompletionFunc *cb, void *opaque)
3390 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3393 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3394 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3395 BlockDriverCompletionFunc *cb, void *opaque)
3397 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3401 typedef struct BlockDriverAIOCBCoroutine {
3402 BlockDriverAIOCB common;
3406 } BlockDriverAIOCBCoroutine;
3408 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3413 static AIOPool bdrv_em_co_aio_pool = {
3414 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3415 .cancel = bdrv_aio_co_cancel_em,
3418 static void bdrv_co_em_bh(void *opaque)
3420 BlockDriverAIOCBCoroutine *acb = opaque;
3422 acb->common.cb(acb->common.opaque, acb->req.error);
3423 qemu_bh_delete(acb->bh);
3424 qemu_aio_release(acb);
3427 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3428 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3430 BlockDriverAIOCBCoroutine *acb = opaque;
3431 BlockDriverState *bs = acb->common.bs;
3433 if (!acb->is_write) {
3434 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3435 acb->req.nb_sectors, acb->req.qiov, 0);
3437 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3438 acb->req.nb_sectors, acb->req.qiov, 0);
3441 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3442 qemu_bh_schedule(acb->bh);
3445 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3449 BlockDriverCompletionFunc *cb,
3454 BlockDriverAIOCBCoroutine *acb;
3456 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3457 acb->req.sector = sector_num;
3458 acb->req.nb_sectors = nb_sectors;
3459 acb->req.qiov = qiov;
3460 acb->is_write = is_write;
3462 co = qemu_coroutine_create(bdrv_co_do_rw);
3463 qemu_coroutine_enter(co, acb);
3465 return &acb->common;
3468 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3470 BlockDriverAIOCBCoroutine *acb = opaque;
3471 BlockDriverState *bs = acb->common.bs;
3473 acb->req.error = bdrv_co_flush(bs);
3474 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3475 qemu_bh_schedule(acb->bh);
3478 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3479 BlockDriverCompletionFunc *cb, void *opaque)
3481 trace_bdrv_aio_flush(bs, opaque);
3484 BlockDriverAIOCBCoroutine *acb;
3486 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3487 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3488 qemu_coroutine_enter(co, acb);
3490 return &acb->common;
3493 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3495 BlockDriverAIOCBCoroutine *acb = opaque;
3496 BlockDriverState *bs = acb->common.bs;
3498 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3499 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3500 qemu_bh_schedule(acb->bh);
3503 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3504 int64_t sector_num, int nb_sectors,
3505 BlockDriverCompletionFunc *cb, void *opaque)
3508 BlockDriverAIOCBCoroutine *acb;
3510 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3512 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3513 acb->req.sector = sector_num;
3514 acb->req.nb_sectors = nb_sectors;
3515 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3516 qemu_coroutine_enter(co, acb);
3518 return &acb->common;
3521 void bdrv_init(void)
3523 module_call_init(MODULE_INIT_BLOCK);
3526 void bdrv_init_with_whitelist(void)
3528 use_bdrv_whitelist = 1;
3532 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3533 BlockDriverCompletionFunc *cb, void *opaque)
3535 BlockDriverAIOCB *acb;
3537 if (pool->free_aiocb) {
3538 acb = pool->free_aiocb;
3539 pool->free_aiocb = acb->next;
3541 acb = g_malloc0(pool->aiocb_size);
3546 acb->opaque = opaque;
3550 void qemu_aio_release(void *p)
3552 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3553 AIOPool *pool = acb->pool;
3554 acb->next = pool->free_aiocb;
3555 pool->free_aiocb = acb;
3558 /**************************************************************/
3559 /* Coroutine block device emulation */
3561 typedef struct CoroutineIOCompletion {
3562 Coroutine *coroutine;
3564 } CoroutineIOCompletion;
3566 static void bdrv_co_io_em_complete(void *opaque, int ret)
3568 CoroutineIOCompletion *co = opaque;
3571 qemu_coroutine_enter(co->coroutine, NULL);
3574 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3575 int nb_sectors, QEMUIOVector *iov,
3578 CoroutineIOCompletion co = {
3579 .coroutine = qemu_coroutine_self(),
3581 BlockDriverAIOCB *acb;
3584 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3585 bdrv_co_io_em_complete, &co);
3587 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3588 bdrv_co_io_em_complete, &co);
3591 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3595 qemu_coroutine_yield();
3600 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3601 int64_t sector_num, int nb_sectors,
3604 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3607 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3608 int64_t sector_num, int nb_sectors,
3611 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3614 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3616 RwCo *rwco = opaque;
3618 rwco->ret = bdrv_co_flush(rwco->bs);
3621 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3625 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3629 /* Write back cached data to the OS even with cache=unsafe */
3630 if (bs->drv->bdrv_co_flush_to_os) {
3631 ret = bs->drv->bdrv_co_flush_to_os(bs);
3637 /* But don't actually force it to the disk with cache=unsafe */
3638 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3642 if (bs->drv->bdrv_co_flush_to_disk) {
3643 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3644 } else if (bs->drv->bdrv_aio_flush) {
3645 BlockDriverAIOCB *acb;
3646 CoroutineIOCompletion co = {
3647 .coroutine = qemu_coroutine_self(),
3650 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3654 qemu_coroutine_yield();
3659 * Some block drivers always operate in either writethrough or unsafe
3660 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3661 * know how the server works (because the behaviour is hardcoded or
3662 * depends on server-side configuration), so we can't ensure that
3663 * everything is safe on disk. Returning an error doesn't work because
3664 * that would break guests even if the server operates in writethrough
3667 * Let's hope the user knows what he's doing.
3675 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3676 * in the case of cache=unsafe, so there are no useless flushes.
3678 return bdrv_co_flush(bs->file);
3681 void bdrv_invalidate_cache(BlockDriverState *bs)
3683 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3684 bs->drv->bdrv_invalidate_cache(bs);
3688 void bdrv_invalidate_cache_all(void)
3690 BlockDriverState *bs;
3692 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3693 bdrv_invalidate_cache(bs);
3697 void bdrv_clear_incoming_migration_all(void)
3699 BlockDriverState *bs;
3701 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3702 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3706 int bdrv_flush(BlockDriverState *bs)
3714 if (qemu_in_coroutine()) {
3715 /* Fast-path if already in coroutine context */
3716 bdrv_flush_co_entry(&rwco);
3718 co = qemu_coroutine_create(bdrv_flush_co_entry);
3719 qemu_coroutine_enter(co, &rwco);
3720 while (rwco.ret == NOT_DONE) {
3728 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3730 RwCo *rwco = opaque;
3732 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3735 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3740 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3742 } else if (bs->read_only) {
3744 } else if (bs->drv->bdrv_co_discard) {
3745 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3746 } else if (bs->drv->bdrv_aio_discard) {
3747 BlockDriverAIOCB *acb;
3748 CoroutineIOCompletion co = {
3749 .coroutine = qemu_coroutine_self(),
3752 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3753 bdrv_co_io_em_complete, &co);
3757 qemu_coroutine_yield();
3765 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3770 .sector_num = sector_num,
3771 .nb_sectors = nb_sectors,
3775 if (qemu_in_coroutine()) {
3776 /* Fast-path if already in coroutine context */
3777 bdrv_discard_co_entry(&rwco);
3779 co = qemu_coroutine_create(bdrv_discard_co_entry);
3780 qemu_coroutine_enter(co, &rwco);
3781 while (rwco.ret == NOT_DONE) {
3789 /**************************************************************/
3790 /* removable device support */
3793 * Return TRUE if the media is present
3795 int bdrv_is_inserted(BlockDriverState *bs)
3797 BlockDriver *drv = bs->drv;
3801 if (!drv->bdrv_is_inserted)
3803 return drv->bdrv_is_inserted(bs);
3807 * Return whether the media changed since the last call to this
3808 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3810 int bdrv_media_changed(BlockDriverState *bs)
3812 BlockDriver *drv = bs->drv;
3814 if (drv && drv->bdrv_media_changed) {
3815 return drv->bdrv_media_changed(bs);
3821 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3823 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3825 BlockDriver *drv = bs->drv;
3827 if (drv && drv->bdrv_eject) {
3828 drv->bdrv_eject(bs, eject_flag);
3831 if (bs->device_name[0] != '\0') {
3832 bdrv_emit_qmp_eject_event(bs, eject_flag);
3837 * Lock or unlock the media (if it is locked, the user won't be able
3838 * to eject it manually).
3840 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3842 BlockDriver *drv = bs->drv;
3844 trace_bdrv_lock_medium(bs, locked);
3846 if (drv && drv->bdrv_lock_medium) {
3847 drv->bdrv_lock_medium(bs, locked);
3851 /* needed for generic scsi interface */
3853 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3855 BlockDriver *drv = bs->drv;
3857 if (drv && drv->bdrv_ioctl)
3858 return drv->bdrv_ioctl(bs, req, buf);
3862 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3863 unsigned long int req, void *buf,
3864 BlockDriverCompletionFunc *cb, void *opaque)
3866 BlockDriver *drv = bs->drv;
3868 if (drv && drv->bdrv_aio_ioctl)
3869 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3873 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3875 bs->buffer_alignment = align;
3878 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3880 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3883 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3885 int64_t bitmap_size;
3887 bs->dirty_count = 0;
3889 if (!bs->dirty_bitmap) {
3890 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3891 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3892 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3894 bs->dirty_bitmap = g_malloc0(bitmap_size);
3897 if (bs->dirty_bitmap) {
3898 g_free(bs->dirty_bitmap);
3899 bs->dirty_bitmap = NULL;
3904 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3906 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3908 if (bs->dirty_bitmap &&
3909 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3910 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3911 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3917 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3920 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3923 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3925 return bs->dirty_count;
3928 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3930 assert(bs->in_use != in_use);
3931 bs->in_use = in_use;
3934 int bdrv_in_use(BlockDriverState *bs)
3939 void bdrv_iostatus_enable(BlockDriverState *bs)
3941 bs->iostatus_enabled = true;
3942 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3945 /* The I/O status is only enabled if the drive explicitly
3946 * enables it _and_ the VM is configured to stop on errors */
3947 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3949 return (bs->iostatus_enabled &&
3950 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3951 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3952 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3955 void bdrv_iostatus_disable(BlockDriverState *bs)
3957 bs->iostatus_enabled = false;
3960 void bdrv_iostatus_reset(BlockDriverState *bs)
3962 if (bdrv_iostatus_is_enabled(bs)) {
3963 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3967 /* XXX: Today this is set by device models because it makes the implementation
3968 quite simple. However, the block layer knows about the error, so it's
3969 possible to implement this without device models being involved */
3970 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3972 if (bdrv_iostatus_is_enabled(bs) &&
3973 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3975 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3976 BLOCK_DEVICE_IO_STATUS_FAILED;
3981 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3982 enum BlockAcctType type)
3984 assert(type < BDRV_MAX_IOTYPE);
3986 cookie->bytes = bytes;
3987 cookie->start_time_ns = get_clock();
3988 cookie->type = type;
3992 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3994 assert(cookie->type < BDRV_MAX_IOTYPE);
3996 bs->nr_bytes[cookie->type] += cookie->bytes;
3997 bs->nr_ops[cookie->type]++;
3998 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4001 int bdrv_img_create(const char *filename, const char *fmt,
4002 const char *base_filename, const char *base_fmt,
4003 char *options, uint64_t img_size, int flags)
4005 QEMUOptionParameter *param = NULL, *create_options = NULL;
4006 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4007 BlockDriverState *bs = NULL;
4008 BlockDriver *drv, *proto_drv;
4009 BlockDriver *backing_drv = NULL;
4012 /* Find driver and parse its options */
4013 drv = bdrv_find_format(fmt);
4015 error_report("Unknown file format '%s'", fmt);
4020 proto_drv = bdrv_find_protocol(filename);
4022 error_report("Unknown protocol '%s'", filename);
4027 create_options = append_option_parameters(create_options,
4028 drv->create_options);
4029 create_options = append_option_parameters(create_options,
4030 proto_drv->create_options);
4032 /* Create parameter list with default values */
4033 param = parse_option_parameters("", create_options, param);
4035 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4037 /* Parse -o options */
4039 param = parse_option_parameters(options, create_options, param);
4040 if (param == NULL) {
4041 error_report("Invalid options for file format '%s'.", fmt);
4047 if (base_filename) {
4048 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4050 error_report("Backing file not supported for file format '%s'",
4058 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4059 error_report("Backing file format not supported for file "
4060 "format '%s'", fmt);
4066 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4067 if (backing_file && backing_file->value.s) {
4068 if (!strcmp(filename, backing_file->value.s)) {
4069 error_report("Error: Trying to create an image with the "
4070 "same filename as the backing file");
4076 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4077 if (backing_fmt && backing_fmt->value.s) {
4078 backing_drv = bdrv_find_format(backing_fmt->value.s);
4080 error_report("Unknown backing file format '%s'",
4081 backing_fmt->value.s);
4087 // The size for the image must always be specified, with one exception:
4088 // If we are using a backing file, we can obtain the size from there
4089 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4090 if (size && size->value.n == -1) {
4091 if (backing_file && backing_file->value.s) {
4097 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4099 error_report("Could not open '%s'", backing_file->value.s);
4102 bdrv_get_geometry(bs, &size);
4105 snprintf(buf, sizeof(buf), "%" PRId64, size);
4106 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4108 error_report("Image creation needs a size parameter");
4114 printf("Formatting '%s', fmt=%s ", filename, fmt);
4115 print_option_parameters(param);
4118 ret = bdrv_create(drv, filename, param);
4121 if (ret == -ENOTSUP) {
4122 error_report("Formatting or formatting option not supported for "
4123 "file format '%s'", fmt);
4124 } else if (ret == -EFBIG) {
4125 error_report("The image size is too large for file format '%s'",
4128 error_report("%s: error while creating %s: %s", filename, fmt,
4134 free_option_parameters(create_options);
4135 free_option_parameters(param);
4144 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4145 int64_t speed, BlockDriverCompletionFunc *cb,
4146 void *opaque, Error **errp)
4150 if (bs->job || bdrv_in_use(bs)) {
4151 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4154 bdrv_set_in_use(bs, 1);
4156 job = g_malloc0(job_type->instance_size);
4157 job->job_type = job_type;
4160 job->opaque = opaque;
4163 /* Only set speed when necessary to avoid NotSupported error */
4165 Error *local_err = NULL;
4167 block_job_set_speed(job, speed, &local_err);
4168 if (error_is_set(&local_err)) {
4171 bdrv_set_in_use(bs, 0);
4172 error_propagate(errp, local_err);
4179 void block_job_complete(BlockJob *job, int ret)
4181 BlockDriverState *bs = job->bs;
4183 assert(bs->job == job);
4184 job->cb(job->opaque, ret);
4187 bdrv_set_in_use(bs, 0);
4190 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4192 Error *local_err = NULL;
4194 if (!job->job_type->set_speed) {
4195 error_set(errp, QERR_NOT_SUPPORTED);
4198 job->job_type->set_speed(job, speed, &local_err);
4199 if (error_is_set(&local_err)) {
4200 error_propagate(errp, local_err);
4207 void block_job_cancel(BlockJob *job)
4209 job->cancelled = true;
4212 bool block_job_is_cancelled(BlockJob *job)
4214 return job->cancelled;
4217 void block_job_cancel_sync(BlockJob *job)
4219 BlockDriverState *bs = job->bs;
4221 assert(bs->job == job);
4222 block_job_cancel(job);
4223 while (bs->job != NULL && bs->job->busy) {