2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
28 #include "block_int.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
36 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79 BlockDriverCompletionFunc *cb,
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
106 static int is_windows_drive_prefix(const char *filename)
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113 int is_windows_drive(const char *filename)
115 if (is_windows_drive_prefix(filename) &&
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
128 bs->io_limits_enabled = false;
130 while (qemu_co_queue_next(&bs->throttled_reqs));
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
144 static void bdrv_block_timer(void *opaque)
146 BlockDriverState *bs = opaque;
148 qemu_co_queue_next(&bs->throttled_reqs);
151 void bdrv_io_limits_enable(BlockDriverState *bs)
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
176 int64_t wait_time = -1;
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
195 qemu_co_queue_next(&bs->throttled_reqs);
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
204 if (is_windows_drive(path) ||
205 is_windows_drive_prefix(path)) {
208 p = path + strcspn(path, ":/\\");
210 p = path + strcspn(path, ":/");
216 int path_is_absolute(const char *path)
219 /* specific case for names like: "\\.\d:" */
220 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
223 return (*path == '/' || *path == '\\');
225 return (*path == '/');
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
232 void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
244 p = strchr(base_path, ':');
249 p1 = strrchr(base_path, '/');
253 p2 = strrchr(base_path, '\\');
265 if (len > dest_size - 1)
267 memcpy(dest, base_path, len);
269 pstrcat(dest, dest_size, filename);
273 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
275 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276 pstrcpy(dest, sz, bs->backing_file);
278 path_combine(dest, sz, bs->filename, bs->backing_file);
282 void bdrv_register(BlockDriver *bdrv)
284 /* Block drivers without coroutine functions need emulation */
285 if (!bdrv->bdrv_co_readv) {
286 bdrv->bdrv_co_readv = bdrv_co_readv_em;
287 bdrv->bdrv_co_writev = bdrv_co_writev_em;
289 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290 * the block driver lacks aio we need to emulate that too.
292 if (!bdrv->bdrv_aio_readv) {
293 /* add AIO emulation layer */
294 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
299 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
302 /* create a new block device (by default it is empty) */
303 BlockDriverState *bdrv_new(const char *device_name)
305 BlockDriverState *bs;
307 bs = g_malloc0(sizeof(BlockDriverState));
308 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309 if (device_name[0] != '\0') {
310 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
312 bdrv_iostatus_disable(bs);
316 BlockDriver *bdrv_find_format(const char *format_name)
319 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320 if (!strcmp(drv1->format_name, format_name)) {
327 static int bdrv_is_whitelisted(BlockDriver *drv)
329 static const char *whitelist[] = {
330 CONFIG_BDRV_WHITELIST
335 return 1; /* no whitelist, anything goes */
337 for (p = whitelist; *p; p++) {
338 if (!strcmp(drv->format_name, *p)) {
345 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
347 BlockDriver *drv = bdrv_find_format(format_name);
348 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
351 typedef struct CreateCo {
354 QEMUOptionParameter *options;
358 static void coroutine_fn bdrv_create_co_entry(void *opaque)
360 CreateCo *cco = opaque;
363 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
366 int bdrv_create(BlockDriver *drv, const char* filename,
367 QEMUOptionParameter *options)
374 .filename = g_strdup(filename),
379 if (!drv->bdrv_create) {
383 if (qemu_in_coroutine()) {
384 /* Fast-path if already in coroutine context */
385 bdrv_create_co_entry(&cco);
387 co = qemu_coroutine_create(bdrv_create_co_entry);
388 qemu_coroutine_enter(co, &cco);
389 while (cco.ret == NOT_DONE) {
395 g_free(cco.filename);
400 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
404 drv = bdrv_find_protocol(filename);
409 return bdrv_create(drv, filename, options);
413 * Create a uniquely-named empty temporary file.
414 * Return 0 upon success, otherwise a negative errno value.
416 int get_tmp_filename(char *filename, int size)
419 char temp_dir[MAX_PATH];
420 /* GetTempFileName requires that its output buffer (4th param)
421 have length MAX_PATH or greater. */
422 assert(size >= MAX_PATH);
423 return (GetTempPath(MAX_PATH, temp_dir)
424 && GetTempFileName(temp_dir, "qem", 0, filename)
425 ? 0 : -GetLastError());
429 tmpdir = getenv("TMPDIR");
432 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
435 fd = mkstemp(filename);
436 if (fd < 0 || close(fd)) {
444 * Detect host devices. By convention, /dev/cdrom[N] is always
445 * recognized as a host CDROM.
447 static BlockDriver *find_hdev_driver(const char *filename)
449 int score_max = 0, score;
450 BlockDriver *drv = NULL, *d;
452 QLIST_FOREACH(d, &bdrv_drivers, list) {
453 if (d->bdrv_probe_device) {
454 score = d->bdrv_probe_device(filename);
455 if (score > score_max) {
465 BlockDriver *bdrv_find_protocol(const char *filename)
472 /* TODO Drivers without bdrv_file_open must be specified explicitly */
475 * XXX(hch): we really should not let host device detection
476 * override an explicit protocol specification, but moving this
477 * later breaks access to device names with colons in them.
478 * Thanks to the brain-dead persistent naming schemes on udev-
479 * based Linux systems those actually are quite common.
481 drv1 = find_hdev_driver(filename);
486 if (!path_has_protocol(filename)) {
487 return bdrv_find_format("file");
489 p = strchr(filename, ':');
492 if (len > sizeof(protocol) - 1)
493 len = sizeof(protocol) - 1;
494 memcpy(protocol, filename, len);
495 protocol[len] = '\0';
496 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
497 if (drv1->protocol_name &&
498 !strcmp(drv1->protocol_name, protocol)) {
505 static int find_image_format(const char *filename, BlockDriver **pdrv)
507 int ret, score, score_max;
508 BlockDriver *drv1, *drv;
510 BlockDriverState *bs;
512 ret = bdrv_file_open(&bs, filename, 0);
518 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
519 if (bs->sg || !bdrv_is_inserted(bs)) {
521 drv = bdrv_find_format("raw");
529 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
538 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
539 if (drv1->bdrv_probe) {
540 score = drv1->bdrv_probe(buf, ret, filename);
541 if (score > score_max) {
555 * Set the current 'total_sectors' value
557 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
559 BlockDriver *drv = bs->drv;
561 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
565 /* query actual device if possible, otherwise just trust the hint */
566 if (drv->bdrv_getlength) {
567 int64_t length = drv->bdrv_getlength(bs);
571 hint = length >> BDRV_SECTOR_BITS;
574 bs->total_sectors = hint;
579 * Set open flags for a given cache mode
581 * Return 0 on success, -1 if the cache mode was invalid.
583 int bdrv_parse_cache_flags(const char *mode, int *flags)
585 *flags &= ~BDRV_O_CACHE_MASK;
587 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
588 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
589 } else if (!strcmp(mode, "directsync")) {
590 *flags |= BDRV_O_NOCACHE;
591 } else if (!strcmp(mode, "writeback")) {
592 *flags |= BDRV_O_CACHE_WB;
593 } else if (!strcmp(mode, "unsafe")) {
594 *flags |= BDRV_O_CACHE_WB;
595 *flags |= BDRV_O_NO_FLUSH;
596 } else if (!strcmp(mode, "writethrough")) {
597 /* this is the default */
606 * The copy-on-read flag is actually a reference count so multiple users may
607 * use the feature without worrying about clobbering its previous state.
608 * Copy-on-read stays enabled until all users have called to disable it.
610 void bdrv_enable_copy_on_read(BlockDriverState *bs)
615 void bdrv_disable_copy_on_read(BlockDriverState *bs)
617 assert(bs->copy_on_read > 0);
622 * Common part for opening disk images and files
624 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
625 int flags, BlockDriver *drv)
630 assert(bs->file == NULL);
632 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
634 bs->open_flags = flags;
635 bs->buffer_alignment = 512;
637 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
638 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
639 bdrv_enable_copy_on_read(bs);
642 pstrcpy(bs->filename, sizeof(bs->filename), filename);
644 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
649 bs->opaque = g_malloc0(drv->instance_size);
651 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
652 open_flags = flags | BDRV_O_CACHE_WB;
655 * Clear flags that are internal to the block layer before opening the
658 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
661 * Snapshots should be writable.
663 if (bs->is_temporary) {
664 open_flags |= BDRV_O_RDWR;
667 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
669 /* Open the image, either directly or using a protocol */
670 if (drv->bdrv_file_open) {
671 ret = drv->bdrv_file_open(bs, filename, open_flags);
673 ret = bdrv_file_open(&bs->file, filename, open_flags);
675 ret = drv->bdrv_open(bs, open_flags);
683 ret = refresh_total_sectors(bs, bs->total_sectors);
689 if (bs->is_temporary) {
697 bdrv_delete(bs->file);
707 * Opens a file using a protocol (file, host_device, nbd, ...)
709 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
711 BlockDriverState *bs;
715 drv = bdrv_find_protocol(filename);
721 ret = bdrv_open_common(bs, filename, flags, drv);
732 * Opens a disk image (raw, qcow2, vmdk, ...)
734 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
738 char tmp_filename[PATH_MAX];
740 if (flags & BDRV_O_SNAPSHOT) {
741 BlockDriverState *bs1;
744 BlockDriver *bdrv_qcow2;
745 QEMUOptionParameter *options;
746 char backing_filename[PATH_MAX];
748 /* if snapshot, we create a temporary backing file and open it
749 instead of opening 'filename' directly */
751 /* if there is a backing file, use it */
753 ret = bdrv_open(bs1, filename, 0, drv);
758 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
760 if (bs1->drv && bs1->drv->protocol_name)
765 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
770 /* Real path is meaningless for protocols */
772 snprintf(backing_filename, sizeof(backing_filename),
774 else if (!realpath(filename, backing_filename))
777 bdrv_qcow2 = bdrv_find_format("qcow2");
778 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
780 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
781 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
783 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
787 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
788 free_option_parameters(options);
793 filename = tmp_filename;
795 bs->is_temporary = 1;
798 /* Find the right image format driver */
800 ret = find_image_format(filename, &drv);
804 goto unlink_and_fail;
808 ret = bdrv_open_common(bs, filename, flags, drv);
810 goto unlink_and_fail;
813 /* If there is a backing file, use it */
814 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
815 char backing_filename[PATH_MAX];
817 BlockDriver *back_drv = NULL;
819 bs->backing_hd = bdrv_new("");
820 bdrv_get_full_backing_filename(bs, backing_filename,
821 sizeof(backing_filename));
823 if (bs->backing_format[0] != '\0') {
824 back_drv = bdrv_find_format(bs->backing_format);
827 /* backing files always opened read-only */
829 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
831 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
836 if (bs->is_temporary) {
837 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
839 /* base image inherits from "parent" */
840 bs->backing_hd->keep_read_only = bs->keep_read_only;
844 if (!bdrv_key_required(bs)) {
845 bdrv_dev_change_media_cb(bs, true);
848 /* throttling disk I/O limits */
849 if (bs->io_limits_enabled) {
850 bdrv_io_limits_enable(bs);
856 if (bs->is_temporary) {
862 void bdrv_close(BlockDriverState *bs)
867 block_job_cancel_sync(bs->job);
871 if (bs == bs_snapshots) {
874 if (bs->backing_hd) {
875 bdrv_delete(bs->backing_hd);
876 bs->backing_hd = NULL;
878 bs->drv->bdrv_close(bs);
881 if (bs->is_temporary) {
882 unlink(bs->filename);
887 bs->copy_on_read = 0;
888 bs->backing_file[0] = '\0';
889 bs->backing_format[0] = '\0';
890 bs->total_sectors = 0;
896 if (bs->file != NULL) {
897 bdrv_delete(bs->file);
901 bdrv_dev_change_media_cb(bs, false);
904 /*throttling disk I/O limits*/
905 if (bs->io_limits_enabled) {
906 bdrv_io_limits_disable(bs);
910 void bdrv_close_all(void)
912 BlockDriverState *bs;
914 QTAILQ_FOREACH(bs, &bdrv_states, list) {
920 * Wait for pending requests to complete across all BlockDriverStates
922 * This function does not flush data to disk, use bdrv_flush_all() for that
923 * after calling this function.
925 * Note that completion of an asynchronous I/O operation can trigger any
926 * number of other I/O operations on other devices---for example a coroutine
927 * can be arbitrarily complex and a constant flow of I/O can come until the
928 * coroutine is complete. Because of this, it is not possible to have a
929 * function to drain a single device's I/O queue.
931 void bdrv_drain_all(void)
933 BlockDriverState *bs;
937 busy = qemu_aio_wait();
939 /* FIXME: We do not have timer support here, so this is effectively
942 QTAILQ_FOREACH(bs, &bdrv_states, list) {
943 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
944 qemu_co_queue_restart_all(&bs->throttled_reqs);
950 /* If requests are still pending there is a bug somewhere */
951 QTAILQ_FOREACH(bs, &bdrv_states, list) {
952 assert(QLIST_EMPTY(&bs->tracked_requests));
953 assert(qemu_co_queue_empty(&bs->throttled_reqs));
957 /* make a BlockDriverState anonymous by removing from bdrv_state list.
958 Also, NULL terminate the device_name to prevent double remove */
959 void bdrv_make_anon(BlockDriverState *bs)
961 if (bs->device_name[0] != '\0') {
962 QTAILQ_REMOVE(&bdrv_states, bs, list);
964 bs->device_name[0] = '\0';
967 static void bdrv_rebind(BlockDriverState *bs)
969 if (bs->drv && bs->drv->bdrv_rebind) {
970 bs->drv->bdrv_rebind(bs);
974 static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
975 BlockDriverState *bs_src)
977 /* move some fields that need to stay attached to the device */
978 bs_dest->open_flags = bs_src->open_flags;
981 bs_dest->dev_ops = bs_src->dev_ops;
982 bs_dest->dev_opaque = bs_src->dev_opaque;
983 bs_dest->dev = bs_src->dev;
984 bs_dest->buffer_alignment = bs_src->buffer_alignment;
985 bs_dest->copy_on_read = bs_src->copy_on_read;
987 bs_dest->enable_write_cache = bs_src->enable_write_cache;
989 /* i/o timing parameters */
990 bs_dest->slice_time = bs_src->slice_time;
991 bs_dest->slice_start = bs_src->slice_start;
992 bs_dest->slice_end = bs_src->slice_end;
993 bs_dest->io_limits = bs_src->io_limits;
994 bs_dest->io_base = bs_src->io_base;
995 bs_dest->throttled_reqs = bs_src->throttled_reqs;
996 bs_dest->block_timer = bs_src->block_timer;
997 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
1000 bs_dest->cyls = bs_src->cyls;
1001 bs_dest->heads = bs_src->heads;
1002 bs_dest->secs = bs_src->secs;
1003 bs_dest->translation = bs_src->translation;
1006 bs_dest->on_read_error = bs_src->on_read_error;
1007 bs_dest->on_write_error = bs_src->on_write_error;
1010 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1011 bs_dest->iostatus = bs_src->iostatus;
1014 bs_dest->dirty_count = bs_src->dirty_count;
1015 bs_dest->dirty_bitmap = bs_src->dirty_bitmap;
1018 bs_dest->in_use = bs_src->in_use;
1019 bs_dest->job = bs_src->job;
1021 /* keep the same entry in bdrv_states */
1022 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1023 bs_src->device_name);
1024 bs_dest->list = bs_src->list;
1028 * Swap bs contents for two image chains while they are live,
1029 * while keeping required fields on the BlockDriverState that is
1030 * actually attached to a device.
1032 * This will modify the BlockDriverState fields, and swap contents
1033 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1035 * bs_new is required to be anonymous.
1037 * This function does not create any image files.
1039 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1041 BlockDriverState tmp;
1043 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1044 assert(bs_new->device_name[0] == '\0');
1045 assert(bs_new->dirty_bitmap == NULL);
1046 assert(bs_new->job == NULL);
1047 assert(bs_new->dev == NULL);
1048 assert(bs_new->in_use == 0);
1049 assert(bs_new->io_limits_enabled == false);
1050 assert(bs_new->block_timer == NULL);
1056 /* there are some fields that should not be swapped, move them back */
1057 bdrv_move_feature_fields(&tmp, bs_old);
1058 bdrv_move_feature_fields(bs_old, bs_new);
1059 bdrv_move_feature_fields(bs_new, &tmp);
1061 /* bs_new shouldn't be in bdrv_states even after the swap! */
1062 assert(bs_new->device_name[0] == '\0');
1064 /* Check a few fields that should remain attached to the device */
1065 assert(bs_new->dev == NULL);
1066 assert(bs_new->job == NULL);
1067 assert(bs_new->in_use == 0);
1068 assert(bs_new->io_limits_enabled == false);
1069 assert(bs_new->block_timer == NULL);
1071 bdrv_rebind(bs_new);
1072 bdrv_rebind(bs_old);
1076 * Add new bs contents at the top of an image chain while the chain is
1077 * live, while keeping required fields on the top layer.
1079 * This will modify the BlockDriverState fields, and swap contents
1080 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1082 * bs_new is required to be anonymous.
1084 * This function does not create any image files.
1086 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1088 bdrv_swap(bs_new, bs_top);
1090 /* The contents of 'tmp' will become bs_top, as we are
1091 * swapping bs_new and bs_top contents. */
1092 bs_top->backing_hd = bs_new;
1093 bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1094 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1096 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1097 bs_new->drv ? bs_new->drv->format_name : "");
1100 void bdrv_delete(BlockDriverState *bs)
1104 assert(!bs->in_use);
1106 /* remove from list, if necessary */
1111 assert(bs != bs_snapshots);
1115 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1116 /* TODO change to DeviceState *dev when all users are qdevified */
1122 bdrv_iostatus_reset(bs);
1126 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1127 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1129 if (bdrv_attach_dev(bs, dev) < 0) {
1134 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1135 /* TODO change to DeviceState *dev when all users are qdevified */
1137 assert(bs->dev == dev);
1140 bs->dev_opaque = NULL;
1141 bs->buffer_alignment = 512;
1144 /* TODO change to return DeviceState * when all users are qdevified */
1145 void *bdrv_get_attached_dev(BlockDriverState *bs)
1150 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1154 bs->dev_opaque = opaque;
1155 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1156 bs_snapshots = NULL;
1160 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1161 BlockQMPEventAction action, int is_read)
1164 const char *action_str;
1167 case BDRV_ACTION_REPORT:
1168 action_str = "report";
1170 case BDRV_ACTION_IGNORE:
1171 action_str = "ignore";
1173 case BDRV_ACTION_STOP:
1174 action_str = "stop";
1180 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1183 is_read ? "read" : "write");
1184 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1186 qobject_decref(data);
1189 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1193 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1194 bdrv_get_device_name(bs), ejected);
1195 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1197 qobject_decref(data);
1200 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1202 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1203 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1204 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1205 if (tray_was_closed) {
1207 bdrv_emit_qmp_eject_event(bs, true);
1211 bdrv_emit_qmp_eject_event(bs, false);
1216 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1218 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1221 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1223 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1224 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1228 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1230 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1231 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1236 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1238 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1239 bs->dev_ops->resize_cb(bs->dev_opaque);
1243 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1245 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1246 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1252 * Run consistency checks on an image
1254 * Returns 0 if the check could be completed (it doesn't mean that the image is
1255 * free of errors) or -errno when an internal error occurred. The results of the
1256 * check are stored in res.
1258 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1260 if (bs->drv->bdrv_check == NULL) {
1264 memset(res, 0, sizeof(*res));
1265 return bs->drv->bdrv_check(bs, res, fix);
1268 #define COMMIT_BUF_SECTORS 2048
1270 /* commit COW file into the raw image */
1271 int bdrv_commit(BlockDriverState *bs)
1273 BlockDriver *drv = bs->drv;
1274 BlockDriver *backing_drv;
1275 int64_t sector, total_sectors;
1276 int n, ro, open_flags;
1277 int ret = 0, rw_ret = 0;
1279 char filename[1024];
1280 BlockDriverState *bs_rw, *bs_ro;
1285 if (!bs->backing_hd) {
1289 if (bs->backing_hd->keep_read_only) {
1293 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1297 backing_drv = bs->backing_hd->drv;
1298 ro = bs->backing_hd->read_only;
1299 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1300 open_flags = bs->backing_hd->open_flags;
1304 bdrv_delete(bs->backing_hd);
1305 bs->backing_hd = NULL;
1306 bs_rw = bdrv_new("");
1307 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1311 /* try to re-open read-only */
1312 bs_ro = bdrv_new("");
1313 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1317 /* drive not functional anymore */
1321 bs->backing_hd = bs_ro;
1324 bs->backing_hd = bs_rw;
1327 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1328 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1330 for (sector = 0; sector < total_sectors; sector += n) {
1331 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1333 if (bdrv_read(bs, sector, buf, n) != 0) {
1338 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1345 if (drv->bdrv_make_empty) {
1346 ret = drv->bdrv_make_empty(bs);
1351 * Make sure all data we wrote to the backing device is actually
1355 bdrv_flush(bs->backing_hd);
1362 bdrv_delete(bs->backing_hd);
1363 bs->backing_hd = NULL;
1364 bs_ro = bdrv_new("");
1365 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1369 /* drive not functional anymore */
1373 bs->backing_hd = bs_ro;
1374 bs->backing_hd->keep_read_only = 0;
1380 int bdrv_commit_all(void)
1382 BlockDriverState *bs;
1384 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1385 int ret = bdrv_commit(bs);
1393 struct BdrvTrackedRequest {
1394 BlockDriverState *bs;
1398 QLIST_ENTRY(BdrvTrackedRequest) list;
1399 Coroutine *co; /* owner, used for deadlock detection */
1400 CoQueue wait_queue; /* coroutines blocked on this request */
1404 * Remove an active request from the tracked requests list
1406 * This function should be called when a tracked request is completing.
1408 static void tracked_request_end(BdrvTrackedRequest *req)
1410 QLIST_REMOVE(req, list);
1411 qemu_co_queue_restart_all(&req->wait_queue);
1415 * Add an active request to the tracked requests list
1417 static void tracked_request_begin(BdrvTrackedRequest *req,
1418 BlockDriverState *bs,
1420 int nb_sectors, bool is_write)
1422 *req = (BdrvTrackedRequest){
1424 .sector_num = sector_num,
1425 .nb_sectors = nb_sectors,
1426 .is_write = is_write,
1427 .co = qemu_coroutine_self(),
1430 qemu_co_queue_init(&req->wait_queue);
1432 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1436 * Round a region to cluster boundaries
1438 static void round_to_clusters(BlockDriverState *bs,
1439 int64_t sector_num, int nb_sectors,
1440 int64_t *cluster_sector_num,
1441 int *cluster_nb_sectors)
1443 BlockDriverInfo bdi;
1445 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1446 *cluster_sector_num = sector_num;
1447 *cluster_nb_sectors = nb_sectors;
1449 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1450 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1451 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1456 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1457 int64_t sector_num, int nb_sectors) {
1459 if (sector_num >= req->sector_num + req->nb_sectors) {
1463 if (req->sector_num >= sector_num + nb_sectors) {
1469 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1470 int64_t sector_num, int nb_sectors)
1472 BdrvTrackedRequest *req;
1473 int64_t cluster_sector_num;
1474 int cluster_nb_sectors;
1477 /* If we touch the same cluster it counts as an overlap. This guarantees
1478 * that allocating writes will be serialized and not race with each other
1479 * for the same cluster. For example, in copy-on-read it ensures that the
1480 * CoR read and write operations are atomic and guest writes cannot
1481 * interleave between them.
1483 round_to_clusters(bs, sector_num, nb_sectors,
1484 &cluster_sector_num, &cluster_nb_sectors);
1488 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1489 if (tracked_request_overlaps(req, cluster_sector_num,
1490 cluster_nb_sectors)) {
1491 /* Hitting this means there was a reentrant request, for
1492 * example, a block driver issuing nested requests. This must
1493 * never happen since it means deadlock.
1495 assert(qemu_coroutine_self() != req->co);
1497 qemu_co_queue_wait(&req->wait_queue);
1508 * -EINVAL - backing format specified, but no file
1509 * -ENOSPC - can't update the backing file because no space is left in the
1511 * -ENOTSUP - format driver doesn't support changing the backing file
1513 int bdrv_change_backing_file(BlockDriverState *bs,
1514 const char *backing_file, const char *backing_fmt)
1516 BlockDriver *drv = bs->drv;
1519 /* Backing file format doesn't make sense without a backing file */
1520 if (backing_fmt && !backing_file) {
1524 if (drv->bdrv_change_backing_file != NULL) {
1525 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1531 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1532 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1537 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1542 if (!bdrv_is_inserted(bs))
1548 len = bdrv_getlength(bs);
1553 if ((offset > len) || (len - offset < size))
1559 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1562 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1563 nb_sectors * BDRV_SECTOR_SIZE);
1566 typedef struct RwCo {
1567 BlockDriverState *bs;
1575 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1577 RwCo *rwco = opaque;
1579 if (!rwco->is_write) {
1580 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1581 rwco->nb_sectors, rwco->qiov, 0);
1583 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1584 rwco->nb_sectors, rwco->qiov, 0);
1589 * Process a synchronous request using coroutines
1591 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1592 int nb_sectors, bool is_write)
1595 struct iovec iov = {
1596 .iov_base = (void *)buf,
1597 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1602 .sector_num = sector_num,
1603 .nb_sectors = nb_sectors,
1605 .is_write = is_write,
1609 qemu_iovec_init_external(&qiov, &iov, 1);
1612 * In sync call context, when the vcpu is blocked, this throttling timer
1613 * will not fire; so the I/O throttling function has to be disabled here
1614 * if it has been enabled.
1616 if (bs->io_limits_enabled) {
1617 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1618 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1619 bdrv_io_limits_disable(bs);
1622 if (qemu_in_coroutine()) {
1623 /* Fast-path if already in coroutine context */
1624 bdrv_rw_co_entry(&rwco);
1626 co = qemu_coroutine_create(bdrv_rw_co_entry);
1627 qemu_coroutine_enter(co, &rwco);
1628 while (rwco.ret == NOT_DONE) {
1635 /* return < 0 if error. See bdrv_write() for the return codes */
1636 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1637 uint8_t *buf, int nb_sectors)
1639 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1642 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
1644 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1645 int nb_sectors, int dirty)
1648 unsigned long val, idx, bit;
1650 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1651 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1653 for (; start <= end; start++) {
1654 idx = start / BITS_PER_LONG;
1655 bit = start % BITS_PER_LONG;
1656 val = bs->dirty_bitmap[idx];
1658 if (!(val & (1UL << bit))) {
1663 if (val & (1UL << bit)) {
1665 val &= ~(1UL << bit);
1668 bs->dirty_bitmap[idx] = val;
1672 /* Return < 0 if error. Important errors are:
1673 -EIO generic I/O error (may happen for all errors)
1674 -ENOMEDIUM No media inserted.
1675 -EINVAL Invalid sector number or nb_sectors
1676 -EACCES Trying to write a read-only device
1678 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1679 const uint8_t *buf, int nb_sectors)
1681 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1684 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1685 void *buf, int count1)
1687 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1688 int len, nb_sectors, count;
1693 /* first read to align to sector start */
1694 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1697 sector_num = offset >> BDRV_SECTOR_BITS;
1699 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1701 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1709 /* read the sectors "in place" */
1710 nb_sectors = count >> BDRV_SECTOR_BITS;
1711 if (nb_sectors > 0) {
1712 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1714 sector_num += nb_sectors;
1715 len = nb_sectors << BDRV_SECTOR_BITS;
1720 /* add data from the last sector */
1722 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1724 memcpy(buf, tmp_buf, count);
1729 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1730 const void *buf, int count1)
1732 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1733 int len, nb_sectors, count;
1738 /* first write to align to sector start */
1739 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1742 sector_num = offset >> BDRV_SECTOR_BITS;
1744 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1746 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1747 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1756 /* write the sectors "in place" */
1757 nb_sectors = count >> BDRV_SECTOR_BITS;
1758 if (nb_sectors > 0) {
1759 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1761 sector_num += nb_sectors;
1762 len = nb_sectors << BDRV_SECTOR_BITS;
1767 /* add data from the last sector */
1769 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1771 memcpy(tmp_buf, buf, count);
1772 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1779 * Writes to the file and ensures that no writes are reordered across this
1780 * request (acts as a barrier)
1782 * Returns 0 on success, -errno in error cases.
1784 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1785 const void *buf, int count)
1789 ret = bdrv_pwrite(bs, offset, buf, count);
1794 /* No flush needed for cache modes that already do it */
1795 if (bs->enable_write_cache) {
1802 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1803 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1805 /* Perform I/O through a temporary buffer so that users who scribble over
1806 * their read buffer while the operation is in progress do not end up
1807 * modifying the image file. This is critical for zero-copy guest I/O
1808 * where anything might happen inside guest memory.
1810 void *bounce_buffer;
1812 BlockDriver *drv = bs->drv;
1814 QEMUIOVector bounce_qiov;
1815 int64_t cluster_sector_num;
1816 int cluster_nb_sectors;
1820 /* Cover entire cluster so no additional backing file I/O is required when
1821 * allocating cluster in the image file.
1823 round_to_clusters(bs, sector_num, nb_sectors,
1824 &cluster_sector_num, &cluster_nb_sectors);
1826 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1827 cluster_sector_num, cluster_nb_sectors);
1829 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1830 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1831 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1833 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1839 if (drv->bdrv_co_write_zeroes &&
1840 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1841 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1842 cluster_nb_sectors);
1844 /* This does not change the data on the disk, it is not necessary
1845 * to flush even in cache=writethrough mode.
1847 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1852 /* It might be okay to ignore write errors for guest requests. If this
1853 * is a deliberate copy-on-read then we don't want to ignore the error.
1854 * Simply report it in all cases.
1859 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1860 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1861 nb_sectors * BDRV_SECTOR_SIZE);
1864 qemu_vfree(bounce_buffer);
1869 * Handle a read request in coroutine context
1871 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1872 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1873 BdrvRequestFlags flags)
1875 BlockDriver *drv = bs->drv;
1876 BdrvTrackedRequest req;
1882 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1886 /* throttling disk read I/O */
1887 if (bs->io_limits_enabled) {
1888 bdrv_io_limits_intercept(bs, false, nb_sectors);
1891 if (bs->copy_on_read) {
1892 flags |= BDRV_REQ_COPY_ON_READ;
1894 if (flags & BDRV_REQ_COPY_ON_READ) {
1895 bs->copy_on_read_in_flight++;
1898 if (bs->copy_on_read_in_flight) {
1899 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1902 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1904 if (flags & BDRV_REQ_COPY_ON_READ) {
1907 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1912 if (!ret || pnum != nb_sectors) {
1913 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1918 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1921 tracked_request_end(&req);
1923 if (flags & BDRV_REQ_COPY_ON_READ) {
1924 bs->copy_on_read_in_flight--;
1930 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1931 int nb_sectors, QEMUIOVector *qiov)
1933 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1935 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1938 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1939 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1941 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1943 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1944 BDRV_REQ_COPY_ON_READ);
1947 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1948 int64_t sector_num, int nb_sectors)
1950 BlockDriver *drv = bs->drv;
1955 /* TODO Emulate only part of misaligned requests instead of letting block
1956 * drivers return -ENOTSUP and emulate everything */
1958 /* First try the efficient write zeroes operation */
1959 if (drv->bdrv_co_write_zeroes) {
1960 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1961 if (ret != -ENOTSUP) {
1966 /* Fall back to bounce buffer if write zeroes is unsupported */
1967 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1968 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1969 memset(iov.iov_base, 0, iov.iov_len);
1970 qemu_iovec_init_external(&qiov, &iov, 1);
1972 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1974 qemu_vfree(iov.iov_base);
1979 * Handle a write request in coroutine context
1981 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1982 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1983 BdrvRequestFlags flags)
1985 BlockDriver *drv = bs->drv;
1986 BdrvTrackedRequest req;
1992 if (bs->read_only) {
1995 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1999 /* throttling disk write I/O */
2000 if (bs->io_limits_enabled) {
2001 bdrv_io_limits_intercept(bs, true, nb_sectors);
2004 if (bs->copy_on_read_in_flight) {
2005 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2008 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2010 if (flags & BDRV_REQ_ZERO_WRITE) {
2011 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2013 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2016 if (ret == 0 && !bs->enable_write_cache) {
2017 ret = bdrv_co_flush(bs);
2020 if (bs->dirty_bitmap) {
2021 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2024 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2025 bs->wr_highest_sector = sector_num + nb_sectors - 1;
2028 tracked_request_end(&req);
2033 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2034 int nb_sectors, QEMUIOVector *qiov)
2036 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2038 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2041 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2042 int64_t sector_num, int nb_sectors)
2044 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2046 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2047 BDRV_REQ_ZERO_WRITE);
2051 * Truncate file to 'offset' bytes (needed only for file protocols)
2053 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2055 BlockDriver *drv = bs->drv;
2059 if (!drv->bdrv_truncate)
2063 if (bdrv_in_use(bs))
2065 ret = drv->bdrv_truncate(bs, offset);
2067 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2068 bdrv_dev_resize_cb(bs);
2074 * Length of a allocated file in bytes. Sparse files are counted by actual
2075 * allocated space. Return < 0 if error or unknown.
2077 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2079 BlockDriver *drv = bs->drv;
2083 if (drv->bdrv_get_allocated_file_size) {
2084 return drv->bdrv_get_allocated_file_size(bs);
2087 return bdrv_get_allocated_file_size(bs->file);
2093 * Length of a file in bytes. Return < 0 if error or unknown.
2095 int64_t bdrv_getlength(BlockDriverState *bs)
2097 BlockDriver *drv = bs->drv;
2101 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2102 if (drv->bdrv_getlength) {
2103 return drv->bdrv_getlength(bs);
2106 return bs->total_sectors * BDRV_SECTOR_SIZE;
2109 /* return 0 as number of sectors if no device present or error */
2110 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2113 length = bdrv_getlength(bs);
2117 length = length >> BDRV_SECTOR_BITS;
2118 *nb_sectors_ptr = length;
2122 uint8_t boot_ind; /* 0x80 - active */
2123 uint8_t head; /* starting head */
2124 uint8_t sector; /* starting sector */
2125 uint8_t cyl; /* starting cylinder */
2126 uint8_t sys_ind; /* What partition type */
2127 uint8_t end_head; /* end head */
2128 uint8_t end_sector; /* end sector */
2129 uint8_t end_cyl; /* end cylinder */
2130 uint32_t start_sect; /* starting sector counting from 0 */
2131 uint32_t nr_sects; /* nr of sectors in partition */
2134 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2135 static int guess_disk_lchs(BlockDriverState *bs,
2136 int *pcylinders, int *pheads, int *psectors)
2138 uint8_t buf[BDRV_SECTOR_SIZE];
2139 int ret, i, heads, sectors, cylinders;
2140 struct partition *p;
2142 uint64_t nb_sectors;
2145 bdrv_get_geometry(bs, &nb_sectors);
2148 * The function will be invoked during startup not only in sync I/O mode,
2149 * but also in async I/O mode. So the I/O throttling function has to
2150 * be disabled temporarily here, not permanently.
2152 enabled = bs->io_limits_enabled;
2153 bs->io_limits_enabled = false;
2154 ret = bdrv_read(bs, 0, buf, 1);
2155 bs->io_limits_enabled = enabled;
2158 /* test msdos magic */
2159 if (buf[510] != 0x55 || buf[511] != 0xaa)
2161 for(i = 0; i < 4; i++) {
2162 p = ((struct partition *)(buf + 0x1be)) + i;
2163 nr_sects = le32_to_cpu(p->nr_sects);
2164 if (nr_sects && p->end_head) {
2165 /* We make the assumption that the partition terminates on
2166 a cylinder boundary */
2167 heads = p->end_head + 1;
2168 sectors = p->end_sector & 63;
2171 cylinders = nb_sectors / (heads * sectors);
2172 if (cylinders < 1 || cylinders > 16383)
2175 *psectors = sectors;
2176 *pcylinders = cylinders;
2178 printf("guessed geometry: LCHS=%d %d %d\n",
2179 cylinders, heads, sectors);
2187 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2189 int translation, lba_detected = 0;
2190 int cylinders, heads, secs;
2191 uint64_t nb_sectors;
2193 /* if a geometry hint is available, use it */
2194 bdrv_get_geometry(bs, &nb_sectors);
2195 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2196 translation = bdrv_get_translation_hint(bs);
2197 if (cylinders != 0) {
2202 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2204 /* if heads > 16, it means that a BIOS LBA
2205 translation was active, so the default
2206 hardware geometry is OK */
2208 goto default_geometry;
2213 /* disable any translation to be in sync with
2214 the logical geometry */
2215 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2216 bdrv_set_translation_hint(bs,
2217 BIOS_ATA_TRANSLATION_NONE);
2222 /* if no geometry, use a standard physical disk geometry */
2223 cylinders = nb_sectors / (16 * 63);
2225 if (cylinders > 16383)
2227 else if (cylinders < 2)
2232 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2233 if ((*pcyls * *pheads) <= 131072) {
2234 bdrv_set_translation_hint(bs,
2235 BIOS_ATA_TRANSLATION_LARGE);
2237 bdrv_set_translation_hint(bs,
2238 BIOS_ATA_TRANSLATION_LBA);
2242 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2246 void bdrv_set_geometry_hint(BlockDriverState *bs,
2247 int cyls, int heads, int secs)
2254 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2256 bs->translation = translation;
2259 void bdrv_get_geometry_hint(BlockDriverState *bs,
2260 int *pcyls, int *pheads, int *psecs)
2263 *pheads = bs->heads;
2267 /* throttling disk io limits */
2268 void bdrv_set_io_limits(BlockDriverState *bs,
2269 BlockIOLimit *io_limits)
2271 bs->io_limits = *io_limits;
2272 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2275 /* Recognize floppy formats */
2276 typedef struct FDFormat {
2284 static const FDFormat fd_formats[] = {
2285 /* First entry is default format */
2286 /* 1.44 MB 3"1/2 floppy disks */
2287 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2288 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2289 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2290 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2291 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2292 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2293 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2294 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2295 /* 2.88 MB 3"1/2 floppy disks */
2296 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2297 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2298 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2299 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2300 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2301 /* 720 kB 3"1/2 floppy disks */
2302 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2303 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2304 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2305 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2306 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2307 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2308 /* 1.2 MB 5"1/4 floppy disks */
2309 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2310 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2311 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2312 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2313 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2314 /* 720 kB 5"1/4 floppy disks */
2315 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2316 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2317 /* 360 kB 5"1/4 floppy disks */
2318 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2319 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2320 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2321 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2322 /* 320 kB 5"1/4 floppy disks */
2323 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2324 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2325 /* 360 kB must match 5"1/4 better than 3"1/2... */
2326 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2328 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2331 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2332 int *max_track, int *last_sect,
2333 FDriveType drive_in, FDriveType *drive,
2336 const FDFormat *parse;
2337 uint64_t nb_sectors, size;
2338 int i, first_match, match;
2340 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2341 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2342 /* User defined disk */
2343 *rate = FDRIVE_RATE_500K;
2345 bdrv_get_geometry(bs, &nb_sectors);
2348 for (i = 0; ; i++) {
2349 parse = &fd_formats[i];
2350 if (parse->drive == FDRIVE_DRV_NONE) {
2353 if (drive_in == parse->drive ||
2354 drive_in == FDRIVE_DRV_NONE) {
2355 size = (parse->max_head + 1) * parse->max_track *
2357 if (nb_sectors == size) {
2361 if (first_match == -1) {
2367 if (first_match == -1) {
2370 match = first_match;
2372 parse = &fd_formats[match];
2374 *nb_heads = parse->max_head + 1;
2375 *max_track = parse->max_track;
2376 *last_sect = parse->last_sect;
2377 *drive = parse->drive;
2378 *rate = parse->rate;
2382 int bdrv_get_translation_hint(BlockDriverState *bs)
2384 return bs->translation;
2387 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2388 BlockErrorAction on_write_error)
2390 bs->on_read_error = on_read_error;
2391 bs->on_write_error = on_write_error;
2394 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2396 return is_read ? bs->on_read_error : bs->on_write_error;
2399 int bdrv_is_read_only(BlockDriverState *bs)
2401 return bs->read_only;
2404 int bdrv_is_sg(BlockDriverState *bs)
2409 int bdrv_enable_write_cache(BlockDriverState *bs)
2411 return bs->enable_write_cache;
2414 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2416 bs->enable_write_cache = wce;
2419 int bdrv_is_encrypted(BlockDriverState *bs)
2421 if (bs->backing_hd && bs->backing_hd->encrypted)
2423 return bs->encrypted;
2426 int bdrv_key_required(BlockDriverState *bs)
2428 BlockDriverState *backing_hd = bs->backing_hd;
2430 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2432 return (bs->encrypted && !bs->valid_key);
2435 int bdrv_set_key(BlockDriverState *bs, const char *key)
2438 if (bs->backing_hd && bs->backing_hd->encrypted) {
2439 ret = bdrv_set_key(bs->backing_hd, key);
2445 if (!bs->encrypted) {
2447 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2450 ret = bs->drv->bdrv_set_key(bs, key);
2453 } else if (!bs->valid_key) {
2455 /* call the change callback now, we skipped it on open */
2456 bdrv_dev_change_media_cb(bs, true);
2461 const char *bdrv_get_format_name(BlockDriverState *bs)
2463 return bs->drv ? bs->drv->format_name : NULL;
2466 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2471 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2472 it(opaque, drv->format_name);
2476 BlockDriverState *bdrv_find(const char *name)
2478 BlockDriverState *bs;
2480 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2481 if (!strcmp(name, bs->device_name)) {
2488 BlockDriverState *bdrv_next(BlockDriverState *bs)
2491 return QTAILQ_FIRST(&bdrv_states);
2493 return QTAILQ_NEXT(bs, list);
2496 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2498 BlockDriverState *bs;
2500 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2505 const char *bdrv_get_device_name(BlockDriverState *bs)
2507 return bs->device_name;
2510 int bdrv_get_flags(BlockDriverState *bs)
2512 return bs->open_flags;
2515 void bdrv_flush_all(void)
2517 BlockDriverState *bs;
2519 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2524 int bdrv_has_zero_init(BlockDriverState *bs)
2528 if (bs->drv->bdrv_has_zero_init) {
2529 return bs->drv->bdrv_has_zero_init(bs);
2535 typedef struct BdrvCoIsAllocatedData {
2536 BlockDriverState *bs;
2542 } BdrvCoIsAllocatedData;
2545 * Returns true iff the specified sector is present in the disk image. Drivers
2546 * not implementing the functionality are assumed to not support backing files,
2547 * hence all their sectors are reported as allocated.
2549 * If 'sector_num' is beyond the end of the disk image the return value is 0
2550 * and 'pnum' is set to 0.
2552 * 'pnum' is set to the number of sectors (including and immediately following
2553 * the specified sector) that are known to be in the same
2554 * allocated/unallocated state.
2556 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2557 * beyond the end of the disk image it will be clamped.
2559 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2560 int nb_sectors, int *pnum)
2564 if (sector_num >= bs->total_sectors) {
2569 n = bs->total_sectors - sector_num;
2570 if (n < nb_sectors) {
2574 if (!bs->drv->bdrv_co_is_allocated) {
2579 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2582 /* Coroutine wrapper for bdrv_is_allocated() */
2583 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2585 BdrvCoIsAllocatedData *data = opaque;
2586 BlockDriverState *bs = data->bs;
2588 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2594 * Synchronous wrapper around bdrv_co_is_allocated().
2596 * See bdrv_co_is_allocated() for details.
2598 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2602 BdrvCoIsAllocatedData data = {
2604 .sector_num = sector_num,
2605 .nb_sectors = nb_sectors,
2610 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2611 qemu_coroutine_enter(co, &data);
2612 while (!data.done) {
2619 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2621 * Return true if the given sector is allocated in any image between
2622 * BASE and TOP (inclusive). BASE can be NULL to check if the given
2623 * sector is allocated in any image of the chain. Return false otherwise.
2625 * 'pnum' is set to the number of sectors (including and immediately following
2626 * the specified sector) that are known to be in the same
2627 * allocated/unallocated state.
2630 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2631 BlockDriverState *base,
2633 int nb_sectors, int *pnum)
2635 BlockDriverState *intermediate;
2636 int ret, n = nb_sectors;
2639 while (intermediate && intermediate != base) {
2641 ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2651 * [sector_num, nb_sectors] is unallocated on top but intermediate
2654 * [sector_num+x, nr_sectors] allocated.
2656 if (n > pnum_inter) {
2660 intermediate = intermediate->backing_hd;
2667 BlockInfoList *qmp_query_block(Error **errp)
2669 BlockInfoList *head = NULL, *cur_item = NULL;
2670 BlockDriverState *bs;
2672 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2673 BlockInfoList *info = g_malloc0(sizeof(*info));
2675 info->value = g_malloc0(sizeof(*info->value));
2676 info->value->device = g_strdup(bs->device_name);
2677 info->value->type = g_strdup("unknown");
2678 info->value->locked = bdrv_dev_is_medium_locked(bs);
2679 info->value->removable = bdrv_dev_has_removable_media(bs);
2681 if (bdrv_dev_has_removable_media(bs)) {
2682 info->value->has_tray_open = true;
2683 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2686 if (bdrv_iostatus_is_enabled(bs)) {
2687 info->value->has_io_status = true;
2688 info->value->io_status = bs->iostatus;
2692 info->value->has_inserted = true;
2693 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2694 info->value->inserted->file = g_strdup(bs->filename);
2695 info->value->inserted->ro = bs->read_only;
2696 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2697 info->value->inserted->encrypted = bs->encrypted;
2698 if (bs->backing_file[0]) {
2699 info->value->inserted->has_backing_file = true;
2700 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2703 if (bs->io_limits_enabled) {
2704 info->value->inserted->bps =
2705 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2706 info->value->inserted->bps_rd =
2707 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2708 info->value->inserted->bps_wr =
2709 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2710 info->value->inserted->iops =
2711 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2712 info->value->inserted->iops_rd =
2713 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2714 info->value->inserted->iops_wr =
2715 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2719 /* XXX: waiting for the qapi to support GSList */
2721 head = cur_item = info;
2723 cur_item->next = info;
2731 /* Consider exposing this as a full fledged QMP command */
2732 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2736 s = g_malloc0(sizeof(*s));
2738 if (bs->device_name[0]) {
2739 s->has_device = true;
2740 s->device = g_strdup(bs->device_name);
2743 s->stats = g_malloc0(sizeof(*s->stats));
2744 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2745 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2746 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2747 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2748 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2749 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2750 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2751 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2752 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2755 s->has_parent = true;
2756 s->parent = qmp_query_blockstat(bs->file, NULL);
2762 BlockStatsList *qmp_query_blockstats(Error **errp)
2764 BlockStatsList *head = NULL, *cur_item = NULL;
2765 BlockDriverState *bs;
2767 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2768 BlockStatsList *info = g_malloc0(sizeof(*info));
2769 info->value = qmp_query_blockstat(bs, NULL);
2771 /* XXX: waiting for the qapi to support GSList */
2773 head = cur_item = info;
2775 cur_item->next = info;
2783 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2785 if (bs->backing_hd && bs->backing_hd->encrypted)
2786 return bs->backing_file;
2787 else if (bs->encrypted)
2788 return bs->filename;
2793 void bdrv_get_backing_filename(BlockDriverState *bs,
2794 char *filename, int filename_size)
2796 pstrcpy(filename, filename_size, bs->backing_file);
2799 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2800 const uint8_t *buf, int nb_sectors)
2802 BlockDriver *drv = bs->drv;
2805 if (!drv->bdrv_write_compressed)
2807 if (bdrv_check_request(bs, sector_num, nb_sectors))
2810 if (bs->dirty_bitmap) {
2811 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2814 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2817 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2819 BlockDriver *drv = bs->drv;
2822 if (!drv->bdrv_get_info)
2824 memset(bdi, 0, sizeof(*bdi));
2825 return drv->bdrv_get_info(bs, bdi);
2828 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2829 int64_t pos, int size)
2831 BlockDriver *drv = bs->drv;
2834 if (drv->bdrv_save_vmstate)
2835 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2837 return bdrv_save_vmstate(bs->file, buf, pos, size);
2841 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2842 int64_t pos, int size)
2844 BlockDriver *drv = bs->drv;
2847 if (drv->bdrv_load_vmstate)
2848 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2850 return bdrv_load_vmstate(bs->file, buf, pos, size);
2854 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2856 BlockDriver *drv = bs->drv;
2858 if (!drv || !drv->bdrv_debug_event) {
2862 return drv->bdrv_debug_event(bs, event);
2866 /**************************************************************/
2867 /* handling of snapshots */
2869 int bdrv_can_snapshot(BlockDriverState *bs)
2871 BlockDriver *drv = bs->drv;
2872 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2876 if (!drv->bdrv_snapshot_create) {
2877 if (bs->file != NULL) {
2878 return bdrv_can_snapshot(bs->file);
2886 int bdrv_is_snapshot(BlockDriverState *bs)
2888 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2891 BlockDriverState *bdrv_snapshots(void)
2893 BlockDriverState *bs;
2896 return bs_snapshots;
2900 while ((bs = bdrv_next(bs))) {
2901 if (bdrv_can_snapshot(bs)) {
2909 int bdrv_snapshot_create(BlockDriverState *bs,
2910 QEMUSnapshotInfo *sn_info)
2912 BlockDriver *drv = bs->drv;
2915 if (drv->bdrv_snapshot_create)
2916 return drv->bdrv_snapshot_create(bs, sn_info);
2918 return bdrv_snapshot_create(bs->file, sn_info);
2922 int bdrv_snapshot_goto(BlockDriverState *bs,
2923 const char *snapshot_id)
2925 BlockDriver *drv = bs->drv;
2930 if (drv->bdrv_snapshot_goto)
2931 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2934 drv->bdrv_close(bs);
2935 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2936 open_ret = drv->bdrv_open(bs, bs->open_flags);
2938 bdrv_delete(bs->file);
2948 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2950 BlockDriver *drv = bs->drv;
2953 if (drv->bdrv_snapshot_delete)
2954 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2956 return bdrv_snapshot_delete(bs->file, snapshot_id);
2960 int bdrv_snapshot_list(BlockDriverState *bs,
2961 QEMUSnapshotInfo **psn_info)
2963 BlockDriver *drv = bs->drv;
2966 if (drv->bdrv_snapshot_list)
2967 return drv->bdrv_snapshot_list(bs, psn_info);
2969 return bdrv_snapshot_list(bs->file, psn_info);
2973 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2974 const char *snapshot_name)
2976 BlockDriver *drv = bs->drv;
2980 if (!bs->read_only) {
2983 if (drv->bdrv_snapshot_load_tmp) {
2984 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2989 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2990 const char *backing_file)
2996 if (bs->backing_hd) {
2997 if (strcmp(bs->backing_file, backing_file) == 0) {
2998 return bs->backing_hd;
3000 return bdrv_find_backing_image(bs->backing_hd, backing_file);
3007 #define NB_SUFFIXES 4
3009 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3011 static const char suffixes[NB_SUFFIXES] = "KMGT";
3016 snprintf(buf, buf_size, "%" PRId64, size);
3019 for(i = 0; i < NB_SUFFIXES; i++) {
3020 if (size < (10 * base)) {
3021 snprintf(buf, buf_size, "%0.1f%c",
3022 (double)size / base,
3025 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
3026 snprintf(buf, buf_size, "%" PRId64 "%c",
3027 ((size + (base >> 1)) / base),
3037 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3039 char buf1[128], date_buf[128], clock_buf[128];
3049 snprintf(buf, buf_size,
3050 "%-10s%-20s%7s%20s%15s",
3051 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3055 ptm = localtime(&ti);
3056 strftime(date_buf, sizeof(date_buf),
3057 "%Y-%m-%d %H:%M:%S", ptm);
3059 localtime_r(&ti, &tm);
3060 strftime(date_buf, sizeof(date_buf),
3061 "%Y-%m-%d %H:%M:%S", &tm);
3063 secs = sn->vm_clock_nsec / 1000000000;
3064 snprintf(clock_buf, sizeof(clock_buf),
3065 "%02d:%02d:%02d.%03d",
3067 (int)((secs / 60) % 60),
3069 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3070 snprintf(buf, buf_size,
3071 "%-10s%-20s%7s%20s%15s",
3072 sn->id_str, sn->name,
3073 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3080 /**************************************************************/
3083 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3084 QEMUIOVector *qiov, int nb_sectors,
3085 BlockDriverCompletionFunc *cb, void *opaque)
3087 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3089 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3093 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3094 QEMUIOVector *qiov, int nb_sectors,
3095 BlockDriverCompletionFunc *cb, void *opaque)
3097 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3099 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3104 typedef struct MultiwriteCB {
3109 BlockDriverCompletionFunc *cb;
3111 QEMUIOVector *free_qiov;
3115 static void multiwrite_user_cb(MultiwriteCB *mcb)
3119 for (i = 0; i < mcb->num_callbacks; i++) {
3120 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3121 if (mcb->callbacks[i].free_qiov) {
3122 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3124 g_free(mcb->callbacks[i].free_qiov);
3128 static void multiwrite_cb(void *opaque, int ret)
3130 MultiwriteCB *mcb = opaque;
3132 trace_multiwrite_cb(mcb, ret);
3134 if (ret < 0 && !mcb->error) {
3138 mcb->num_requests--;
3139 if (mcb->num_requests == 0) {
3140 multiwrite_user_cb(mcb);
3145 static int multiwrite_req_compare(const void *a, const void *b)
3147 const BlockRequest *req1 = a, *req2 = b;
3150 * Note that we can't simply subtract req2->sector from req1->sector
3151 * here as that could overflow the return value.
3153 if (req1->sector > req2->sector) {
3155 } else if (req1->sector < req2->sector) {
3163 * Takes a bunch of requests and tries to merge them. Returns the number of
3164 * requests that remain after merging.
3166 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3167 int num_reqs, MultiwriteCB *mcb)
3171 // Sort requests by start sector
3172 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3174 // Check if adjacent requests touch the same clusters. If so, combine them,
3175 // filling up gaps with zero sectors.
3177 for (i = 1; i < num_reqs; i++) {
3179 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3181 // Handle exactly sequential writes and overlapping writes.
3182 if (reqs[i].sector <= oldreq_last) {
3186 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3192 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3193 qemu_iovec_init(qiov,
3194 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3196 // Add the first request to the merged one. If the requests are
3197 // overlapping, drop the last sectors of the first request.
3198 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3199 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3201 // We should need to add any zeros between the two requests
3202 assert (reqs[i].sector <= oldreq_last);
3204 // Add the second request
3205 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3207 reqs[outidx].nb_sectors = qiov->size >> 9;
3208 reqs[outidx].qiov = qiov;
3210 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3213 reqs[outidx].sector = reqs[i].sector;
3214 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3215 reqs[outidx].qiov = reqs[i].qiov;
3223 * Submit multiple AIO write requests at once.
3225 * On success, the function returns 0 and all requests in the reqs array have
3226 * been submitted. In error case this function returns -1, and any of the
3227 * requests may or may not be submitted yet. In particular, this means that the
3228 * callback will be called for some of the requests, for others it won't. The
3229 * caller must check the error field of the BlockRequest to wait for the right
3230 * callbacks (if error != 0, no callback will be called).
3232 * The implementation may modify the contents of the reqs array, e.g. to merge
3233 * requests. However, the fields opaque and error are left unmodified as they
3234 * are used to signal failure for a single request to the caller.
3236 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3241 /* don't submit writes if we don't have a medium */
3242 if (bs->drv == NULL) {
3243 for (i = 0; i < num_reqs; i++) {
3244 reqs[i].error = -ENOMEDIUM;
3249 if (num_reqs == 0) {
3253 // Create MultiwriteCB structure
3254 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3255 mcb->num_requests = 0;
3256 mcb->num_callbacks = num_reqs;
3258 for (i = 0; i < num_reqs; i++) {
3259 mcb->callbacks[i].cb = reqs[i].cb;
3260 mcb->callbacks[i].opaque = reqs[i].opaque;
3263 // Check for mergable requests
3264 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3266 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3268 /* Run the aio requests. */
3269 mcb->num_requests = num_reqs;
3270 for (i = 0; i < num_reqs; i++) {
3271 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3272 reqs[i].nb_sectors, multiwrite_cb, mcb);
3278 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3280 acb->pool->cancel(acb);
3283 /* block I/O throttling */
3284 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3285 bool is_write, double elapsed_time, uint64_t *wait)
3287 uint64_t bps_limit = 0;
3288 double bytes_limit, bytes_base, bytes_res;
3289 double slice_time, wait_time;
3291 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3292 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3293 } else if (bs->io_limits.bps[is_write]) {
3294 bps_limit = bs->io_limits.bps[is_write];
3303 slice_time = bs->slice_end - bs->slice_start;
3304 slice_time /= (NANOSECONDS_PER_SECOND);
3305 bytes_limit = bps_limit * slice_time;
3306 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3307 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3308 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3311 /* bytes_base: the bytes of data which have been read/written; and
3312 * it is obtained from the history statistic info.
3313 * bytes_res: the remaining bytes of data which need to be read/written.
3314 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3315 * the total time for completing reading/writting all data.
3317 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3319 if (bytes_base + bytes_res <= bytes_limit) {
3327 /* Calc approx time to dispatch */
3328 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3330 /* When the I/O rate at runtime exceeds the limits,
3331 * bs->slice_end need to be extended in order that the current statistic
3332 * info can be kept until the timer fire, so it is increased and tuned
3333 * based on the result of experiment.
3335 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3336 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3338 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3344 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3345 double elapsed_time, uint64_t *wait)
3347 uint64_t iops_limit = 0;
3348 double ios_limit, ios_base;
3349 double slice_time, wait_time;
3351 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3352 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3353 } else if (bs->io_limits.iops[is_write]) {
3354 iops_limit = bs->io_limits.iops[is_write];
3363 slice_time = bs->slice_end - bs->slice_start;
3364 slice_time /= (NANOSECONDS_PER_SECOND);
3365 ios_limit = iops_limit * slice_time;
3366 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3367 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3368 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3371 if (ios_base + 1 <= ios_limit) {
3379 /* Calc approx time to dispatch */
3380 wait_time = (ios_base + 1) / iops_limit;
3381 if (wait_time > elapsed_time) {
3382 wait_time = wait_time - elapsed_time;
3387 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3388 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3390 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3396 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3397 bool is_write, int64_t *wait)
3399 int64_t now, max_wait;
3400 uint64_t bps_wait = 0, iops_wait = 0;
3401 double elapsed_time;
3402 int bps_ret, iops_ret;
3404 now = qemu_get_clock_ns(vm_clock);
3405 if ((bs->slice_start < now)
3406 && (bs->slice_end > now)) {
3407 bs->slice_end = now + bs->slice_time;
3409 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3410 bs->slice_start = now;
3411 bs->slice_end = now + bs->slice_time;
3413 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3414 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3416 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3417 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3420 elapsed_time = now - bs->slice_start;
3421 elapsed_time /= (NANOSECONDS_PER_SECOND);
3423 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3424 is_write, elapsed_time, &bps_wait);
3425 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3426 elapsed_time, &iops_wait);
3427 if (bps_ret || iops_ret) {
3428 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3433 now = qemu_get_clock_ns(vm_clock);
3434 if (bs->slice_end < now + max_wait) {
3435 bs->slice_end = now + max_wait;
3448 /**************************************************************/
3449 /* async block device emulation */
3451 typedef struct BlockDriverAIOCBSync {
3452 BlockDriverAIOCB common;
3455 /* vector translation state */
3459 } BlockDriverAIOCBSync;
3461 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3463 BlockDriverAIOCBSync *acb =
3464 container_of(blockacb, BlockDriverAIOCBSync, common);
3465 qemu_bh_delete(acb->bh);
3467 qemu_aio_release(acb);
3470 static AIOPool bdrv_em_aio_pool = {
3471 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3472 .cancel = bdrv_aio_cancel_em,
3475 static void bdrv_aio_bh_cb(void *opaque)
3477 BlockDriverAIOCBSync *acb = opaque;
3480 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3481 qemu_vfree(acb->bounce);
3482 acb->common.cb(acb->common.opaque, acb->ret);
3483 qemu_bh_delete(acb->bh);
3485 qemu_aio_release(acb);
3488 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3492 BlockDriverCompletionFunc *cb,
3497 BlockDriverAIOCBSync *acb;
3499 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3500 acb->is_write = is_write;
3502 acb->bounce = qemu_blockalign(bs, qiov->size);
3503 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3506 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3507 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3509 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3512 qemu_bh_schedule(acb->bh);
3514 return &acb->common;
3517 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3518 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3519 BlockDriverCompletionFunc *cb, void *opaque)
3521 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3524 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3525 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3526 BlockDriverCompletionFunc *cb, void *opaque)
3528 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3532 typedef struct BlockDriverAIOCBCoroutine {
3533 BlockDriverAIOCB common;
3537 } BlockDriverAIOCBCoroutine;
3539 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3544 static AIOPool bdrv_em_co_aio_pool = {
3545 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3546 .cancel = bdrv_aio_co_cancel_em,
3549 static void bdrv_co_em_bh(void *opaque)
3551 BlockDriverAIOCBCoroutine *acb = opaque;
3553 acb->common.cb(acb->common.opaque, acb->req.error);
3554 qemu_bh_delete(acb->bh);
3555 qemu_aio_release(acb);
3558 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3559 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3561 BlockDriverAIOCBCoroutine *acb = opaque;
3562 BlockDriverState *bs = acb->common.bs;
3564 if (!acb->is_write) {
3565 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3566 acb->req.nb_sectors, acb->req.qiov, 0);
3568 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3569 acb->req.nb_sectors, acb->req.qiov, 0);
3572 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3573 qemu_bh_schedule(acb->bh);
3576 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3580 BlockDriverCompletionFunc *cb,
3585 BlockDriverAIOCBCoroutine *acb;
3587 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3588 acb->req.sector = sector_num;
3589 acb->req.nb_sectors = nb_sectors;
3590 acb->req.qiov = qiov;
3591 acb->is_write = is_write;
3593 co = qemu_coroutine_create(bdrv_co_do_rw);
3594 qemu_coroutine_enter(co, acb);
3596 return &acb->common;
3599 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3601 BlockDriverAIOCBCoroutine *acb = opaque;
3602 BlockDriverState *bs = acb->common.bs;
3604 acb->req.error = bdrv_co_flush(bs);
3605 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3606 qemu_bh_schedule(acb->bh);
3609 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3610 BlockDriverCompletionFunc *cb, void *opaque)
3612 trace_bdrv_aio_flush(bs, opaque);
3615 BlockDriverAIOCBCoroutine *acb;
3617 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3618 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3619 qemu_coroutine_enter(co, acb);
3621 return &acb->common;
3624 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3626 BlockDriverAIOCBCoroutine *acb = opaque;
3627 BlockDriverState *bs = acb->common.bs;
3629 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3630 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3631 qemu_bh_schedule(acb->bh);
3634 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3635 int64_t sector_num, int nb_sectors,
3636 BlockDriverCompletionFunc *cb, void *opaque)
3639 BlockDriverAIOCBCoroutine *acb;
3641 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3643 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3644 acb->req.sector = sector_num;
3645 acb->req.nb_sectors = nb_sectors;
3646 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3647 qemu_coroutine_enter(co, acb);
3649 return &acb->common;
3652 void bdrv_init(void)
3654 module_call_init(MODULE_INIT_BLOCK);
3657 void bdrv_init_with_whitelist(void)
3659 use_bdrv_whitelist = 1;
3663 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3664 BlockDriverCompletionFunc *cb, void *opaque)
3666 BlockDriverAIOCB *acb;
3668 if (pool->free_aiocb) {
3669 acb = pool->free_aiocb;
3670 pool->free_aiocb = acb->next;
3672 acb = g_malloc0(pool->aiocb_size);
3677 acb->opaque = opaque;
3681 void qemu_aio_release(void *p)
3683 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3684 AIOPool *pool = acb->pool;
3685 acb->next = pool->free_aiocb;
3686 pool->free_aiocb = acb;
3689 /**************************************************************/
3690 /* Coroutine block device emulation */
3692 typedef struct CoroutineIOCompletion {
3693 Coroutine *coroutine;
3695 } CoroutineIOCompletion;
3697 static void bdrv_co_io_em_complete(void *opaque, int ret)
3699 CoroutineIOCompletion *co = opaque;
3702 qemu_coroutine_enter(co->coroutine, NULL);
3705 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3706 int nb_sectors, QEMUIOVector *iov,
3709 CoroutineIOCompletion co = {
3710 .coroutine = qemu_coroutine_self(),
3712 BlockDriverAIOCB *acb;
3715 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3716 bdrv_co_io_em_complete, &co);
3718 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3719 bdrv_co_io_em_complete, &co);
3722 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3726 qemu_coroutine_yield();
3731 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3732 int64_t sector_num, int nb_sectors,
3735 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3738 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3739 int64_t sector_num, int nb_sectors,
3742 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3745 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3747 RwCo *rwco = opaque;
3749 rwco->ret = bdrv_co_flush(rwco->bs);
3752 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3756 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3760 /* Write back cached data to the OS even with cache=unsafe */
3761 if (bs->drv->bdrv_co_flush_to_os) {
3762 ret = bs->drv->bdrv_co_flush_to_os(bs);
3768 /* But don't actually force it to the disk with cache=unsafe */
3769 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3773 if (bs->drv->bdrv_co_flush_to_disk) {
3774 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3775 } else if (bs->drv->bdrv_aio_flush) {
3776 BlockDriverAIOCB *acb;
3777 CoroutineIOCompletion co = {
3778 .coroutine = qemu_coroutine_self(),
3781 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3785 qemu_coroutine_yield();
3790 * Some block drivers always operate in either writethrough or unsafe
3791 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3792 * know how the server works (because the behaviour is hardcoded or
3793 * depends on server-side configuration), so we can't ensure that
3794 * everything is safe on disk. Returning an error doesn't work because
3795 * that would break guests even if the server operates in writethrough
3798 * Let's hope the user knows what he's doing.
3806 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3807 * in the case of cache=unsafe, so there are no useless flushes.
3809 return bdrv_co_flush(bs->file);
3812 void bdrv_invalidate_cache(BlockDriverState *bs)
3814 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3815 bs->drv->bdrv_invalidate_cache(bs);
3819 void bdrv_invalidate_cache_all(void)
3821 BlockDriverState *bs;
3823 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3824 bdrv_invalidate_cache(bs);
3828 void bdrv_clear_incoming_migration_all(void)
3830 BlockDriverState *bs;
3832 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3833 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3837 int bdrv_flush(BlockDriverState *bs)
3845 if (qemu_in_coroutine()) {
3846 /* Fast-path if already in coroutine context */
3847 bdrv_flush_co_entry(&rwco);
3849 co = qemu_coroutine_create(bdrv_flush_co_entry);
3850 qemu_coroutine_enter(co, &rwco);
3851 while (rwco.ret == NOT_DONE) {
3859 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3861 RwCo *rwco = opaque;
3863 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3866 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3871 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3873 } else if (bs->read_only) {
3875 } else if (bs->drv->bdrv_co_discard) {
3876 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3877 } else if (bs->drv->bdrv_aio_discard) {
3878 BlockDriverAIOCB *acb;
3879 CoroutineIOCompletion co = {
3880 .coroutine = qemu_coroutine_self(),
3883 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3884 bdrv_co_io_em_complete, &co);
3888 qemu_coroutine_yield();
3896 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3901 .sector_num = sector_num,
3902 .nb_sectors = nb_sectors,
3906 if (qemu_in_coroutine()) {
3907 /* Fast-path if already in coroutine context */
3908 bdrv_discard_co_entry(&rwco);
3910 co = qemu_coroutine_create(bdrv_discard_co_entry);
3911 qemu_coroutine_enter(co, &rwco);
3912 while (rwco.ret == NOT_DONE) {
3920 /**************************************************************/
3921 /* removable device support */
3924 * Return TRUE if the media is present
3926 int bdrv_is_inserted(BlockDriverState *bs)
3928 BlockDriver *drv = bs->drv;
3932 if (!drv->bdrv_is_inserted)
3934 return drv->bdrv_is_inserted(bs);
3938 * Return whether the media changed since the last call to this
3939 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3941 int bdrv_media_changed(BlockDriverState *bs)
3943 BlockDriver *drv = bs->drv;
3945 if (drv && drv->bdrv_media_changed) {
3946 return drv->bdrv_media_changed(bs);
3952 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3954 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3956 BlockDriver *drv = bs->drv;
3958 if (drv && drv->bdrv_eject) {
3959 drv->bdrv_eject(bs, eject_flag);
3962 if (bs->device_name[0] != '\0') {
3963 bdrv_emit_qmp_eject_event(bs, eject_flag);
3968 * Lock or unlock the media (if it is locked, the user won't be able
3969 * to eject it manually).
3971 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3973 BlockDriver *drv = bs->drv;
3975 trace_bdrv_lock_medium(bs, locked);
3977 if (drv && drv->bdrv_lock_medium) {
3978 drv->bdrv_lock_medium(bs, locked);
3982 /* needed for generic scsi interface */
3984 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3986 BlockDriver *drv = bs->drv;
3988 if (drv && drv->bdrv_ioctl)
3989 return drv->bdrv_ioctl(bs, req, buf);
3993 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3994 unsigned long int req, void *buf,
3995 BlockDriverCompletionFunc *cb, void *opaque)
3997 BlockDriver *drv = bs->drv;
3999 if (drv && drv->bdrv_aio_ioctl)
4000 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4004 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4006 bs->buffer_alignment = align;
4009 void *qemu_blockalign(BlockDriverState *bs, size_t size)
4011 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4014 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
4016 int64_t bitmap_size;
4018 bs->dirty_count = 0;
4020 if (!bs->dirty_bitmap) {
4021 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
4022 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
4023 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
4025 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
4028 if (bs->dirty_bitmap) {
4029 g_free(bs->dirty_bitmap);
4030 bs->dirty_bitmap = NULL;
4035 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4037 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4039 if (bs->dirty_bitmap &&
4040 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4041 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4042 (1UL << (chunk % (sizeof(unsigned long) * 8))));
4048 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4051 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4054 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4056 return bs->dirty_count;
4059 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4061 assert(bs->in_use != in_use);
4062 bs->in_use = in_use;
4065 int bdrv_in_use(BlockDriverState *bs)
4070 void bdrv_iostatus_enable(BlockDriverState *bs)
4072 bs->iostatus_enabled = true;
4073 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4076 /* The I/O status is only enabled if the drive explicitly
4077 * enables it _and_ the VM is configured to stop on errors */
4078 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4080 return (bs->iostatus_enabled &&
4081 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
4082 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
4083 bs->on_read_error == BLOCK_ERR_STOP_ANY));
4086 void bdrv_iostatus_disable(BlockDriverState *bs)
4088 bs->iostatus_enabled = false;
4091 void bdrv_iostatus_reset(BlockDriverState *bs)
4093 if (bdrv_iostatus_is_enabled(bs)) {
4094 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4098 /* XXX: Today this is set by device models because it makes the implementation
4099 quite simple. However, the block layer knows about the error, so it's
4100 possible to implement this without device models being involved */
4101 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4103 if (bdrv_iostatus_is_enabled(bs) &&
4104 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4106 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4107 BLOCK_DEVICE_IO_STATUS_FAILED;
4112 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4113 enum BlockAcctType type)
4115 assert(type < BDRV_MAX_IOTYPE);
4117 cookie->bytes = bytes;
4118 cookie->start_time_ns = get_clock();
4119 cookie->type = type;
4123 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4125 assert(cookie->type < BDRV_MAX_IOTYPE);
4127 bs->nr_bytes[cookie->type] += cookie->bytes;
4128 bs->nr_ops[cookie->type]++;
4129 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4132 int bdrv_img_create(const char *filename, const char *fmt,
4133 const char *base_filename, const char *base_fmt,
4134 char *options, uint64_t img_size, int flags)
4136 QEMUOptionParameter *param = NULL, *create_options = NULL;
4137 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4138 BlockDriverState *bs = NULL;
4139 BlockDriver *drv, *proto_drv;
4140 BlockDriver *backing_drv = NULL;
4143 /* Find driver and parse its options */
4144 drv = bdrv_find_format(fmt);
4146 error_report("Unknown file format '%s'", fmt);
4151 proto_drv = bdrv_find_protocol(filename);
4153 error_report("Unknown protocol '%s'", filename);
4158 create_options = append_option_parameters(create_options,
4159 drv->create_options);
4160 create_options = append_option_parameters(create_options,
4161 proto_drv->create_options);
4163 /* Create parameter list with default values */
4164 param = parse_option_parameters("", create_options, param);
4166 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4168 /* Parse -o options */
4170 param = parse_option_parameters(options, create_options, param);
4171 if (param == NULL) {
4172 error_report("Invalid options for file format '%s'.", fmt);
4178 if (base_filename) {
4179 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4181 error_report("Backing file not supported for file format '%s'",
4189 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4190 error_report("Backing file format not supported for file "
4191 "format '%s'", fmt);
4197 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4198 if (backing_file && backing_file->value.s) {
4199 if (!strcmp(filename, backing_file->value.s)) {
4200 error_report("Error: Trying to create an image with the "
4201 "same filename as the backing file");
4207 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4208 if (backing_fmt && backing_fmt->value.s) {
4209 backing_drv = bdrv_find_format(backing_fmt->value.s);
4211 error_report("Unknown backing file format '%s'",
4212 backing_fmt->value.s);
4218 // The size for the image must always be specified, with one exception:
4219 // If we are using a backing file, we can obtain the size from there
4220 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4221 if (size && size->value.n == -1) {
4222 if (backing_file && backing_file->value.s) {
4227 /* backing files always opened read-only */
4229 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4233 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4235 error_report("Could not open '%s'", backing_file->value.s);
4238 bdrv_get_geometry(bs, &size);
4241 snprintf(buf, sizeof(buf), "%" PRId64, size);
4242 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4244 error_report("Image creation needs a size parameter");
4250 printf("Formatting '%s', fmt=%s ", filename, fmt);
4251 print_option_parameters(param);
4254 ret = bdrv_create(drv, filename, param);
4257 if (ret == -ENOTSUP) {
4258 error_report("Formatting or formatting option not supported for "
4259 "file format '%s'", fmt);
4260 } else if (ret == -EFBIG) {
4261 error_report("The image size is too large for file format '%s'",
4264 error_report("%s: error while creating %s: %s", filename, fmt,
4270 free_option_parameters(create_options);
4271 free_option_parameters(param);
4280 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4281 int64_t speed, BlockDriverCompletionFunc *cb,
4282 void *opaque, Error **errp)
4286 if (bs->job || bdrv_in_use(bs)) {
4287 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4290 bdrv_set_in_use(bs, 1);
4292 job = g_malloc0(job_type->instance_size);
4293 job->job_type = job_type;
4296 job->opaque = opaque;
4300 /* Only set speed when necessary to avoid NotSupported error */
4302 Error *local_err = NULL;
4304 block_job_set_speed(job, speed, &local_err);
4305 if (error_is_set(&local_err)) {
4308 bdrv_set_in_use(bs, 0);
4309 error_propagate(errp, local_err);
4316 void block_job_complete(BlockJob *job, int ret)
4318 BlockDriverState *bs = job->bs;
4320 assert(bs->job == job);
4321 job->cb(job->opaque, ret);
4324 bdrv_set_in_use(bs, 0);
4327 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4329 Error *local_err = NULL;
4331 if (!job->job_type->set_speed) {
4332 error_set(errp, QERR_NOT_SUPPORTED);
4335 job->job_type->set_speed(job, speed, &local_err);
4336 if (error_is_set(&local_err)) {
4337 error_propagate(errp, local_err);
4344 void block_job_cancel(BlockJob *job)
4346 job->cancelled = true;
4347 if (job->co && !job->busy) {
4348 qemu_coroutine_enter(job->co, NULL);
4352 bool block_job_is_cancelled(BlockJob *job)
4354 return job->cancelled;
4357 struct BlockCancelData {
4359 BlockDriverCompletionFunc *cb;
4365 static void block_job_cancel_cb(void *opaque, int ret)
4367 struct BlockCancelData *data = opaque;
4369 data->cancelled = block_job_is_cancelled(data->job);
4371 data->cb(data->opaque, ret);
4374 int block_job_cancel_sync(BlockJob *job)
4376 struct BlockCancelData data;
4377 BlockDriverState *bs = job->bs;
4379 assert(bs->job == job);
4381 /* Set up our own callback to store the result and chain to
4382 * the original callback.
4386 data.opaque = job->opaque;
4387 data.ret = -EINPROGRESS;
4388 job->cb = block_job_cancel_cb;
4389 job->opaque = &data;
4390 block_job_cancel(job);
4391 while (data.ret == -EINPROGRESS) {
4394 return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4397 void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4399 /* Check cancellation *before* setting busy = false, too! */
4400 if (!block_job_is_cancelled(job)) {
4402 co_sleep_ns(clock, ns);