2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
28 #include "block_int.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
36 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79 BlockDriverCompletionFunc *cb,
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
106 static int is_windows_drive_prefix(const char *filename)
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113 int is_windows_drive(const char *filename)
115 if (is_windows_drive_prefix(filename) &&
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
128 bs->io_limits_enabled = false;
130 while (qemu_co_queue_next(&bs->throttled_reqs));
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
144 static void bdrv_block_timer(void *opaque)
146 BlockDriverState *bs = opaque;
148 qemu_co_queue_next(&bs->throttled_reqs);
151 void bdrv_io_limits_enable(BlockDriverState *bs)
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
176 int64_t wait_time = -1;
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
195 qemu_co_queue_next(&bs->throttled_reqs);
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
204 if (is_windows_drive(path) ||
205 is_windows_drive_prefix(path)) {
208 p = path + strcspn(path, ":/\\");
210 p = path + strcspn(path, ":/");
216 int path_is_absolute(const char *path)
219 /* specific case for names like: "\\.\d:" */
220 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
223 return (*path == '/' || *path == '\\');
225 return (*path == '/');
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
232 void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
244 p = strchr(base_path, ':');
249 p1 = strrchr(base_path, '/');
253 p2 = strrchr(base_path, '\\');
265 if (len > dest_size - 1)
267 memcpy(dest, base_path, len);
269 pstrcat(dest, dest_size, filename);
273 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
275 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
276 pstrcpy(dest, sz, bs->backing_file);
278 path_combine(dest, sz, bs->filename, bs->backing_file);
282 void bdrv_register(BlockDriver *bdrv)
284 /* Block drivers without coroutine functions need emulation */
285 if (!bdrv->bdrv_co_readv) {
286 bdrv->bdrv_co_readv = bdrv_co_readv_em;
287 bdrv->bdrv_co_writev = bdrv_co_writev_em;
289 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
290 * the block driver lacks aio we need to emulate that too.
292 if (!bdrv->bdrv_aio_readv) {
293 /* add AIO emulation layer */
294 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
295 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
299 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
302 /* create a new block device (by default it is empty) */
303 BlockDriverState *bdrv_new(const char *device_name)
305 BlockDriverState *bs;
307 bs = g_malloc0(sizeof(BlockDriverState));
308 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
309 if (device_name[0] != '\0') {
310 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
312 bdrv_iostatus_disable(bs);
316 BlockDriver *bdrv_find_format(const char *format_name)
319 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
320 if (!strcmp(drv1->format_name, format_name)) {
327 static int bdrv_is_whitelisted(BlockDriver *drv)
329 static const char *whitelist[] = {
330 CONFIG_BDRV_WHITELIST
335 return 1; /* no whitelist, anything goes */
337 for (p = whitelist; *p; p++) {
338 if (!strcmp(drv->format_name, *p)) {
345 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
347 BlockDriver *drv = bdrv_find_format(format_name);
348 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
351 typedef struct CreateCo {
354 QEMUOptionParameter *options;
358 static void coroutine_fn bdrv_create_co_entry(void *opaque)
360 CreateCo *cco = opaque;
363 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
366 int bdrv_create(BlockDriver *drv, const char* filename,
367 QEMUOptionParameter *options)
374 .filename = g_strdup(filename),
379 if (!drv->bdrv_create) {
383 if (qemu_in_coroutine()) {
384 /* Fast-path if already in coroutine context */
385 bdrv_create_co_entry(&cco);
387 co = qemu_coroutine_create(bdrv_create_co_entry);
388 qemu_coroutine_enter(co, &cco);
389 while (cco.ret == NOT_DONE) {
395 g_free(cco.filename);
400 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
404 drv = bdrv_find_protocol(filename);
409 return bdrv_create(drv, filename, options);
413 * Create a uniquely-named empty temporary file.
414 * Return 0 upon success, otherwise a negative errno value.
416 int get_tmp_filename(char *filename, int size)
419 char temp_dir[MAX_PATH];
420 /* GetTempFileName requires that its output buffer (4th param)
421 have length MAX_PATH or greater. */
422 assert(size >= MAX_PATH);
423 return (GetTempPath(MAX_PATH, temp_dir)
424 && GetTempFileName(temp_dir, "qem", 0, filename)
425 ? 0 : -GetLastError());
429 tmpdir = getenv("TMPDIR");
432 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
435 fd = mkstemp(filename);
436 if (fd < 0 || close(fd)) {
444 * Detect host devices. By convention, /dev/cdrom[N] is always
445 * recognized as a host CDROM.
447 static BlockDriver *find_hdev_driver(const char *filename)
449 int score_max = 0, score;
450 BlockDriver *drv = NULL, *d;
452 QLIST_FOREACH(d, &bdrv_drivers, list) {
453 if (d->bdrv_probe_device) {
454 score = d->bdrv_probe_device(filename);
455 if (score > score_max) {
465 BlockDriver *bdrv_find_protocol(const char *filename)
472 /* TODO Drivers without bdrv_file_open must be specified explicitly */
475 * XXX(hch): we really should not let host device detection
476 * override an explicit protocol specification, but moving this
477 * later breaks access to device names with colons in them.
478 * Thanks to the brain-dead persistent naming schemes on udev-
479 * based Linux systems those actually are quite common.
481 drv1 = find_hdev_driver(filename);
486 if (!path_has_protocol(filename)) {
487 return bdrv_find_format("file");
489 p = strchr(filename, ':');
492 if (len > sizeof(protocol) - 1)
493 len = sizeof(protocol) - 1;
494 memcpy(protocol, filename, len);
495 protocol[len] = '\0';
496 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
497 if (drv1->protocol_name &&
498 !strcmp(drv1->protocol_name, protocol)) {
505 static int find_image_format(const char *filename, BlockDriver **pdrv)
507 int ret, score, score_max;
508 BlockDriver *drv1, *drv;
510 BlockDriverState *bs;
512 ret = bdrv_file_open(&bs, filename, 0);
518 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
519 if (bs->sg || !bdrv_is_inserted(bs)) {
521 drv = bdrv_find_format("raw");
529 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
538 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
539 if (drv1->bdrv_probe) {
540 score = drv1->bdrv_probe(buf, ret, filename);
541 if (score > score_max) {
555 * Set the current 'total_sectors' value
557 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
559 BlockDriver *drv = bs->drv;
561 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
565 /* query actual device if possible, otherwise just trust the hint */
566 if (drv->bdrv_getlength) {
567 int64_t length = drv->bdrv_getlength(bs);
571 hint = length >> BDRV_SECTOR_BITS;
574 bs->total_sectors = hint;
579 * Set open flags for a given cache mode
581 * Return 0 on success, -1 if the cache mode was invalid.
583 int bdrv_parse_cache_flags(const char *mode, int *flags)
585 *flags &= ~BDRV_O_CACHE_MASK;
587 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
588 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
589 } else if (!strcmp(mode, "directsync")) {
590 *flags |= BDRV_O_NOCACHE;
591 } else if (!strcmp(mode, "writeback")) {
592 *flags |= BDRV_O_CACHE_WB;
593 } else if (!strcmp(mode, "unsafe")) {
594 *flags |= BDRV_O_CACHE_WB;
595 *flags |= BDRV_O_NO_FLUSH;
596 } else if (!strcmp(mode, "writethrough")) {
597 /* this is the default */
606 * The copy-on-read flag is actually a reference count so multiple users may
607 * use the feature without worrying about clobbering its previous state.
608 * Copy-on-read stays enabled until all users have called to disable it.
610 void bdrv_enable_copy_on_read(BlockDriverState *bs)
615 void bdrv_disable_copy_on_read(BlockDriverState *bs)
617 assert(bs->copy_on_read > 0);
622 * Common part for opening disk images and files
624 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
625 int flags, BlockDriver *drv)
630 assert(bs->file == NULL);
632 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
634 bs->open_flags = flags;
635 bs->buffer_alignment = 512;
637 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
638 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
639 bdrv_enable_copy_on_read(bs);
642 pstrcpy(bs->filename, sizeof(bs->filename), filename);
644 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
649 bs->opaque = g_malloc0(drv->instance_size);
651 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
654 * Clear flags that are internal to the block layer before opening the
657 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
660 * Snapshots should be writable.
662 if (bs->is_temporary) {
663 open_flags |= BDRV_O_RDWR;
666 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
668 /* Open the image, either directly or using a protocol */
669 if (drv->bdrv_file_open) {
670 ret = drv->bdrv_file_open(bs, filename, open_flags);
672 ret = bdrv_file_open(&bs->file, filename, open_flags);
674 ret = drv->bdrv_open(bs, open_flags);
682 ret = refresh_total_sectors(bs, bs->total_sectors);
688 if (bs->is_temporary) {
696 bdrv_delete(bs->file);
706 * Opens a file using a protocol (file, host_device, nbd, ...)
708 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
710 BlockDriverState *bs;
714 drv = bdrv_find_protocol(filename);
720 ret = bdrv_open_common(bs, filename, flags, drv);
731 * Opens a disk image (raw, qcow2, vmdk, ...)
733 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
737 char tmp_filename[PATH_MAX];
739 if (flags & BDRV_O_SNAPSHOT) {
740 BlockDriverState *bs1;
743 BlockDriver *bdrv_qcow2;
744 QEMUOptionParameter *options;
745 char backing_filename[PATH_MAX];
747 /* if snapshot, we create a temporary backing file and open it
748 instead of opening 'filename' directly */
750 /* if there is a backing file, use it */
752 ret = bdrv_open(bs1, filename, 0, drv);
757 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
759 if (bs1->drv && bs1->drv->protocol_name)
764 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
769 /* Real path is meaningless for protocols */
771 snprintf(backing_filename, sizeof(backing_filename),
773 else if (!realpath(filename, backing_filename))
776 bdrv_qcow2 = bdrv_find_format("qcow2");
777 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
779 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
780 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
782 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
786 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
787 free_option_parameters(options);
792 filename = tmp_filename;
794 bs->is_temporary = 1;
797 /* Find the right image format driver */
799 ret = find_image_format(filename, &drv);
803 goto unlink_and_fail;
807 ret = bdrv_open_common(bs, filename, flags, drv);
809 goto unlink_and_fail;
812 /* If there is a backing file, use it */
813 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
814 char backing_filename[PATH_MAX];
816 BlockDriver *back_drv = NULL;
818 bs->backing_hd = bdrv_new("");
819 bdrv_get_full_backing_filename(bs, backing_filename,
820 sizeof(backing_filename));
822 if (bs->backing_format[0] != '\0') {
823 back_drv = bdrv_find_format(bs->backing_format);
826 /* backing files always opened read-only */
828 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
830 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
835 if (bs->is_temporary) {
836 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
838 /* base image inherits from "parent" */
839 bs->backing_hd->keep_read_only = bs->keep_read_only;
843 if (!bdrv_key_required(bs)) {
844 bdrv_dev_change_media_cb(bs, true);
847 /* throttling disk I/O limits */
848 if (bs->io_limits_enabled) {
849 bdrv_io_limits_enable(bs);
855 if (bs->is_temporary) {
861 void bdrv_close(BlockDriverState *bs)
866 block_job_cancel_sync(bs->job);
870 if (bs == bs_snapshots) {
873 if (bs->backing_hd) {
874 bdrv_delete(bs->backing_hd);
875 bs->backing_hd = NULL;
877 bs->drv->bdrv_close(bs);
880 if (bs->is_temporary) {
881 unlink(bs->filename);
886 bs->copy_on_read = 0;
887 bs->backing_file[0] = '\0';
888 bs->backing_format[0] = '\0';
889 bs->total_sectors = 0;
895 if (bs->file != NULL) {
896 bdrv_delete(bs->file);
900 bdrv_dev_change_media_cb(bs, false);
903 /*throttling disk I/O limits*/
904 if (bs->io_limits_enabled) {
905 bdrv_io_limits_disable(bs);
909 void bdrv_close_all(void)
911 BlockDriverState *bs;
913 QTAILQ_FOREACH(bs, &bdrv_states, list) {
919 * Wait for pending requests to complete across all BlockDriverStates
921 * This function does not flush data to disk, use bdrv_flush_all() for that
922 * after calling this function.
924 * Note that completion of an asynchronous I/O operation can trigger any
925 * number of other I/O operations on other devices---for example a coroutine
926 * can be arbitrarily complex and a constant flow of I/O can come until the
927 * coroutine is complete. Because of this, it is not possible to have a
928 * function to drain a single device's I/O queue.
930 void bdrv_drain_all(void)
932 BlockDriverState *bs;
936 busy = qemu_aio_wait();
938 /* FIXME: We do not have timer support here, so this is effectively
941 QTAILQ_FOREACH(bs, &bdrv_states, list) {
942 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
943 qemu_co_queue_restart_all(&bs->throttled_reqs);
949 /* If requests are still pending there is a bug somewhere */
950 QTAILQ_FOREACH(bs, &bdrv_states, list) {
951 assert(QLIST_EMPTY(&bs->tracked_requests));
952 assert(qemu_co_queue_empty(&bs->throttled_reqs));
956 /* make a BlockDriverState anonymous by removing from bdrv_state list.
957 Also, NULL terminate the device_name to prevent double remove */
958 void bdrv_make_anon(BlockDriverState *bs)
960 if (bs->device_name[0] != '\0') {
961 QTAILQ_REMOVE(&bdrv_states, bs, list);
963 bs->device_name[0] = '\0';
966 static void bdrv_rebind(BlockDriverState *bs)
968 if (bs->drv && bs->drv->bdrv_rebind) {
969 bs->drv->bdrv_rebind(bs);
974 * Add new bs contents at the top of an image chain while the chain is
975 * live, while keeping required fields on the top layer.
977 * This will modify the BlockDriverState fields, and swap contents
978 * between bs_new and bs_top. Both bs_new and bs_top are modified.
980 * bs_new is required to be anonymous.
982 * This function does not create any image files.
984 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
986 BlockDriverState tmp;
988 /* bs_new must be anonymous */
989 assert(bs_new->device_name[0] == '\0');
993 /* there are some fields that need to stay on the top layer: */
994 tmp.open_flags = bs_top->open_flags;
997 tmp.dev_ops = bs_top->dev_ops;
998 tmp.dev_opaque = bs_top->dev_opaque;
999 tmp.dev = bs_top->dev;
1000 tmp.buffer_alignment = bs_top->buffer_alignment;
1001 tmp.copy_on_read = bs_top->copy_on_read;
1003 tmp.enable_write_cache = bs_top->enable_write_cache;
1005 /* i/o timing parameters */
1006 tmp.slice_time = bs_top->slice_time;
1007 tmp.slice_start = bs_top->slice_start;
1008 tmp.slice_end = bs_top->slice_end;
1009 tmp.io_limits = bs_top->io_limits;
1010 tmp.io_base = bs_top->io_base;
1011 tmp.throttled_reqs = bs_top->throttled_reqs;
1012 tmp.block_timer = bs_top->block_timer;
1013 tmp.io_limits_enabled = bs_top->io_limits_enabled;
1016 tmp.cyls = bs_top->cyls;
1017 tmp.heads = bs_top->heads;
1018 tmp.secs = bs_top->secs;
1019 tmp.translation = bs_top->translation;
1022 tmp.on_read_error = bs_top->on_read_error;
1023 tmp.on_write_error = bs_top->on_write_error;
1026 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1027 tmp.iostatus = bs_top->iostatus;
1029 /* keep the same entry in bdrv_states */
1030 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1031 tmp.list = bs_top->list;
1033 /* The contents of 'tmp' will become bs_top, as we are
1034 * swapping bs_new and bs_top contents. */
1035 tmp.backing_hd = bs_new;
1036 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1037 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1039 /* swap contents of the fixed new bs and the current top */
1043 /* device_name[] was carried over from the old bs_top. bs_new
1044 * shouldn't be in bdrv_states, so we need to make device_name[]
1045 * reflect the anonymity of bs_new
1047 bs_new->device_name[0] = '\0';
1049 /* clear the copied fields in the new backing file */
1050 bdrv_detach_dev(bs_new, bs_new->dev);
1052 qemu_co_queue_init(&bs_new->throttled_reqs);
1053 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1054 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1055 bdrv_iostatus_disable(bs_new);
1057 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1058 * to affect or delete the block_timer, as it has been moved to bs_top */
1059 bs_new->io_limits_enabled = false;
1060 bs_new->block_timer = NULL;
1061 bs_new->slice_time = 0;
1062 bs_new->slice_start = 0;
1063 bs_new->slice_end = 0;
1065 bdrv_rebind(bs_new);
1066 bdrv_rebind(bs_top);
1069 void bdrv_delete(BlockDriverState *bs)
1073 assert(!bs->in_use);
1075 /* remove from list, if necessary */
1080 assert(bs != bs_snapshots);
1084 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1085 /* TODO change to DeviceState *dev when all users are qdevified */
1091 bdrv_iostatus_reset(bs);
1095 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1096 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1098 if (bdrv_attach_dev(bs, dev) < 0) {
1103 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1104 /* TODO change to DeviceState *dev when all users are qdevified */
1106 assert(bs->dev == dev);
1109 bs->dev_opaque = NULL;
1110 bs->buffer_alignment = 512;
1113 /* TODO change to return DeviceState * when all users are qdevified */
1114 void *bdrv_get_attached_dev(BlockDriverState *bs)
1119 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1123 bs->dev_opaque = opaque;
1124 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1125 bs_snapshots = NULL;
1129 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1130 BlockQMPEventAction action, int is_read)
1133 const char *action_str;
1136 case BDRV_ACTION_REPORT:
1137 action_str = "report";
1139 case BDRV_ACTION_IGNORE:
1140 action_str = "ignore";
1142 case BDRV_ACTION_STOP:
1143 action_str = "stop";
1149 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1152 is_read ? "read" : "write");
1153 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1155 qobject_decref(data);
1158 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1162 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1163 bdrv_get_device_name(bs), ejected);
1164 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1166 qobject_decref(data);
1169 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1171 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1172 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1173 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1174 if (tray_was_closed) {
1176 bdrv_emit_qmp_eject_event(bs, true);
1180 bdrv_emit_qmp_eject_event(bs, false);
1185 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1187 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1190 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1192 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1193 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1197 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1199 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1200 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1205 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1207 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1208 bs->dev_ops->resize_cb(bs->dev_opaque);
1212 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1214 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1215 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1221 * Run consistency checks on an image
1223 * Returns 0 if the check could be completed (it doesn't mean that the image is
1224 * free of errors) or -errno when an internal error occurred. The results of the
1225 * check are stored in res.
1227 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1229 if (bs->drv->bdrv_check == NULL) {
1233 memset(res, 0, sizeof(*res));
1234 return bs->drv->bdrv_check(bs, res, fix);
1237 #define COMMIT_BUF_SECTORS 2048
1239 /* commit COW file into the raw image */
1240 int bdrv_commit(BlockDriverState *bs)
1242 BlockDriver *drv = bs->drv;
1243 BlockDriver *backing_drv;
1244 int64_t sector, total_sectors;
1245 int n, ro, open_flags;
1246 int ret = 0, rw_ret = 0;
1248 char filename[1024];
1249 BlockDriverState *bs_rw, *bs_ro;
1254 if (!bs->backing_hd) {
1258 if (bs->backing_hd->keep_read_only) {
1262 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1266 backing_drv = bs->backing_hd->drv;
1267 ro = bs->backing_hd->read_only;
1268 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1269 open_flags = bs->backing_hd->open_flags;
1273 bdrv_delete(bs->backing_hd);
1274 bs->backing_hd = NULL;
1275 bs_rw = bdrv_new("");
1276 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1280 /* try to re-open read-only */
1281 bs_ro = bdrv_new("");
1282 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1286 /* drive not functional anymore */
1290 bs->backing_hd = bs_ro;
1293 bs->backing_hd = bs_rw;
1296 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1297 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1299 for (sector = 0; sector < total_sectors; sector += n) {
1300 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1302 if (bdrv_read(bs, sector, buf, n) != 0) {
1307 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1314 if (drv->bdrv_make_empty) {
1315 ret = drv->bdrv_make_empty(bs);
1320 * Make sure all data we wrote to the backing device is actually
1324 bdrv_flush(bs->backing_hd);
1331 bdrv_delete(bs->backing_hd);
1332 bs->backing_hd = NULL;
1333 bs_ro = bdrv_new("");
1334 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1338 /* drive not functional anymore */
1342 bs->backing_hd = bs_ro;
1343 bs->backing_hd->keep_read_only = 0;
1349 int bdrv_commit_all(void)
1351 BlockDriverState *bs;
1353 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1354 int ret = bdrv_commit(bs);
1362 struct BdrvTrackedRequest {
1363 BlockDriverState *bs;
1367 QLIST_ENTRY(BdrvTrackedRequest) list;
1368 Coroutine *co; /* owner, used for deadlock detection */
1369 CoQueue wait_queue; /* coroutines blocked on this request */
1373 * Remove an active request from the tracked requests list
1375 * This function should be called when a tracked request is completing.
1377 static void tracked_request_end(BdrvTrackedRequest *req)
1379 QLIST_REMOVE(req, list);
1380 qemu_co_queue_restart_all(&req->wait_queue);
1384 * Add an active request to the tracked requests list
1386 static void tracked_request_begin(BdrvTrackedRequest *req,
1387 BlockDriverState *bs,
1389 int nb_sectors, bool is_write)
1391 *req = (BdrvTrackedRequest){
1393 .sector_num = sector_num,
1394 .nb_sectors = nb_sectors,
1395 .is_write = is_write,
1396 .co = qemu_coroutine_self(),
1399 qemu_co_queue_init(&req->wait_queue);
1401 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1405 * Round a region to cluster boundaries
1407 static void round_to_clusters(BlockDriverState *bs,
1408 int64_t sector_num, int nb_sectors,
1409 int64_t *cluster_sector_num,
1410 int *cluster_nb_sectors)
1412 BlockDriverInfo bdi;
1414 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1415 *cluster_sector_num = sector_num;
1416 *cluster_nb_sectors = nb_sectors;
1418 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1419 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1420 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1425 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1426 int64_t sector_num, int nb_sectors) {
1428 if (sector_num >= req->sector_num + req->nb_sectors) {
1432 if (req->sector_num >= sector_num + nb_sectors) {
1438 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1439 int64_t sector_num, int nb_sectors)
1441 BdrvTrackedRequest *req;
1442 int64_t cluster_sector_num;
1443 int cluster_nb_sectors;
1446 /* If we touch the same cluster it counts as an overlap. This guarantees
1447 * that allocating writes will be serialized and not race with each other
1448 * for the same cluster. For example, in copy-on-read it ensures that the
1449 * CoR read and write operations are atomic and guest writes cannot
1450 * interleave between them.
1452 round_to_clusters(bs, sector_num, nb_sectors,
1453 &cluster_sector_num, &cluster_nb_sectors);
1457 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1458 if (tracked_request_overlaps(req, cluster_sector_num,
1459 cluster_nb_sectors)) {
1460 /* Hitting this means there was a reentrant request, for
1461 * example, a block driver issuing nested requests. This must
1462 * never happen since it means deadlock.
1464 assert(qemu_coroutine_self() != req->co);
1466 qemu_co_queue_wait(&req->wait_queue);
1477 * -EINVAL - backing format specified, but no file
1478 * -ENOSPC - can't update the backing file because no space is left in the
1480 * -ENOTSUP - format driver doesn't support changing the backing file
1482 int bdrv_change_backing_file(BlockDriverState *bs,
1483 const char *backing_file, const char *backing_fmt)
1485 BlockDriver *drv = bs->drv;
1488 /* Backing file format doesn't make sense without a backing file */
1489 if (backing_fmt && !backing_file) {
1493 if (drv->bdrv_change_backing_file != NULL) {
1494 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1500 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1501 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1506 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1511 if (!bdrv_is_inserted(bs))
1517 len = bdrv_getlength(bs);
1522 if ((offset > len) || (len - offset < size))
1528 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1531 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1532 nb_sectors * BDRV_SECTOR_SIZE);
1535 typedef struct RwCo {
1536 BlockDriverState *bs;
1544 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1546 RwCo *rwco = opaque;
1548 if (!rwco->is_write) {
1549 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1550 rwco->nb_sectors, rwco->qiov, 0);
1552 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1553 rwco->nb_sectors, rwco->qiov, 0);
1558 * Process a synchronous request using coroutines
1560 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1561 int nb_sectors, bool is_write)
1564 struct iovec iov = {
1565 .iov_base = (void *)buf,
1566 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1571 .sector_num = sector_num,
1572 .nb_sectors = nb_sectors,
1574 .is_write = is_write,
1578 qemu_iovec_init_external(&qiov, &iov, 1);
1581 * In sync call context, when the vcpu is blocked, this throttling timer
1582 * will not fire; so the I/O throttling function has to be disabled here
1583 * if it has been enabled.
1585 if (bs->io_limits_enabled) {
1586 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1587 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1588 bdrv_io_limits_disable(bs);
1591 if (qemu_in_coroutine()) {
1592 /* Fast-path if already in coroutine context */
1593 bdrv_rw_co_entry(&rwco);
1595 co = qemu_coroutine_create(bdrv_rw_co_entry);
1596 qemu_coroutine_enter(co, &rwco);
1597 while (rwco.ret == NOT_DONE) {
1604 /* return < 0 if error. See bdrv_write() for the return codes */
1605 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1606 uint8_t *buf, int nb_sectors)
1608 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1611 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
1613 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1614 int nb_sectors, int dirty)
1617 unsigned long val, idx, bit;
1619 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1620 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1622 for (; start <= end; start++) {
1623 idx = start / BITS_PER_LONG;
1624 bit = start % BITS_PER_LONG;
1625 val = bs->dirty_bitmap[idx];
1627 if (!(val & (1UL << bit))) {
1632 if (val & (1UL << bit)) {
1634 val &= ~(1UL << bit);
1637 bs->dirty_bitmap[idx] = val;
1641 /* Return < 0 if error. Important errors are:
1642 -EIO generic I/O error (may happen for all errors)
1643 -ENOMEDIUM No media inserted.
1644 -EINVAL Invalid sector number or nb_sectors
1645 -EACCES Trying to write a read-only device
1647 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1648 const uint8_t *buf, int nb_sectors)
1650 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1653 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1654 void *buf, int count1)
1656 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1657 int len, nb_sectors, count;
1662 /* first read to align to sector start */
1663 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1666 sector_num = offset >> BDRV_SECTOR_BITS;
1668 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1670 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1678 /* read the sectors "in place" */
1679 nb_sectors = count >> BDRV_SECTOR_BITS;
1680 if (nb_sectors > 0) {
1681 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1683 sector_num += nb_sectors;
1684 len = nb_sectors << BDRV_SECTOR_BITS;
1689 /* add data from the last sector */
1691 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1693 memcpy(buf, tmp_buf, count);
1698 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1699 const void *buf, int count1)
1701 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1702 int len, nb_sectors, count;
1707 /* first write to align to sector start */
1708 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1711 sector_num = offset >> BDRV_SECTOR_BITS;
1713 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1715 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1716 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1725 /* write the sectors "in place" */
1726 nb_sectors = count >> BDRV_SECTOR_BITS;
1727 if (nb_sectors > 0) {
1728 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1730 sector_num += nb_sectors;
1731 len = nb_sectors << BDRV_SECTOR_BITS;
1736 /* add data from the last sector */
1738 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1740 memcpy(tmp_buf, buf, count);
1741 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1748 * Writes to the file and ensures that no writes are reordered across this
1749 * request (acts as a barrier)
1751 * Returns 0 on success, -errno in error cases.
1753 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1754 const void *buf, int count)
1758 ret = bdrv_pwrite(bs, offset, buf, count);
1763 /* No flush needed for cache modes that already do it */
1764 if (bs->enable_write_cache) {
1771 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1772 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1774 /* Perform I/O through a temporary buffer so that users who scribble over
1775 * their read buffer while the operation is in progress do not end up
1776 * modifying the image file. This is critical for zero-copy guest I/O
1777 * where anything might happen inside guest memory.
1779 void *bounce_buffer;
1781 BlockDriver *drv = bs->drv;
1783 QEMUIOVector bounce_qiov;
1784 int64_t cluster_sector_num;
1785 int cluster_nb_sectors;
1789 /* Cover entire cluster so no additional backing file I/O is required when
1790 * allocating cluster in the image file.
1792 round_to_clusters(bs, sector_num, nb_sectors,
1793 &cluster_sector_num, &cluster_nb_sectors);
1795 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1796 cluster_sector_num, cluster_nb_sectors);
1798 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1799 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1800 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1802 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1808 if (drv->bdrv_co_write_zeroes &&
1809 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1810 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1811 cluster_nb_sectors);
1813 /* This does not change the data on the disk, it is not necessary
1814 * to flush even in cache=writethrough mode.
1816 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1821 /* It might be okay to ignore write errors for guest requests. If this
1822 * is a deliberate copy-on-read then we don't want to ignore the error.
1823 * Simply report it in all cases.
1828 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1829 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1830 nb_sectors * BDRV_SECTOR_SIZE);
1833 qemu_vfree(bounce_buffer);
1838 * Handle a read request in coroutine context
1840 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1841 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1842 BdrvRequestFlags flags)
1844 BlockDriver *drv = bs->drv;
1845 BdrvTrackedRequest req;
1851 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1855 /* throttling disk read I/O */
1856 if (bs->io_limits_enabled) {
1857 bdrv_io_limits_intercept(bs, false, nb_sectors);
1860 if (bs->copy_on_read) {
1861 flags |= BDRV_REQ_COPY_ON_READ;
1863 if (flags & BDRV_REQ_COPY_ON_READ) {
1864 bs->copy_on_read_in_flight++;
1867 if (bs->copy_on_read_in_flight) {
1868 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1871 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1873 if (flags & BDRV_REQ_COPY_ON_READ) {
1876 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1881 if (!ret || pnum != nb_sectors) {
1882 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1887 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1890 tracked_request_end(&req);
1892 if (flags & BDRV_REQ_COPY_ON_READ) {
1893 bs->copy_on_read_in_flight--;
1899 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1900 int nb_sectors, QEMUIOVector *qiov)
1902 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1904 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1907 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1908 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1910 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1912 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1913 BDRV_REQ_COPY_ON_READ);
1916 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1917 int64_t sector_num, int nb_sectors)
1919 BlockDriver *drv = bs->drv;
1924 /* TODO Emulate only part of misaligned requests instead of letting block
1925 * drivers return -ENOTSUP and emulate everything */
1927 /* First try the efficient write zeroes operation */
1928 if (drv->bdrv_co_write_zeroes) {
1929 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1930 if (ret != -ENOTSUP) {
1935 /* Fall back to bounce buffer if write zeroes is unsupported */
1936 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1937 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1938 memset(iov.iov_base, 0, iov.iov_len);
1939 qemu_iovec_init_external(&qiov, &iov, 1);
1941 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1943 qemu_vfree(iov.iov_base);
1948 * Handle a write request in coroutine context
1950 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1951 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1952 BdrvRequestFlags flags)
1954 BlockDriver *drv = bs->drv;
1955 BdrvTrackedRequest req;
1961 if (bs->read_only) {
1964 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1968 /* throttling disk write I/O */
1969 if (bs->io_limits_enabled) {
1970 bdrv_io_limits_intercept(bs, true, nb_sectors);
1973 if (bs->copy_on_read_in_flight) {
1974 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1977 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1979 if (flags & BDRV_REQ_ZERO_WRITE) {
1980 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1982 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1985 if (ret == 0 && !bs->enable_write_cache) {
1986 ret = bdrv_co_flush(bs);
1989 if (bs->dirty_bitmap) {
1990 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1993 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1994 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1997 tracked_request_end(&req);
2002 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2003 int nb_sectors, QEMUIOVector *qiov)
2005 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2007 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2010 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2011 int64_t sector_num, int nb_sectors)
2013 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2015 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2016 BDRV_REQ_ZERO_WRITE);
2020 * Truncate file to 'offset' bytes (needed only for file protocols)
2022 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2024 BlockDriver *drv = bs->drv;
2028 if (!drv->bdrv_truncate)
2032 if (bdrv_in_use(bs))
2034 ret = drv->bdrv_truncate(bs, offset);
2036 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
2037 bdrv_dev_resize_cb(bs);
2043 * Length of a allocated file in bytes. Sparse files are counted by actual
2044 * allocated space. Return < 0 if error or unknown.
2046 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2048 BlockDriver *drv = bs->drv;
2052 if (drv->bdrv_get_allocated_file_size) {
2053 return drv->bdrv_get_allocated_file_size(bs);
2056 return bdrv_get_allocated_file_size(bs->file);
2062 * Length of a file in bytes. Return < 0 if error or unknown.
2064 int64_t bdrv_getlength(BlockDriverState *bs)
2066 BlockDriver *drv = bs->drv;
2070 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2071 if (drv->bdrv_getlength) {
2072 return drv->bdrv_getlength(bs);
2075 return bs->total_sectors * BDRV_SECTOR_SIZE;
2078 /* return 0 as number of sectors if no device present or error */
2079 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2082 length = bdrv_getlength(bs);
2086 length = length >> BDRV_SECTOR_BITS;
2087 *nb_sectors_ptr = length;
2091 uint8_t boot_ind; /* 0x80 - active */
2092 uint8_t head; /* starting head */
2093 uint8_t sector; /* starting sector */
2094 uint8_t cyl; /* starting cylinder */
2095 uint8_t sys_ind; /* What partition type */
2096 uint8_t end_head; /* end head */
2097 uint8_t end_sector; /* end sector */
2098 uint8_t end_cyl; /* end cylinder */
2099 uint32_t start_sect; /* starting sector counting from 0 */
2100 uint32_t nr_sects; /* nr of sectors in partition */
2103 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2104 static int guess_disk_lchs(BlockDriverState *bs,
2105 int *pcylinders, int *pheads, int *psectors)
2107 uint8_t buf[BDRV_SECTOR_SIZE];
2108 int ret, i, heads, sectors, cylinders;
2109 struct partition *p;
2111 uint64_t nb_sectors;
2114 bdrv_get_geometry(bs, &nb_sectors);
2117 * The function will be invoked during startup not only in sync I/O mode,
2118 * but also in async I/O mode. So the I/O throttling function has to
2119 * be disabled temporarily here, not permanently.
2121 enabled = bs->io_limits_enabled;
2122 bs->io_limits_enabled = false;
2123 ret = bdrv_read(bs, 0, buf, 1);
2124 bs->io_limits_enabled = enabled;
2127 /* test msdos magic */
2128 if (buf[510] != 0x55 || buf[511] != 0xaa)
2130 for(i = 0; i < 4; i++) {
2131 p = ((struct partition *)(buf + 0x1be)) + i;
2132 nr_sects = le32_to_cpu(p->nr_sects);
2133 if (nr_sects && p->end_head) {
2134 /* We make the assumption that the partition terminates on
2135 a cylinder boundary */
2136 heads = p->end_head + 1;
2137 sectors = p->end_sector & 63;
2140 cylinders = nb_sectors / (heads * sectors);
2141 if (cylinders < 1 || cylinders > 16383)
2144 *psectors = sectors;
2145 *pcylinders = cylinders;
2147 printf("guessed geometry: LCHS=%d %d %d\n",
2148 cylinders, heads, sectors);
2156 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2158 int translation, lba_detected = 0;
2159 int cylinders, heads, secs;
2160 uint64_t nb_sectors;
2162 /* if a geometry hint is available, use it */
2163 bdrv_get_geometry(bs, &nb_sectors);
2164 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2165 translation = bdrv_get_translation_hint(bs);
2166 if (cylinders != 0) {
2171 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2173 /* if heads > 16, it means that a BIOS LBA
2174 translation was active, so the default
2175 hardware geometry is OK */
2177 goto default_geometry;
2182 /* disable any translation to be in sync with
2183 the logical geometry */
2184 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2185 bdrv_set_translation_hint(bs,
2186 BIOS_ATA_TRANSLATION_NONE);
2191 /* if no geometry, use a standard physical disk geometry */
2192 cylinders = nb_sectors / (16 * 63);
2194 if (cylinders > 16383)
2196 else if (cylinders < 2)
2201 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2202 if ((*pcyls * *pheads) <= 131072) {
2203 bdrv_set_translation_hint(bs,
2204 BIOS_ATA_TRANSLATION_LARGE);
2206 bdrv_set_translation_hint(bs,
2207 BIOS_ATA_TRANSLATION_LBA);
2211 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2215 void bdrv_set_geometry_hint(BlockDriverState *bs,
2216 int cyls, int heads, int secs)
2223 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2225 bs->translation = translation;
2228 void bdrv_get_geometry_hint(BlockDriverState *bs,
2229 int *pcyls, int *pheads, int *psecs)
2232 *pheads = bs->heads;
2236 /* throttling disk io limits */
2237 void bdrv_set_io_limits(BlockDriverState *bs,
2238 BlockIOLimit *io_limits)
2240 bs->io_limits = *io_limits;
2241 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2244 /* Recognize floppy formats */
2245 typedef struct FDFormat {
2253 static const FDFormat fd_formats[] = {
2254 /* First entry is default format */
2255 /* 1.44 MB 3"1/2 floppy disks */
2256 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2257 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2258 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2259 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2260 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2261 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2262 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2263 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2264 /* 2.88 MB 3"1/2 floppy disks */
2265 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2266 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2267 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2268 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2269 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2270 /* 720 kB 3"1/2 floppy disks */
2271 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2272 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2273 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2274 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2275 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2276 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2277 /* 1.2 MB 5"1/4 floppy disks */
2278 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2279 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2280 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2281 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2282 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2283 /* 720 kB 5"1/4 floppy disks */
2284 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2285 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2286 /* 360 kB 5"1/4 floppy disks */
2287 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2288 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2289 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2290 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2291 /* 320 kB 5"1/4 floppy disks */
2292 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2293 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2294 /* 360 kB must match 5"1/4 better than 3"1/2... */
2295 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2297 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2300 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2301 int *max_track, int *last_sect,
2302 FDriveType drive_in, FDriveType *drive,
2305 const FDFormat *parse;
2306 uint64_t nb_sectors, size;
2307 int i, first_match, match;
2309 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2310 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2311 /* User defined disk */
2312 *rate = FDRIVE_RATE_500K;
2314 bdrv_get_geometry(bs, &nb_sectors);
2317 for (i = 0; ; i++) {
2318 parse = &fd_formats[i];
2319 if (parse->drive == FDRIVE_DRV_NONE) {
2322 if (drive_in == parse->drive ||
2323 drive_in == FDRIVE_DRV_NONE) {
2324 size = (parse->max_head + 1) * parse->max_track *
2326 if (nb_sectors == size) {
2330 if (first_match == -1) {
2336 if (first_match == -1) {
2339 match = first_match;
2341 parse = &fd_formats[match];
2343 *nb_heads = parse->max_head + 1;
2344 *max_track = parse->max_track;
2345 *last_sect = parse->last_sect;
2346 *drive = parse->drive;
2347 *rate = parse->rate;
2351 int bdrv_get_translation_hint(BlockDriverState *bs)
2353 return bs->translation;
2356 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2357 BlockErrorAction on_write_error)
2359 bs->on_read_error = on_read_error;
2360 bs->on_write_error = on_write_error;
2363 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2365 return is_read ? bs->on_read_error : bs->on_write_error;
2368 int bdrv_is_read_only(BlockDriverState *bs)
2370 return bs->read_only;
2373 int bdrv_is_sg(BlockDriverState *bs)
2378 int bdrv_enable_write_cache(BlockDriverState *bs)
2380 return bs->enable_write_cache;
2383 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2385 bs->enable_write_cache = wce;
2388 int bdrv_is_encrypted(BlockDriverState *bs)
2390 if (bs->backing_hd && bs->backing_hd->encrypted)
2392 return bs->encrypted;
2395 int bdrv_key_required(BlockDriverState *bs)
2397 BlockDriverState *backing_hd = bs->backing_hd;
2399 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2401 return (bs->encrypted && !bs->valid_key);
2404 int bdrv_set_key(BlockDriverState *bs, const char *key)
2407 if (bs->backing_hd && bs->backing_hd->encrypted) {
2408 ret = bdrv_set_key(bs->backing_hd, key);
2414 if (!bs->encrypted) {
2416 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2419 ret = bs->drv->bdrv_set_key(bs, key);
2422 } else if (!bs->valid_key) {
2424 /* call the change callback now, we skipped it on open */
2425 bdrv_dev_change_media_cb(bs, true);
2430 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2435 pstrcpy(buf, buf_size, bs->drv->format_name);
2439 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2444 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2445 it(opaque, drv->format_name);
2449 BlockDriverState *bdrv_find(const char *name)
2451 BlockDriverState *bs;
2453 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2454 if (!strcmp(name, bs->device_name)) {
2461 BlockDriverState *bdrv_next(BlockDriverState *bs)
2464 return QTAILQ_FIRST(&bdrv_states);
2466 return QTAILQ_NEXT(bs, list);
2469 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2471 BlockDriverState *bs;
2473 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2478 const char *bdrv_get_device_name(BlockDriverState *bs)
2480 return bs->device_name;
2483 int bdrv_get_flags(BlockDriverState *bs)
2485 return bs->open_flags;
2488 void bdrv_flush_all(void)
2490 BlockDriverState *bs;
2492 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2497 int bdrv_has_zero_init(BlockDriverState *bs)
2501 if (bs->drv->bdrv_has_zero_init) {
2502 return bs->drv->bdrv_has_zero_init(bs);
2508 typedef struct BdrvCoIsAllocatedData {
2509 BlockDriverState *bs;
2515 } BdrvCoIsAllocatedData;
2518 * Returns true iff the specified sector is present in the disk image. Drivers
2519 * not implementing the functionality are assumed to not support backing files,
2520 * hence all their sectors are reported as allocated.
2522 * If 'sector_num' is beyond the end of the disk image the return value is 0
2523 * and 'pnum' is set to 0.
2525 * 'pnum' is set to the number of sectors (including and immediately following
2526 * the specified sector) that are known to be in the same
2527 * allocated/unallocated state.
2529 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2530 * beyond the end of the disk image it will be clamped.
2532 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2533 int nb_sectors, int *pnum)
2537 if (sector_num >= bs->total_sectors) {
2542 n = bs->total_sectors - sector_num;
2543 if (n < nb_sectors) {
2547 if (!bs->drv->bdrv_co_is_allocated) {
2552 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2555 /* Coroutine wrapper for bdrv_is_allocated() */
2556 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2558 BdrvCoIsAllocatedData *data = opaque;
2559 BlockDriverState *bs = data->bs;
2561 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2567 * Synchronous wrapper around bdrv_co_is_allocated().
2569 * See bdrv_co_is_allocated() for details.
2571 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2575 BdrvCoIsAllocatedData data = {
2577 .sector_num = sector_num,
2578 .nb_sectors = nb_sectors,
2583 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2584 qemu_coroutine_enter(co, &data);
2585 while (!data.done) {
2592 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2594 * Return true if the given sector is allocated in any image between
2595 * BASE and TOP (inclusive). BASE can be NULL to check if the given
2596 * sector is allocated in any image of the chain. Return false otherwise.
2598 * 'pnum' is set to the number of sectors (including and immediately following
2599 * the specified sector) that are known to be in the same
2600 * allocated/unallocated state.
2603 int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2604 BlockDriverState *base,
2606 int nb_sectors, int *pnum)
2608 BlockDriverState *intermediate;
2609 int ret, n = nb_sectors;
2612 while (intermediate && intermediate != base) {
2614 ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2624 * [sector_num, nb_sectors] is unallocated on top but intermediate
2627 * [sector_num+x, nr_sectors] allocated.
2629 if (n > pnum_inter) {
2633 intermediate = intermediate->backing_hd;
2640 BlockInfoList *qmp_query_block(Error **errp)
2642 BlockInfoList *head = NULL, *cur_item = NULL;
2643 BlockDriverState *bs;
2645 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2646 BlockInfoList *info = g_malloc0(sizeof(*info));
2648 info->value = g_malloc0(sizeof(*info->value));
2649 info->value->device = g_strdup(bs->device_name);
2650 info->value->type = g_strdup("unknown");
2651 info->value->locked = bdrv_dev_is_medium_locked(bs);
2652 info->value->removable = bdrv_dev_has_removable_media(bs);
2654 if (bdrv_dev_has_removable_media(bs)) {
2655 info->value->has_tray_open = true;
2656 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2659 if (bdrv_iostatus_is_enabled(bs)) {
2660 info->value->has_io_status = true;
2661 info->value->io_status = bs->iostatus;
2665 info->value->has_inserted = true;
2666 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2667 info->value->inserted->file = g_strdup(bs->filename);
2668 info->value->inserted->ro = bs->read_only;
2669 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2670 info->value->inserted->encrypted = bs->encrypted;
2671 if (bs->backing_file[0]) {
2672 info->value->inserted->has_backing_file = true;
2673 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2676 if (bs->io_limits_enabled) {
2677 info->value->inserted->bps =
2678 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2679 info->value->inserted->bps_rd =
2680 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2681 info->value->inserted->bps_wr =
2682 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2683 info->value->inserted->iops =
2684 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2685 info->value->inserted->iops_rd =
2686 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2687 info->value->inserted->iops_wr =
2688 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2692 /* XXX: waiting for the qapi to support GSList */
2694 head = cur_item = info;
2696 cur_item->next = info;
2704 /* Consider exposing this as a full fledged QMP command */
2705 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2709 s = g_malloc0(sizeof(*s));
2711 if (bs->device_name[0]) {
2712 s->has_device = true;
2713 s->device = g_strdup(bs->device_name);
2716 s->stats = g_malloc0(sizeof(*s->stats));
2717 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2718 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2719 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2720 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2721 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2722 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2723 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2724 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2725 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2728 s->has_parent = true;
2729 s->parent = qmp_query_blockstat(bs->file, NULL);
2735 BlockStatsList *qmp_query_blockstats(Error **errp)
2737 BlockStatsList *head = NULL, *cur_item = NULL;
2738 BlockDriverState *bs;
2740 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2741 BlockStatsList *info = g_malloc0(sizeof(*info));
2742 info->value = qmp_query_blockstat(bs, NULL);
2744 /* XXX: waiting for the qapi to support GSList */
2746 head = cur_item = info;
2748 cur_item->next = info;
2756 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2758 if (bs->backing_hd && bs->backing_hd->encrypted)
2759 return bs->backing_file;
2760 else if (bs->encrypted)
2761 return bs->filename;
2766 void bdrv_get_backing_filename(BlockDriverState *bs,
2767 char *filename, int filename_size)
2769 pstrcpy(filename, filename_size, bs->backing_file);
2772 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2773 const uint8_t *buf, int nb_sectors)
2775 BlockDriver *drv = bs->drv;
2778 if (!drv->bdrv_write_compressed)
2780 if (bdrv_check_request(bs, sector_num, nb_sectors))
2783 if (bs->dirty_bitmap) {
2784 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2787 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2790 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2792 BlockDriver *drv = bs->drv;
2795 if (!drv->bdrv_get_info)
2797 memset(bdi, 0, sizeof(*bdi));
2798 return drv->bdrv_get_info(bs, bdi);
2801 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2802 int64_t pos, int size)
2804 BlockDriver *drv = bs->drv;
2807 if (drv->bdrv_save_vmstate)
2808 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2810 return bdrv_save_vmstate(bs->file, buf, pos, size);
2814 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2815 int64_t pos, int size)
2817 BlockDriver *drv = bs->drv;
2820 if (drv->bdrv_load_vmstate)
2821 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2823 return bdrv_load_vmstate(bs->file, buf, pos, size);
2827 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2829 BlockDriver *drv = bs->drv;
2831 if (!drv || !drv->bdrv_debug_event) {
2835 return drv->bdrv_debug_event(bs, event);
2839 /**************************************************************/
2840 /* handling of snapshots */
2842 int bdrv_can_snapshot(BlockDriverState *bs)
2844 BlockDriver *drv = bs->drv;
2845 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2849 if (!drv->bdrv_snapshot_create) {
2850 if (bs->file != NULL) {
2851 return bdrv_can_snapshot(bs->file);
2859 int bdrv_is_snapshot(BlockDriverState *bs)
2861 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2864 BlockDriverState *bdrv_snapshots(void)
2866 BlockDriverState *bs;
2869 return bs_snapshots;
2873 while ((bs = bdrv_next(bs))) {
2874 if (bdrv_can_snapshot(bs)) {
2882 int bdrv_snapshot_create(BlockDriverState *bs,
2883 QEMUSnapshotInfo *sn_info)
2885 BlockDriver *drv = bs->drv;
2888 if (drv->bdrv_snapshot_create)
2889 return drv->bdrv_snapshot_create(bs, sn_info);
2891 return bdrv_snapshot_create(bs->file, sn_info);
2895 int bdrv_snapshot_goto(BlockDriverState *bs,
2896 const char *snapshot_id)
2898 BlockDriver *drv = bs->drv;
2903 if (drv->bdrv_snapshot_goto)
2904 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2907 drv->bdrv_close(bs);
2908 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2909 open_ret = drv->bdrv_open(bs, bs->open_flags);
2911 bdrv_delete(bs->file);
2921 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2923 BlockDriver *drv = bs->drv;
2926 if (drv->bdrv_snapshot_delete)
2927 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2929 return bdrv_snapshot_delete(bs->file, snapshot_id);
2933 int bdrv_snapshot_list(BlockDriverState *bs,
2934 QEMUSnapshotInfo **psn_info)
2936 BlockDriver *drv = bs->drv;
2939 if (drv->bdrv_snapshot_list)
2940 return drv->bdrv_snapshot_list(bs, psn_info);
2942 return bdrv_snapshot_list(bs->file, psn_info);
2946 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2947 const char *snapshot_name)
2949 BlockDriver *drv = bs->drv;
2953 if (!bs->read_only) {
2956 if (drv->bdrv_snapshot_load_tmp) {
2957 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2962 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2963 const char *backing_file)
2969 if (bs->backing_hd) {
2970 if (strcmp(bs->backing_file, backing_file) == 0) {
2971 return bs->backing_hd;
2973 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2980 #define NB_SUFFIXES 4
2982 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2984 static const char suffixes[NB_SUFFIXES] = "KMGT";
2989 snprintf(buf, buf_size, "%" PRId64, size);
2992 for(i = 0; i < NB_SUFFIXES; i++) {
2993 if (size < (10 * base)) {
2994 snprintf(buf, buf_size, "%0.1f%c",
2995 (double)size / base,
2998 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2999 snprintf(buf, buf_size, "%" PRId64 "%c",
3000 ((size + (base >> 1)) / base),
3010 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3012 char buf1[128], date_buf[128], clock_buf[128];
3022 snprintf(buf, buf_size,
3023 "%-10s%-20s%7s%20s%15s",
3024 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3028 ptm = localtime(&ti);
3029 strftime(date_buf, sizeof(date_buf),
3030 "%Y-%m-%d %H:%M:%S", ptm);
3032 localtime_r(&ti, &tm);
3033 strftime(date_buf, sizeof(date_buf),
3034 "%Y-%m-%d %H:%M:%S", &tm);
3036 secs = sn->vm_clock_nsec / 1000000000;
3037 snprintf(clock_buf, sizeof(clock_buf),
3038 "%02d:%02d:%02d.%03d",
3040 (int)((secs / 60) % 60),
3042 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3043 snprintf(buf, buf_size,
3044 "%-10s%-20s%7s%20s%15s",
3045 sn->id_str, sn->name,
3046 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3053 /**************************************************************/
3056 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
3057 QEMUIOVector *qiov, int nb_sectors,
3058 BlockDriverCompletionFunc *cb, void *opaque)
3060 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3062 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3066 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3067 QEMUIOVector *qiov, int nb_sectors,
3068 BlockDriverCompletionFunc *cb, void *opaque)
3070 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3072 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
3077 typedef struct MultiwriteCB {
3082 BlockDriverCompletionFunc *cb;
3084 QEMUIOVector *free_qiov;
3088 static void multiwrite_user_cb(MultiwriteCB *mcb)
3092 for (i = 0; i < mcb->num_callbacks; i++) {
3093 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
3094 if (mcb->callbacks[i].free_qiov) {
3095 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3097 g_free(mcb->callbacks[i].free_qiov);
3101 static void multiwrite_cb(void *opaque, int ret)
3103 MultiwriteCB *mcb = opaque;
3105 trace_multiwrite_cb(mcb, ret);
3107 if (ret < 0 && !mcb->error) {
3111 mcb->num_requests--;
3112 if (mcb->num_requests == 0) {
3113 multiwrite_user_cb(mcb);
3118 static int multiwrite_req_compare(const void *a, const void *b)
3120 const BlockRequest *req1 = a, *req2 = b;
3123 * Note that we can't simply subtract req2->sector from req1->sector
3124 * here as that could overflow the return value.
3126 if (req1->sector > req2->sector) {
3128 } else if (req1->sector < req2->sector) {
3136 * Takes a bunch of requests and tries to merge them. Returns the number of
3137 * requests that remain after merging.
3139 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3140 int num_reqs, MultiwriteCB *mcb)
3144 // Sort requests by start sector
3145 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3147 // Check if adjacent requests touch the same clusters. If so, combine them,
3148 // filling up gaps with zero sectors.
3150 for (i = 1; i < num_reqs; i++) {
3152 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3154 // Handle exactly sequential writes and overlapping writes.
3155 if (reqs[i].sector <= oldreq_last) {
3159 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3165 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3166 qemu_iovec_init(qiov,
3167 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3169 // Add the first request to the merged one. If the requests are
3170 // overlapping, drop the last sectors of the first request.
3171 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3172 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3174 // We should need to add any zeros between the two requests
3175 assert (reqs[i].sector <= oldreq_last);
3177 // Add the second request
3178 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3180 reqs[outidx].nb_sectors = qiov->size >> 9;
3181 reqs[outidx].qiov = qiov;
3183 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3186 reqs[outidx].sector = reqs[i].sector;
3187 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3188 reqs[outidx].qiov = reqs[i].qiov;
3196 * Submit multiple AIO write requests at once.
3198 * On success, the function returns 0 and all requests in the reqs array have
3199 * been submitted. In error case this function returns -1, and any of the
3200 * requests may or may not be submitted yet. In particular, this means that the
3201 * callback will be called for some of the requests, for others it won't. The
3202 * caller must check the error field of the BlockRequest to wait for the right
3203 * callbacks (if error != 0, no callback will be called).
3205 * The implementation may modify the contents of the reqs array, e.g. to merge
3206 * requests. However, the fields opaque and error are left unmodified as they
3207 * are used to signal failure for a single request to the caller.
3209 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3214 /* don't submit writes if we don't have a medium */
3215 if (bs->drv == NULL) {
3216 for (i = 0; i < num_reqs; i++) {
3217 reqs[i].error = -ENOMEDIUM;
3222 if (num_reqs == 0) {
3226 // Create MultiwriteCB structure
3227 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3228 mcb->num_requests = 0;
3229 mcb->num_callbacks = num_reqs;
3231 for (i = 0; i < num_reqs; i++) {
3232 mcb->callbacks[i].cb = reqs[i].cb;
3233 mcb->callbacks[i].opaque = reqs[i].opaque;
3236 // Check for mergable requests
3237 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3239 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3241 /* Run the aio requests. */
3242 mcb->num_requests = num_reqs;
3243 for (i = 0; i < num_reqs; i++) {
3244 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3245 reqs[i].nb_sectors, multiwrite_cb, mcb);
3251 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3253 acb->pool->cancel(acb);
3256 /* block I/O throttling */
3257 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3258 bool is_write, double elapsed_time, uint64_t *wait)
3260 uint64_t bps_limit = 0;
3261 double bytes_limit, bytes_base, bytes_res;
3262 double slice_time, wait_time;
3264 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3265 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3266 } else if (bs->io_limits.bps[is_write]) {
3267 bps_limit = bs->io_limits.bps[is_write];
3276 slice_time = bs->slice_end - bs->slice_start;
3277 slice_time /= (NANOSECONDS_PER_SECOND);
3278 bytes_limit = bps_limit * slice_time;
3279 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3280 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3281 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3284 /* bytes_base: the bytes of data which have been read/written; and
3285 * it is obtained from the history statistic info.
3286 * bytes_res: the remaining bytes of data which need to be read/written.
3287 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3288 * the total time for completing reading/writting all data.
3290 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3292 if (bytes_base + bytes_res <= bytes_limit) {
3300 /* Calc approx time to dispatch */
3301 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3303 /* When the I/O rate at runtime exceeds the limits,
3304 * bs->slice_end need to be extended in order that the current statistic
3305 * info can be kept until the timer fire, so it is increased and tuned
3306 * based on the result of experiment.
3308 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3309 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3311 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3317 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3318 double elapsed_time, uint64_t *wait)
3320 uint64_t iops_limit = 0;
3321 double ios_limit, ios_base;
3322 double slice_time, wait_time;
3324 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3325 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3326 } else if (bs->io_limits.iops[is_write]) {
3327 iops_limit = bs->io_limits.iops[is_write];
3336 slice_time = bs->slice_end - bs->slice_start;
3337 slice_time /= (NANOSECONDS_PER_SECOND);
3338 ios_limit = iops_limit * slice_time;
3339 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3340 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3341 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3344 if (ios_base + 1 <= ios_limit) {
3352 /* Calc approx time to dispatch */
3353 wait_time = (ios_base + 1) / iops_limit;
3354 if (wait_time > elapsed_time) {
3355 wait_time = wait_time - elapsed_time;
3360 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3361 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3363 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3369 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3370 bool is_write, int64_t *wait)
3372 int64_t now, max_wait;
3373 uint64_t bps_wait = 0, iops_wait = 0;
3374 double elapsed_time;
3375 int bps_ret, iops_ret;
3377 now = qemu_get_clock_ns(vm_clock);
3378 if ((bs->slice_start < now)
3379 && (bs->slice_end > now)) {
3380 bs->slice_end = now + bs->slice_time;
3382 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3383 bs->slice_start = now;
3384 bs->slice_end = now + bs->slice_time;
3386 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3387 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3389 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3390 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3393 elapsed_time = now - bs->slice_start;
3394 elapsed_time /= (NANOSECONDS_PER_SECOND);
3396 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3397 is_write, elapsed_time, &bps_wait);
3398 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3399 elapsed_time, &iops_wait);
3400 if (bps_ret || iops_ret) {
3401 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3406 now = qemu_get_clock_ns(vm_clock);
3407 if (bs->slice_end < now + max_wait) {
3408 bs->slice_end = now + max_wait;
3421 /**************************************************************/
3422 /* async block device emulation */
3424 typedef struct BlockDriverAIOCBSync {
3425 BlockDriverAIOCB common;
3428 /* vector translation state */
3432 } BlockDriverAIOCBSync;
3434 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3436 BlockDriverAIOCBSync *acb =
3437 container_of(blockacb, BlockDriverAIOCBSync, common);
3438 qemu_bh_delete(acb->bh);
3440 qemu_aio_release(acb);
3443 static AIOPool bdrv_em_aio_pool = {
3444 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3445 .cancel = bdrv_aio_cancel_em,
3448 static void bdrv_aio_bh_cb(void *opaque)
3450 BlockDriverAIOCBSync *acb = opaque;
3453 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3454 qemu_vfree(acb->bounce);
3455 acb->common.cb(acb->common.opaque, acb->ret);
3456 qemu_bh_delete(acb->bh);
3458 qemu_aio_release(acb);
3461 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3465 BlockDriverCompletionFunc *cb,
3470 BlockDriverAIOCBSync *acb;
3472 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3473 acb->is_write = is_write;
3475 acb->bounce = qemu_blockalign(bs, qiov->size);
3476 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3479 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3480 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3482 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3485 qemu_bh_schedule(acb->bh);
3487 return &acb->common;
3490 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3491 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3492 BlockDriverCompletionFunc *cb, void *opaque)
3494 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3497 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3498 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3499 BlockDriverCompletionFunc *cb, void *opaque)
3501 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3505 typedef struct BlockDriverAIOCBCoroutine {
3506 BlockDriverAIOCB common;
3510 } BlockDriverAIOCBCoroutine;
3512 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3517 static AIOPool bdrv_em_co_aio_pool = {
3518 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3519 .cancel = bdrv_aio_co_cancel_em,
3522 static void bdrv_co_em_bh(void *opaque)
3524 BlockDriverAIOCBCoroutine *acb = opaque;
3526 acb->common.cb(acb->common.opaque, acb->req.error);
3527 qemu_bh_delete(acb->bh);
3528 qemu_aio_release(acb);
3531 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3532 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3534 BlockDriverAIOCBCoroutine *acb = opaque;
3535 BlockDriverState *bs = acb->common.bs;
3537 if (!acb->is_write) {
3538 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3539 acb->req.nb_sectors, acb->req.qiov, 0);
3541 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3542 acb->req.nb_sectors, acb->req.qiov, 0);
3545 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3546 qemu_bh_schedule(acb->bh);
3549 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3553 BlockDriverCompletionFunc *cb,
3558 BlockDriverAIOCBCoroutine *acb;
3560 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3561 acb->req.sector = sector_num;
3562 acb->req.nb_sectors = nb_sectors;
3563 acb->req.qiov = qiov;
3564 acb->is_write = is_write;
3566 co = qemu_coroutine_create(bdrv_co_do_rw);
3567 qemu_coroutine_enter(co, acb);
3569 return &acb->common;
3572 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3574 BlockDriverAIOCBCoroutine *acb = opaque;
3575 BlockDriverState *bs = acb->common.bs;
3577 acb->req.error = bdrv_co_flush(bs);
3578 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3579 qemu_bh_schedule(acb->bh);
3582 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3583 BlockDriverCompletionFunc *cb, void *opaque)
3585 trace_bdrv_aio_flush(bs, opaque);
3588 BlockDriverAIOCBCoroutine *acb;
3590 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3591 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3592 qemu_coroutine_enter(co, acb);
3594 return &acb->common;
3597 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3599 BlockDriverAIOCBCoroutine *acb = opaque;
3600 BlockDriverState *bs = acb->common.bs;
3602 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3603 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3604 qemu_bh_schedule(acb->bh);
3607 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3608 int64_t sector_num, int nb_sectors,
3609 BlockDriverCompletionFunc *cb, void *opaque)
3612 BlockDriverAIOCBCoroutine *acb;
3614 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3616 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3617 acb->req.sector = sector_num;
3618 acb->req.nb_sectors = nb_sectors;
3619 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3620 qemu_coroutine_enter(co, acb);
3622 return &acb->common;
3625 void bdrv_init(void)
3627 module_call_init(MODULE_INIT_BLOCK);
3630 void bdrv_init_with_whitelist(void)
3632 use_bdrv_whitelist = 1;
3636 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3637 BlockDriverCompletionFunc *cb, void *opaque)
3639 BlockDriverAIOCB *acb;
3641 if (pool->free_aiocb) {
3642 acb = pool->free_aiocb;
3643 pool->free_aiocb = acb->next;
3645 acb = g_malloc0(pool->aiocb_size);
3650 acb->opaque = opaque;
3654 void qemu_aio_release(void *p)
3656 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3657 AIOPool *pool = acb->pool;
3658 acb->next = pool->free_aiocb;
3659 pool->free_aiocb = acb;
3662 /**************************************************************/
3663 /* Coroutine block device emulation */
3665 typedef struct CoroutineIOCompletion {
3666 Coroutine *coroutine;
3668 } CoroutineIOCompletion;
3670 static void bdrv_co_io_em_complete(void *opaque, int ret)
3672 CoroutineIOCompletion *co = opaque;
3675 qemu_coroutine_enter(co->coroutine, NULL);
3678 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3679 int nb_sectors, QEMUIOVector *iov,
3682 CoroutineIOCompletion co = {
3683 .coroutine = qemu_coroutine_self(),
3685 BlockDriverAIOCB *acb;
3688 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3689 bdrv_co_io_em_complete, &co);
3691 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3692 bdrv_co_io_em_complete, &co);
3695 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3699 qemu_coroutine_yield();
3704 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3705 int64_t sector_num, int nb_sectors,
3708 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3711 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3712 int64_t sector_num, int nb_sectors,
3715 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3718 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3720 RwCo *rwco = opaque;
3722 rwco->ret = bdrv_co_flush(rwco->bs);
3725 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3729 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3733 /* Write back cached data to the OS even with cache=unsafe */
3734 if (bs->drv->bdrv_co_flush_to_os) {
3735 ret = bs->drv->bdrv_co_flush_to_os(bs);
3741 /* But don't actually force it to the disk with cache=unsafe */
3742 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3746 if (bs->drv->bdrv_co_flush_to_disk) {
3747 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3748 } else if (bs->drv->bdrv_aio_flush) {
3749 BlockDriverAIOCB *acb;
3750 CoroutineIOCompletion co = {
3751 .coroutine = qemu_coroutine_self(),
3754 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3758 qemu_coroutine_yield();
3763 * Some block drivers always operate in either writethrough or unsafe
3764 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3765 * know how the server works (because the behaviour is hardcoded or
3766 * depends on server-side configuration), so we can't ensure that
3767 * everything is safe on disk. Returning an error doesn't work because
3768 * that would break guests even if the server operates in writethrough
3771 * Let's hope the user knows what he's doing.
3779 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3780 * in the case of cache=unsafe, so there are no useless flushes.
3782 return bdrv_co_flush(bs->file);
3785 void bdrv_invalidate_cache(BlockDriverState *bs)
3787 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3788 bs->drv->bdrv_invalidate_cache(bs);
3792 void bdrv_invalidate_cache_all(void)
3794 BlockDriverState *bs;
3796 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3797 bdrv_invalidate_cache(bs);
3801 void bdrv_clear_incoming_migration_all(void)
3803 BlockDriverState *bs;
3805 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3806 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3810 int bdrv_flush(BlockDriverState *bs)
3818 if (qemu_in_coroutine()) {
3819 /* Fast-path if already in coroutine context */
3820 bdrv_flush_co_entry(&rwco);
3822 co = qemu_coroutine_create(bdrv_flush_co_entry);
3823 qemu_coroutine_enter(co, &rwco);
3824 while (rwco.ret == NOT_DONE) {
3832 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3834 RwCo *rwco = opaque;
3836 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3839 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3844 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3846 } else if (bs->read_only) {
3848 } else if (bs->drv->bdrv_co_discard) {
3849 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3850 } else if (bs->drv->bdrv_aio_discard) {
3851 BlockDriverAIOCB *acb;
3852 CoroutineIOCompletion co = {
3853 .coroutine = qemu_coroutine_self(),
3856 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3857 bdrv_co_io_em_complete, &co);
3861 qemu_coroutine_yield();
3869 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3874 .sector_num = sector_num,
3875 .nb_sectors = nb_sectors,
3879 if (qemu_in_coroutine()) {
3880 /* Fast-path if already in coroutine context */
3881 bdrv_discard_co_entry(&rwco);
3883 co = qemu_coroutine_create(bdrv_discard_co_entry);
3884 qemu_coroutine_enter(co, &rwco);
3885 while (rwco.ret == NOT_DONE) {
3893 /**************************************************************/
3894 /* removable device support */
3897 * Return TRUE if the media is present
3899 int bdrv_is_inserted(BlockDriverState *bs)
3901 BlockDriver *drv = bs->drv;
3905 if (!drv->bdrv_is_inserted)
3907 return drv->bdrv_is_inserted(bs);
3911 * Return whether the media changed since the last call to this
3912 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3914 int bdrv_media_changed(BlockDriverState *bs)
3916 BlockDriver *drv = bs->drv;
3918 if (drv && drv->bdrv_media_changed) {
3919 return drv->bdrv_media_changed(bs);
3925 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3927 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3929 BlockDriver *drv = bs->drv;
3931 if (drv && drv->bdrv_eject) {
3932 drv->bdrv_eject(bs, eject_flag);
3935 if (bs->device_name[0] != '\0') {
3936 bdrv_emit_qmp_eject_event(bs, eject_flag);
3941 * Lock or unlock the media (if it is locked, the user won't be able
3942 * to eject it manually).
3944 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3946 BlockDriver *drv = bs->drv;
3948 trace_bdrv_lock_medium(bs, locked);
3950 if (drv && drv->bdrv_lock_medium) {
3951 drv->bdrv_lock_medium(bs, locked);
3955 /* needed for generic scsi interface */
3957 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3959 BlockDriver *drv = bs->drv;
3961 if (drv && drv->bdrv_ioctl)
3962 return drv->bdrv_ioctl(bs, req, buf);
3966 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3967 unsigned long int req, void *buf,
3968 BlockDriverCompletionFunc *cb, void *opaque)
3970 BlockDriver *drv = bs->drv;
3972 if (drv && drv->bdrv_aio_ioctl)
3973 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3977 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3979 bs->buffer_alignment = align;
3982 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3984 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3987 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3989 int64_t bitmap_size;
3991 bs->dirty_count = 0;
3993 if (!bs->dirty_bitmap) {
3994 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3995 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3996 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
3998 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
4001 if (bs->dirty_bitmap) {
4002 g_free(bs->dirty_bitmap);
4003 bs->dirty_bitmap = NULL;
4008 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4010 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
4012 if (bs->dirty_bitmap &&
4013 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
4014 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4015 (1UL << (chunk % (sizeof(unsigned long) * 8))));
4021 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4024 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4027 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4029 return bs->dirty_count;
4032 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4034 assert(bs->in_use != in_use);
4035 bs->in_use = in_use;
4038 int bdrv_in_use(BlockDriverState *bs)
4043 void bdrv_iostatus_enable(BlockDriverState *bs)
4045 bs->iostatus_enabled = true;
4046 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4049 /* The I/O status is only enabled if the drive explicitly
4050 * enables it _and_ the VM is configured to stop on errors */
4051 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4053 return (bs->iostatus_enabled &&
4054 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
4055 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
4056 bs->on_read_error == BLOCK_ERR_STOP_ANY));
4059 void bdrv_iostatus_disable(BlockDriverState *bs)
4061 bs->iostatus_enabled = false;
4064 void bdrv_iostatus_reset(BlockDriverState *bs)
4066 if (bdrv_iostatus_is_enabled(bs)) {
4067 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
4071 /* XXX: Today this is set by device models because it makes the implementation
4072 quite simple. However, the block layer knows about the error, so it's
4073 possible to implement this without device models being involved */
4074 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4076 if (bdrv_iostatus_is_enabled(bs) &&
4077 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
4079 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4080 BLOCK_DEVICE_IO_STATUS_FAILED;
4085 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4086 enum BlockAcctType type)
4088 assert(type < BDRV_MAX_IOTYPE);
4090 cookie->bytes = bytes;
4091 cookie->start_time_ns = get_clock();
4092 cookie->type = type;
4096 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4098 assert(cookie->type < BDRV_MAX_IOTYPE);
4100 bs->nr_bytes[cookie->type] += cookie->bytes;
4101 bs->nr_ops[cookie->type]++;
4102 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4105 int bdrv_img_create(const char *filename, const char *fmt,
4106 const char *base_filename, const char *base_fmt,
4107 char *options, uint64_t img_size, int flags)
4109 QEMUOptionParameter *param = NULL, *create_options = NULL;
4110 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4111 BlockDriverState *bs = NULL;
4112 BlockDriver *drv, *proto_drv;
4113 BlockDriver *backing_drv = NULL;
4116 /* Find driver and parse its options */
4117 drv = bdrv_find_format(fmt);
4119 error_report("Unknown file format '%s'", fmt);
4124 proto_drv = bdrv_find_protocol(filename);
4126 error_report("Unknown protocol '%s'", filename);
4131 create_options = append_option_parameters(create_options,
4132 drv->create_options);
4133 create_options = append_option_parameters(create_options,
4134 proto_drv->create_options);
4136 /* Create parameter list with default values */
4137 param = parse_option_parameters("", create_options, param);
4139 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4141 /* Parse -o options */
4143 param = parse_option_parameters(options, create_options, param);
4144 if (param == NULL) {
4145 error_report("Invalid options for file format '%s'.", fmt);
4151 if (base_filename) {
4152 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4154 error_report("Backing file not supported for file format '%s'",
4162 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4163 error_report("Backing file format not supported for file "
4164 "format '%s'", fmt);
4170 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4171 if (backing_file && backing_file->value.s) {
4172 if (!strcmp(filename, backing_file->value.s)) {
4173 error_report("Error: Trying to create an image with the "
4174 "same filename as the backing file");
4180 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4181 if (backing_fmt && backing_fmt->value.s) {
4182 backing_drv = bdrv_find_format(backing_fmt->value.s);
4184 error_report("Unknown backing file format '%s'",
4185 backing_fmt->value.s);
4191 // The size for the image must always be specified, with one exception:
4192 // If we are using a backing file, we can obtain the size from there
4193 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4194 if (size && size->value.n == -1) {
4195 if (backing_file && backing_file->value.s) {
4200 /* backing files always opened read-only */
4202 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
4206 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
4208 error_report("Could not open '%s'", backing_file->value.s);
4211 bdrv_get_geometry(bs, &size);
4214 snprintf(buf, sizeof(buf), "%" PRId64, size);
4215 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4217 error_report("Image creation needs a size parameter");
4223 printf("Formatting '%s', fmt=%s ", filename, fmt);
4224 print_option_parameters(param);
4227 ret = bdrv_create(drv, filename, param);
4230 if (ret == -ENOTSUP) {
4231 error_report("Formatting or formatting option not supported for "
4232 "file format '%s'", fmt);
4233 } else if (ret == -EFBIG) {
4234 error_report("The image size is too large for file format '%s'",
4237 error_report("%s: error while creating %s: %s", filename, fmt,
4243 free_option_parameters(create_options);
4244 free_option_parameters(param);
4253 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4254 int64_t speed, BlockDriverCompletionFunc *cb,
4255 void *opaque, Error **errp)
4259 if (bs->job || bdrv_in_use(bs)) {
4260 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4263 bdrv_set_in_use(bs, 1);
4265 job = g_malloc0(job_type->instance_size);
4266 job->job_type = job_type;
4269 job->opaque = opaque;
4273 /* Only set speed when necessary to avoid NotSupported error */
4275 Error *local_err = NULL;
4277 block_job_set_speed(job, speed, &local_err);
4278 if (error_is_set(&local_err)) {
4281 bdrv_set_in_use(bs, 0);
4282 error_propagate(errp, local_err);
4289 void block_job_complete(BlockJob *job, int ret)
4291 BlockDriverState *bs = job->bs;
4293 assert(bs->job == job);
4294 job->cb(job->opaque, ret);
4297 bdrv_set_in_use(bs, 0);
4300 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4302 Error *local_err = NULL;
4304 if (!job->job_type->set_speed) {
4305 error_set(errp, QERR_NOT_SUPPORTED);
4308 job->job_type->set_speed(job, speed, &local_err);
4309 if (error_is_set(&local_err)) {
4310 error_propagate(errp, local_err);
4317 void block_job_cancel(BlockJob *job)
4319 job->cancelled = true;
4320 if (job->co && !job->busy) {
4321 qemu_coroutine_enter(job->co, NULL);
4325 bool block_job_is_cancelled(BlockJob *job)
4327 return job->cancelled;
4330 struct BlockCancelData {
4332 BlockDriverCompletionFunc *cb;
4338 static void block_job_cancel_cb(void *opaque, int ret)
4340 struct BlockCancelData *data = opaque;
4342 data->cancelled = block_job_is_cancelled(data->job);
4344 data->cb(data->opaque, ret);
4347 int block_job_cancel_sync(BlockJob *job)
4349 struct BlockCancelData data;
4350 BlockDriverState *bs = job->bs;
4352 assert(bs->job == job);
4354 /* Set up our own callback to store the result and chain to
4355 * the original callback.
4359 data.opaque = job->opaque;
4360 data.ret = -EINPROGRESS;
4361 job->cb = block_job_cancel_cb;
4362 job->opaque = &data;
4363 block_job_cancel(job);
4364 while (data.ret == -EINPROGRESS) {
4367 return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
4370 void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4372 /* Check cancellation *before* setting busy = false, too! */
4373 if (!block_job_is_cancelled(job)) {
4375 co_sleep_ns(clock, ns);