4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
29 #include "monitor/monitor.h"
30 #include "qapi/qmp/qerror.h"
31 #include "qemu/error-report.h"
32 #include "sysemu/sysemu.h"
33 #include "sysemu/block-backend.h"
34 #include "exec/gdbstub.h"
35 #include "sysemu/dma.h"
36 #include "sysemu/kvm.h"
37 #include "sysemu/hax.h"
38 #include "qmp-commands.h"
39 #include "exec/exec-all.h"
41 #include "qemu/thread.h"
42 #include "sysemu/cpus.h"
43 #include "sysemu/qtest.h"
44 #include "qemu/main-loop.h"
45 #include "qemu/bitmap.h"
46 #include "qemu/seqlock.h"
47 #include "qapi-event.h"
49 #include "sysemu/replay.h"
52 #include "qemu/compatfd.h"
57 #include <sys/prctl.h>
60 #define PR_MCE_KILL 33
63 #ifndef PR_MCE_KILL_SET
64 #define PR_MCE_KILL_SET 1
67 #ifndef PR_MCE_KILL_EARLY
68 #define PR_MCE_KILL_EARLY 1
71 #endif /* CONFIG_LINUX */
76 /* vcpu throttling controls */
77 static QEMUTimer *throttle_timer;
78 static unsigned int throttle_percentage;
80 #define CPU_THROTTLE_PCT_MIN 1
81 #define CPU_THROTTLE_PCT_MAX 99
82 #define CPU_THROTTLE_TIMESLICE_NS 10000000
84 bool cpu_is_stopped(CPUState *cpu)
86 return cpu->stopped || !runstate_is_running();
89 static bool cpu_thread_is_idle(CPUState *cpu)
91 if (cpu->stop || cpu->queued_work_first) {
94 if (cpu_is_stopped(cpu)) {
97 if (!cpu->halted || cpu_has_work(cpu) ||
98 kvm_halt_in_kernel()) {
104 static bool all_cpu_threads_idle(void)
109 if (!cpu_thread_is_idle(cpu)) {
116 /***********************************************************/
117 /* guest cycle counter */
119 /* Protected by TimersState seqlock */
121 static bool icount_sleep = true;
122 static int64_t vm_clock_warp_start = -1;
123 /* Conversion factor from emulated instructions to virtual clock ticks. */
124 static int icount_time_shift;
125 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
126 #define MAX_ICOUNT_SHIFT 10
128 static QEMUTimer *icount_rt_timer;
129 static QEMUTimer *icount_vm_timer;
130 static QEMUTimer *icount_warp_timer;
132 typedef struct TimersState {
133 /* Protected by BQL. */
134 int64_t cpu_ticks_prev;
135 int64_t cpu_ticks_offset;
137 /* cpu_clock_offset can be read out of BQL, so protect it with
140 QemuSeqLock vm_clock_seqlock;
141 int64_t cpu_clock_offset;
142 int32_t cpu_ticks_enabled;
145 /* Compensate for varying guest execution speed. */
146 int64_t qemu_icount_bias;
147 /* Only written by TCG thread */
151 static TimersState timers_state;
153 int64_t cpu_get_icount_raw(void)
156 CPUState *cpu = current_cpu;
158 icount = timers_state.qemu_icount;
160 if (!cpu->can_do_io) {
161 fprintf(stderr, "Bad icount read\n");
164 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
169 /* Return the virtual CPU time, based on the instruction counter. */
170 static int64_t cpu_get_icount_locked(void)
172 int64_t icount = cpu_get_icount_raw();
173 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
176 int64_t cpu_get_icount(void)
182 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
183 icount = cpu_get_icount_locked();
184 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
189 int64_t cpu_icount_to_ns(int64_t icount)
191 return icount << icount_time_shift;
194 /* return the time elapsed in VM between vm_start and vm_stop. Unless
195 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
198 * Caller must hold the BQL
200 int64_t cpu_get_ticks(void)
205 return cpu_get_icount();
208 ticks = timers_state.cpu_ticks_offset;
209 if (timers_state.cpu_ticks_enabled) {
210 ticks += cpu_get_host_ticks();
213 if (timers_state.cpu_ticks_prev > ticks) {
214 /* Note: non increasing ticks may happen if the host uses
216 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
217 ticks = timers_state.cpu_ticks_prev;
220 timers_state.cpu_ticks_prev = ticks;
224 static int64_t cpu_get_clock_locked(void)
228 time = timers_state.cpu_clock_offset;
229 if (timers_state.cpu_ticks_enabled) {
236 /* Return the monotonic time elapsed in VM, i.e.,
237 * the time between vm_start and vm_stop
239 int64_t cpu_get_clock(void)
245 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
246 ti = cpu_get_clock_locked();
247 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
252 /* enable cpu_get_ticks()
253 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
255 void cpu_enable_ticks(void)
257 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
258 seqlock_write_begin(&timers_state.vm_clock_seqlock);
259 if (!timers_state.cpu_ticks_enabled) {
260 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
261 timers_state.cpu_clock_offset -= get_clock();
262 timers_state.cpu_ticks_enabled = 1;
264 seqlock_write_end(&timers_state.vm_clock_seqlock);
267 /* disable cpu_get_ticks() : the clock is stopped. You must not call
268 * cpu_get_ticks() after that.
269 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
271 void cpu_disable_ticks(void)
273 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
274 seqlock_write_begin(&timers_state.vm_clock_seqlock);
275 if (timers_state.cpu_ticks_enabled) {
276 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
277 timers_state.cpu_clock_offset = cpu_get_clock_locked();
278 timers_state.cpu_ticks_enabled = 0;
280 seqlock_write_end(&timers_state.vm_clock_seqlock);
283 /* Correlation between real and virtual time is always going to be
284 fairly approximate, so ignore small variation.
285 When the guest is idle real and virtual time will be aligned in
287 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
289 static void icount_adjust(void)
295 /* Protected by TimersState mutex. */
296 static int64_t last_delta;
298 /* If the VM is not running, then do nothing. */
299 if (!runstate_is_running()) {
303 seqlock_write_begin(&timers_state.vm_clock_seqlock);
304 cur_time = cpu_get_clock_locked();
305 cur_icount = cpu_get_icount_locked();
307 delta = cur_icount - cur_time;
308 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
310 && last_delta + ICOUNT_WOBBLE < delta * 2
311 && icount_time_shift > 0) {
312 /* The guest is getting too far ahead. Slow time down. */
316 && last_delta - ICOUNT_WOBBLE > delta * 2
317 && icount_time_shift < MAX_ICOUNT_SHIFT) {
318 /* The guest is getting too far behind. Speed time up. */
322 timers_state.qemu_icount_bias = cur_icount
323 - (timers_state.qemu_icount << icount_time_shift);
324 seqlock_write_end(&timers_state.vm_clock_seqlock);
327 static void icount_adjust_rt(void *opaque)
329 timer_mod(icount_rt_timer,
330 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
334 static void icount_adjust_vm(void *opaque)
336 timer_mod(icount_vm_timer,
337 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
338 NANOSECONDS_PER_SECOND / 10);
342 static int64_t qemu_icount_round(int64_t count)
344 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
347 static void icount_warp_rt(void)
352 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
353 * changes from -1 to another value, so the race here is okay.
356 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
357 warp_start = vm_clock_warp_start;
358 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
360 if (warp_start == -1) {
364 seqlock_write_begin(&timers_state.vm_clock_seqlock);
365 if (runstate_is_running()) {
366 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
367 cpu_get_clock_locked());
370 warp_delta = clock - vm_clock_warp_start;
371 if (use_icount == 2) {
373 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
374 * far ahead of real time.
376 int64_t cur_icount = cpu_get_icount_locked();
377 int64_t delta = clock - cur_icount;
378 warp_delta = MIN(warp_delta, delta);
380 timers_state.qemu_icount_bias += warp_delta;
382 vm_clock_warp_start = -1;
383 seqlock_write_end(&timers_state.vm_clock_seqlock);
385 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
386 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
390 static void icount_timer_cb(void *opaque)
392 /* No need for a checkpoint because the timer already synchronizes
393 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
398 void qtest_clock_warp(int64_t dest)
400 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
401 AioContext *aio_context;
402 assert(qtest_enabled());
403 aio_context = qemu_get_aio_context();
404 while (clock < dest) {
405 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
406 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
408 seqlock_write_begin(&timers_state.vm_clock_seqlock);
409 timers_state.qemu_icount_bias += warp;
410 seqlock_write_end(&timers_state.vm_clock_seqlock);
412 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
413 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
414 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
416 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
419 void qemu_start_warp_timer(void)
428 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
429 * do not fire, so computing the deadline does not make sense.
431 if (!runstate_is_running()) {
435 /* warp clock deterministically in record/replay mode */
436 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
440 if (!all_cpu_threads_idle()) {
444 if (qtest_enabled()) {
445 /* When testing, qtest commands advance icount. */
449 /* We want to use the earliest deadline from ALL vm_clocks */
450 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
451 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
453 static bool notified;
454 if (!icount_sleep && !notified) {
455 error_report("WARNING: icount sleep disabled and no active timers");
463 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
464 * sleep. Otherwise, the CPU might be waiting for a future timer
465 * interrupt to wake it up, but the interrupt never comes because
466 * the vCPU isn't running any insns and thus doesn't advance the
467 * QEMU_CLOCK_VIRTUAL.
471 * We never let VCPUs sleep in no sleep icount mode.
472 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
473 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
474 * It is useful when we want a deterministic execution time,
475 * isolated from host latencies.
477 seqlock_write_begin(&timers_state.vm_clock_seqlock);
478 timers_state.qemu_icount_bias += deadline;
479 seqlock_write_end(&timers_state.vm_clock_seqlock);
480 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
483 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
484 * "real" time, (related to the time left until the next event) has
485 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
486 * This avoids that the warps are visible externally; for example,
487 * you will not be sending network packets continuously instead of
490 seqlock_write_begin(&timers_state.vm_clock_seqlock);
491 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
492 vm_clock_warp_start = clock;
494 seqlock_write_end(&timers_state.vm_clock_seqlock);
495 timer_mod_anticipate(icount_warp_timer, clock + deadline);
497 } else if (deadline == 0) {
498 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
502 static void qemu_account_warp_timer(void)
504 if (!use_icount || !icount_sleep) {
508 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
509 * do not fire, so computing the deadline does not make sense.
511 if (!runstate_is_running()) {
515 /* warp clock deterministically in record/replay mode */
516 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
520 timer_del(icount_warp_timer);
524 static bool icount_state_needed(void *opaque)
530 * This is a subsection for icount migration.
532 static const VMStateDescription icount_vmstate_timers = {
533 .name = "timer/icount",
535 .minimum_version_id = 1,
536 .needed = icount_state_needed,
537 .fields = (VMStateField[]) {
538 VMSTATE_INT64(qemu_icount_bias, TimersState),
539 VMSTATE_INT64(qemu_icount, TimersState),
540 VMSTATE_END_OF_LIST()
544 static const VMStateDescription vmstate_timers = {
547 .minimum_version_id = 1,
548 .fields = (VMStateField[]) {
549 VMSTATE_INT64(cpu_ticks_offset, TimersState),
550 VMSTATE_INT64(dummy, TimersState),
551 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
552 VMSTATE_END_OF_LIST()
554 .subsections = (const VMStateDescription*[]) {
555 &icount_vmstate_timers,
560 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
563 double throttle_ratio;
566 if (!cpu_throttle_get_percentage()) {
570 pct = (double)cpu_throttle_get_percentage()/100;
571 throttle_ratio = pct / (1 - pct);
572 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
574 qemu_mutex_unlock_iothread();
575 atomic_set(&cpu->throttle_thread_scheduled, 0);
576 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
577 qemu_mutex_lock_iothread();
580 static void cpu_throttle_timer_tick(void *opaque)
585 /* Stop the timer if needed */
586 if (!cpu_throttle_get_percentage()) {
590 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
591 async_run_on_cpu(cpu, cpu_throttle_thread,
596 pct = (double)cpu_throttle_get_percentage()/100;
597 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
598 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
601 void cpu_throttle_set(int new_throttle_pct)
603 /* Ensure throttle percentage is within valid range */
604 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
605 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
607 atomic_set(&throttle_percentage, new_throttle_pct);
609 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
610 CPU_THROTTLE_TIMESLICE_NS);
613 void cpu_throttle_stop(void)
615 atomic_set(&throttle_percentage, 0);
618 bool cpu_throttle_active(void)
620 return (cpu_throttle_get_percentage() != 0);
623 int cpu_throttle_get_percentage(void)
625 return atomic_read(&throttle_percentage);
628 void cpu_ticks_init(void)
630 seqlock_init(&timers_state.vm_clock_seqlock);
631 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
632 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
633 cpu_throttle_timer_tick, NULL);
636 void configure_icount(QemuOpts *opts, Error **errp)
639 char *rem_str = NULL;
641 option = qemu_opt_get(opts, "shift");
643 if (qemu_opt_get(opts, "align") != NULL) {
644 error_setg(errp, "Please specify shift option when using align");
649 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
651 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
652 icount_timer_cb, NULL);
655 icount_align_option = qemu_opt_get_bool(opts, "align", false);
657 if (icount_align_option && !icount_sleep) {
658 error_setg(errp, "align=on and sleep=off are incompatible");
660 if (strcmp(option, "auto") != 0) {
662 icount_time_shift = strtol(option, &rem_str, 0);
663 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
664 error_setg(errp, "icount: Invalid shift value");
668 } else if (icount_align_option) {
669 error_setg(errp, "shift=auto and align=on are incompatible");
670 } else if (!icount_sleep) {
671 error_setg(errp, "shift=auto and sleep=off are incompatible");
676 /* 125MIPS seems a reasonable initial guess at the guest speed.
677 It will be corrected fairly quickly anyway. */
678 icount_time_shift = 3;
680 /* Have both realtime and virtual time triggers for speed adjustment.
681 The realtime trigger catches emulated time passing too slowly,
682 the virtual time trigger catches emulated time passing too fast.
683 Realtime triggers occur even when idle, so use them less frequently
685 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
686 icount_adjust_rt, NULL);
687 timer_mod(icount_rt_timer,
688 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
689 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
690 icount_adjust_vm, NULL);
691 timer_mod(icount_vm_timer,
692 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
693 NANOSECONDS_PER_SECOND / 10);
696 /***********************************************************/
697 void hw_error(const char *fmt, ...)
703 fprintf(stderr, "qemu: hardware error: ");
704 vfprintf(stderr, fmt, ap);
705 fprintf(stderr, "\n");
707 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
708 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
714 void cpu_synchronize_all_states(void)
719 cpu_synchronize_state(cpu);
721 if (hax_enabled() && hax_ug_platform()) {
722 hax_cpu_synchronize_state(cpu);
728 void cpu_synchronize_all_post_reset(void)
733 cpu_synchronize_post_reset(cpu);
735 if (hax_enabled() && hax_ug_platform())
736 hax_cpu_synchronize_post_reset(cpu);
741 void cpu_synchronize_all_post_init(void)
746 cpu_synchronize_post_init(cpu);
748 if (hax_enabled() && hax_ug_platform())
749 hax_cpu_synchronize_post_init(cpu);
754 static int do_vm_stop(RunState state)
758 if (runstate_is_running()) {
762 vm_state_notify(0, state);
763 qapi_event_send_stop(&error_abort);
767 replay_disable_events();
768 ret = bdrv_flush_all();
773 static bool cpu_can_run(CPUState *cpu)
778 if (cpu_is_stopped(cpu)) {
784 static void cpu_handle_guest_debug(CPUState *cpu)
786 gdb_set_stop_cpu(cpu);
787 qemu_system_debug_request();
792 static void sigbus_reraise(void)
795 struct sigaction action;
797 memset(&action, 0, sizeof(action));
798 action.sa_handler = SIG_DFL;
799 if (!sigaction(SIGBUS, &action, NULL)) {
802 sigaddset(&set, SIGBUS);
803 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
805 perror("Failed to re-raise SIGBUS!\n");
809 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
812 if (kvm_on_sigbus(siginfo->ssi_code,
813 (void *)(intptr_t)siginfo->ssi_addr)) {
818 static void qemu_init_sigbus(void)
820 struct sigaction action;
822 memset(&action, 0, sizeof(action));
823 action.sa_flags = SA_SIGINFO;
824 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
825 sigaction(SIGBUS, &action, NULL);
827 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
830 static void qemu_kvm_eat_signals(CPUState *cpu)
832 struct timespec ts = { 0, 0 };
838 sigemptyset(&waitset);
839 sigaddset(&waitset, SIG_IPI);
840 sigaddset(&waitset, SIGBUS);
843 r = sigtimedwait(&waitset, &siginfo, &ts);
844 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
845 perror("sigtimedwait");
851 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
859 r = sigpending(&chkset);
861 perror("sigpending");
864 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
867 #else /* !CONFIG_LINUX */
869 static void qemu_init_sigbus(void)
873 static void qemu_kvm_eat_signals(CPUState *cpu)
876 #endif /* !CONFIG_LINUX */
879 static void dummy_signal(int sig)
883 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
887 struct sigaction sigact;
889 memset(&sigact, 0, sizeof(sigact));
890 sigact.sa_handler = dummy_signal;
891 sigaction(SIG_IPI, &sigact, NULL);
893 pthread_sigmask(SIG_BLOCK, NULL, &set);
894 sigdelset(&set, SIG_IPI);
895 sigdelset(&set, SIGBUS);
896 r = kvm_set_signal_mask(cpu, &set);
898 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
904 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
910 static QemuMutex qemu_global_mutex;
911 static QemuCond qemu_io_proceeded_cond;
912 static unsigned iothread_requesting_mutex;
914 static QemuThread io_thread;
917 static QemuCond qemu_cpu_cond;
919 static QemuCond qemu_pause_cond;
921 void qemu_init_cpu_loop(void)
924 qemu_cond_init(&qemu_cpu_cond);
925 qemu_cond_init(&qemu_pause_cond);
926 qemu_cond_init(&qemu_io_proceeded_cond);
927 qemu_mutex_init(&qemu_global_mutex);
929 qemu_thread_get_self(&io_thread);
932 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
934 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
937 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
939 if (kvm_destroy_vcpu(cpu) < 0) {
940 error_report("kvm_destroy_vcpu failed");
945 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
949 static void qemu_wait_io_event_common(CPUState *cpu)
954 qemu_cond_broadcast(&qemu_pause_cond);
956 process_queued_cpu_work(cpu);
957 cpu->thread_kicked = false;
960 static void qemu_tcg_wait_io_event(CPUState *cpu)
962 while (all_cpu_threads_idle()) {
963 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
966 while (iothread_requesting_mutex) {
967 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
971 qemu_wait_io_event_common(cpu);
976 static void qemu_hax_wait_io_event(CPUState *cpu)
978 while (cpu_thread_is_idle(cpu)) {
979 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
982 qemu_wait_io_event_common(cpu);
986 static void qemu_kvm_wait_io_event(CPUState *cpu)
988 while (cpu_thread_is_idle(cpu)) {
989 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
992 qemu_kvm_eat_signals(cpu);
993 qemu_wait_io_event_common(cpu);
996 static void *qemu_kvm_cpu_thread_fn(void *arg)
1001 rcu_register_thread();
1003 qemu_mutex_lock_iothread();
1004 qemu_thread_get_self(cpu->thread);
1005 cpu->thread_id = qemu_get_thread_id();
1009 r = kvm_init_vcpu(cpu);
1011 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1015 qemu_kvm_init_cpu_signals(cpu);
1017 /* signal CPU creation */
1018 cpu->created = true;
1019 qemu_cond_signal(&qemu_cpu_cond);
1022 if (cpu_can_run(cpu)) {
1023 r = kvm_cpu_exec(cpu);
1024 if (r == EXCP_DEBUG) {
1025 cpu_handle_guest_debug(cpu);
1028 qemu_kvm_wait_io_event(cpu);
1029 } while (!cpu->unplug || cpu_can_run(cpu));
1031 qemu_kvm_destroy_vcpu(cpu);
1032 cpu->created = false;
1033 qemu_cond_signal(&qemu_cpu_cond);
1034 qemu_mutex_unlock_iothread();
1038 static void *qemu_dummy_cpu_thread_fn(void *arg)
1041 fprintf(stderr, "qtest is not supported under Windows\n");
1044 CPUState *cpu = arg;
1048 rcu_register_thread();
1050 qemu_mutex_lock_iothread();
1051 qemu_thread_get_self(cpu->thread);
1052 cpu->thread_id = qemu_get_thread_id();
1055 sigemptyset(&waitset);
1056 sigaddset(&waitset, SIG_IPI);
1058 /* signal CPU creation */
1059 cpu->created = true;
1060 qemu_cond_signal(&qemu_cpu_cond);
1065 qemu_mutex_unlock_iothread();
1068 r = sigwait(&waitset, &sig);
1069 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1074 qemu_mutex_lock_iothread();
1076 qemu_wait_io_event_common(cpu);
1083 static int64_t tcg_get_icount_limit(void)
1087 if (replay_mode != REPLAY_MODE_PLAY) {
1088 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1090 /* Maintain prior (possibly buggy) behaviour where if no deadline
1091 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1092 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1095 if ((deadline < 0) || (deadline > INT32_MAX)) {
1096 deadline = INT32_MAX;
1099 return qemu_icount_round(deadline);
1101 return replay_get_instructions();
1105 static void handle_icount_deadline(void)
1109 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1111 if (deadline == 0) {
1112 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1117 static int tcg_cpu_exec(CPUState *cpu)
1120 #ifdef CONFIG_PROFILER
1124 #ifdef CONFIG_PROFILER
1125 ti = profile_getclock();
1130 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1131 + cpu->icount_extra);
1132 cpu->icount_decr.u16.low = 0;
1133 cpu->icount_extra = 0;
1134 count = tcg_get_icount_limit();
1135 timers_state.qemu_icount += count;
1136 decr = (count > 0xffff) ? 0xffff : count;
1138 cpu->icount_decr.u16.low = decr;
1139 cpu->icount_extra = count;
1141 cpu_exec_start(cpu);
1142 ret = cpu_exec(cpu);
1144 #ifdef CONFIG_PROFILER
1145 tcg_time += profile_getclock() - ti;
1148 /* Fold pending instructions back into the
1149 instruction counter, and clear the interrupt flag. */
1150 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1151 + cpu->icount_extra);
1152 cpu->icount_decr.u32 = 0;
1153 cpu->icount_extra = 0;
1154 replay_account_executed_instructions();
1159 /* Destroy any remaining vCPUs which have been unplugged and have
1162 static void deal_with_unplugged_cpus(void)
1167 if (cpu->unplug && !cpu_can_run(cpu)) {
1168 qemu_tcg_destroy_vcpu(cpu);
1169 cpu->created = false;
1170 qemu_cond_signal(&qemu_cpu_cond);
1176 static void *qemu_tcg_cpu_thread_fn(void *arg)
1178 CPUState *cpu = arg;
1180 rcu_register_thread();
1182 qemu_mutex_lock_iothread();
1183 qemu_thread_get_self(cpu->thread);
1186 cpu->thread_id = qemu_get_thread_id();
1187 cpu->created = true;
1190 qemu_cond_signal(&qemu_cpu_cond);
1192 /* wait for initial kick-off after machine start */
1193 while (first_cpu->stopped) {
1194 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1196 /* process any pending work */
1198 qemu_wait_io_event_common(cpu);
1202 /* process any pending work */
1203 atomic_mb_set(&exit_request, 1);
1208 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1209 qemu_account_warp_timer();
1215 for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
1217 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1218 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1220 if (cpu_can_run(cpu)) {
1222 r = tcg_cpu_exec(cpu);
1223 if (r == EXCP_DEBUG) {
1224 cpu_handle_guest_debug(cpu);
1227 } else if (cpu->stop || cpu->stopped) {
1229 cpu = CPU_NEXT(cpu);
1236 /* Pairs with smp_wmb in qemu_cpu_kick. */
1237 atomic_mb_set(&exit_request, 0);
1239 handle_icount_deadline();
1241 qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1242 deal_with_unplugged_cpus();
1249 static void *qemu_hax_cpu_thread_fn(void *arg)
1251 CPUState *cpu = arg;
1253 qemu_thread_get_self(cpu->thread);
1254 qemu_mutex_lock(&qemu_global_mutex);
1256 cpu->thread_id = qemu_get_thread_id();
1257 cpu->created = true;
1262 qemu_cond_signal(&qemu_cpu_cond);
1265 if (cpu_can_run(cpu)) {
1266 r = hax_smp_cpu_exec(cpu);
1267 if (r == EXCP_DEBUG) {
1268 cpu_handle_guest_debug(cpu);
1271 qemu_hax_wait_io_event(cpu);
1277 static void qemu_cpu_kick_thread(CPUState *cpu)
1282 if (cpu->thread_kicked) {
1285 cpu->thread_kicked = true;
1286 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1288 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1292 # ifdef CONFIG_DARWIN
1293 /* The cpu thread cannot catch it reliably when shutdown the guest on Mac.
1294 * We can double check it and resend it
1296 if (!exit_request) {
1297 // FIXME: check it soon
1301 if (hax_enabled() && hax_ug_platform()) {
1302 cpu->exit_request = 1;
1309 // FIXME: check it soon
1311 if (!qemu_cpu_is_self(cpu)) {
1314 if (SuspendThread(cpu->hThread) == (DWORD)-1) {
1315 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1320 /* On multi-core systems, we are not sure that the thread is actually
1321 * suspended until we can get the context.
1323 tcgContext.ContextFlags = CONTEXT_CONTROL;
1324 while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
1330 if(hax_enabled() && hax_ug_platform()) {
1331 cpu->exit_request = 1;
1335 if (ResumeThread(cpu->hThread) == (DWORD)-1) {
1336 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1342 if (!qemu_cpu_is_self(cpu)) {
1343 if(hax_enabled() && hax_ug_platform()) {
1344 cpu->exit_request = 1;
1351 static void qemu_cpu_kick_no_halt(void)
1354 /* Ensure whatever caused the exit has reached the CPU threads before
1355 * writing exit_request.
1357 atomic_mb_set(&exit_request, 1);
1358 cpu = atomic_mb_read(&tcg_current_cpu);
1364 void qemu_cpu_kick(CPUState *cpu)
1366 qemu_cond_broadcast(cpu->halt_cond);
1367 if (tcg_enabled()) {
1368 qemu_cpu_kick_no_halt();
1370 qemu_cpu_kick_thread(cpu);
1374 void qemu_cpu_kick_self(void)
1376 assert(current_cpu);
1377 qemu_cpu_kick_thread(current_cpu);
1380 bool qemu_cpu_is_self(CPUState *cpu)
1382 return qemu_thread_is_self(cpu->thread);
1385 bool qemu_in_vcpu_thread(void)
1387 return current_cpu && qemu_cpu_is_self(current_cpu);
1390 static __thread bool iothread_locked = false;
1392 bool qemu_mutex_iothread_locked(void)
1394 return iothread_locked;
1397 void qemu_mutex_lock_iothread(void)
1399 atomic_inc(&iothread_requesting_mutex);
1400 /* In the simple case there is no need to bump the VCPU thread out of
1401 * TCG code execution.
1403 if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1404 !first_cpu || !first_cpu->created) {
1405 qemu_mutex_lock(&qemu_global_mutex);
1406 atomic_dec(&iothread_requesting_mutex);
1408 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1409 qemu_cpu_kick_no_halt();
1410 qemu_mutex_lock(&qemu_global_mutex);
1412 atomic_dec(&iothread_requesting_mutex);
1413 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1415 iothread_locked = true;
1418 void qemu_mutex_unlock_iothread(void)
1420 iothread_locked = false;
1421 qemu_mutex_unlock(&qemu_global_mutex);
1424 static bool all_vcpus_paused(void)
1429 if (!cpu->stopped) {
1437 void pause_all_vcpus(void)
1441 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1447 if (qemu_in_vcpu_thread()) {
1449 if (!kvm_enabled()) {
1452 cpu->stopped = true;
1458 while (!all_vcpus_paused()) {
1459 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1466 void cpu_resume(CPUState *cpu)
1469 cpu->stopped = false;
1473 void resume_all_vcpus(void)
1477 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1483 void cpu_remove(CPUState *cpu)
1490 void cpu_remove_sync(CPUState *cpu)
1493 while (cpu->created) {
1494 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1498 /* For temporary buffers for forming a name */
1499 #define VCPU_THREAD_NAME_SIZE 16
1501 static void qemu_tcg_init_vcpu(CPUState *cpu)
1504 if (hax_enabled()) {
1508 char thread_name[VCPU_THREAD_NAME_SIZE];
1509 static QemuCond *tcg_halt_cond;
1510 static QemuThread *tcg_cpu_thread;
1512 /* share a single thread for all cpus with TCG */
1513 if (!tcg_cpu_thread) {
1514 cpu->thread = g_malloc0(sizeof(QemuThread));
1515 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1516 qemu_cond_init(cpu->halt_cond);
1517 tcg_halt_cond = cpu->halt_cond;
1518 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1520 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1521 cpu, QEMU_THREAD_JOINABLE);
1523 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1525 while (!cpu->created) {
1526 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1528 tcg_cpu_thread = cpu->thread;
1530 cpu->thread = tcg_cpu_thread;
1531 cpu->halt_cond = tcg_halt_cond;
1536 static void qemu_hax_start_vcpu(CPUState *cpu)
1538 char thread_name[VCPU_THREAD_NAME_SIZE];
1540 cpu->thread = g_malloc0(sizeof(QemuThread));
1541 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1542 qemu_cond_init(cpu->halt_cond);
1544 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1547 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1548 cpu, QEMU_THREAD_JOINABLE);
1550 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1552 while (!cpu->created) {
1553 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1558 static void qemu_kvm_start_vcpu(CPUState *cpu)
1560 char thread_name[VCPU_THREAD_NAME_SIZE];
1562 cpu->thread = g_malloc0(sizeof(QemuThread));
1563 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1564 qemu_cond_init(cpu->halt_cond);
1565 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1567 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1568 cpu, QEMU_THREAD_JOINABLE);
1569 while (!cpu->created) {
1570 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1574 static void qemu_dummy_start_vcpu(CPUState *cpu)
1576 char thread_name[VCPU_THREAD_NAME_SIZE];
1578 cpu->thread = g_malloc0(sizeof(QemuThread));
1579 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1580 qemu_cond_init(cpu->halt_cond);
1581 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1583 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1584 QEMU_THREAD_JOINABLE);
1585 while (!cpu->created) {
1586 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1590 void qemu_init_vcpu(CPUState *cpu)
1592 cpu->nr_cores = smp_cores;
1593 cpu->nr_threads = smp_threads;
1594 cpu->stopped = true;
1597 /* If the target cpu hasn't set up any address spaces itself,
1598 * give it the default one.
1600 AddressSpace *as = address_space_init_shareable(cpu->memory,
1603 cpu_address_space_init(cpu, as, 0);
1606 if (kvm_enabled()) {
1607 qemu_kvm_start_vcpu(cpu);
1609 } else if (hax_enabled() && hax_ug_platform()) {
1610 qemu_hax_start_vcpu(cpu);
1612 } else if (tcg_enabled()) {
1613 qemu_tcg_init_vcpu(cpu);
1615 qemu_dummy_start_vcpu(cpu);
1619 void cpu_stop_current(void)
1622 current_cpu->stop = false;
1623 current_cpu->stopped = true;
1624 cpu_exit(current_cpu);
1625 qemu_cond_broadcast(&qemu_pause_cond);
1629 int vm_stop(RunState state)
1631 if (qemu_in_vcpu_thread()) {
1632 qemu_system_vmstop_request_prepare();
1633 qemu_system_vmstop_request(state);
1635 * FIXME: should not return to device code in case
1636 * vm_stop() has been requested.
1642 return do_vm_stop(state);
1645 /* does a state transition even if the VM is already stopped,
1646 current state is forgotten forever */
1647 int vm_stop_force_state(RunState state)
1649 if (runstate_is_running()) {
1650 return vm_stop(state);
1652 runstate_set(state);
1655 /* Make sure to return an error if the flush in a previous vm_stop()
1657 return bdrv_flush_all();
1661 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1663 /* XXX: implement xxx_cpu_list for targets that still miss it */
1664 #if defined(cpu_list)
1665 cpu_list(f, cpu_fprintf);
1669 CpuInfoList *qmp_query_cpus(Error **errp)
1671 CpuInfoList *head = NULL, *cur_item = NULL;
1676 #if defined(TARGET_I386)
1677 X86CPU *x86_cpu = X86_CPU(cpu);
1678 CPUX86State *env = &x86_cpu->env;
1679 #elif defined(TARGET_PPC)
1680 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1681 CPUPPCState *env = &ppc_cpu->env;
1682 #elif defined(TARGET_SPARC)
1683 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1684 CPUSPARCState *env = &sparc_cpu->env;
1685 #elif defined(TARGET_MIPS)
1686 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1687 CPUMIPSState *env = &mips_cpu->env;
1688 #elif defined(TARGET_TRICORE)
1689 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1690 CPUTriCoreState *env = &tricore_cpu->env;
1693 cpu_synchronize_state(cpu);
1695 info = g_malloc0(sizeof(*info));
1696 info->value = g_malloc0(sizeof(*info->value));
1697 info->value->CPU = cpu->cpu_index;
1698 info->value->current = (cpu == first_cpu);
1699 info->value->halted = cpu->halted;
1700 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1701 info->value->thread_id = cpu->thread_id;
1702 #if defined(TARGET_I386)
1703 info->value->arch = CPU_INFO_ARCH_X86;
1704 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1705 #elif defined(TARGET_PPC)
1706 info->value->arch = CPU_INFO_ARCH_PPC;
1707 info->value->u.ppc.nip = env->nip;
1708 #elif defined(TARGET_SPARC)
1709 info->value->arch = CPU_INFO_ARCH_SPARC;
1710 info->value->u.q_sparc.pc = env->pc;
1711 info->value->u.q_sparc.npc = env->npc;
1712 #elif defined(TARGET_MIPS)
1713 info->value->arch = CPU_INFO_ARCH_MIPS;
1714 info->value->u.q_mips.PC = env->active_tc.PC;
1715 #elif defined(TARGET_TRICORE)
1716 info->value->arch = CPU_INFO_ARCH_TRICORE;
1717 info->value->u.tricore.PC = env->PC;
1719 info->value->arch = CPU_INFO_ARCH_OTHER;
1722 /* XXX: waiting for the qapi to support GSList */
1724 head = cur_item = info;
1726 cur_item->next = info;
1734 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1735 bool has_cpu, int64_t cpu_index, Error **errp)
1741 int64_t orig_addr = addr, orig_size = size;
1747 cpu = qemu_get_cpu(cpu_index);
1749 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1754 f = fopen(filename, "wb");
1756 error_setg_file_open(errp, errno, filename);
1764 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1765 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1766 " specified", orig_addr, orig_size);
1769 if (fwrite(buf, 1, l, f) != l) {
1770 error_setg(errp, QERR_IO_ERROR);
1781 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1788 f = fopen(filename, "wb");
1790 error_setg_file_open(errp, errno, filename);
1798 cpu_physical_memory_read(addr, buf, l);
1799 if (fwrite(buf, 1, l, f) != l) {
1800 error_setg(errp, QERR_IO_ERROR);
1811 void qmp_inject_nmi(Error **errp)
1813 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1816 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1822 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1823 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1824 if (icount_align_option) {
1825 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1826 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1828 cpu_fprintf(f, "Max guest delay NA\n");
1829 cpu_fprintf(f, "Max guest advance NA\n");