Merge tag 'v2.7.0' into develop_qemu_2.7
[sdk/emulator/qemu.git] / cpus.c
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "cpu.h"
29 #include "monitor/monitor.h"
30 #include "qapi/qmp/qerror.h"
31 #include "qemu/error-report.h"
32 #include "sysemu/sysemu.h"
33 #include "sysemu/block-backend.h"
34 #include "exec/gdbstub.h"
35 #include "sysemu/dma.h"
36 #include "sysemu/kvm.h"
37 #include "sysemu/hax.h"
38 #include "qmp-commands.h"
39 #include "exec/exec-all.h"
40
41 #include "qemu/thread.h"
42 #include "sysemu/cpus.h"
43 #include "sysemu/qtest.h"
44 #include "qemu/main-loop.h"
45 #include "qemu/bitmap.h"
46 #include "qemu/seqlock.h"
47 #include "qapi-event.h"
48 #include "hw/nmi.h"
49 #include "sysemu/replay.h"
50
51 #ifndef _WIN32
52 #include "qemu/compatfd.h"
53 #endif
54
55 #ifdef CONFIG_LINUX
56
57 #include <sys/prctl.h>
58
59 #ifndef PR_MCE_KILL
60 #define PR_MCE_KILL 33
61 #endif
62
63 #ifndef PR_MCE_KILL_SET
64 #define PR_MCE_KILL_SET 1
65 #endif
66
67 #ifndef PR_MCE_KILL_EARLY
68 #define PR_MCE_KILL_EARLY 1
69 #endif
70
71 #endif /* CONFIG_LINUX */
72
73 static CPUState *next_cpu;
74 int64_t max_delay;
75 int64_t max_advance;
76
77 /* vcpu throttling controls */
78 static QEMUTimer *throttle_timer;
79 static unsigned int throttle_percentage;
80
81 #define CPU_THROTTLE_PCT_MIN 1
82 #define CPU_THROTTLE_PCT_MAX 99
83 #define CPU_THROTTLE_TIMESLICE_NS 10000000
84
85 bool cpu_is_stopped(CPUState *cpu)
86 {
87     return cpu->stopped || !runstate_is_running();
88 }
89
90 static bool cpu_thread_is_idle(CPUState *cpu)
91 {
92     if (cpu->stop || cpu->queued_work_first) {
93         return false;
94     }
95     if (cpu_is_stopped(cpu)) {
96         return true;
97     }
98     if (!cpu->halted || cpu_has_work(cpu) ||
99         kvm_halt_in_kernel()) {
100         return false;
101     }
102     return true;
103 }
104
105 static bool all_cpu_threads_idle(void)
106 {
107     CPUState *cpu;
108
109     CPU_FOREACH(cpu) {
110         if (!cpu_thread_is_idle(cpu)) {
111             return false;
112         }
113     }
114     return true;
115 }
116
117 /***********************************************************/
118 /* guest cycle counter */
119
120 /* Protected by TimersState seqlock */
121
122 static bool icount_sleep = true;
123 static int64_t vm_clock_warp_start = -1;
124 /* Conversion factor from emulated instructions to virtual clock ticks.  */
125 static int icount_time_shift;
126 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
127 #define MAX_ICOUNT_SHIFT 10
128
129 static QEMUTimer *icount_rt_timer;
130 static QEMUTimer *icount_vm_timer;
131 static QEMUTimer *icount_warp_timer;
132
133 typedef struct TimersState {
134     /* Protected by BQL.  */
135     int64_t cpu_ticks_prev;
136     int64_t cpu_ticks_offset;
137
138     /* cpu_clock_offset can be read out of BQL, so protect it with
139      * this lock.
140      */
141     QemuSeqLock vm_clock_seqlock;
142     int64_t cpu_clock_offset;
143     int32_t cpu_ticks_enabled;
144     int64_t dummy;
145
146     /* Compensate for varying guest execution speed.  */
147     int64_t qemu_icount_bias;
148     /* Only written by TCG thread */
149     int64_t qemu_icount;
150 } TimersState;
151
152 static TimersState timers_state;
153
154 int64_t cpu_get_icount_raw(void)
155 {
156     int64_t icount;
157     CPUState *cpu = current_cpu;
158
159     icount = timers_state.qemu_icount;
160     if (cpu) {
161         if (!cpu->can_do_io) {
162             fprintf(stderr, "Bad icount read\n");
163             exit(1);
164         }
165         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
166     }
167     return icount;
168 }
169
170 /* Return the virtual CPU time, based on the instruction counter.  */
171 static int64_t cpu_get_icount_locked(void)
172 {
173     int64_t icount = cpu_get_icount_raw();
174     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
175 }
176
177 int64_t cpu_get_icount(void)
178 {
179     int64_t icount;
180     unsigned start;
181
182     do {
183         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
184         icount = cpu_get_icount_locked();
185     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
186
187     return icount;
188 }
189
190 int64_t cpu_icount_to_ns(int64_t icount)
191 {
192     return icount << icount_time_shift;
193 }
194
195 /* return the host CPU cycle counter and handle stop/restart */
196 /* Caller must hold the BQL */
197 int64_t cpu_get_ticks(void)
198 {
199     int64_t ticks;
200
201     if (use_icount) {
202         return cpu_get_icount();
203     }
204
205     ticks = timers_state.cpu_ticks_offset;
206     if (timers_state.cpu_ticks_enabled) {
207         ticks += cpu_get_host_ticks();
208     }
209
210     if (timers_state.cpu_ticks_prev > ticks) {
211         /* Note: non increasing ticks may happen if the host uses
212            software suspend */
213         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
214         ticks = timers_state.cpu_ticks_prev;
215     }
216
217     timers_state.cpu_ticks_prev = ticks;
218     return ticks;
219 }
220
221 static int64_t cpu_get_clock_locked(void)
222 {
223     int64_t ticks;
224
225     ticks = timers_state.cpu_clock_offset;
226     if (timers_state.cpu_ticks_enabled) {
227         ticks += get_clock();
228     }
229
230     return ticks;
231 }
232
233 /* return the host CPU monotonic timer and handle stop/restart */
234 int64_t cpu_get_clock(void)
235 {
236     int64_t ti;
237     unsigned start;
238
239     do {
240         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
241         ti = cpu_get_clock_locked();
242     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
243
244     return ti;
245 }
246
247 /* enable cpu_get_ticks()
248  * Caller must hold BQL which server as mutex for vm_clock_seqlock.
249  */
250 void cpu_enable_ticks(void)
251 {
252     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
253     seqlock_write_begin(&timers_state.vm_clock_seqlock);
254     if (!timers_state.cpu_ticks_enabled) {
255         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
256         timers_state.cpu_clock_offset -= get_clock();
257         timers_state.cpu_ticks_enabled = 1;
258     }
259     seqlock_write_end(&timers_state.vm_clock_seqlock);
260 }
261
262 /* disable cpu_get_ticks() : the clock is stopped. You must not call
263  * cpu_get_ticks() after that.
264  * Caller must hold BQL which server as mutex for vm_clock_seqlock.
265  */
266 void cpu_disable_ticks(void)
267 {
268     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
269     seqlock_write_begin(&timers_state.vm_clock_seqlock);
270     if (timers_state.cpu_ticks_enabled) {
271         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
272         timers_state.cpu_clock_offset = cpu_get_clock_locked();
273         timers_state.cpu_ticks_enabled = 0;
274     }
275     seqlock_write_end(&timers_state.vm_clock_seqlock);
276 }
277
278 /* Correlation between real and virtual time is always going to be
279    fairly approximate, so ignore small variation.
280    When the guest is idle real and virtual time will be aligned in
281    the IO wait loop.  */
282 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
283
284 static void icount_adjust(void)
285 {
286     int64_t cur_time;
287     int64_t cur_icount;
288     int64_t delta;
289
290     /* Protected by TimersState mutex.  */
291     static int64_t last_delta;
292
293     /* If the VM is not running, then do nothing.  */
294     if (!runstate_is_running()) {
295         return;
296     }
297
298     seqlock_write_begin(&timers_state.vm_clock_seqlock);
299     cur_time = cpu_get_clock_locked();
300     cur_icount = cpu_get_icount_locked();
301
302     delta = cur_icount - cur_time;
303     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
304     if (delta > 0
305         && last_delta + ICOUNT_WOBBLE < delta * 2
306         && icount_time_shift > 0) {
307         /* The guest is getting too far ahead.  Slow time down.  */
308         icount_time_shift--;
309     }
310     if (delta < 0
311         && last_delta - ICOUNT_WOBBLE > delta * 2
312         && icount_time_shift < MAX_ICOUNT_SHIFT) {
313         /* The guest is getting too far behind.  Speed time up.  */
314         icount_time_shift++;
315     }
316     last_delta = delta;
317     timers_state.qemu_icount_bias = cur_icount
318                               - (timers_state.qemu_icount << icount_time_shift);
319     seqlock_write_end(&timers_state.vm_clock_seqlock);
320 }
321
322 static void icount_adjust_rt(void *opaque)
323 {
324     timer_mod(icount_rt_timer,
325               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
326     icount_adjust();
327 }
328
329 static void icount_adjust_vm(void *opaque)
330 {
331     timer_mod(icount_vm_timer,
332                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
333                    NANOSECONDS_PER_SECOND / 10);
334     icount_adjust();
335 }
336
337 static int64_t qemu_icount_round(int64_t count)
338 {
339     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
340 }
341
342 static void icount_warp_rt(void)
343 {
344     unsigned seq;
345     int64_t warp_start;
346
347     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
348      * changes from -1 to another value, so the race here is okay.
349      */
350     do {
351         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
352         warp_start = vm_clock_warp_start;
353     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
354
355     if (warp_start == -1) {
356         return;
357     }
358
359     seqlock_write_begin(&timers_state.vm_clock_seqlock);
360     if (runstate_is_running()) {
361         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
362                                      cpu_get_clock_locked());
363         int64_t warp_delta;
364
365         warp_delta = clock - vm_clock_warp_start;
366         if (use_icount == 2) {
367             /*
368              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
369              * far ahead of real time.
370              */
371             int64_t cur_icount = cpu_get_icount_locked();
372             int64_t delta = clock - cur_icount;
373             warp_delta = MIN(warp_delta, delta);
374         }
375         timers_state.qemu_icount_bias += warp_delta;
376     }
377     vm_clock_warp_start = -1;
378     seqlock_write_end(&timers_state.vm_clock_seqlock);
379
380     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
381         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
382     }
383 }
384
385 static void icount_timer_cb(void *opaque)
386 {
387     /* No need for a checkpoint because the timer already synchronizes
388      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
389      */
390     icount_warp_rt();
391 }
392
393 void qtest_clock_warp(int64_t dest)
394 {
395     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
396     AioContext *aio_context;
397     assert(qtest_enabled());
398     aio_context = qemu_get_aio_context();
399     while (clock < dest) {
400         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
401         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
402
403         seqlock_write_begin(&timers_state.vm_clock_seqlock);
404         timers_state.qemu_icount_bias += warp;
405         seqlock_write_end(&timers_state.vm_clock_seqlock);
406
407         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
408         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
409         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
410     }
411     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
412 }
413
414 void qemu_start_warp_timer(void)
415 {
416     int64_t clock;
417     int64_t deadline;
418
419     if (!use_icount) {
420         return;
421     }
422
423     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
424      * do not fire, so computing the deadline does not make sense.
425      */
426     if (!runstate_is_running()) {
427         return;
428     }
429
430     /* warp clock deterministically in record/replay mode */
431     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
432         return;
433     }
434
435     if (!all_cpu_threads_idle()) {
436         return;
437     }
438
439     if (qtest_enabled()) {
440         /* When testing, qtest commands advance icount.  */
441         return;
442     }
443
444     /* We want to use the earliest deadline from ALL vm_clocks */
445     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
446     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
447     if (deadline < 0) {
448         static bool notified;
449         if (!icount_sleep && !notified) {
450             error_report("WARNING: icount sleep disabled and no active timers");
451             notified = true;
452         }
453         return;
454     }
455
456     if (deadline > 0) {
457         /*
458          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
459          * sleep.  Otherwise, the CPU might be waiting for a future timer
460          * interrupt to wake it up, but the interrupt never comes because
461          * the vCPU isn't running any insns and thus doesn't advance the
462          * QEMU_CLOCK_VIRTUAL.
463          */
464         if (!icount_sleep) {
465             /*
466              * We never let VCPUs sleep in no sleep icount mode.
467              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
468              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
469              * It is useful when we want a deterministic execution time,
470              * isolated from host latencies.
471              */
472             seqlock_write_begin(&timers_state.vm_clock_seqlock);
473             timers_state.qemu_icount_bias += deadline;
474             seqlock_write_end(&timers_state.vm_clock_seqlock);
475             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
476         } else {
477             /*
478              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
479              * "real" time, (related to the time left until the next event) has
480              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
481              * This avoids that the warps are visible externally; for example,
482              * you will not be sending network packets continuously instead of
483              * every 100ms.
484              */
485             seqlock_write_begin(&timers_state.vm_clock_seqlock);
486             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
487                 vm_clock_warp_start = clock;
488             }
489             seqlock_write_end(&timers_state.vm_clock_seqlock);
490             timer_mod_anticipate(icount_warp_timer, clock + deadline);
491         }
492     } else if (deadline == 0) {
493         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
494     }
495 }
496
497 static void qemu_account_warp_timer(void)
498 {
499     if (!use_icount || !icount_sleep) {
500         return;
501     }
502
503     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
504      * do not fire, so computing the deadline does not make sense.
505      */
506     if (!runstate_is_running()) {
507         return;
508     }
509
510     /* warp clock deterministically in record/replay mode */
511     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
512         return;
513     }
514
515     timer_del(icount_warp_timer);
516     icount_warp_rt();
517 }
518
519 static bool icount_state_needed(void *opaque)
520 {
521     return use_icount;
522 }
523
524 /*
525  * This is a subsection for icount migration.
526  */
527 static const VMStateDescription icount_vmstate_timers = {
528     .name = "timer/icount",
529     .version_id = 1,
530     .minimum_version_id = 1,
531     .needed = icount_state_needed,
532     .fields = (VMStateField[]) {
533         VMSTATE_INT64(qemu_icount_bias, TimersState),
534         VMSTATE_INT64(qemu_icount, TimersState),
535         VMSTATE_END_OF_LIST()
536     }
537 };
538
539 static const VMStateDescription vmstate_timers = {
540     .name = "timer",
541     .version_id = 2,
542     .minimum_version_id = 1,
543     .fields = (VMStateField[]) {
544         VMSTATE_INT64(cpu_ticks_offset, TimersState),
545         VMSTATE_INT64(dummy, TimersState),
546         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
547         VMSTATE_END_OF_LIST()
548     },
549     .subsections = (const VMStateDescription*[]) {
550         &icount_vmstate_timers,
551         NULL
552     }
553 };
554
555 static void cpu_throttle_thread(void *opaque)
556 {
557     CPUState *cpu = opaque;
558     double pct;
559     double throttle_ratio;
560     long sleeptime_ns;
561
562     if (!cpu_throttle_get_percentage()) {
563         return;
564     }
565
566     pct = (double)cpu_throttle_get_percentage()/100;
567     throttle_ratio = pct / (1 - pct);
568     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
569
570     qemu_mutex_unlock_iothread();
571     atomic_set(&cpu->throttle_thread_scheduled, 0);
572     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
573     qemu_mutex_lock_iothread();
574 }
575
576 static void cpu_throttle_timer_tick(void *opaque)
577 {
578     CPUState *cpu;
579     double pct;
580
581     /* Stop the timer if needed */
582     if (!cpu_throttle_get_percentage()) {
583         return;
584     }
585     CPU_FOREACH(cpu) {
586         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
587             async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
588         }
589     }
590
591     pct = (double)cpu_throttle_get_percentage()/100;
592     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
593                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
594 }
595
596 void cpu_throttle_set(int new_throttle_pct)
597 {
598     /* Ensure throttle percentage is within valid range */
599     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
600     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
601
602     atomic_set(&throttle_percentage, new_throttle_pct);
603
604     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
605                                        CPU_THROTTLE_TIMESLICE_NS);
606 }
607
608 void cpu_throttle_stop(void)
609 {
610     atomic_set(&throttle_percentage, 0);
611 }
612
613 bool cpu_throttle_active(void)
614 {
615     return (cpu_throttle_get_percentage() != 0);
616 }
617
618 int cpu_throttle_get_percentage(void)
619 {
620     return atomic_read(&throttle_percentage);
621 }
622
623 void cpu_ticks_init(void)
624 {
625     seqlock_init(&timers_state.vm_clock_seqlock);
626     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
627     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
628                                            cpu_throttle_timer_tick, NULL);
629 }
630
631 void configure_icount(QemuOpts *opts, Error **errp)
632 {
633     const char *option;
634     char *rem_str = NULL;
635
636     option = qemu_opt_get(opts, "shift");
637     if (!option) {
638         if (qemu_opt_get(opts, "align") != NULL) {
639             error_setg(errp, "Please specify shift option when using align");
640         }
641         return;
642     }
643
644     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
645     if (icount_sleep) {
646         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
647                                          icount_timer_cb, NULL);
648     }
649
650     icount_align_option = qemu_opt_get_bool(opts, "align", false);
651
652     if (icount_align_option && !icount_sleep) {
653         error_setg(errp, "align=on and sleep=off are incompatible");
654     }
655     if (strcmp(option, "auto") != 0) {
656         errno = 0;
657         icount_time_shift = strtol(option, &rem_str, 0);
658         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
659             error_setg(errp, "icount: Invalid shift value");
660         }
661         use_icount = 1;
662         return;
663     } else if (icount_align_option) {
664         error_setg(errp, "shift=auto and align=on are incompatible");
665     } else if (!icount_sleep) {
666         error_setg(errp, "shift=auto and sleep=off are incompatible");
667     }
668
669     use_icount = 2;
670
671     /* 125MIPS seems a reasonable initial guess at the guest speed.
672        It will be corrected fairly quickly anyway.  */
673     icount_time_shift = 3;
674
675     /* Have both realtime and virtual time triggers for speed adjustment.
676        The realtime trigger catches emulated time passing too slowly,
677        the virtual time trigger catches emulated time passing too fast.
678        Realtime triggers occur even when idle, so use them less frequently
679        than VM triggers.  */
680     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
681                                    icount_adjust_rt, NULL);
682     timer_mod(icount_rt_timer,
683                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
684     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
685                                         icount_adjust_vm, NULL);
686     timer_mod(icount_vm_timer,
687                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
688                    NANOSECONDS_PER_SECOND / 10);
689 }
690
691 /***********************************************************/
692 void hw_error(const char *fmt, ...)
693 {
694     va_list ap;
695     CPUState *cpu;
696
697     va_start(ap, fmt);
698     fprintf(stderr, "qemu: hardware error: ");
699     vfprintf(stderr, fmt, ap);
700     fprintf(stderr, "\n");
701     CPU_FOREACH(cpu) {
702         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
703         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
704     }
705     va_end(ap);
706     abort();
707 }
708
709 void cpu_synchronize_all_states(void)
710 {
711     CPUState *cpu;
712
713     CPU_FOREACH(cpu) {
714         cpu_synchronize_state(cpu);
715     }
716 }
717
718 void cpu_synchronize_all_post_reset(void)
719 {
720     CPUState *cpu;
721
722     CPU_FOREACH(cpu) {
723         cpu_synchronize_post_reset(cpu);
724 #ifdef CONFIG_HAX
725         if (hax_enabled() && hax_ug_platform())
726             hax_cpu_synchronize_post_reset(cpu);
727 #endif
728     }
729 }
730
731 void cpu_synchronize_all_post_init(void)
732 {
733     CPUState *cpu;
734
735     CPU_FOREACH(cpu) {
736         cpu_synchronize_post_init(cpu);
737 #ifdef CONFIG_HAX
738         if (hax_enabled() && hax_ug_platform())
739             hax_cpu_synchronize_post_init(cpu);
740 #endif
741     }
742 }
743
744 static int do_vm_stop(RunState state)
745 {
746     int ret = 0;
747
748     if (runstate_is_running()) {
749         cpu_disable_ticks();
750         pause_all_vcpus();
751         runstate_set(state);
752         vm_state_notify(0, state);
753         qapi_event_send_stop(&error_abort);
754     }
755
756     bdrv_drain_all();
757     ret = blk_flush_all();
758
759     return ret;
760 }
761
762 static bool cpu_can_run(CPUState *cpu)
763 {
764     if (cpu->stop) {
765         return false;
766     }
767     if (cpu_is_stopped(cpu)) {
768         return false;
769     }
770     return true;
771 }
772
773 static void cpu_handle_guest_debug(CPUState *cpu)
774 {
775     gdb_set_stop_cpu(cpu);
776     qemu_system_debug_request();
777     cpu->stopped = true;
778 }
779
780 #ifdef CONFIG_LINUX
781 static void sigbus_reraise(void)
782 {
783     sigset_t set;
784     struct sigaction action;
785
786     memset(&action, 0, sizeof(action));
787     action.sa_handler = SIG_DFL;
788     if (!sigaction(SIGBUS, &action, NULL)) {
789         raise(SIGBUS);
790         sigemptyset(&set);
791         sigaddset(&set, SIGBUS);
792         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
793     }
794     perror("Failed to re-raise SIGBUS!\n");
795     abort();
796 }
797
798 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
799                            void *ctx)
800 {
801     if (kvm_on_sigbus(siginfo->ssi_code,
802                       (void *)(intptr_t)siginfo->ssi_addr)) {
803         sigbus_reraise();
804     }
805 }
806
807 static void qemu_init_sigbus(void)
808 {
809     struct sigaction action;
810
811     memset(&action, 0, sizeof(action));
812     action.sa_flags = SA_SIGINFO;
813     action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
814     sigaction(SIGBUS, &action, NULL);
815
816     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
817 }
818
819 static void qemu_kvm_eat_signals(CPUState *cpu)
820 {
821     struct timespec ts = { 0, 0 };
822     siginfo_t siginfo;
823     sigset_t waitset;
824     sigset_t chkset;
825     int r;
826
827     sigemptyset(&waitset);
828     sigaddset(&waitset, SIG_IPI);
829     sigaddset(&waitset, SIGBUS);
830
831     do {
832         r = sigtimedwait(&waitset, &siginfo, &ts);
833         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
834             perror("sigtimedwait");
835             exit(1);
836         }
837
838         switch (r) {
839         case SIGBUS:
840             if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
841                 sigbus_reraise();
842             }
843             break;
844         default:
845             break;
846         }
847
848         r = sigpending(&chkset);
849         if (r == -1) {
850             perror("sigpending");
851             exit(1);
852         }
853     } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
854 }
855
856 #else /* !CONFIG_LINUX */
857
858 static void qemu_init_sigbus(void)
859 {
860 }
861
862 static void qemu_kvm_eat_signals(CPUState *cpu)
863 {
864 }
865 #endif /* !CONFIG_LINUX */
866
867 #ifndef _WIN32
868 static void dummy_signal(int sig)
869 {
870 }
871
872 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
873 {
874     int r;
875     sigset_t set;
876     struct sigaction sigact;
877
878     memset(&sigact, 0, sizeof(sigact));
879     sigact.sa_handler = dummy_signal;
880     sigaction(SIG_IPI, &sigact, NULL);
881
882     pthread_sigmask(SIG_BLOCK, NULL, &set);
883     sigdelset(&set, SIG_IPI);
884     sigdelset(&set, SIGBUS);
885     r = kvm_set_signal_mask(cpu, &set);
886     if (r) {
887         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
888         exit(1);
889     }
890 }
891
892 #else /* _WIN32 */
893 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
894 {
895     abort();
896 }
897 #endif /* _WIN32 */
898
899 static QemuMutex qemu_global_mutex;
900 static QemuCond qemu_io_proceeded_cond;
901 static unsigned iothread_requesting_mutex;
902
903 static QemuThread io_thread;
904
905 /* cpu creation */
906 static QemuCond qemu_cpu_cond;
907 /* system init */
908 static QemuCond qemu_pause_cond;
909 static QemuCond qemu_work_cond;
910
911 void qemu_init_cpu_loop(void)
912 {
913     qemu_init_sigbus();
914     qemu_cond_init(&qemu_cpu_cond);
915     qemu_cond_init(&qemu_pause_cond);
916     qemu_cond_init(&qemu_work_cond);
917     qemu_cond_init(&qemu_io_proceeded_cond);
918     qemu_mutex_init(&qemu_global_mutex);
919
920     qemu_thread_get_self(&io_thread);
921 }
922
923 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
924 {
925     struct qemu_work_item wi;
926
927     if (qemu_cpu_is_self(cpu)) {
928         func(data);
929         return;
930     }
931
932     wi.func = func;
933     wi.data = data;
934     wi.free = false;
935
936     qemu_mutex_lock(&cpu->work_mutex);
937     if (cpu->queued_work_first == NULL) {
938         cpu->queued_work_first = &wi;
939     } else {
940         cpu->queued_work_last->next = &wi;
941     }
942     cpu->queued_work_last = &wi;
943     wi.next = NULL;
944     wi.done = false;
945     qemu_mutex_unlock(&cpu->work_mutex);
946
947     qemu_cpu_kick(cpu);
948     while (!atomic_mb_read(&wi.done)) {
949         CPUState *self_cpu = current_cpu;
950
951         qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
952         current_cpu = self_cpu;
953     }
954 }
955
956 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
957 {
958     struct qemu_work_item *wi;
959
960     if (qemu_cpu_is_self(cpu)) {
961         func(data);
962         return;
963     }
964
965     wi = g_malloc0(sizeof(struct qemu_work_item));
966     wi->func = func;
967     wi->data = data;
968     wi->free = true;
969
970     qemu_mutex_lock(&cpu->work_mutex);
971     if (cpu->queued_work_first == NULL) {
972         cpu->queued_work_first = wi;
973     } else {
974         cpu->queued_work_last->next = wi;
975     }
976     cpu->queued_work_last = wi;
977     wi->next = NULL;
978     wi->done = false;
979     qemu_mutex_unlock(&cpu->work_mutex);
980
981     qemu_cpu_kick(cpu);
982 }
983
984 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
985 {
986     if (kvm_destroy_vcpu(cpu) < 0) {
987         error_report("kvm_destroy_vcpu failed");
988         exit(EXIT_FAILURE);
989     }
990 }
991
992 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
993 {
994 }
995
996 static void flush_queued_work(CPUState *cpu)
997 {
998     struct qemu_work_item *wi;
999
1000     if (cpu->queued_work_first == NULL) {
1001         return;
1002     }
1003
1004     qemu_mutex_lock(&cpu->work_mutex);
1005     while (cpu->queued_work_first != NULL) {
1006         wi = cpu->queued_work_first;
1007         cpu->queued_work_first = wi->next;
1008         if (!cpu->queued_work_first) {
1009             cpu->queued_work_last = NULL;
1010         }
1011         qemu_mutex_unlock(&cpu->work_mutex);
1012         wi->func(wi->data);
1013         qemu_mutex_lock(&cpu->work_mutex);
1014         if (wi->free) {
1015             g_free(wi);
1016         } else {
1017             atomic_mb_set(&wi->done, true);
1018         }
1019     }
1020     qemu_mutex_unlock(&cpu->work_mutex);
1021     qemu_cond_broadcast(&qemu_work_cond);
1022 }
1023
1024 static void qemu_wait_io_event_common(CPUState *cpu)
1025 {
1026     if (cpu->stop) {
1027         cpu->stop = false;
1028         cpu->stopped = true;
1029         qemu_cond_broadcast(&qemu_pause_cond);
1030     }
1031     flush_queued_work(cpu);
1032     cpu->thread_kicked = false;
1033 }
1034
1035 static void qemu_tcg_wait_io_event(CPUState *cpu)
1036 {
1037     while (all_cpu_threads_idle()) {
1038         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1039     }
1040
1041     while (iothread_requesting_mutex) {
1042         qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
1043     }
1044
1045     CPU_FOREACH(cpu) {
1046         qemu_wait_io_event_common(cpu);
1047     }
1048 }
1049
1050 #ifdef CONFIG_HAX
1051 static void qemu_hax_wait_io_event(CPUState *cpu)
1052 {
1053     while (cpu_thread_is_idle(cpu)) {
1054         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1055     }
1056
1057     qemu_wait_io_event_common(cpu);
1058 }
1059 #endif
1060
1061 static void qemu_kvm_wait_io_event(CPUState *cpu)
1062 {
1063     while (cpu_thread_is_idle(cpu)) {
1064         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1065     }
1066
1067     qemu_kvm_eat_signals(cpu);
1068     qemu_wait_io_event_common(cpu);
1069 }
1070
1071 static void *qemu_kvm_cpu_thread_fn(void *arg)
1072 {
1073     CPUState *cpu = arg;
1074     int r;
1075
1076     rcu_register_thread();
1077
1078     qemu_mutex_lock_iothread();
1079     qemu_thread_get_self(cpu->thread);
1080     cpu->thread_id = qemu_get_thread_id();
1081     cpu->can_do_io = 1;
1082     current_cpu = cpu;
1083
1084     r = kvm_init_vcpu(cpu);
1085     if (r < 0) {
1086         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1087         exit(1);
1088     }
1089
1090     qemu_kvm_init_cpu_signals(cpu);
1091
1092     /* signal CPU creation */
1093     cpu->created = true;
1094     qemu_cond_signal(&qemu_cpu_cond);
1095
1096     do {
1097         if (cpu_can_run(cpu)) {
1098             r = kvm_cpu_exec(cpu);
1099             if (r == EXCP_DEBUG) {
1100                 cpu_handle_guest_debug(cpu);
1101             }
1102         }
1103         qemu_kvm_wait_io_event(cpu);
1104     } while (!cpu->unplug || cpu_can_run(cpu));
1105
1106     qemu_kvm_destroy_vcpu(cpu);
1107     cpu->created = false;
1108     qemu_cond_signal(&qemu_cpu_cond);
1109     qemu_mutex_unlock_iothread();
1110     return NULL;
1111 }
1112
1113 static void *qemu_dummy_cpu_thread_fn(void *arg)
1114 {
1115 #ifdef _WIN32
1116     fprintf(stderr, "qtest is not supported under Windows\n");
1117     exit(1);
1118 #else
1119     CPUState *cpu = arg;
1120     sigset_t waitset;
1121     int r;
1122
1123     rcu_register_thread();
1124
1125     qemu_mutex_lock_iothread();
1126     qemu_thread_get_self(cpu->thread);
1127     cpu->thread_id = qemu_get_thread_id();
1128     cpu->can_do_io = 1;
1129
1130     sigemptyset(&waitset);
1131     sigaddset(&waitset, SIG_IPI);
1132
1133     /* signal CPU creation */
1134     cpu->created = true;
1135     qemu_cond_signal(&qemu_cpu_cond);
1136
1137     current_cpu = cpu;
1138     while (1) {
1139         current_cpu = NULL;
1140         qemu_mutex_unlock_iothread();
1141         do {
1142             int sig;
1143             r = sigwait(&waitset, &sig);
1144         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1145         if (r == -1) {
1146             perror("sigwait");
1147             exit(1);
1148         }
1149         qemu_mutex_lock_iothread();
1150         current_cpu = cpu;
1151         qemu_wait_io_event_common(cpu);
1152     }
1153
1154     return NULL;
1155 #endif
1156 }
1157
1158 static void tcg_exec_all(void);
1159
1160 static void *qemu_tcg_cpu_thread_fn(void *arg)
1161 {
1162     CPUState *cpu = arg;
1163     CPUState *remove_cpu = NULL;
1164
1165     rcu_register_thread();
1166
1167     qemu_mutex_lock_iothread();
1168     qemu_thread_get_self(cpu->thread);
1169
1170     CPU_FOREACH(cpu) {
1171         cpu->thread_id = qemu_get_thread_id();
1172         cpu->created = true;
1173         cpu->can_do_io = 1;
1174     }
1175     qemu_cond_signal(&qemu_cpu_cond);
1176
1177     /* wait for initial kick-off after machine start */
1178     while (first_cpu->stopped) {
1179         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1180
1181         /* process any pending work */
1182         CPU_FOREACH(cpu) {
1183             qemu_wait_io_event_common(cpu);
1184         }
1185     }
1186
1187     /* process any pending work */
1188     atomic_mb_set(&exit_request, 1);
1189
1190     while (1) {
1191         tcg_exec_all();
1192
1193         if (use_icount) {
1194             int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1195
1196             if (deadline == 0) {
1197                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1198             }
1199         }
1200         qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1201         CPU_FOREACH(cpu) {
1202             if (cpu->unplug && !cpu_can_run(cpu)) {
1203                 remove_cpu = cpu;
1204                 break;
1205             }
1206         }
1207         if (remove_cpu) {
1208             qemu_tcg_destroy_vcpu(remove_cpu);
1209             cpu->created = false;
1210             qemu_cond_signal(&qemu_cpu_cond);
1211             remove_cpu = NULL;
1212         }
1213     }
1214
1215     return NULL;
1216 }
1217
1218 #ifdef CONFIG_HAX
1219 static void *qemu_hax_cpu_thread_fn(void *arg)
1220 {
1221     CPUState *cpu = arg;
1222     int r;
1223     qemu_thread_get_self(cpu->thread);
1224     qemu_mutex_lock(&qemu_global_mutex);
1225
1226     cpu->thread_id = qemu_get_thread_id();
1227     cpu->created = true;
1228     cpu->halted = 0;
1229     current_cpu = cpu;
1230
1231     hax_init_vcpu(cpu);
1232     qemu_cond_signal(&qemu_cpu_cond);
1233
1234     while (1) {
1235         if (cpu_can_run(cpu)) {
1236             r = hax_smp_cpu_exec(cpu);
1237             if (r == EXCP_DEBUG) {
1238                 cpu_handle_guest_debug(cpu);
1239             }
1240         }
1241         qemu_hax_wait_io_event(cpu);
1242     }
1243     return NULL;
1244 }
1245 #endif
1246
1247 static void qemu_cpu_kick_thread(CPUState *cpu)
1248 {
1249 #ifndef _WIN32
1250     int err;
1251
1252     if (cpu->thread_kicked) {
1253         return;
1254     }
1255     cpu->thread_kicked = true;
1256     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1257     if (err) {
1258         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1259         exit(1);
1260     }
1261
1262 # ifdef CONFIG_DARWIN
1263     /* The cpu thread cannot catch it reliably when shutdown the guest on Mac.
1264     * We can double check it and resend it
1265     */
1266     if (!exit_request) {
1267     // FIXME: check it soon
1268 //        cpu_signal(0);
1269     }
1270
1271     if (hax_enabled() && hax_ug_platform()) {
1272         cpu->exit_request = 1;
1273     }
1274 # endif
1275 #else /* _WIN32 */
1276 # ifndef CONFIG_HAX
1277     abort();
1278 # else
1279     // FIXME: check it soon
1280 #if 0
1281     if (!qemu_cpu_is_self(cpu)) {
1282         CONTEXT tcgContext;
1283
1284         if (SuspendThread(cpu->hThread) == (DWORD)-1) {
1285             fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1286                     GetLastError());
1287             exit(1);
1288         }
1289
1290         /* On multi-core systems, we are not sure that the thread is actually
1291          * suspended until we can get the context.
1292          */
1293         tcgContext.ContextFlags = CONTEXT_CONTROL;
1294         while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
1295             continue;
1296         }
1297
1298 //        cpu_signal(0);
1299
1300         if(hax_enabled() && hax_ug_platform()) {
1301             cpu->exit_request = 1;
1302         } else {
1303             abort();
1304         }
1305         if (ResumeThread(cpu->hThread) == (DWORD)-1) {
1306             fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1307                     GetLastError());
1308             exit(1);
1309         }
1310     }
1311 #endif
1312     if (!qemu_cpu_is_self(cpu)) {
1313         if(hax_enabled() && hax_ug_platform()) {
1314             cpu->exit_request = 1;
1315         }
1316     }
1317 # endif
1318 #endif
1319 }
1320
1321 static void qemu_cpu_kick_no_halt(void)
1322 {
1323     CPUState *cpu;
1324     /* Ensure whatever caused the exit has reached the CPU threads before
1325      * writing exit_request.
1326      */
1327     atomic_mb_set(&exit_request, 1);
1328     cpu = atomic_mb_read(&tcg_current_cpu);
1329     if (cpu) {
1330         cpu_exit(cpu);
1331     }
1332 }
1333
1334 void qemu_cpu_kick(CPUState *cpu)
1335 {
1336     qemu_cond_broadcast(cpu->halt_cond);
1337     if (tcg_enabled()) {
1338         qemu_cpu_kick_no_halt();
1339     } else {
1340         qemu_cpu_kick_thread(cpu);
1341     }
1342 }
1343
1344 void qemu_cpu_kick_self(void)
1345 {
1346     assert(current_cpu);
1347     qemu_cpu_kick_thread(current_cpu);
1348 }
1349
1350 bool qemu_cpu_is_self(CPUState *cpu)
1351 {
1352     return qemu_thread_is_self(cpu->thread);
1353 }
1354
1355 bool qemu_in_vcpu_thread(void)
1356 {
1357     return current_cpu && qemu_cpu_is_self(current_cpu);
1358 }
1359
1360 static __thread bool iothread_locked = false;
1361
1362 bool qemu_mutex_iothread_locked(void)
1363 {
1364     return iothread_locked;
1365 }
1366
1367 void qemu_mutex_lock_iothread(void)
1368 {
1369     atomic_inc(&iothread_requesting_mutex);
1370     /* In the simple case there is no need to bump the VCPU thread out of
1371      * TCG code execution.
1372      */
1373     if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1374         !first_cpu || !first_cpu->created) {
1375         qemu_mutex_lock(&qemu_global_mutex);
1376         atomic_dec(&iothread_requesting_mutex);
1377     } else {
1378         if (qemu_mutex_trylock(&qemu_global_mutex)) {
1379             qemu_cpu_kick_no_halt();
1380             qemu_mutex_lock(&qemu_global_mutex);
1381         }
1382         atomic_dec(&iothread_requesting_mutex);
1383         qemu_cond_broadcast(&qemu_io_proceeded_cond);
1384     }
1385     iothread_locked = true;
1386 }
1387
1388 void qemu_mutex_unlock_iothread(void)
1389 {
1390     iothread_locked = false;
1391     qemu_mutex_unlock(&qemu_global_mutex);
1392 }
1393
1394 static int all_vcpus_paused(void)
1395 {
1396     CPUState *cpu;
1397
1398     CPU_FOREACH(cpu) {
1399         if (!cpu->stopped) {
1400             return 0;
1401         }
1402     }
1403
1404     return 1;
1405 }
1406
1407 void pause_all_vcpus(void)
1408 {
1409     CPUState *cpu;
1410
1411     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1412     CPU_FOREACH(cpu) {
1413         cpu->stop = true;
1414         qemu_cpu_kick(cpu);
1415     }
1416
1417     if (qemu_in_vcpu_thread()) {
1418         cpu_stop_current();
1419         if (!kvm_enabled()) {
1420             CPU_FOREACH(cpu) {
1421                 cpu->stop = false;
1422                 cpu->stopped = true;
1423             }
1424             return;
1425         }
1426     }
1427
1428     while (!all_vcpus_paused()) {
1429         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1430         CPU_FOREACH(cpu) {
1431             qemu_cpu_kick(cpu);
1432         }
1433     }
1434 }
1435
1436 void cpu_resume(CPUState *cpu)
1437 {
1438     cpu->stop = false;
1439     cpu->stopped = false;
1440     qemu_cpu_kick(cpu);
1441 }
1442
1443 void resume_all_vcpus(void)
1444 {
1445     CPUState *cpu;
1446
1447     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1448     CPU_FOREACH(cpu) {
1449         cpu_resume(cpu);
1450     }
1451 }
1452
1453 void cpu_remove(CPUState *cpu)
1454 {
1455     cpu->stop = true;
1456     cpu->unplug = true;
1457     qemu_cpu_kick(cpu);
1458 }
1459
1460 void cpu_remove_sync(CPUState *cpu)
1461 {
1462     cpu_remove(cpu);
1463     while (cpu->created) {
1464         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1465     }
1466 }
1467
1468 /* For temporary buffers for forming a name */
1469 #define VCPU_THREAD_NAME_SIZE 16
1470
1471 static void qemu_tcg_init_vcpu(CPUState *cpu)
1472 {
1473 #ifdef CONFIG_HAX
1474     if (hax_enabled()) {
1475         hax_init_vcpu(cpu);
1476     }
1477 #endif
1478     char thread_name[VCPU_THREAD_NAME_SIZE];
1479     static QemuCond *tcg_halt_cond;
1480     static QemuThread *tcg_cpu_thread;
1481
1482     /* share a single thread for all cpus with TCG */
1483     if (!tcg_cpu_thread) {
1484         cpu->thread = g_malloc0(sizeof(QemuThread));
1485         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1486         qemu_cond_init(cpu->halt_cond);
1487         tcg_halt_cond = cpu->halt_cond;
1488         snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1489                  cpu->cpu_index);
1490         qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1491                            cpu, QEMU_THREAD_JOINABLE);
1492 #ifdef _WIN32
1493         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1494 #endif
1495         while (!cpu->created) {
1496             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1497         }
1498         tcg_cpu_thread = cpu->thread;
1499     } else {
1500         cpu->thread = tcg_cpu_thread;
1501         cpu->halt_cond = tcg_halt_cond;
1502     }
1503 }
1504
1505 #ifdef CONFIG_HAX
1506 static void qemu_hax_start_vcpu(CPUState *cpu)
1507 {
1508     char thread_name[VCPU_THREAD_NAME_SIZE];
1509
1510     cpu->thread = g_malloc0(sizeof(QemuThread));
1511     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1512     qemu_cond_init(cpu->halt_cond);
1513
1514     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1515              cpu->cpu_index);
1516
1517     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1518                        cpu, QEMU_THREAD_JOINABLE);
1519 #ifdef _WIN32
1520      cpu->hThread = qemu_thread_get_handle(cpu->thread);
1521 #endif
1522     while (!cpu->created) {
1523         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1524     }
1525 }
1526 #endif
1527
1528 static void qemu_kvm_start_vcpu(CPUState *cpu)
1529 {
1530     char thread_name[VCPU_THREAD_NAME_SIZE];
1531
1532     cpu->thread = g_malloc0(sizeof(QemuThread));
1533     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1534     qemu_cond_init(cpu->halt_cond);
1535     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1536              cpu->cpu_index);
1537     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1538                        cpu, QEMU_THREAD_JOINABLE);
1539     while (!cpu->created) {
1540         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1541     }
1542 }
1543
1544 static void qemu_dummy_start_vcpu(CPUState *cpu)
1545 {
1546     char thread_name[VCPU_THREAD_NAME_SIZE];
1547
1548     cpu->thread = g_malloc0(sizeof(QemuThread));
1549     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1550     qemu_cond_init(cpu->halt_cond);
1551     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1552              cpu->cpu_index);
1553     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1554                        QEMU_THREAD_JOINABLE);
1555     while (!cpu->created) {
1556         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1557     }
1558 }
1559
1560 void qemu_init_vcpu(CPUState *cpu)
1561 {
1562     cpu->nr_cores = smp_cores;
1563     cpu->nr_threads = smp_threads;
1564     cpu->stopped = true;
1565
1566     if (!cpu->as) {
1567         /* If the target cpu hasn't set up any address spaces itself,
1568          * give it the default one.
1569          */
1570         AddressSpace *as = address_space_init_shareable(cpu->memory,
1571                                                         "cpu-memory");
1572         cpu->num_ases = 1;
1573         cpu_address_space_init(cpu, as, 0);
1574     }
1575
1576     if (kvm_enabled()) {
1577         qemu_kvm_start_vcpu(cpu);
1578 #ifdef CONFIG_HAX
1579     } else if (hax_enabled() && hax_ug_platform()) {
1580         qemu_hax_start_vcpu(cpu);
1581 #endif
1582     } else if (tcg_enabled()) {
1583         qemu_tcg_init_vcpu(cpu);
1584     } else {
1585         qemu_dummy_start_vcpu(cpu);
1586     }
1587 }
1588
1589 void cpu_stop_current(void)
1590 {
1591     if (current_cpu) {
1592         current_cpu->stop = false;
1593         current_cpu->stopped = true;
1594         cpu_exit(current_cpu);
1595         qemu_cond_broadcast(&qemu_pause_cond);
1596     }
1597 }
1598
1599 int vm_stop(RunState state)
1600 {
1601     if (qemu_in_vcpu_thread()) {
1602         qemu_system_vmstop_request_prepare();
1603         qemu_system_vmstop_request(state);
1604         /*
1605          * FIXME: should not return to device code in case
1606          * vm_stop() has been requested.
1607          */
1608         cpu_stop_current();
1609         return 0;
1610     }
1611
1612     return do_vm_stop(state);
1613 }
1614
1615 /* does a state transition even if the VM is already stopped,
1616    current state is forgotten forever */
1617 int vm_stop_force_state(RunState state)
1618 {
1619     if (runstate_is_running()) {
1620         return vm_stop(state);
1621     } else {
1622         runstate_set(state);
1623
1624         bdrv_drain_all();
1625         /* Make sure to return an error if the flush in a previous vm_stop()
1626          * failed. */
1627         return blk_flush_all();
1628     }
1629 }
1630
1631 static int64_t tcg_get_icount_limit(void)
1632 {
1633     int64_t deadline;
1634
1635     if (replay_mode != REPLAY_MODE_PLAY) {
1636         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1637
1638         /* Maintain prior (possibly buggy) behaviour where if no deadline
1639          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1640          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1641          * nanoseconds.
1642          */
1643         if ((deadline < 0) || (deadline > INT32_MAX)) {
1644             deadline = INT32_MAX;
1645         }
1646
1647         return qemu_icount_round(deadline);
1648     } else {
1649         return replay_get_instructions();
1650     }
1651 }
1652
1653 static int tcg_cpu_exec(CPUState *cpu)
1654 {
1655     int ret;
1656 #ifdef CONFIG_PROFILER
1657     int64_t ti;
1658 #endif
1659
1660 #ifdef CONFIG_PROFILER
1661     ti = profile_getclock();
1662 #endif
1663     if (use_icount) {
1664         int64_t count;
1665         int decr;
1666         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1667                                     + cpu->icount_extra);
1668         cpu->icount_decr.u16.low = 0;
1669         cpu->icount_extra = 0;
1670         count = tcg_get_icount_limit();
1671         timers_state.qemu_icount += count;
1672         decr = (count > 0xffff) ? 0xffff : count;
1673         count -= decr;
1674         cpu->icount_decr.u16.low = decr;
1675         cpu->icount_extra = count;
1676     }
1677     ret = cpu_exec(cpu);
1678 #ifdef CONFIG_PROFILER
1679     tcg_time += profile_getclock() - ti;
1680 #endif
1681     if (use_icount) {
1682         /* Fold pending instructions back into the
1683            instruction counter, and clear the interrupt flag.  */
1684         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1685                         + cpu->icount_extra);
1686         cpu->icount_decr.u32 = 0;
1687         cpu->icount_extra = 0;
1688         replay_account_executed_instructions();
1689     }
1690     return ret;
1691 }
1692
1693 static void tcg_exec_all(void)
1694 {
1695     int r;
1696
1697     /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1698     qemu_account_warp_timer();
1699
1700     if (next_cpu == NULL) {
1701         next_cpu = first_cpu;
1702     }
1703     for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1704         CPUState *cpu = next_cpu;
1705
1706         qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1707                           (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1708
1709         if (cpu_can_run(cpu)) {
1710             r = tcg_cpu_exec(cpu);
1711             if (r == EXCP_DEBUG) {
1712                 cpu_handle_guest_debug(cpu);
1713                 break;
1714             }
1715         } else if (cpu->stop || cpu->stopped) {
1716             if (cpu->unplug) {
1717                 next_cpu = CPU_NEXT(cpu);
1718             }
1719             break;
1720         }
1721     }
1722
1723     /* Pairs with smp_wmb in qemu_cpu_kick.  */
1724     atomic_mb_set(&exit_request, 0);
1725 }
1726
1727 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1728 {
1729     /* XXX: implement xxx_cpu_list for targets that still miss it */
1730 #if defined(cpu_list)
1731     cpu_list(f, cpu_fprintf);
1732 #endif
1733 }
1734
1735 CpuInfoList *qmp_query_cpus(Error **errp)
1736 {
1737     CpuInfoList *head = NULL, *cur_item = NULL;
1738     CPUState *cpu;
1739
1740     CPU_FOREACH(cpu) {
1741         CpuInfoList *info;
1742 #if defined(TARGET_I386)
1743         X86CPU *x86_cpu = X86_CPU(cpu);
1744         CPUX86State *env = &x86_cpu->env;
1745 #elif defined(TARGET_PPC)
1746         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1747         CPUPPCState *env = &ppc_cpu->env;
1748 #elif defined(TARGET_SPARC)
1749         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1750         CPUSPARCState *env = &sparc_cpu->env;
1751 #elif defined(TARGET_MIPS)
1752         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1753         CPUMIPSState *env = &mips_cpu->env;
1754 #elif defined(TARGET_TRICORE)
1755         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1756         CPUTriCoreState *env = &tricore_cpu->env;
1757 #endif
1758
1759         cpu_synchronize_state(cpu);
1760
1761         info = g_malloc0(sizeof(*info));
1762         info->value = g_malloc0(sizeof(*info->value));
1763         info->value->CPU = cpu->cpu_index;
1764         info->value->current = (cpu == first_cpu);
1765         info->value->halted = cpu->halted;
1766         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1767         info->value->thread_id = cpu->thread_id;
1768 #if defined(TARGET_I386)
1769         info->value->arch = CPU_INFO_ARCH_X86;
1770         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1771 #elif defined(TARGET_PPC)
1772         info->value->arch = CPU_INFO_ARCH_PPC;
1773         info->value->u.ppc.nip = env->nip;
1774 #elif defined(TARGET_SPARC)
1775         info->value->arch = CPU_INFO_ARCH_SPARC;
1776         info->value->u.q_sparc.pc = env->pc;
1777         info->value->u.q_sparc.npc = env->npc;
1778 #elif defined(TARGET_MIPS)
1779         info->value->arch = CPU_INFO_ARCH_MIPS;
1780         info->value->u.q_mips.PC = env->active_tc.PC;
1781 #elif defined(TARGET_TRICORE)
1782         info->value->arch = CPU_INFO_ARCH_TRICORE;
1783         info->value->u.tricore.PC = env->PC;
1784 #else
1785         info->value->arch = CPU_INFO_ARCH_OTHER;
1786 #endif
1787
1788         /* XXX: waiting for the qapi to support GSList */
1789         if (!cur_item) {
1790             head = cur_item = info;
1791         } else {
1792             cur_item->next = info;
1793             cur_item = info;
1794         }
1795     }
1796
1797     return head;
1798 }
1799
1800 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1801                  bool has_cpu, int64_t cpu_index, Error **errp)
1802 {
1803     FILE *f;
1804     uint32_t l;
1805     CPUState *cpu;
1806     uint8_t buf[1024];
1807     int64_t orig_addr = addr, orig_size = size;
1808
1809     if (!has_cpu) {
1810         cpu_index = 0;
1811     }
1812
1813     cpu = qemu_get_cpu(cpu_index);
1814     if (cpu == NULL) {
1815         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1816                    "a CPU number");
1817         return;
1818     }
1819
1820     f = fopen(filename, "wb");
1821     if (!f) {
1822         error_setg_file_open(errp, errno, filename);
1823         return;
1824     }
1825
1826     while (size != 0) {
1827         l = sizeof(buf);
1828         if (l > size)
1829             l = size;
1830         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1831             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1832                              " specified", orig_addr, orig_size);
1833             goto exit;
1834         }
1835         if (fwrite(buf, 1, l, f) != l) {
1836             error_setg(errp, QERR_IO_ERROR);
1837             goto exit;
1838         }
1839         addr += l;
1840         size -= l;
1841     }
1842
1843 exit:
1844     fclose(f);
1845 }
1846
1847 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1848                   Error **errp)
1849 {
1850     FILE *f;
1851     uint32_t l;
1852     uint8_t buf[1024];
1853
1854     f = fopen(filename, "wb");
1855     if (!f) {
1856         error_setg_file_open(errp, errno, filename);
1857         return;
1858     }
1859
1860     while (size != 0) {
1861         l = sizeof(buf);
1862         if (l > size)
1863             l = size;
1864         cpu_physical_memory_read(addr, buf, l);
1865         if (fwrite(buf, 1, l, f) != l) {
1866             error_setg(errp, QERR_IO_ERROR);
1867             goto exit;
1868         }
1869         addr += l;
1870         size -= l;
1871     }
1872
1873 exit:
1874     fclose(f);
1875 }
1876
1877 void qmp_inject_nmi(Error **errp)
1878 {
1879     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1880 }
1881
1882 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1883 {
1884     if (!use_icount) {
1885         return;
1886     }
1887
1888     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1889                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1890     if (icount_align_option) {
1891         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1892         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1893     } else {
1894         cpu_fprintf(f, "Max guest delay     NA\n");
1895         cpu_fprintf(f, "Max guest advance   NA\n");
1896     }
1897 }